Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Local file to S3 upload. #27

Merged
merged 9 commits into from
Feb 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Available options are:
- `http(s)`
- Google Cloud Storage (`gs://`)
- Google Drive (`gdrive://` or https://drive.google.com/...). The file must be publicly accessible.
- Amazon AWS S3 bucket (`s3://`)
- **local_name**: The name to save the file as locally
- **tag**: A tag to use to filter downloads
- **api**: The API to use to download the file. Currently supported: `elasticsearch`
Expand All @@ -36,6 +37,11 @@ Available options are:
> - [add the service account to the relevant bucket](https://cloud.google.com/storage/docs/access-control/using-iam-permissions#bucket-iam) and
> - [download a JSON key](https://cloud.google.com/iam/docs/keys-create-delete) for that service account.
> Then, set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable to point to that file.
>
> Mirorring local files to Amazon AWS S3 bucket requires the following:
> - [Create an AWS account](https://portal.aws.amazon.com/)
> - [Create an IAM user in AWS](https://docs.aws.amazon.com/IAM/latest/UserGuide/getting-started.html): This enables getting the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` needed for authentication. These two shoul dbe stored as environment variables in the user's system.
> - [Create an S3 bucket](https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html): This will be the destination for pushing local files.

You can also include any secrets like API keys you have set as environment variables using `{VARIABLE_NAME}`, for example:
```yaml
Expand Down
3 changes: 3 additions & 0 deletions example/download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
- url: https://drive.google.com/uc?id=10ojJffrPSl12OMcu4gyx0fak2CNu6qOs
local_name: gdrive_test_2.txt

- url: s3://monarch-kg-test/kghub_downloader_test_file.yaml
local_name: test_file.yaml

# - url: https://www.ebi.ac.uk/chembl/elk/es/
# api: elasticsearch
# query_file: example/query.json
Expand Down
59 changes: 40 additions & 19 deletions kghub_downloader/download_utils.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,26 @@
import os, pathlib, re
import json
import logging

import json, yaml
import compress_json # type: ignore

# from compress_json import compress_json

import os
import pathlib
import re
from multiprocessing.sharedctypes import Value

from typing import List, Optional
from urllib.error import URLError
from urllib.request import Request, urlopen


import boto3
import compress_json # type: ignore
import elasticsearch
import elasticsearch.helpers

from tqdm.auto import tqdm # type: ignore
import gdown
import yaml
from botocore.exceptions import NoCredentialsError
from google.cloud import storage
from google.cloud.storage.blob import Blob
from typing import List, Optional
import gdown
from tqdm.auto import tqdm # type: ignore

# from compress_json import compress_json


GDOWN_MAP = {"gdrive": "https://drive.google.com/uc?id="}

Expand Down Expand Up @@ -104,6 +105,11 @@ def download_from_yaml(
Blob.from_string(url, client=storage.Client()).download_to_filename(
outfile
)
elif url.startswith("s3://"):
s3 = boto3.client("s3")
bucket_name = url.split("/")[2]
remote_file = "/".join(url.split("/")[3:])
s3.download_file(bucket_name, remote_file, outfile)
elif any(
url.startswith(str(i))
for i in list(GDOWN_MAP.keys()) + list(GDOWN_MAP.values())
Expand Down Expand Up @@ -157,6 +163,8 @@ def download_from_yaml(


def mirror_to_bucket(local_file, bucket_url, remote_file) -> None:
bucket_split = bucket_url.split("/")
bucket_name = bucket_split[2]
with open(local_file, "rb"):
if bucket_url.startswith("gs://"):

Expand All @@ -165,8 +173,6 @@ def mirror_to_bucket(local_file, bucket_url, remote_file) -> None:

# Connect to GCS Bucket
storage_client = storage.Client()
bucket_split = bucket_url.split("/")
bucket_name = bucket_split[2]
bucket = storage_client.bucket(bucket_name)

# Upload blob from local file
Expand All @@ -188,12 +194,27 @@ def mirror_to_bucket(local_file, bucket_url, remote_file) -> None:
blob.upload_from_filename(local_file)

elif bucket_url.startswith("s3://"):
raise ValueError("Currently, only Google Cloud storage is supported.")
# bashCommand = f"aws s3 cp {outfile} {mirror}"
# subprocess.run(bashCommand.split())
# Create an S3 client
s3 = boto3.client("s3")

try:
# Upload the file
# ! This will only work if the user has the AWS IAM user
# ! access keys set up as environment variables.
s3.upload_file(local_file, bucket_name, remote_file)
print(f"File {local_file} uploaded to {bucket_name}/{remote_file}")
return True
except FileNotFoundError:
print(f"The file {local_file} was not found")
return False
except NoCredentialsError:
print("Credentials not available")
return False

else:
raise ValueError("Currently, only Google Cloud storage is supported.")
raise ValueError(
"Currently, only Google Cloud and S3 storage is supported."
)

return None

Expand Down
3 changes: 2 additions & 1 deletion kghub_downloader/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Optional, List
from typing import List, Optional

import typer

from kghub_downloader.download_utils import download_from_yaml

typer_app = typer.Typer()
Expand Down
Loading
Loading