Merge pull request #265 from weingartlorenz/main

This is a component designed to download the Xview dataset
claimed-framework · May 31, 2024 · b6c60c3 · b6c60c3
2 parents 2dcb166 + 5759903
commit b6c60c3
Show file tree

Hide file tree

Showing 4 changed files with 331 additions and 0 deletions.
diff --git a/component-library/input/input-Xview-download.cwl b/component-library/input/input-Xview-download.cwl
@@ -0,0 +1,57 @@
+cwlVersion: v1.2
+class: CommandLineTool
+
+baseCommand: "claimed"
+
+inputs:
+  component:
+    type: string
+    default: docker.io/mdorzweiler/claimed-input-xview-download:0.1
+    inputBinding:
+      position: 1
+      prefix: --component
+  log_level:
+    type: string
+    default: "INFO"
+    inputBinding:
+      position: 2
+      prefix: --log_level
+  username:
+    type: string
+    default: None
+    inputBinding:
+      position: 3
+      prefix: --username
+  password:
+    type: string
+    default: None
+    inputBinding:
+      position: 4
+      prefix: --password
+  move_to_dir:
+    type: string
+    default: None
+    inputBinding:
+      position: 5
+      prefix: --move_to_dir
+  chromedriver_path:
+    type: string
+    default: None
+    inputBinding:
+      position: 6
+      prefix: --chromedriver_path
+  max_download_time:
+    type: string
+    default: None
+    inputBinding:
+      position: 7
+      prefix: --max_download_time
+  label:
+    type: string
+    default: None
+    inputBinding:
+      position: 8
+      prefix: --label
+
+
+outputs: []
diff --git a/component-library/input/input-Xview-download.ipynb b/component-library/input/input-Xview-download.ipynb
@@ -0,0 +1,213 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "147f9480",
+   "metadata": {},
+   "source": [
+    "## Xview Dataset Download \n",
+    "\n",
+    "This component is designed to download a labeled overhead image dataset, provided a chromedriver, to a specified location. \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c185c1f0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install selenium"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dc0554b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "import os\n",
+    "import shutil\n",
+    "import time\n",
+    "from selenium import webdriver\n",
+    "from selenium.webdriver.common.by import By\n",
+    "from selenium.webdriver.support.ui import WebDriverWait\n",
+    "from selenium.webdriver.support import expected_conditions as EC\n",
+    "from urllib.parse import urlparse\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "866d16c3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# username for the Xview webpage to authorize login\n",
+    "username = os.environ.get('username')\n",
+    "\n",
+    "# password for the Xview webpage to authorize login\n",
+    "password = os.environ.get('password')\n",
+    "\n",
+    "# move_to_dir the directory where the dataset should be saved\n",
+    "move_to_dir = os.environ.get('move_to_dir')\n",
+    "\n",
+    "# chromedriver_path the directory where the local copy of chromedriver is saved\n",
+    "chromedriver_path = os.environ.get('chromedriver_path')\n",
+    "\n",
+    "# max_download_time before timeout, must be ajusted acording to the file size and internet speed\n",
+    "max_download_time = os.environ.get('max_download_time')\n",
+    "\n",
+    "# The label of the file desired to download.\n",
+    "# Chose from \"TI.zip\", \"TL.zip\", \"VI.zip\", \"TI.tgz\", \"TL.tgz\", \"VI.tgz, \n",
+    "# standing for TI=Traning Images, TL=Training Lables, VI=Validation Images\n",
+    "label = os.environ.get('label')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "794506c5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "def login_and_download(username, password, move_to_dir, chromedriver_path, max_download_time, label):  \n",
+    "    \n",
+    "    # Set Chrome options to automatically download files to the specified directory\n",
+    "    options = webdriver.ChromeOptions()\n",
+    "    prefs = {\n",
+    "        \"download.default_directory\": move_to_dir,\n",
+    "        \"download.prompt_for_download\": False,\n",
+    "        \"download.directory_upgrade\": True,\n",
+    "        \"safebrowsing.enabled\": True\n",
+    "    }\n",
+    "    options.add_experimental_option(\"prefs\", prefs)\n",
+    "\n",
+    "    # Start a new instance of Chrome web browser\n",
+    "    driver = webdriver.Chrome(executable_path=chromedriver_path, options=options)\n",
+    "    \n",
+    "    # Open the login page\n",
+    "    url_login = r'https://challenge.xviewdataset.org/login'\n",
+    "    driver.get(url_login)\n",
+    "\n",
+    "    # Find the username and password fields and enter credentials\n",
+    "    username_field = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.NAME, 'email')))\n",
+    "    password_field = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.NAME, 'password')))\n",
+    "    username_field.send_keys(username)\n",
+    "    password_field.send_keys(password)\n",
+    "\n",
+    "    # Find and click the login button\n",
+    "    login_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, 'btn.primary')))\n",
+    "    login_button.click()\n",
+    "    \n",
+    "    # Wait for the page to load after login\n",
+    "    time.sleep(1)\n",
+    "    \n",
+    "    # Open the Download page\n",
+    "    url_download = r'https://challenge.xviewdataset.org/download-links'\n",
+    "    driver.get(url_download)\n",
+    "    \n",
+    "    # Wait for the overlay element to be present\n",
+    "    overlay_element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'overlay--active')))\n",
+    "\n",
+    "    # Remove the automaic pop-up overlay \n",
+    "    body_element = driver.find_element_by_tag_name('body')\n",
+    "    body_element.click()\n",
+    "    time.sleep(1)\n",
+    "    \n",
+    "    # Switch between the possible download files\n",
+    "    search_text = \"\"\n",
+    "    match label:\n",
+    "        case \"TI.zip\":\n",
+    "            search_text = '//a[contains(text(), \"Download Training Images (zip)\")]'\n",
+    "        case \"TL.zip\":\n",
+    "            search_text = '//a[contains(text(), \"Download Training Labels (zip)\")]'\n",
+    "        case \"VI.zip\":\n",
+    "            search_text = '//a[contains(text(), \"Download Validation Images (zip)\")]'\n",
+    "        case \"TI.tgz\":\n",
+    "            search_text = '//a[contains(text(), \"Download Training Images (tgz)\")]'\n",
+    "        case \"TL.tgz\":\n",
+    "            search_text = '//a[contains(text(), \"Download Training Labels (tgz)\")]'\n",
+    "        case \"VI.tgz\":\n",
+    "            search_text = '//a[contains(text(), \"Download Validation Images (tgz)\")]'\n",
+    "        case _:\n",
+    "            raise ValueError(\"Error: This is an invalid download option\") \n",
+    "    \n",
+    "    # Wait for the download link to be present\n",
+    "    download_link_element = WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.XPATH, search_text)))\n",
+    "    \n",
+    "    # Get the dynamic download link from the href attribute\n",
+    "    download_link = download_link_element.get_attribute('href')\n",
+    "    \n",
+    "     # Download the dataset using the obtained link\n",
+    "    if download_link:\n",
+    "        driver.get(download_link)\n",
+    "        print(\"Dataset download started successfully.\")\n",
+    "        \n",
+    "        # Extract the filename from the download link URL\n",
+    "        parsed_url = urlparse(download_link)\n",
+    "        filename = parsed_url.path.split('/')[-1]\n",
+    "        downloaded_file = os.path.join(move_to_dir, filename)\n",
+    "        print(downloaded_file)\n",
+    "        \n",
+    "        # Check if the download directory exists\n",
+    "        if not os.path.exists(move_to_dir):\n",
+    "            os.makedirs(move_to_dir)\n",
+    "        \n",
+    "        # Wait for the file to be completely downloaded\n",
+    "        start_time = time.time()\n",
+    "        \n",
+    "        while True:\n",
+    "            if os.path.exists(downloaded_file) and os.path.getsize(downloaded_file) > 0:\n",
+    "                print(\"File downloaded successfully.\")\n",
+    "                break\n",
+    "            elif time.time() - start_time > max_download_time:\n",
+    "                print(\"Error: Maximum wait time exceeded.\")\n",
+    "                break\n",
+    "            else:\n",
+    "                time.sleep(5)\n",
+    "    \n",
+    "    else:\n",
+    "        print(\"Failed to get the download link.\")\n",
+    "\n",
+    "    # Close the browser\n",
+    "    driver.quit()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e7b2f96d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "login_and_download(username, password, move_to_dir, chromedriver_path, max_download_time, label)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/component-library/input/input-Xview-download.job.yaml b/component-library/input/input-Xview-download.job.yaml
@@ -0,0 +1,30 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: input-xview-download
+spec:
+  template:
+    spec:
+      containers:
+      - name: input-xview-download
+        image: docker.io/mdorzweiler/claimed-input-xview-download:0.1
+        workingDir: /opt/app-root/src/
+        command: ["/opt/app-root/bin/ipython","claimed_input-Xview-download.ipynb"]
+        env:
+        - name: log_level
+          value: value_of_log_level
+        - name: username
+          value: value_of_username
+        - name: password
+          value: value_of_password
+        - name: move_to_dir
+          value: value_of_move_to_dir
+        - name: chromedriver_path
+          value: value_of_chromedriver_path
+        - name: max_download_time
+          value: value_of_max_download_time
+        - name: label
+          value: value_of_label
+      restartPolicy: OnFailure
+      imagePullSecrets:
+        - name: image_pull_secret
diff --git a/component-library/input/input-Xview-download.yaml b/component-library/input/input-Xview-download.yaml
@@ -0,0 +1,31 @@
+name: input-xview-download
+description: "## Xview Dataset Download   – CLAIMED V0.1"
+
+inputs:
+- {name: log_level, type: String, description: "update log level", default: "INFO"}
+- {name: username, type: String, description: "username for the Xview webpage to authorize login"}
+- {name: password, type: String, description: "password for the Xview webpage to authorize login"}
+- {name: move_to_dir, type: String, description: "move_to_dir the directory where the dataset should be saved"}
+- {name: chromedriver_path, type: String, description: "chromedriver_path the directory where the local copy of chromedriver is saved"}
+- {name: max_download_time, type: String, description: "max_download_time before timeout, must be ajusted acording to the file size and internet speed"}
+- {name: label, type: String, description: "standing for TI=Traning Images, TL=Training Lables, VI=Validation Images"}
+
+
+outputs:
+
+
+implementation:
+    container:
+        image: docker.io/mdorzweiler/claimed-input-xview-download:0.1
+        command:
+        - sh
+        - -ec
+        - |
+          ipython ./claimed_input-Xview-download.ipynb log_level="${0}" username="${1}" password="${2}" move_to_dir="${3}" chromedriver_path="${4}" max_download_time="${5}" label="${6}" 
+        - {inputValue: log_level}
+        - {inputValue: username}
+        - {inputValue: password}
+        - {inputValue: move_to_dir}
+        - {inputValue: chromedriver_path}
+        - {inputValue: max_download_time}
+        - {inputValue: label}