From 77849391915623a2f01dc8467ca8175755ac77a1 Mon Sep 17 00:00:00 2001 From: Nir Ben-Or Date: Tue, 17 Sep 2024 15:26:12 -0500 Subject: [PATCH 1/9] Initial commit, adding the bootstrap from export guide, bootstrap.shscript, and linking the bootstrap guide to the main DB README.md doc Signed-off-by: Nir Ben-Or --- docs/database/README.md | 4 + docs/database/bootstrap.md | 404 ++++++++++++++++++ .../main/resources/db/scripts/bootstrap.sh | 257 +++++++++++ 3 files changed, 665 insertions(+) create mode 100644 docs/database/bootstrap.md create mode 100644 hedera-mirror-importer/src/main/resources/db/scripts/bootstrap.sh diff --git a/docs/database/README.md b/docs/database/README.md index d23e4a324d5..492e697fa4f 100644 --- a/docs/database/README.md +++ b/docs/database/README.md @@ -304,3 +304,7 @@ is expected to migrate full mainnet data in 10 days. ## Citus Backup and Restore Please refer to this [document](/docs/database/citus.md) for the steps. + +## Bootstrap a DB from exported data + +Please refer to this [document](/docs/database/bootstrap.md) for instructions. \ No newline at end of file diff --git a/docs/database/bootstrap.md b/docs/database/bootstrap.md new file mode 100644 index 00000000000..b2ca7c41282 --- /dev/null +++ b/docs/database/bootstrap.md @@ -0,0 +1,404 @@ +# Mirror Node Database Bootstrap Guide + +This guide provides step-by-step instructions for setting up a fresh PostgreSQL 14 database and importing Mirror Node data into it. The process involves initializing the database, configuring environment variables, and running the import script. The data import is a long-running process, so it's recommended to run it within a `screen` or `tmux` session. + +--- + +## Table of Contents + +- [Mirror Node Database Bootstrap Guide](#mirror-node-database-bootstrap-guide) + - [Table of Contents](#table-of-contents) + - [Prerequisites](#prerequisites) + - [Database Initialization](#database-initialization) + - [1. Configure Environment Variables](#1-configure-environment-variables) + - [2. Important Note for Google Cloud SQL Users](#2-important-note-for-google-cloud-sql-users) + - [3. Run the Initialization Script](#3-run-the-initialization-script) + - [4. Import the Database Schema](#4-import-the-database-schema) + - [Data Import Process](#data-import-process) + - [1. Download the Database Export Data](#1-download-the-database-export-data) + - [2. Download the Import Script](#2-download-the-import-script) + - [3. Run the Import Script](#3-run-the-import-script) + - [Mirror Node Version Compatibility](#mirror-node-version-compatibility) + - [Handling Failed Imports](#handling-failed-imports) + - [Steps to Handle Failed Imports:](#steps-to-handle-failed-imports) + - [Additional Notes](#additional-notes) + - [Troubleshooting](#troubleshooting) + +--- + +## Prerequisites + +- **PostgreSQL 14** installed and running. +- Access to a machine where you can run the initialization and import scripts and connect to the PostgreSQL database. +- A Google Cloud Platform (GCP) account with a valid billing account attached (required for downloading data from a Requester Pays bucket). + +--- + +## Database Initialization + +### 1. Configure Environment Variables + +Set the following environment variables on the machine from which you will run the initialization and import scripts. These variables allow for database connectivity and authentication. + +**Database Connection Variables:** + +```bash +export PGUSER="postgres" +export PGPASSWORD="YOUR_POSTGRES_PASSWORD" +export PGDATABASE="postgres" +export PGHOST="DB_IP_ADDRESS" +``` + +- `PGUSER`: The PostgreSQL superuser with administrative privileges (typically `postgres`). +- `PGPASSWORD`: Password for the PostgreSQL superuser. +- `PGDATABASE`: The default database to connect to (`postgres` by default). +- `PGHOST`: The IP address or hostname of your PostgreSQL database server. + +**Database User Password Variables:** + +Set the following environment variables to define passwords for the various database users that will be created during initialization. + +```bash +export GRAPHQL_PASSWORD="SET_PASSWORD" +export GRPC_PASSWORD="SET_PASSWORD" +export IMPORTER_PASSWORD="SET_PASSWORD" +export OWNER_PASSWORD="SET_PASSWORD" +export REST_PASSWORD="SET_PASSWORD" +export REST_JAVA_PASSWORD="SET_PASSWORD" +export ROSETTA_PASSWORD="SET_PASSWORD" +export WEB3_PASSWORD="SET_PASSWORD" +``` + +- Replace `SET_PASSWORD` with strong, unique passwords for each respective user. + +### 2. Important Note for Google Cloud SQL Users + +If you are using **Google Cloud SQL** for your PostgreSQL database, an additional step is required before running the `init.sh` script to ensure proper initialization. + +**Add the Following Line to the Initialization Script:** + +Before running the `init.sh` script, you need to grant the `mirror_node` role to the `postgres` user. This is necessary because Google Cloud SQL restricts certain permissions for the `postgres` user. + +Add the following line **before** running the `init.sh` script: + +```sql +GRANT mirror_node TO postgres; +``` + +**Revised Section of `init.sh`:** + +```sql +-- Create database & owner +CREATE USER :ownerUsername WITH LOGIN PASSWORD :'ownerPassword'; +GRANT mirror_node TO postgres; +CREATE DATABASE :dbName WITH OWNER :ownerUsername; +``` + +- This adjustment ensures that the `postgres` user has the necessary permissions to execute the initialization script correctly on Google Cloud SQL. + +### 3. Run the Initialization Script + +Download the initialization script `init.sh` from the repository: + +```bash +curl -O https://raw.githubusercontent.com/hashgraph/hedera-mirror-node/main/hedera-mirror-importer/src/main/resources/db/scripts/init.sh +chmod +x init.sh +``` + +Run the initialization script: + +```bash +./init.sh +echo "EXIT STATUS: $?" +``` + +- The exit status `0` indicates the script executed successfully. +- The script will create the `mirror_node` database, along with all necessary roles, users, and permissions within your PostgreSQL database, using the passwords specified in the environment variables. + +### 4. Import the Database Schema + +After the initialization script completes successfully, update the environment variables to connect using the `mirror_node` user and database: + +```bash +export PGUSER="mirror_node" +export PGPASSWORD="$OWNER_PASSWORD" # Use the password set for OWNER_PASSWORD +export PGDATABASE="mirror_node" +``` + +Import the database schema: + +```bash +psql -f schema.sql +echo "EXIT STATUS: $?" +``` + +- Ensure the exit status is `0` to confirm the schema was imported successfully. + +--- + +## Data Import Process + +### 1. Download the Database Export Data + +The Mirror Node database export data is available in a Google Cloud Storage (GCS) bucket: + +- **Bucket URL:** [mirrornode-db-export](https://console.cloud.google.com/storage/browser/mirrornode-db-export) + +**Important Notes:** + +- The bucket is **read-only** to the public. +- It is configured as **Requester Pays**, meaning you need a GCP account with a valid billing account attached to download the data. +- You will be billed for the data transfer fees incurred during the download. + +**Download Instructions:** + +1. **Authenticate with GCP:** + + Ensure you have the [Google Cloud SDK](https://cloud.google.com/sdk/docs/install) installed and authenticated: + + ```bash + gcloud auth login + gcloud config set billing/disable_usage_reporting false + ``` + +2. **Set the Default Project:** + + ```bash + gcloud config set project YOUR_GCP_PROJECT_ID + ``` + +3. **Download the Data:** + + Create an empty directory to store the data and download all files and subdirectories: + + ```bash + mkdir -p /path/to/db_export + gsutil -u YOUR_GCP_PROJECT_ID -m cp -r gs://mirrornode-db-export/* /path/to/db_export/ + ``` + + - Replace `/path/to/db_export` with your desired directory path. + - Ensure all files and subdirectories are downloaded into this single parent directory. + - **Note:** The `-m` flag enables parallel downloads to speed up the process. + +### 2. Download the Import Script + +Download the import script `bootstrap.sh` from the repository: + +```bash +curl -O https://raw.githubusercontent.com/hashgraph/hedera-mirror-node/main/hedera-mirror-importer/src/main/resources/db/scripts/bootstrap.sh +chmod +x bootstrap.sh +``` + +### 3. Run the Import Script + +The import script is designed to efficiently import the Mirror Node data into your PostgreSQL database. It handles compressed CSV files and uses parallel processing to speed up the import. + +**Script Summary:** + +- **Name:** `bootstrap.sh` +- **Functionality:** Imports data from compressed CSV files into the PostgreSQL database using parallel processing. It processes multiple tables concurrently based on the number of CPU cores specified. +- **Requirements:** Ensure that the environment variables for database connectivity are set (`PGUSER`, `PGPASSWORD`, `PGDATABASE`, `PGHOST`). + +**Instructions:** + +1. **Ensure Environment Variables are Set:** + + The environment variables should still be set from the previous steps. Verify them: + + ```bash + echo $PGUSER # Should output 'mirror_node' + echo $PGPASSWORD # Should output the password you set for OWNER_PASSWORD + echo $PGDATABASE # Should output 'mirror_node' + echo $PGHOST # Should be set to your DB IP address + ``` + +2. **Run the Import Script within a `screen` or `tmux` Session:** + + It's recommended to run the import script within a `screen` or `tmux` session, as the import process may take several hours to complete. + + **Using `screen`:** + + ```bash + screen -S db_import + ``` + + **Run the Import Script:** + + ```bash + ./bootstrap.sh 8 /path/to/db_export/ + ``` + + - `8` refers to the number of CPU cores to use for parallel processing. Adjust this number based on your system's resources. + - `/path/to/db_export/` is the directory where you downloaded the database export data. + + **Detach from the `screen` Session:** + + Press `Ctrl+A` then `D`. + + - This allows the import process to continue running in the background. + + **Reattach to the `screen` Session Later:** + + ```bash + screen -r db_import + ``` + +3. **Monitor the Import Process:** + + - The script will output logs indicating the progress of the import. + - Check the `import.log` file for detailed logs and any error messages. + +4. **Check the Exit Status:** + + After the script completes, check the exit status: + + ```bash + echo "EXIT STATUS: $?" + ``` + + - An exit status of `0` indicates the import completed successfully. + - If the exit status is not `0`, refer to the `import.log` file and `import_tracking.txt` for troubleshooting. + +--- + +## Mirror Node Version Compatibility + +Before initializing your Mirror Node with the imported database, it's crucial to ensure version compatibility. + +**MIRRORNODE_VERSION File:** + +- In the database export data, there is a file named `MIRRORNODE_VERSION`. +- This file contains the version of the Mirror Node at the time of the database export. + +**Importance:** + +- Your Mirror Node instance must be initialized with the **same version** as specified in the `MIRRORNODE_VERSION` file. +- Using a different version may lead to compatibility issues and/or schema mismatches. + +**Action Required:** + +1. **Check the Mirror Node Version:** + + - Open the `MIRRORNODE_VERSION` file: + + ```bash + cat /path/to/db_export/MIRRORNODE_VERSION + ``` + + - Note the version number specified. + +--- + +## Handling Failed Imports + +During the import process, the script generates a file named `import_tracking.txt`, which logs the status of each file import. Each line in this file contains the path and name of a file, followed by its import status: `NOT_STARTED`, `IN_PROGRESS`, `IMPORTED`, or `FAILED_TO_IMPORT`. + +**Statuses:** + +- `NOT_STARTED`: The file has not yet been processed. +- `IN_PROGRESS`: The file is currently being imported. +- `IMPORTED`: The file was successfully imported. +- `FAILED_TO_IMPORT`: The file failed to import. + +**Example of `import_tracking.txt`:** + +``` +/path/to/db_export/record_file.csv.gz IMPORTED +/path/to/db_export/transaction/transaction_part_1.csv.gz IMPORTED +/path/to/db_export/transaction/transaction_part_2.csv.gz FAILED_TO_IMPORT +/path/to/db_export/account.csv.gz NOT_STARTED +``` + +### Steps to Handle Failed Imports: + +1. **Identify Files to Re-import:** + + - Open the `import_tracking.txt` file. + - Look for files with the status `FAILED_TO_IMPORT` or `NOT_STARTED`. + - These files either failed to import or were not processed due to interruption. + +2. **Re-run the Import Script:** + + - You can re-run the import script; it will skip files marked as `IMPORTED` and attempt to import files with statuses `NOT_STARTED`, `IN_PROGRESS`, or `FAILED_TO_IMPORT`. + + ```bash + ./bootstrap.sh 8 /path/to/db_export/ + ``` + + - The script will resume importing where it left off. + +3. **Alternatively, Collect Specific Files to Re-import:** + + - Create a new directory to hold the files to be re-imported: + + ```bash + mkdir -p /path/to/reimport_files + ``` + + - Copy the failed and not started files to the new directory: + + ```bash + grep -E "FAILED_TO_IMPORT|NOT_STARTED" import_tracking.txt | awk '{print $1}' | xargs -I {} cp "{}" /path/to/reimport_files/ + ``` + + - Run the import script, pointing it to the new directory: + + ```bash + ./bootstrap.sh 8 /path/to/reimport_files/ + ``` + +4. **Verify the Imports:** + + - Check the `import_tracking.txt` and `import.log` files to ensure that all files have been imported successfully. + - If files continue to fail, review the error messages in `import.log` for troubleshooting. + +**Note on Data Consistency:** + +- When a file import fails, the database transaction ensures that **no partial data** is committed. +- This means that when you re-run the import script, you can safely re-import failed files without worrying about duplicates or inconsistencies. +- The database tables remain in the same state as before the failed import attempt. + +--- + +## Additional Notes + +- **Data Integrity:** The import script ensures data integrity by using transactions. If an error occurs during the import of a file, that file's data will not be committed to the database. +- **System Resources:** Adjust the number of CPU cores used (`8` in the example) based on your system's capabilities to prevent overloading the server. +- **Security:** Ensure that the passwords set in the environment variables are kept secure and not exposed in logs or command history. +- **Concurrent Write Safety:** The script uses file locking (`flock`) to safely handle concurrent writes to `import_tracking.txt`. This prevents race conditions and ensures the tracking file remains consistent. +- **Resuming Imports:** The script maintains the status of all files in `import_tracking.txt`, allowing you to resume imports after an interruption without re-importing already imported files. +- **Required Tools:** Ensure that all required tools (`psql`, `gunzip`, `realpath`, `flock`) are installed on your system. + +--- + +## Troubleshooting + +- **Connection Errors:** + + - Confirm that `PGHOST` is correctly set to the IP address or hostname of your database server. + - Ensure that the database server allows connections from your client machine. + +- **Import Failures:** + + - Check the `import.log` file generated by the import script for detailed error messages. + - Review the `import_tracking.txt` file to identify which files failed to import. + +- **Interruption Handling:** + + - If the import process is interrupted (e.g., due to a network issue or manual cancellation), the script updates the statuses in `import_tracking.txt` accordingly. + - Files that were in progress will be marked as `IN_PROGRESS` or remain as `NOT_STARTED` if they had not begun. + - Upon restarting the script, it will: + - Skip files marked as `IMPORTED`. + - Attempt to import files with statuses `NOT_STARTED`, `IN_PROGRESS`, or `FAILED_TO_IMPORT`. + +- **Bash Version Compatibility:** + + - The import script requires Bash version 4.3 or higher. Check your Bash version with: + + ```bash + bash --version + ``` + + - If using an older version of Bash, consider updating to the minimum required version. + +--- diff --git a/hedera-mirror-importer/src/main/resources/db/scripts/bootstrap.sh b/hedera-mirror-importer/src/main/resources/db/scripts/bootstrap.sh new file mode 100644 index 00000000000..64a5ad65723 --- /dev/null +++ b/hedera-mirror-importer/src/main/resources/db/scripts/bootstrap.sh @@ -0,0 +1,257 @@ +#!/bin/bash + +# Enable job control +set -m + +show_help() { + echo "Usage: $0 [OPTIONS] DB_CPU_CORES IMPORT_DIR" + echo + echo "Imports data into a PostgreSQL database from compressed CSV files." + echo + echo "Options:" + echo " -h, --help, -H Show this help message and exit." + echo + echo "Arguments:" + echo " DB_CPU_CORES Number of CPU cores on the DB instance to thread the import jobs." + echo " IMPORT_DIR Path to the directory containing the compressed CSV files." + echo + echo "Example:" + echo " $0 8 /path/to/db_export" + echo +} + +# Parse options +if [[ $# -eq 0 ]]; then + echo "No arguments provided. Use --help or -h for usage information." + exit 1 +fi + +while [[ "$#" -gt 0 ]]; do + case $1 in + -h|--help|-H) + show_help + exit 0 + ;; + *) + break + ;; + esac +done + +# Check if required arguments are supplied +if [[ -z "$1" || -z "$2" ]]; then + echo "Error: Both DB_CPU_CORES and IMPORT_DIR must be provided." + echo "Use --help or -h for usage information." + exit 1 +fi + +DB_CPU_CORES="$1" +IMPORT_DIR="$2" + +# Convert IMPORT_DIR to an absolute path +IMPORT_DIR="$(realpath "$IMPORT_DIR")" + +# Check if IMPORT_DIR exists and is a directory +if [[ ! -d "$IMPORT_DIR" ]]; then + echo "Error: IMPORT_DIR '$IMPORT_DIR' does not exist or is not a directory." + exit 1 +fi + +AVAILABLE_CORES=$(( $(nproc) - 1 )) # Leave one core free for the local system +DB_AVAILABLE_CORES=$((DB_CPU_CORES - 1)) # Leave one core free for the DB instance + +if [[ $AVAILABLE_CORES -lt $DB_AVAILABLE_CORES ]]; then + DB_AVAILABLE_CORES=$AVAILABLE_CORES +fi + +max_jobs="$DB_AVAILABLE_CORES" + +# Set PostgreSQL environment variables +export PGUSER=${PGUSER:-"DB_OWNER"} +export PGPASSWORD=${PGPASSWORD:-"DB_PASSWORD"} +export PGHOST=${PGHOST:-"DB_ADDRESS"} +export PGDATABASE=${PGDATABASE:-"DB_NAME"} + +LOG_FILE="import.log" +TRACKING_FILE="import_tracking.txt" +LOCK_FILE="import_tracking.lock" + +# Check if required tools are installed +REQUIRED_TOOLS=("psql" "gunzip" "realpath" "flock") +for tool in "${REQUIRED_TOOLS[@]}"; do + if ! command -v "$tool" &> /dev/null; then + echo "Error: $tool is not installed. Please install it to continue." + exit 1 + fi +done + +# Log using UTC times +log() { + local msg="$1" + local level="${2:-INFO}" + local timestamp + timestamp=$(date -u '+%Y-%m-%d %H:%M:%S') + + echo "$timestamp - $level - $msg" | tee -a "$LOG_FILE" +} + +# Function to kill a process and its descendants +kill_descendants() { + local pid="$1" + local children + children=$(pgrep -P "$pid") + for child in $children; do + kill_descendants "$child" + done + kill -TERM "$pid" 2>/dev/null +} + +# Function to handle script termination +cleanup() { + log "Script interrupted. Terminating background jobs..." "ERROR" + # Ignore further signals during cleanup + trap '' SIGINT SIGTERM + + # Kill all background jobs and their descendants + for pid in "${pids[@]}"; do + kill_descendants "$pid" + done + + wait 2>/dev/null + log "All background jobs terminated." + exit 1 +} + +# Trap signals +trap 'cleanup' SIGINT SIGTERM + +# Function to safely write to tracking file with lock +write_tracking_file() { + local file="$1" + local status="$2" + ( + flock -x 200 + + # Remove any existing entry for the file + grep -v "^$file " "$TRACKING_FILE" > "${TRACKING_FILE}.tmp" 2>/dev/null || true + mv "${TRACKING_FILE}.tmp" "$TRACKING_FILE" + + # Add the new status + echo "$file $status" >> "$TRACKING_FILE" + ) 200>"$LOCK_FILE" +} + +# Function to read status from tracking file +read_tracking_status() { + local file="$1" + grep "^$file " "$TRACKING_FILE" 2>/dev/null | awk '{print $2}' +} + +# Function to collect all import tasks +collect_import_tasks() { + find "$IMPORT_DIR" -type f -name "*.csv.gz" +} + +# Main script execution +log "Starting DB import." + +# Get the list of files to import +mapfile -t files < <(collect_import_tasks) + +# Initialize the tracking file with all files as NOT_STARTED +( + flock -x 200 + for file in "${files[@]}"; do + # Only add if not already in tracking file + if ! grep -q "^$file " "$TRACKING_FILE" 2>/dev/null; then + echo "$file NOT_STARTED" >> "$TRACKING_FILE" + fi + done +) 200>"$LOCK_FILE" + +# Initialize variables +pids=() +overall_success=0 + +# Export necessary functions and variables +export -f import_file log kill_descendants write_tracking_file read_tracking_status +export IMPORT_DIR LOG_FILE TRACKING_FILE LOCK_FILE PGUSER PGPASSWORD PGHOST PGDATABASE + +# Function to import a single file +import_file() { + local file="$1" + local table + + # Determine the table name + if [[ "$(dirname "$file")" == "$IMPORT_DIR" ]]; then + table=$(basename "$file" .csv.gz) + else + table=$(basename "$(dirname "$file")") + fi + + # Update status to IN_PROGRESS + write_tracking_file "$file" "IN_PROGRESS" + log "Importing table $table from $file" + + if { + echo "BEGIN;" + echo "\\copy $table FROM STDIN WITH CSV HEADER;" + gunzip -c "$file" + echo "\." + echo "COMMIT;" + } | psql -q -v ON_ERROR_STOP=1; then + log "Successfully imported $file into $table" + # Update the status to IMPORTED + write_tracking_file "$file" "IMPORTED" + else + log "Failed to import $file into $table" "ERROR" + # Update the status to FAILED_TO_IMPORT + write_tracking_file "$file" "FAILED_TO_IMPORT" + return 1 + fi +} + +# Loop through files and manage parallel execution +for file in "${files[@]}"; do + # Check if the file has already been imported + status=$(read_tracking_status "$file") + if [[ "$status" == "IMPORTED" ]]; then + log "Skipping already imported file $file" + continue + fi + + # Wait if max_jobs are already running + while [[ ${#pids[@]} -ge $max_jobs ]]; do + # Wait for any job to finish + if ! wait -n; then + overall_success=1 + fi + + # Remove completed PIDs from the array + new_pids=() + for pid in "${pids[@]}"; do + if kill -0 "$pid" 2>/dev/null; then + new_pids+=("$pid") + fi + done + pids=("${new_pids[@]}") + done + + # Start import in background + import_file "$file" & + pids+=($!) +done + +# Wait for all remaining jobs to finish +for pid in "${pids[@]}"; do + if ! wait "$pid"; then + overall_success=1 + fi +done + +if [[ $overall_success -eq 0 ]]; then + log "DB import completed successfully." +else + log "DB import completed with errors" "ERROR" + exit 1 +fi \ No newline at end of file From 563ccab619753b456487a3cfd7a6d5f349edd72a Mon Sep 17 00:00:00 2001 From: Nir Ben-Or Date: Tue, 17 Sep 2024 15:39:29 -0500 Subject: [PATCH 2/9] Improve comments Signed-off-by: Nir Ben-Or --- .../src/main/resources/db/scripts/bootstrap.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/hedera-mirror-importer/src/main/resources/db/scripts/bootstrap.sh b/hedera-mirror-importer/src/main/resources/db/scripts/bootstrap.sh index 64a5ad65723..8aecee08c84 100644 --- a/hedera-mirror-importer/src/main/resources/db/scripts/bootstrap.sh +++ b/hedera-mirror-importer/src/main/resources/db/scripts/bootstrap.sh @@ -95,7 +95,7 @@ log() { echo "$timestamp - $level - $msg" | tee -a "$LOG_FILE" } -# Function to kill a process and its descendants +# Kill a process and its descendants kill_descendants() { local pid="$1" local children @@ -106,7 +106,7 @@ kill_descendants() { kill -TERM "$pid" 2>/dev/null } -# Function to handle script termination +# Handle script termination cleanup() { log "Script interrupted. Terminating background jobs..." "ERROR" # Ignore further signals during cleanup @@ -125,7 +125,7 @@ cleanup() { # Trap signals trap 'cleanup' SIGINT SIGTERM -# Function to safely write to tracking file with lock +# Safely write to tracking file with lock write_tracking_file() { local file="$1" local status="$2" @@ -141,13 +141,13 @@ write_tracking_file() { ) 200>"$LOCK_FILE" } -# Function to read status from tracking file +# Read status from tracking file read_tracking_status() { local file="$1" grep "^$file " "$TRACKING_FILE" 2>/dev/null | awk '{print $2}' } -# Function to collect all import tasks +# Collect all import tasks collect_import_tasks() { find "$IMPORT_DIR" -type f -name "*.csv.gz" } @@ -177,7 +177,7 @@ overall_success=0 export -f import_file log kill_descendants write_tracking_file read_tracking_status export IMPORT_DIR LOG_FILE TRACKING_FILE LOCK_FILE PGUSER PGPASSWORD PGHOST PGDATABASE -# Function to import a single file +# Import a single file import_file() { local file="$1" local table From bba77e987ae0623e8fa37ed2307ad75841f3fd94 Mon Sep 17 00:00:00 2001 From: Nir Ben-Or <119968212+nirbosl@users.noreply.github.com> Date: Thu, 19 Sep 2024 10:33:44 -0500 Subject: [PATCH 3/9] Update title - suggestion accepted Co-authored-by: Steven Sheehy <17552371+steven-sheehy@users.noreply.github.com> Signed-off-by: Nir Ben-Or <119968212+nirbosl@users.noreply.github.com> --- docs/database/bootstrap.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/database/bootstrap.md b/docs/database/bootstrap.md index b2ca7c41282..b470f72633b 100644 --- a/docs/database/bootstrap.md +++ b/docs/database/bootstrap.md @@ -1,4 +1,4 @@ -# Mirror Node Database Bootstrap Guide +# Database Bootstrap Guide This guide provides step-by-step instructions for setting up a fresh PostgreSQL 14 database and importing Mirror Node data into it. The process involves initializing the database, configuring environment variables, and running the import script. The data import is a long-running process, so it's recommended to run it within a `screen` or `tmux` session. From 26dffb1f6a4ddae9b5b7ccd67d929a4749d020f8 Mon Sep 17 00:00:00 2001 From: Nir Ben-Or Date: Thu, 19 Sep 2024 11:51:02 -0500 Subject: [PATCH 4/9] Several changes and updates following Steven's review comments; More to come Signed-off-by: Nir Ben-Or --- docs/database/bootstrap.md | 38 +++++++++---------- .../src/main/resources/db/scripts/init.sh | 12 +++++- 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/docs/database/bootstrap.md b/docs/database/bootstrap.md index b470f72633b..c78fa192175 100644 --- a/docs/database/bootstrap.md +++ b/docs/database/bootstrap.md @@ -1,34 +1,32 @@ # Database Bootstrap Guide -This guide provides step-by-step instructions for setting up a fresh PostgreSQL 14 database and importing Mirror Node data into it. The process involves initializing the database, configuring environment variables, and running the import script. The data import is a long-running process, so it's recommended to run it within a `screen` or `tmux` session. +This guide provides step-by-step instructions for setting up a fresh PostgreSQL database and importing Mirror Node data into it. The process involves initializing the database, configuring environment variables, and running the import script. The data import is a long-running process, so it's recommended to run it within a `screen` or `tmux` session. --- ## Table of Contents -- [Mirror Node Database Bootstrap Guide](#mirror-node-database-bootstrap-guide) - - [Table of Contents](#table-of-contents) - - [Prerequisites](#prerequisites) - - [Database Initialization](#database-initialization) - - [1. Configure Environment Variables](#1-configure-environment-variables) - - [2. Important Note for Google Cloud SQL Users](#2-important-note-for-google-cloud-sql-users) - - [3. Run the Initialization Script](#3-run-the-initialization-script) - - [4. Import the Database Schema](#4-import-the-database-schema) - - [Data Import Process](#data-import-process) - - [1. Download the Database Export Data](#1-download-the-database-export-data) - - [2. Download the Import Script](#2-download-the-import-script) - - [3. Run the Import Script](#3-run-the-import-script) - - [Mirror Node Version Compatibility](#mirror-node-version-compatibility) - - [Handling Failed Imports](#handling-failed-imports) - - [Steps to Handle Failed Imports:](#steps-to-handle-failed-imports) - - [Additional Notes](#additional-notes) - - [Troubleshooting](#troubleshooting) +- [Prerequisites](#prerequisites) +- [Database Initialization](#database-initialization) + - [1. Configure Environment Variables](#1-configure-environment-variables) + - [2. Important Note for Google Cloud SQL Users](#2-important-note-for-google-cloud-sql-users) + - [3. Run the Initialization Script](#3-run-the-initialization-script) + - [4. Import the Database Schema](#4-import-the-database-schema) +- [Data Import Process](#data-import-process) + - [1. Download the Database Export Data](#1-download-the-database-export-data) + - [2. Download the Import Script](#2-download-the-import-script) + - [3. Run the Import Script](#3-run-the-import-script) +- [Mirror Node Version Compatibility](#mirror-node-version-compatibility) +- [Handling Failed Imports](#handling-failed-imports) + - [Steps to Handle Failed Imports:](#steps-to-handle-failed-imports) +- [Additional Notes](#additional-notes) +- [Troubleshooting](#troubleshooting) --- ## Prerequisites -- **PostgreSQL 14** installed and running. +- **PostgreSQL 16** installed and running. - Access to a machine where you can run the initialization and import scripts and connect to the PostgreSQL database. - A Google Cloud Platform (GCP) account with a valid billing account attached (required for downloading data from a Requester Pays bucket). @@ -47,12 +45,14 @@ export PGUSER="postgres" export PGPASSWORD="YOUR_POSTGRES_PASSWORD" export PGDATABASE="postgres" export PGHOST="DB_IP_ADDRESS" +export PGPORT="DB_PORT" ``` - `PGUSER`: The PostgreSQL superuser with administrative privileges (typically `postgres`). - `PGPASSWORD`: Password for the PostgreSQL superuser. - `PGDATABASE`: The default database to connect to (`postgres` by default). - `PGHOST`: The IP address or hostname of your PostgreSQL database server. +- `PGPORT`: The database server port number (`5432` by default). **Database User Password Variables:** diff --git a/hedera-mirror-importer/src/main/resources/db/scripts/init.sh b/hedera-mirror-importer/src/main/resources/db/scripts/init.sh index ba6d7d2198e..a7fcbf7e7ff 100755 --- a/hedera-mirror-importer/src/main/resources/db/scripts/init.sh +++ b/hedera-mirror-importer/src/main/resources/db/scripts/init.sh @@ -4,7 +4,9 @@ set -e export PGCONNECT_TIMEOUT="${PGCONNECT_TIMEOUT:-3}" export PGDATABASE="${POSTGRES_DB:-postgres}" export PGHOST="${PGHOST}" +export PGPORT="${PGPORT:-5432}" export PGUSER="${POSTGRES_USER:-postgres}" +export IS_GCP_CLOUD_SQL="${IS_GCP_CLOUD_SQL:-false}" DB_SPECIFIC_EXTENSION_SQL="create extension btree_gist; create extension pg_trgm;" @@ -47,10 +49,18 @@ psql --set ON_ERROR_STOP=1 \ --set "rosettaUsername=${ROSETTA_USERNAME:-mirror_rosetta}" \ --set "web3Password=${WEB3_PASSWORD:-mirror_web3_pass}" \ --set "web3Username=${WEB3_USERNAME:-mirror_web3}" \ - --set "tempSchema=${DB_TEMPSCHEMA:-temporary}" <<__SQL__ + --set "tempSchema=${DB_TEMPSCHEMA:-temporary}" \ + --set "isGcpCloudSql=${IS_GCP_CLOUD_SQL}" \ + --set "pgUser=${PGUSER}" <<__SQL__ -- Create database & owner create user :ownerUsername with login password :'ownerPassword'; + +-- Conditional GRANT statement for Google Cloud SQL +\if :isGcpCloudSql + grant mirror_node to :pgUser; +\endif + create database :dbName with owner :ownerUsername; -- Add extensions From e3e0b9cf73438cdb4167c529cf5302611bc78885 Mon Sep 17 00:00:00 2001 From: Nir Ben-Or Date: Thu, 19 Sep 2024 12:22:27 -0500 Subject: [PATCH 5/9] Continuation of the review comments resolution Signed-off-by: Nir Ben-Or --- docs/database/bootstrap.md | 27 +++++---------------------- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/docs/database/bootstrap.md b/docs/database/bootstrap.md index c78fa192175..34baefae28c 100644 --- a/docs/database/bootstrap.md +++ b/docs/database/bootstrap.md @@ -73,32 +73,15 @@ export WEB3_PASSWORD="SET_PASSWORD" ### 2. Important Note for Google Cloud SQL Users -If you are using **Google Cloud SQL** for your PostgreSQL database, an additional step is required before running the `init.sh` script to ensure proper initialization. - -**Add the Following Line to the Initialization Script:** - -Before running the `init.sh` script, you need to grant the `mirror_node` role to the `postgres` user. This is necessary because Google Cloud SQL restricts certain permissions for the `postgres` user. - -Add the following line **before** running the `init.sh` script: - -```sql -GRANT mirror_node TO postgres; -``` - -**Revised Section of `init.sh`:** - -```sql --- Create database & owner -CREATE USER :ownerUsername WITH LOGIN PASSWORD :'ownerPassword'; -GRANT mirror_node TO postgres; -CREATE DATABASE :dbName WITH OWNER :ownerUsername; +If you are using **Google Cloud SQL** for your PostgreSQL database, you'll need to set an additional environment variable: +```bash +export IS_GCP_CLOUD_SQL="true" ``` - -- This adjustment ensures that the `postgres` user has the necessary permissions to execute the initialization script correctly on Google Cloud SQL. +*Note*: For non-Google Cloud SQL environments, you do not need to set this variable, as it defaults to false. ### 3. Run the Initialization Script -Download the initialization script `init.sh` from the repository: +Download the initialization script [`init.sh`](../../hedera-mirror-importer/src/main/resources/db/scripts/init.sh) from the repository: ```bash curl -O https://raw.githubusercontent.com/hashgraph/hedera-mirror-node/main/hedera-mirror-importer/src/main/resources/db/scripts/init.sh From a4528cdae5b2de1a3cf0c4cdda0be1d92ec9791a Mon Sep 17 00:00:00 2001 From: Nir Ben-Or <119968212+nirbosl@users.noreply.github.com> Date: Thu, 19 Sep 2024 12:31:05 -0500 Subject: [PATCH 6/9] Remove redundant "mirror node" in the title Co-authored-by: Steven Sheehy <17552371+steven-sheehy@users.noreply.github.com> Signed-off-by: Nir Ben-Or <119968212+nirbosl@users.noreply.github.com> --- docs/database/bootstrap.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/database/bootstrap.md b/docs/database/bootstrap.md index 34baefae28c..8e9bf71c66f 100644 --- a/docs/database/bootstrap.md +++ b/docs/database/bootstrap.md @@ -244,7 +244,7 @@ The import script is designed to efficiently import the Mirror Node data into yo --- -## Mirror Node Version Compatibility +## Version Compatibility Before initializing your Mirror Node with the imported database, it's crucial to ensure version compatibility. From 1bc16ebe10ded74bf1a87791e22535d775de5ff7 Mon Sep 17 00:00:00 2001 From: Nir Ben-Or Date: Thu, 19 Sep 2024 14:32:54 -0500 Subject: [PATCH 7/9] Several more changes based on comments Signed-off-by: Nir Ben-Or --- docs/database/bootstrap.md | 135 ++++------ .../main/resources/db/scripts/bootstrap.sh | 235 +++++++++++------- 2 files changed, 188 insertions(+), 182 deletions(-) diff --git a/docs/database/bootstrap.md b/docs/database/bootstrap.md index 8e9bf71c66f..fa59abec9e4 100644 --- a/docs/database/bootstrap.md +++ b/docs/database/bootstrap.md @@ -16,7 +16,6 @@ This guide provides step-by-step instructions for setting up a fresh PostgreSQL - [1. Download the Database Export Data](#1-download-the-database-export-data) - [2. Download the Import Script](#2-download-the-import-script) - [3. Run the Import Script](#3-run-the-import-script) -- [Mirror Node Version Compatibility](#mirror-node-version-compatibility) - [Handling Failed Imports](#handling-failed-imports) - [Steps to Handle Failed Imports:](#steps-to-handle-failed-imports) - [Additional Notes](#additional-notes) @@ -26,9 +25,35 @@ This guide provides step-by-step instructions for setting up a fresh PostgreSQL ## Prerequisites -- **PostgreSQL 16** installed and running. -- Access to a machine where you can run the initialization and import scripts and connect to the PostgreSQL database. -- A Google Cloud Platform (GCP) account with a valid billing account attached (required for downloading data from a Requester Pays bucket). +1. **Version Compatibility** + + Before initializing your Mirror Node with the imported database, it's crucial to ensure version compatibility. + + **MIRRORNODE_VERSION File:** + + - In the database export data, there is a file named `MIRRORNODE_VERSION`. + - This file contains the version of the Mirror Node at the time of the database export. + + **Importance:** + + - Your Mirror Node instance must be initialized with the **same version** as specified in the `MIRRORNODE_VERSION` file. + - Using a different version may lead to compatibility issues and/or schema mismatches. + + **Action Required:** + + 1. **Check the Mirror Node Version:** + + - Open the `MIRRORNODE_VERSION` file: + + ```bash + cat /path/to/db_export/MIRRORNODE_VERSION + ``` + + - Note the version number specified. + +2. **PostgreSQL 16** installed and running. +3. Access to a machine where you can run the initialization and import scripts and connect to the PostgreSQL database. +4. A Google Cloud Platform (GCP) account with a valid billing account attached (required for downloading data from a Requester Pays bucket). --- @@ -54,6 +79,8 @@ export PGPORT="DB_PORT" - `PGHOST`: The IP address or hostname of your PostgreSQL database server. - `PGPORT`: The database server port number (`5432` by default). + + **Database User Password Variables:** Set the following environment variables to define passwords for the various database users that will be created during initialization. @@ -71,6 +98,8 @@ export WEB3_PASSWORD="SET_PASSWORD" - Replace `SET_PASSWORD` with strong, unique passwords for each respective user. +- **Security Note:** Ensure that the passwords set in the environment variables are kept secure and not exposed in logs or command history. + ### 2. Important Note for Google Cloud SQL Users If you are using **Google Cloud SQL** for your PostgreSQL database, you'll need to set an additional environment variable: @@ -244,34 +273,6 @@ The import script is designed to efficiently import the Mirror Node data into yo --- -## Version Compatibility - -Before initializing your Mirror Node with the imported database, it's crucial to ensure version compatibility. - -**MIRRORNODE_VERSION File:** - -- In the database export data, there is a file named `MIRRORNODE_VERSION`. -- This file contains the version of the Mirror Node at the time of the database export. - -**Importance:** - -- Your Mirror Node instance must be initialized with the **same version** as specified in the `MIRRORNODE_VERSION` file. -- Using a different version may lead to compatibility issues and/or schema mismatches. - -**Action Required:** - -1. **Check the Mirror Node Version:** - - - Open the `MIRRORNODE_VERSION` file: - - ```bash - cat /path/to/db_export/MIRRORNODE_VERSION - ``` - - - Note the version number specified. - ---- - ## Handling Failed Imports During the import process, the script generates a file named `import_tracking.txt`, which logs the status of each file import. Each line in this file contains the path and name of a file, followed by its import status: `NOT_STARTED`, `IN_PROGRESS`, `IMPORTED`, or `FAILED_TO_IMPORT`. @@ -294,63 +295,29 @@ During the import process, the script generates a file named `import_tracking.tx ### Steps to Handle Failed Imports: -1. **Identify Files to Re-import:** - - - Open the `import_tracking.txt` file. - - Look for files with the status `FAILED_TO_IMPORT` or `NOT_STARTED`. - - These files either failed to import or were not processed due to interruption. - -2. **Re-run the Import Script:** - - - You can re-run the import script; it will skip files marked as `IMPORTED` and attempt to import files with statuses `NOT_STARTED`, `IN_PROGRESS`, or `FAILED_TO_IMPORT`. - - ```bash - ./bootstrap.sh 8 /path/to/db_export/ - ``` - - - The script will resume importing where it left off. - -3. **Alternatively, Collect Specific Files to Re-import:** - - - Create a new directory to hold the files to be re-imported: - - ```bash - mkdir -p /path/to/reimport_files - ``` - - - Copy the failed and not started files to the new directory: +1. **Re-run the Import Script:** + - Simply re-run the import script; it will automatically skip files marked as `IMPORTED` and attempt to import files with statuses `NOT_STARTED`, `IN_PROGRESS`, or `FAILED_TO_IMPORT`. + ```bash - grep -E "FAILED_TO_IMPORT|NOT_STARTED" import_tracking.txt | awk '{print $1}' | xargs -I {} cp "{}" /path/to/reimport_files/ + ./your_import_script.sh 8 /path/to/db_export/ ``` + + - The script manages the import process, ensuring that only the necessary files are processed without manual intervention. - - Run the import script, pointing it to the new directory: - - ```bash - ./bootstrap.sh 8 /path/to/reimport_files/ - ``` - -4. **Verify the Imports:** +2. **Verify the Imports:** - Check the `import_tracking.txt` and `import.log` files to ensure that all files have been imported successfully. + - If files continue to fail, review the error messages in `import.log` for troubleshooting. -**Note on Data Consistency:** - -- When a file import fails, the database transaction ensures that **no partial data** is committed. -- This means that when you re-run the import script, you can safely re-import failed files without worrying about duplicates or inconsistencies. -- The database tables remain in the same state as before the failed import attempt. - ---- - -## Additional Notes - -- **Data Integrity:** The import script ensures data integrity by using transactions. If an error occurs during the import of a file, that file's data will not be committed to the database. +**Notes on Data Consistency:** + - **System Resources:** Adjust the number of CPU cores used (`8` in the example) based on your system's capabilities to prevent overloading the server. -- **Security:** Ensure that the passwords set in the environment variables are kept secure and not exposed in logs or command history. + +- **Data Integrity:** When a file import fails, the database transaction ensures that **no partial data** is committed. This means that when you re-run the import script, you can safely re-import failed files without worrying about duplicates or inconsistencies; The database tables remain in the same state as before the failed import attempt. + - **Concurrent Write Safety:** The script uses file locking (`flock`) to safely handle concurrent writes to `import_tracking.txt`. This prevents race conditions and ensures the tracking file remains consistent. -- **Resuming Imports:** The script maintains the status of all files in `import_tracking.txt`, allowing you to resume imports after an interruption without re-importing already imported files. -- **Required Tools:** Ensure that all required tools (`psql`, `gunzip`, `realpath`, `flock`) are installed on your system. --- @@ -374,14 +341,4 @@ During the import process, the script generates a file named `import_tracking.tx - Skip files marked as `IMPORTED`. - Attempt to import files with statuses `NOT_STARTED`, `IN_PROGRESS`, or `FAILED_TO_IMPORT`. -- **Bash Version Compatibility:** - - - The import script requires Bash version 4.3 or higher. Check your Bash version with: - - ```bash - bash --version - ``` - - - If using an older version of Bash, consider updating to the minimum required version. - --- diff --git a/hedera-mirror-importer/src/main/resources/db/scripts/bootstrap.sh b/hedera-mirror-importer/src/main/resources/db/scripts/bootstrap.sh index 8aecee08c84..e7127f636ed 100644 --- a/hedera-mirror-importer/src/main/resources/db/scripts/bootstrap.sh +++ b/hedera-mirror-importer/src/main/resources/db/scripts/bootstrap.sh @@ -3,62 +3,31 @@ # Enable job control set -m -show_help() { - echo "Usage: $0 [OPTIONS] DB_CPU_CORES IMPORT_DIR" - echo - echo "Imports data into a PostgreSQL database from compressed CSV files." - echo - echo "Options:" - echo " -h, --help, -H Show this help message and exit." - echo - echo "Arguments:" - echo " DB_CPU_CORES Number of CPU cores on the DB instance to thread the import jobs." - echo " IMPORT_DIR Path to the directory containing the compressed CSV files." - echo - echo "Example:" - echo " $0 8 /path/to/db_export" - echo -} - -# Parse options -if [[ $# -eq 0 ]]; then - echo "No arguments provided. Use --help or -h for usage information." - exit 1 -fi +#################################### +# Variables +#################################### -while [[ "$#" -gt 0 ]]; do - case $1 in - -h|--help|-H) - show_help - exit 0 - ;; - *) - break - ;; - esac -done +# Define minimum required Bash version +REQUIRED_BASH_MAJOR=4 +REQUIRED_BASH_MINOR=3 -# Check if required arguments are supplied -if [[ -z "$1" || -z "$2" ]]; then - echo "Error: Both DB_CPU_CORES and IMPORT_DIR must be provided." - echo "Use --help or -h for usage information." - exit 1 -fi +# PostgreSQL environment variables +export PGUSER=${PGUSER:-"DB_OWNER"} +export PGPASSWORD=${PGPASSWORD:-"DB_PASSWORD"} +export PGHOST=${PGHOST:-"DB_ADDRESS"} +export PGPORT="${PGPORT:-5432}" # Added PGPORT with default value +export PGDATABASE=${PGDATABASE:-"DB_NAME"} +# Import script arguments DB_CPU_CORES="$1" IMPORT_DIR="$2" # Convert IMPORT_DIR to an absolute path IMPORT_DIR="$(realpath "$IMPORT_DIR")" -# Check if IMPORT_DIR exists and is a directory -if [[ ! -d "$IMPORT_DIR" ]]; then - echo "Error: IMPORT_DIR '$IMPORT_DIR' does not exist or is not a directory." - exit 1 -fi - -AVAILABLE_CORES=$(( $(nproc) - 1 )) # Leave one core free for the local system -DB_AVAILABLE_CORES=$((DB_CPU_CORES - 1)) # Leave one core free for the DB instance +# Calculate available CPU cores +AVAILABLE_CORES=$(( $(nproc) - 1 )) # Leave one core free for the local system +DB_AVAILABLE_CORES=$((DB_CPU_CORES - 1)) # Leave one core free for the DB instance if [[ $AVAILABLE_CORES -lt $DB_AVAILABLE_CORES ]]; then DB_AVAILABLE_CORES=$AVAILABLE_CORES @@ -66,26 +35,49 @@ fi max_jobs="$DB_AVAILABLE_CORES" -# Set PostgreSQL environment variables -export PGUSER=${PGUSER:-"DB_OWNER"} -export PGPASSWORD=${PGPASSWORD:-"DB_PASSWORD"} -export PGHOST=${PGHOST:-"DB_ADDRESS"} -export PGDATABASE=${PGDATABASE:-"DB_NAME"} - +# Logging and tracking files LOG_FILE="import.log" TRACKING_FILE="import_tracking.txt" LOCK_FILE="import_tracking.lock" -# Check if required tools are installed +# Required tools REQUIRED_TOOLS=("psql" "gunzip" "realpath" "flock") -for tool in "${REQUIRED_TOOLS[@]}"; do - if ! command -v "$tool" &> /dev/null; then - echo "Error: $tool is not installed. Please install it to continue." + +#################################### +# Functions +#################################### + +# display help message +show_help() { + echo "Usage: $0 [OPTIONS] DB_CPU_CORES IMPORT_DIR" + echo + echo "Imports data into a PostgreSQL database from compressed CSV files." + echo + echo "Options:" + echo " -h, --help, -H Show this help message and exit." + echo + echo "Arguments:" + echo " DB_CPU_CORES Number of CPU cores on the DB instance to thread the import jobs." + echo " IMPORT_DIR Path to the directory containing the compressed CSV files." + echo + echo "Example:" + echo " $0 8 /path/to/db_export" + echo +} + +# check Bash version +check_bash_version() { + local current_major=${BASH_VERSINFO[0]} + local current_minor=${BASH_VERSINFO[1]} + + if (( current_major < REQUIRED_BASH_MAJOR )) || \ + (( current_major == REQUIRED_BASH_MAJOR && current_minor < REQUIRED_BASH_MINOR )); then + echo "Error: Bash version ${REQUIRED_BASH_MAJOR}.${REQUIRED_BASH_MINOR}+ is required. Current version is ${BASH_VERSION}." exit 1 fi -done +} -# Log using UTC times +# log messages with UTC timestamps log() { local msg="$1" local level="${2:-INFO}" @@ -95,7 +87,7 @@ log() { echo "$timestamp - $level - $msg" | tee -a "$LOG_FILE" } -# Kill a process and its descendants +# kill a process and its descendants kill_descendants() { local pid="$1" local children @@ -106,7 +98,7 @@ kill_descendants() { kill -TERM "$pid" 2>/dev/null } -# Handle script termination +# handle script termination cleanup() { log "Script interrupted. Terminating background jobs..." "ERROR" # Ignore further signals during cleanup @@ -122,10 +114,7 @@ cleanup() { exit 1 } -# Trap signals -trap 'cleanup' SIGINT SIGTERM - -# Safely write to tracking file with lock +# safely write to the tracking file with a lock write_tracking_file() { local file="$1" local status="$2" @@ -141,43 +130,18 @@ write_tracking_file() { ) 200>"$LOCK_FILE" } -# Read status from tracking file +# read status from the tracking file read_tracking_status() { local file="$1" grep "^$file " "$TRACKING_FILE" 2>/dev/null | awk '{print $2}' } -# Collect all import tasks +# collect all import tasks (compressed CSV files) collect_import_tasks() { find "$IMPORT_DIR" -type f -name "*.csv.gz" } -# Main script execution -log "Starting DB import." - -# Get the list of files to import -mapfile -t files < <(collect_import_tasks) - -# Initialize the tracking file with all files as NOT_STARTED -( - flock -x 200 - for file in "${files[@]}"; do - # Only add if not already in tracking file - if ! grep -q "^$file " "$TRACKING_FILE" 2>/dev/null; then - echo "$file NOT_STARTED" >> "$TRACKING_FILE" - fi - done -) 200>"$LOCK_FILE" - -# Initialize variables -pids=() -overall_success=0 - -# Export necessary functions and variables -export -f import_file log kill_descendants write_tracking_file read_tracking_status -export IMPORT_DIR LOG_FILE TRACKING_FILE LOCK_FILE PGUSER PGPASSWORD PGHOST PGDATABASE - -# Import a single file +# import a single file into the database import_file() { local file="$1" local table @@ -211,6 +175,90 @@ import_file() { fi } +#################################### +# Execution +#################################### + +# Perform the Bash version check +check_bash_version + +# display help if no arguments are provided +if [[ $# -eq 0 ]]; then + echo "No arguments provided. Use --help or -h for usage information." + exit 1 +fi + +# Parse options +while [[ "$#" -gt 0 ]]; do + case $1 in + -h|--help|-H) + show_help + exit 0 + ;; + *) + break + ;; + esac +done + +# Check if required arguments are supplied +if [[ -z "$DB_CPU_CORES" || -z "$IMPORT_DIR" ]]; then + echo "Error: Both DB_CPU_CORES and IMPORT_DIR must be provided." + echo "Use --help or -h for usage information." + exit 1 +fi + +# Check if IMPORT_DIR exists and is a directory +if [[ ! -d "$IMPORT_DIR" ]]; then + echo "Error: IMPORT_DIR '$IMPORT_DIR' does not exist or is not a directory." + exit 1 +fi + +# Check if required tools are installed +missing_tools=() +for tool in "${REQUIRED_TOOLS[@]}"; do + if ! command -v "$tool" &> /dev/null; then + missing_tools+=("$tool") + fi +done + +if [[ ${#missing_tools[@]} -gt 0 ]]; then + echo "Error: The following required tools are not installed:" + for tool in "${missing_tools[@]}"; do + echo " - $tool" + done + echo "Please install them to continue." + exit 1 +fi + +# Trap signals for cleanup +trap 'cleanup' SIGINT SIGTERM + +# Log the start of the import process +log "Starting DB import." + +# Get the list of files to import +mapfile -t files < <(collect_import_tasks) + +# Initialize the tracking file with all files as NOT_STARTED +( + flock -x 200 + for file in "${files[@]}"; do + # Only add if not already in tracking file + if ! grep -q "^$file " "$TRACKING_FILE" 2>/dev/null; then + echo "$file NOT_STARTED" >> "$TRACKING_FILE" + fi + done +) 200>"$LOCK_FILE" + +# Initialize variables for background processes +pids=() +overall_success=0 + +# Export necessary functions and variables for subshells +export -f import_file log kill_descendants write_tracking_file read_tracking_status +export IMPORT_DIR LOG_FILE TRACKING_FILE LOCK_FILE PGUSER PGPASSWORD PGHOST PGDATABASE + # Loop through files and manage parallel execution for file in "${files[@]}"; do # Check if the file has already been imported @@ -249,6 +297,7 @@ for pid in "${pids[@]}"; do fi done +# Log the final status of the import process if [[ $overall_success -eq 0 ]]; then log "DB import completed successfully." else From cd6fe752f73a91823b9b17e0efe3825790e02834 Mon Sep 17 00:00:00 2001 From: Nir Ben-Or Date: Thu, 19 Sep 2024 18:20:43 -0500 Subject: [PATCH 8/9] Almost done, two more items left - will be in the next commit Signed-off-by: Nir Ben-Or --- docs/database/bootstrap.md | 63 +++++++++++++++----------------------- 1 file changed, 25 insertions(+), 38 deletions(-) diff --git a/docs/database/bootstrap.md b/docs/database/bootstrap.md index fa59abec9e4..4fbabff1064 100644 --- a/docs/database/bootstrap.md +++ b/docs/database/bootstrap.md @@ -53,7 +53,16 @@ This guide provides step-by-step instructions for setting up a fresh PostgreSQL 2. **PostgreSQL 16** installed and running. 3. Access to a machine where you can run the initialization and import scripts and connect to the PostgreSQL database. -4. A Google Cloud Platform (GCP) account with a valid billing account attached (required for downloading data from a Requester Pays bucket). +4. Ensure the following tools are installed on your machine: + - psql + - gunzip + - realpath + - flock +5. Install [Google Cloud SDK](https://cloud.google.com/sdk/docs/install), then authenticate: + ``` + gcloud auth login + ``` +6. A Google Cloud Platform (GCP) account with a valid billing account attached (required for downloading data from a Requester Pays bucket). --- @@ -164,28 +173,13 @@ The Mirror Node database export data is available in a Google Cloud Storage (GCS **Download Instructions:** -1. **Authenticate with GCP:** - - Ensure you have the [Google Cloud SDK](https://cloud.google.com/sdk/docs/install) installed and authenticated: - - ```bash - gcloud auth login - gcloud config set billing/disable_usage_reporting false - ``` - -2. **Set the Default Project:** - - ```bash - gcloud config set project YOUR_GCP_PROJECT_ID - ``` - -3. **Download the Data:** +1. **Download the Data:** Create an empty directory to store the data and download all files and subdirectories: ```bash mkdir -p /path/to/db_export - gsutil -u YOUR_GCP_PROJECT_ID -m cp -r gs://mirrornode-db-export/* /path/to/db_export/ + gsutil -u YOUR_GCP_PROJECT_ID -m cp -r gs://mirrornode-db-export/<$VERSION_NUMBER>/* /path/to/db_export/ ``` - Replace `/path/to/db_export` with your desired directory path. @@ -293,24 +287,6 @@ During the import process, the script generates a file named `import_tracking.tx /path/to/db_export/account.csv.gz NOT_STARTED ``` -### Steps to Handle Failed Imports: - -1. **Re-run the Import Script:** - - - Simply re-run the import script; it will automatically skip files marked as `IMPORTED` and attempt to import files with statuses `NOT_STARTED`, `IN_PROGRESS`, or `FAILED_TO_IMPORT`. - - ```bash - ./your_import_script.sh 8 /path/to/db_export/ - ``` - - - The script manages the import process, ensuring that only the necessary files are processed without manual intervention. - -2. **Verify the Imports:** - - - Check the `import_tracking.txt` and `import.log` files to ensure that all files have been imported successfully. - - - If files continue to fail, review the error messages in `import.log` for troubleshooting. - **Notes on Data Consistency:** - **System Resources:** Adjust the number of CPU cores used (`8` in the example) based on your system's capabilities to prevent overloading the server. @@ -330,8 +306,19 @@ During the import process, the script generates a file named `import_tracking.tx - **Import Failures:** - - Check the `import.log` file generated by the import script for detailed error messages. - - Review the `import_tracking.txt` file to identify which files failed to import. + - Simply re-run the import script; it will automatically skip files marked as `IMPORTED` and attempt to import files with statuses `NOT_STARTED`, `IN_PROGRESS`, or `FAILED_TO_IMPORT`. + + ```bash + ./bootstrap.sh 8 /path/to/db_export/ + ``` + + - The script manages the import process, ensuring that only the necessary files are processed without manual intervention. + + - Verify the Imports: + + - Check the `import_tracking.txt` and `import.log` files to ensure that all files have been imported successfully. + + - If files continue to fail, review the error messages in `import.log` for troubleshooting. - **Interruption Handling:** From e526a4f67d8ffb33aa6af40b7d58b881c01d6bc0 Mon Sep 17 00:00:00 2001 From: Nir Ben-Or Date: Fri, 20 Sep 2024 14:48:28 -0500 Subject: [PATCH 9/9] Changed the import command to a one-liner, and added a conditional revoke of the extra grant for GCP cloud sql instances Signed-off-by: Nir Ben-Or --- .../src/main/resources/db/scripts/bootstrap.sh | 8 +------- .../src/main/resources/db/scripts/init.sh | 5 +++++ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/hedera-mirror-importer/src/main/resources/db/scripts/bootstrap.sh b/hedera-mirror-importer/src/main/resources/db/scripts/bootstrap.sh index e7127f636ed..a6a65c6ff25 100644 --- a/hedera-mirror-importer/src/main/resources/db/scripts/bootstrap.sh +++ b/hedera-mirror-importer/src/main/resources/db/scripts/bootstrap.sh @@ -157,13 +157,7 @@ import_file() { write_tracking_file "$file" "IN_PROGRESS" log "Importing table $table from $file" - if { - echo "BEGIN;" - echo "\\copy $table FROM STDIN WITH CSV HEADER;" - gunzip -c "$file" - echo "\." - echo "COMMIT;" - } | psql -q -v ON_ERROR_STOP=1; then + if gunzip -c "$file" | psql -q -v ON_ERROR_STOP=1 -c "\COPY $table FROM STDIN WITH CSV HEADER"; then log "Successfully imported $file into $table" # Update the status to IMPORTED write_tracking_file "$file" "IMPORTED" diff --git a/hedera-mirror-importer/src/main/resources/db/scripts/init.sh b/hedera-mirror-importer/src/main/resources/db/scripts/init.sh index a7fcbf7e7ff..eb9d3b5c67d 100755 --- a/hedera-mirror-importer/src/main/resources/db/scripts/init.sh +++ b/hedera-mirror-importer/src/main/resources/db/scripts/init.sh @@ -120,4 +120,9 @@ ${DB_SPECIFIC_EXTENSION_SQL} -- Alter search path \connect postgres postgres alter database :dbName set search_path = :dbSchema, public, :tempSchema; + +-- Conditional REVOKE statement for Google Cloud SQL +\if :isGcpCloudSql + revoke mirror_node from :pgUser; +\endif __SQL__