Skip to content

Commit

Permalink
Merge branch 'master' of github.com:saalfeldlab/hot-knife
Browse files Browse the repository at this point in the history
  • Loading branch information
StephanPreibisch committed Jun 7, 2024
2 parents fb7e19c + 0abead9 commit a9d3b36
Show file tree
Hide file tree
Showing 43 changed files with 4,159 additions and 64 deletions.
4 changes: 2 additions & 2 deletions run_scripts/multi-sem/wafer-53-center7/00_config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ export FLINTSTONE="/groups/flyTEM/flyTEM/render/spark/spark-janelia/flintstone.s
# --------------------------------------------------------------------
export N5_SAMPLE_PATH="/nrs/hess/data/hess_wafer_53/export/${RENDER_OWNER}.n5"
export N5_HEIGHT_FIELDS_DOWNSAMPLING_FACTORS="2,2,1"
export N5_FLAT_DATASET_ROOT="/flat/${RAW_SLAB}"
export N5_FLAT_DATASET_ROOT="/flat_clahe/${RAW_SLAB}"
export N5_FLAT_RAW_DATASET="${N5_FLAT_DATASET_ROOT}/raw/s0"
export N5_SURFACE_ROOT="/surface-align/run_${RUN_TIMESTAMP}"

Expand Down Expand Up @@ -1739,4 +1739,4 @@ getSlabProjectName () {
LAST_Z="402"
fi
echo "slab_${FIRST_Z}_to_${LAST_Z}"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/bin/bash

set -e

ABSOLUTE_SCRIPT=$(readlink -m "$0")
SCRIPT_DIR=$(dirname "${ABSOLUTE_SCRIPT}")

source "${SCRIPT_DIR}"/00_config.sh "NA"

umask 0002

DATASET_CSV="$1"
N_NODES="15"
export RUNTIME="233:59" # batches with 35 slabs took between 5 and 10 hours to complete

if [[ ! -f ${DATASET_CSV} ]]; then
echo "ERROR: csv file ${DATASET_CSV} not found"
exit 1
fi

#-----------------------------------------------------------
# Spark executor setup with 11 cores per worker ...

export N_EXECUTORS_PER_NODE=2
export N_CORES_PER_EXECUTOR=5
# To distribute work evenly, recommended number of tasks/partitions is 3 times the number of cores.
#N_TASKS_PER_EXECUTOR_CORE=3
export N_OVERHEAD_CORES_PER_WORKER=1
#N_CORES_PER_WORKER=$(( (N_EXECUTORS_PER_NODE * N_CORES_PER_EXECUTOR) + N_OVERHEAD_CORES_PER_WORKER ))
export N_CORES_DRIVER=1

#-----------------------------------------------------------
RUN_TIME=$(date +"%Y%m%d_%H%M%S")
CLASS="org.janelia.saalfeldlab.hotknife.SparkMaskedCLAHEMultiSEMBatch"

ARGV="\
--n5PathInput=${N5_SAMPLE_PATH} \
--datasetCsv=${DATASET_CSV} \
--blockFactorXY=32 \
--blockFactorZ=1 \
--overwrite"

LOG_DIR="logs/64_masked_clahe"
LOG_FILE="${LOG_DIR}/masked_clahe_batch.${RUN_TIME}.out"
mkdir -p ${LOG_DIR}

# use shell group to tee all output to log file
{

echo "Running with arguments:
${ARGV}
"
/groups/flyTEM/flyTEM/render/spark/spark-janelia/flintstone.sh $N_NODES $HOT_KNIFE_JAR $CLASS $ARGV
} 2>&1 | tee -a ${LOG_FILE}

Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/bin/bash

set -e

ABSOLUTE_SCRIPT=$(readlink -m "$0")
SCRIPT_DIR=$(dirname "${ABSOLUTE_SCRIPT}")

source "${SCRIPT_DIR}"/00_config.sh "NA"

umask 0002

DATASET_CSV="$1"
N_NODES="20"
export RUNTIME="233:59" # using 20 11-core nodes, batches with 31 slabs took between 5 and 7 hours to complete

if [[ ! -f ${DATASET_CSV} ]]; then
echo "ERROR: csv file ${DATASET_CSV} not found"
exit 1
fi

#-----------------------------------------------------------
# Spark executor setup with 11 cores per worker ...

export N_EXECUTORS_PER_NODE=2
export N_CORES_PER_EXECUTOR=5
# To distribute work evenly, recommended number of tasks/partitions is 3 times the number of cores.
#N_TASKS_PER_EXECUTOR_CORE=3
export N_OVERHEAD_CORES_PER_WORKER=1
#N_CORES_PER_WORKER=$(( (N_EXECUTORS_PER_NODE * N_CORES_PER_EXECUTOR) + N_OVERHEAD_CORES_PER_WORKER ))
export N_CORES_DRIVER=1

#-----------------------------------------------------------
RUN_TIME=$(date +"%Y%m%d_%H%M%S")
CLASS="org.janelia.saalfeldlab.hotknife.SparkExportFlattenedVolumeMultiSEMBatch"

ARGV="\
--n5RootPath=${N5_SAMPLE_PATH} \
--datasetCsv=${DATASET_CSV} \
--padding=3 \
--blockSize=128,128,64 \
--downsample"

LOG_DIR="logs/71_export_flat"
LOG_FILE="${LOG_DIR}/export_flat.${RUN_TIME}.out"
mkdir -p ${LOG_DIR}

# use shell group to tee all output to log file
{

echo "Running with arguments:
${ARGV}
"
/groups/flyTEM/flyTEM/render/spark/spark-janelia/flintstone.sh $N_NODES $HOT_KNIFE_JAR $CLASS $ARGV
} 2>&1 | tee -a "${LOG_FILE}"

Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/bin/bash

set -e

ABSOLUTE_SCRIPT=$(readlink -m "$0")
SCRIPT_DIR=$(dirname "${ABSOLUTE_SCRIPT}")

source "${SCRIPT_DIR}"/00_config.sh "NA"

umask 0002

DATASET_CSV="$1"
N_NODES="20"
export RUNTIME="233:59" # using 20 11-core nodes, batches with 31 slabs took between ? and ? hours to complete

if [[ ! -f ${DATASET_CSV} ]]; then
echo "ERROR: csv file ${DATASET_CSV} not found"
exit 1
fi

#-----------------------------------------------------------
# Spark executor setup with 11 cores per worker ...

export N_EXECUTORS_PER_NODE=2
export N_CORES_PER_EXECUTOR=5
# To distribute work evenly, recommended number of tasks/partitions is 3 times the number of cores.
#N_TASKS_PER_EXECUTOR_CORE=3
export N_OVERHEAD_CORES_PER_WORKER=1
#N_CORES_PER_WORKER=$(( (N_EXECUTORS_PER_NODE * N_CORES_PER_EXECUTOR) + N_OVERHEAD_CORES_PER_WORKER ))
export N_CORES_DRIVER=1

#-----------------------------------------------------------
RUN_TIME=$(date +"%Y%m%d_%H%M%S")
CLASS="org.janelia.saalfeldlab.hotknife.SparkGenerateFaceScaleSpaceMultiSEMBatch"

ARGV="\
--n5Path=${N5_SAMPLE_PATH} \
--datasetCsv=${DATASET_CSV} \
--blockSize=1024,1024 \
--invert"

LOG_DIR="logs/72_face"
LOG_FILE="${LOG_DIR}/extract_face.${RUN_TIME}.out"
mkdir -p ${LOG_DIR}

# use shell group to tee all output to log file
{

echo "Running with arguments:
${ARGV}
"
/groups/flyTEM/flyTEM/render/spark/spark-janelia/flintstone.sh $N_NODES $HOT_KNIFE_JAR $CLASS $ARGV
} 2>&1 | tee -a "${LOG_FILE}"

Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@ ABSOLUTE_SCRIPT=$(readlink -m "${0}")
SCRIPT_DIR=$(dirname "${ABSOLUTE_SCRIPT}")
source "${SCRIPT_DIR}/00_config.sh" "tab_not_applicable"

# This runs quickly! A 1 node job for 10 slabs in wafer 53 took 4 minutes to finish.
# This runs quickly! A 41 node job for 402 slabs in wafer_53_center7 took 5 minutes to finish.
N_NODES=41
N5_GROUP_OUTPUT="${N5_SURFACE_ROOT}/pass00"

# Face dataset order is important.
unset FACE_DATASET_ARGS
for SLAB in ${ALL_SLABS}; do
FACE_DATASET_ARGS="${FACE_DATASET_ARGS} -d /flat/${SLAB}/top4/face_local-inverted -d /flat/${SLAB}/bot4/face_local-inverted"
FACE_DATASET_ARGS="${FACE_DATASET_ARGS} -d /flat_clahe/${SLAB}/top4i/face -d /flat_clahe/${SLAB}/bot4i/face"
done

# need scaleIndex=5 for larger wafer 53 slabs
Expand All @@ -22,7 +22,8 @@ ARGV="
--n5GroupOutput=${N5_GROUP_OUTPUT} \
--scaleIndex=5 \
--iterations 100000 \
--maxError 400 \
--maxError 320 \
--filter RANSAC \
${FACE_DATASET_ARGS}"

CLASS="org.janelia.saalfeldlab.hotknife.SparkAlignAffineGlobal"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,20 @@ set -e
# 4|5|6) N_NODES=150
# pass04: 38 minutes, pass05: 59 minutes, pass06: 58 minutes

# Run times for all 402 wafer_53_center7 slabs with 7 mFOVs:
# 1|2|3) N_NODES=60
# pass01: 150 minutes, pass02: 112 minutes, pass03: 200 minutes
# 4) N_NODES=150
# pass04: 377 minutes
# 5) N_NODES=200
# pass05: 245 minutes
# 6|7|8|9) N_NODES=???
# pass06: ??? minutes, pass07: ??? minutes, pass08: ??? minutes, pass09: ??? minutes
# 10|11) N_NODES=200
# pass10: 193 minutes, pass11: 324 minutes
# 12) N_NODES=250
# pass12: 922 minutes

if (( $# < 1 )); then
echo "USAGE $0 <pass (1-12)> [number of nodes (overrides default)]"
exit 1
Expand Down Expand Up @@ -59,7 +73,9 @@ fi
# setup pass specific run class
case "${PASS}" in
1|2|3) N_NODES=${2:-60}; CLASS="org.janelia.saalfeldlab.hotknife.SparkPairAlignSIFTAverage" ;; # wafer 52: 20 node default
4|5|6|7) N_NODES=${2:-150}; CLASS="org.janelia.saalfeldlab.hotknife.SparkPairAlignSIFTAverage" ;; # wafer 52: 50 node default
4) N_NODES=${2:-150}; CLASS="org.janelia.saalfeldlab.hotknife.SparkPairAlignSIFTAverage" ;; # wafer 52: 50 node default
5) N_NODES=${2:-200}; CLASS="org.janelia.saalfeldlab.hotknife.SparkPairAlignSIFTAverage" ;; # wafer 52: 50 node default
6|7) N_NODES=${2:-150}; CLASS="org.janelia.saalfeldlab.hotknife.SparkPairAlignSIFTAverage" ;; # wafer 52: 50 node default
8|9|10) N_NODES=${2:-150}; CLASS="org.janelia.saalfeldlab.hotknife.SparkPairAlignFlow" ;; # wafer 52: 50 node default
11|12) N_NODES=${2:-210}; CLASS="org.janelia.saalfeldlab.hotknife.SparkPairAlignFlow" ;; # wafer 52: 70 node default
*)
Expand Down Expand Up @@ -90,6 +106,11 @@ ARGV="

LOG_FILE=$(setupRunLog "surface-align-pass${PADDED_PASS}")

# Using a single core driver for the larger wafer 53 jobs,
# we sometimes got a driver failure: TERM_MEMLIMIT: job killed after reaching LSF memory usage limit.
# So, bumping up driver to 4 cores here to avoid that possibility.
export N_CORES_DRIVER=4

# use shell group to tee all output to log file
{

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@

set -e

if (( $# != 1 )); then
echo "USAGE $0 <start with pass (1-12)>"
if (( $# < 1 )); then
echo "USAGE $0 <start with pass (1-12)> [number of nodes (overrides default for each pass)]"
exit 1
fi

START_PASS="${1}"
N_NODES="${2}"

ABSOLUTE_SCRIPT=$(readlink -m "${0}")
SCRIPT_DIR=$(dirname "${ABSOLUTE_SCRIPT}")
Expand All @@ -33,7 +34,7 @@ for PASS in $( seq "${START_PASS}" 12 ); do
waiting to start setup for pass ${PASS} ...
"
sleep 2
source "${SCRIPT_DIR}"/74_spark_surface_align_pass_n.sh "${PASS}"
source "${SCRIPT_DIR}"/74_spark_surface_align_pass_n.sh "${PASS}" ${N_NODES}
done

unset FIRST_LAUNCH_SCRIPT
Expand Down
Loading

0 comments on commit a9d3b36

Please sign in to comment.