diff --git a/ci/Dockerfile b/ci/Dockerfile index 31f05a7d..35298181 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -35,7 +35,7 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-py38_4.10.3-Linu && conda init # install cuML -ARG CUML_VER=23.04 +ARG CUML_VER=23.06 RUN conda install -c conda-forge mamba && \ - mamba install -y -c rapidsai -c nvidia -c conda-forge cuml=$CUML_VER python=3.8 cuda-toolkit=11.5 \ + mamba install -y -c rapidsai -c nvidia -c conda-forge cuml=$CUML_VER python=3.9 cuda-toolkit=11.5 \ && mamba clean --all -f -y diff --git a/docker/Dockerfile.pip b/docker/Dockerfile.pip index 865a70de..f12193ef 100644 --- a/docker/Dockerfile.pip +++ b/docker/Dockerfile.pip @@ -18,7 +18,7 @@ ARG CUDA_VERSION=11.8.0 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 ARG PYSPARK_VERSION=3.3.1 -ARG RAPIDS_VERSION=23.4.0 +ARG RAPIDS_VERSION=23.6.0 # Install packages to build spark-rapids-ml RUN apt-get update -y \ diff --git a/docker/Dockerfile.python b/docker/Dockerfile.python index 187c1b2d..0f94e8ea 100644 --- a/docker/Dockerfile.python +++ b/docker/Dockerfile.python @@ -17,7 +17,7 @@ ARG CUDA_VERSION=11.5.2 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 -ARG CUML_VERSION=23.04 +ARG CUML_VERSION=23.06 # Install packages to build spark-rapids-ml RUN apt update -y \ @@ -38,7 +38,7 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-py38_4.10.3-Linu # install cuML -RUN conda install -y -c rapidsai -c nvidia -c conda-forge python=3.8 cuda-toolkit=11.5 cuml=$CUML_VERSION \ +RUN conda install -y -c rapidsai -c nvidia -c conda-forge python=3.9 cuda-toolkit=11.5 cuml=$CUML_VERSION \ && conda clean --all -f -y # install python dependencies diff --git a/docs/source/conf.py b/docs/source/conf.py index f27fd682..6d6c7bd2 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -9,7 +9,7 @@ project = 'spark-rapids-ml' copyright = '2023, NVIDIA' author = 'NVIDIA' -release = '23.4.0' +release = '23.6.0' # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/jvm/README.md b/jvm/README.md index a1dd6561..54726456 100644 --- a/jvm/README.md +++ b/jvm/README.md @@ -74,7 +74,7 @@ the _project root path_ with: cd jvm mvn clean package ``` -Then `rapids-4-spark-ml_2.12-23.04.0-SNAPSHOT.jar` will be generated under `target` folder. +Then `rapids-4-spark-ml_2.12-23.06.0-SNAPSHOT.jar` will be generated under `target` folder. Users can also use the _release_ version spark-rapids plugin as the dependency if it's already been released in public maven repositories, see [rapids-4-spark maven repository](https://mvnrepository.com/artifact/com.nvidia/rapids-4-spark) @@ -94,8 +94,8 @@ repository, usually in your `~/.m2/repository`. Add the artifact jar to the Spark, for example: ```bash -ML_JAR="target/rapids-4-spark-ml_2.12-23.04.0-SNAPSHOT.jar" -PLUGIN_JAR="~/.m2/repository/com/nvidia/rapids-4-spark_2.12/23.04.0-SNAPSHOT/rapids-4-spark_2.12-23.04.0-SNAPSHOT.jar" +ML_JAR="target/rapids-4-spark-ml_2.12-23.06.0-SNAPSHOT.jar" +PLUGIN_JAR="~/.m2/repository/com/nvidia/rapids-4-spark_2.12/23.06.0-SNAPSHOT/rapids-4-spark_2.12-23.06.0-SNAPSHOT.jar" $SPARK_HOME/bin/spark-shell --master $SPARK_MASTER \ --driver-memory 20G \ diff --git a/jvm/pom.xml b/jvm/pom.xml index fde9dbcd..eff9ee8c 100644 --- a/jvm/pom.xml +++ b/jvm/pom.xml @@ -20,7 +20,7 @@ 4.0.0 com.nvidia rapids-4-spark-ml_2.12 - 23.04.0-SNAPSHOT + 23.06.0-SNAPSHOT RAPIDS Accelerator for Apache Spark ML The RAPIDS cuML library for Apache Spark 2021 @@ -93,7 +93,7 @@ com.nvidia rapids-4-spark_2.12 - 23.04.0 + 23.06.0 diff --git a/notebooks/aws-emr/init-bootstrap-action.sh b/notebooks/aws-emr/init-bootstrap-action.sh index 396d85a2..8a136702 100755 --- a/notebooks/aws-emr/init-bootstrap-action.sh +++ b/notebooks/aws-emr/init-bootstrap-action.sh @@ -8,7 +8,7 @@ sudo chmod a+rwx -R /sys/fs/cgroup/devices sudo yum install -y gcc openssl-devel bzip2-devel libffi-devel tar gzip wget make mysql-devel sudo bash -c "wget https://www.python.org/ftp/python/3.9.9/Python-3.9.9.tgz && tar xzf Python-3.9.9.tgz && cd Python-3.9.9 && ./configure --enable-optimizations && make altinstall" -RAPIDS_VERSION=23.4.0 +RAPIDS_VERSION=23.6.0 # install scikit-learn sudo /usr/local/bin/pip3.9 install scikit-learn diff --git a/notebooks/databricks/README.md b/notebooks/databricks/README.md index 01c98990..d1b24ea8 100644 --- a/notebooks/databricks/README.md +++ b/notebooks/databricks/README.md @@ -41,7 +41,7 @@ If you already have a Databricks account, you can run the example notebooks on a spark.task.resource.gpu.amount 1 spark.databricks.delta.preview.enabled true spark.python.worker.reuse true - spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-23.04.0.jar:/databricks/spark/python + spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-23.06.0.jar:/databricks/spark/python spark.sql.execution.arrow.maxRecordsPerBatch 100000 spark.rapids.memory.gpu.minAllocFraction 0.0001 spark.plugins com.nvidia.spark.SQLPlugin diff --git a/notebooks/databricks/init-pip-cuda-11.8.sh b/notebooks/databricks/init-pip-cuda-11.8.sh index f71f82a7..63d27268 100644 --- a/notebooks/databricks/init-pip-cuda-11.8.sh +++ b/notebooks/databricks/init-pip-cuda-11.8.sh @@ -1,13 +1,13 @@ #!/bin/bash # set portion of path below after /dbfs/ to dbfs zip file location SPARK_RAPIDS_ML_ZIP=/dbfs/path/to/zip/file -# IMPORTANT: specify RAPIDS_VERSION fully 23.4.0 and not 23.4 -# also RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.4.0 and not 23.04.0) -# while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.04.0 and not 23.4.0) -RAPIDS_VERSION=23.4.0 -SPARK_RAPIDS_VERSION=23.04.0 +# IMPORTANT: specify RAPIDS_VERSION fully 23.6.0 and not 23.6 +# also RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.6.0 and not 23.06.0) +# while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.06.0 and not 23.6.0) +RAPIDS_VERSION=23.6.0 +SPARK_RAPIDS_VERSION=23.06.0 -curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar +curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda11.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar # install cudatoolkit 11.8 via runfile approach wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run @@ -34,7 +34,7 @@ ldconfig /databricks/python/bin/pip install --upgrade pip # install cudf, cuml and their rapids dependencies -# using ~= pulls in lates micro version patches +# using ~= pulls in latest micro version patches /databricks/python/bin/pip install cudf-cu11~=${RAPIDS_VERSION} \ cuml-cu11~=${RAPIDS_VERSION} \ pylibraft-cu11~=${RAPIDS_VERSION} \ diff --git a/notebooks/dataproc/README.md b/notebooks/dataproc/README.md index 7b423d8e..11eb1133 100644 --- a/notebooks/dataproc/README.md +++ b/notebooks/dataproc/README.md @@ -20,17 +20,18 @@ If you already have a Dataproc account, you can run the example notebooks on a D gcloud storage buckets create gs://${GCS_BUCKET} ``` -- Upload the initialization script to your GCS bucket: +- Upload the initialization scripts to your GCS bucket: ``` gsutil cp spark_rapids_ml.sh gs://${GCS_BUCKET}/spark_rapids_ml.sh + gsutil cp ../../python/benchmark/dataproc/spark-rapids.sh gs://${GCS_BUCKET}/spark-rapids.sh ``` - Create a cluster with at least two single-gpu workers. **Note**: in addition to the initialization script from above, this also uses the standard [initialization actions](https://github.com/GoogleCloudDataproc/initialization-actions) for installing the GPU drivers and RAPIDS: ``` export CUDA_VERSION=11.8 - export RAPIDS_VERSION=23.4 + export RAPIDS_VERSION=23.6 gcloud dataproc clusters create $USER-spark-rapids-ml \ - --image-version=2.0.29-ubuntu18 \ + --image-version=2.1-ubuntu \ --region ${COMPUTE_REGION} \ --master-machine-type n1-standard-16 \ --master-accelerator type=nvidia-tesla-t4,count=1 \ @@ -40,7 +41,7 @@ If you already have a Dataproc account, you can run the example notebooks on a D --worker-machine-type n1-standard-16 \ --num-worker-local-ssds 4 \ --worker-local-ssd-interface=NVME \ - --initialization-actions gs://goog-dataproc-initialization-actions-us-central1/gpu/install_gpu_driver.sh,gs://goog-dataproc-initialization-actions-us-central1/rapids/rapids.sh,gs://${GCS_BUCKET}/spark_rapids_ml.sh \ + --initialization-actions gs://goog-dataproc-initialization-actions-us-central1/gpu/install_gpu_driver.sh,gs://${GCS_BUCKET}/spark_rapids.sh,gs://${GCS_BUCKET}/spark_rapids_ml.sh \ --optional-components=JUPYTER \ --metadata gpu-driver-provider="NVIDIA" \ --metadata rapids-runtime=SPARK \ @@ -48,7 +49,8 @@ If you already have a Dataproc account, you can run the example notebooks on a D --metadata rapids-version=${RAPIDS_VERSION} \ --bucket ${GCS_BUCKET} \ --enable-component-gateway \ - --subnet=default + --subnet=default \ + --no-shielded-secure-boot ``` - In the [Dataproc console](https://console.cloud.google.com/dataproc/clusters), select your cluster, go to the "Web Interfaces" tab, and click on the "JupyterLab" link. - In JupyterLab, upload the desired [notebook](../) via the `Upload Files` button. diff --git a/notebooks/dataproc/spark_rapids_ml.sh b/notebooks/dataproc/spark_rapids_ml.sh index 84353f3c..e3f37f39 100644 --- a/notebooks/dataproc/spark_rapids_ml.sh +++ b/notebooks/dataproc/spark_rapids_ml.sh @@ -1,10 +1,13 @@ #!/bin/bash -RAPIDS_VERSION=23.4.0 +RAPIDS_VERSION=23.6.0 # patch existing packages mamba install "llvmlite<0.40,>=0.39.0dev0" "numba>=0.56.2" +# dataproc 2.1 pyarrow and arrow conda installation is not compatible with cudf +mamba uninstall -y pyarrow arrow + # install cudf and cuml pip install --upgrade pip pip install cudf-cu11~=${RAPIDS_VERSION} cuml-cu11~=${RAPIDS_VERSION} \ diff --git a/python/README.md b/python/README.md index 31546c0f..07c547d4 100644 --- a/python/README.md +++ b/python/README.md @@ -8,9 +8,9 @@ For simplicity, the following instructions just use Spark local mode, assuming a First, install RAPIDS cuML per [these instructions](https://rapids.ai/start.html). ```bash -conda create -n rapids-23.04 \ +conda create -n rapids-23.06 \ -c rapidsai -c nvidia -c conda-forge \ - cuml=23.04 python=3.8 cudatoolkit=11.5 + cuml=23.06 python=3.9 cudatoolkit=11.5 ``` **Note**: while testing, we recommend using conda or docker to simplify installation and isolate your environment while experimenting. Once you have a working environment, you can then try installing directly, if necessary. @@ -19,7 +19,7 @@ conda create -n rapids-23.04 \ Once you have the conda environment, activate it and install the required packages. ```bash -conda activate rapids-23.04 +conda activate rapids-23.06 # for development access to notebooks, tests, and benchmarks git clone --branch main https://github.com/NVIDIA/spark-rapids-ml.git @@ -152,7 +152,7 @@ While the Spark Rapids ML API attempts to mirror the PySpark ML API to minimize ```python # from pyspark.ml.clustering import KMeans from spark_rapids_ml.clustering import KMeans -form pyspark.ml.linalg import Vectors +from pyspark.ml.linalg import Vectors data = [(Vectors.dense([0.0, 0.0]), 2.0), (Vectors.dense([1.0, 1.0]), 2.0), (Vectors.dense([9.0, 8.0]), 2.0), (Vectors.dense([8.0, 9.0]), 2.0)] diff --git a/python/benchmark/aws-emr/README.md b/python/benchmark/aws-emr/README.md index aabfaba3..074be171 100644 --- a/python/benchmark/aws-emr/README.md +++ b/python/benchmark/aws-emr/README.md @@ -1,4 +1,4 @@ -# Benchmarking on Dataproc +# Benchmarking on AWS EMR This directory contains shell scripts for running larger-scale benchmarks on an AWS EMR cluster. You will need an AWS account to run them. The benchmarks use datasets synthetically generated using [gen_data.py](../gen_data.py). For convenience, these have been precomputed and are available in the public S3 bucket `spark-rapids-ml-bm-datasets-public`. The benchmark scripts are currently configured to read the data from there. diff --git a/python/benchmark/databricks/gpu_cluster_spec.sh b/python/benchmark/databricks/gpu_cluster_spec.sh index becde51f..b302b944 100644 --- a/python/benchmark/databricks/gpu_cluster_spec.sh +++ b/python/benchmark/databricks/gpu_cluster_spec.sh @@ -9,7 +9,7 @@ cat <=0.39.0dev0" "numba>=0.56.2" @@ -16,6 +16,10 @@ mamba install "llvmlite<0.40,>=0.39.0dev0" "numba>=0.56.2" # install cudf and cuml # using ~= pulls in lates micro version patches pip install --upgrade pip + +# dataproc 2.1 pyarrow and arrow conda installation is not compatible with cudf +mamba uninstall -y pyarrow arrow + pip install cudf-cu11~=${RAPIDS_VERSION} cuml-cu11~=${RAPIDS_VERSION} \ pylibraft-cu11~=${RAPIDS_VERSION} \ rmm-cu11~=${RAPIDS_VERSION} \ @@ -32,5 +36,6 @@ gsutil cp gs://${BENCHMARK_HOME}/benchmark_runner.py . gsutil cp gs://${BENCHMARK_HOME}/spark_rapids_ml.zip . gsutil cp gs://${BENCHMARK_HOME}/benchmark.zip . -unzip spark_rapids_ml.zip -d /opt/conda/miniconda3/lib/python3.8/site-packages -unzip benchmark.zip -d /opt/conda/miniconda3/lib/python3.8/site-packages +python_ver=`python --version | grep -oP '3\.[0-9]+'` +unzip spark_rapids_ml.zip -d /opt/conda/miniconda3/lib/python${python_ver}/site-packages +unzip benchmark.zip -d /opt/conda/miniconda3/lib/python${python_ver}/site-packages diff --git a/python/benchmark/dataproc/setup.sh b/python/benchmark/dataproc/setup.sh index 354ce1bb..163bd8a1 100755 --- a/python/benchmark/dataproc/setup.sh +++ b/python/benchmark/dataproc/setup.sh @@ -12,6 +12,8 @@ SPARK_RAPIDS_ML_HOME='../..' echo "**** copying benchmarking related files to ${BENCHMARK_HOME} ****" gsutil cp init_benchmark.sh gs://${BENCHMARK_HOME}/init_benchmark.sh +curl -LO https://raw.githubusercontent.com/GoogleCloudDataproc/initialization-actions/master/spark-rapids/spark-rapids.sh +gsutil cp spark-rapids.sh gs://${BENCHMARK_HOME}/spark-rapids.sh pushd ${SPARK_RAPIDS_ML_HOME}/benchmark zip -r - benchmark >benchmark.zip diff --git a/python/benchmark/dataproc/start_cluster.sh b/python/benchmark/dataproc/start_cluster.sh index d8639149..0ee83d32 100755 --- a/python/benchmark/dataproc/start_cluster.sh +++ b/python/benchmark/dataproc/start_cluster.sh @@ -14,16 +14,14 @@ fi BENCHMARK_HOME=${BENCHMARK_HOME:-${GCS_BUCKET}/benchmark} CUDA_VERSION=${CUDA_VERSION:-11.8} -RAPIDS_VERSION=${RAPIDS_VERSION:-23.4.0} gpu_args=$(cat <