diff --git a/ci/Dockerfile b/ci/Dockerfile
index 31f05a7d..35298181 100644
--- a/ci/Dockerfile
+++ b/ci/Dockerfile
@@ -35,7 +35,7 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-py38_4.10.3-Linu
&& conda init
# install cuML
-ARG CUML_VER=23.04
+ARG CUML_VER=23.06
RUN conda install -c conda-forge mamba && \
- mamba install -y -c rapidsai -c nvidia -c conda-forge cuml=$CUML_VER python=3.8 cuda-toolkit=11.5 \
+ mamba install -y -c rapidsai -c nvidia -c conda-forge cuml=$CUML_VER python=3.9 cuda-toolkit=11.5 \
&& mamba clean --all -f -y
diff --git a/docker/Dockerfile.pip b/docker/Dockerfile.pip
index 865a70de..f12193ef 100644
--- a/docker/Dockerfile.pip
+++ b/docker/Dockerfile.pip
@@ -18,7 +18,7 @@ ARG CUDA_VERSION=11.8.0
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
ARG PYSPARK_VERSION=3.3.1
-ARG RAPIDS_VERSION=23.4.0
+ARG RAPIDS_VERSION=23.6.0
# Install packages to build spark-rapids-ml
RUN apt-get update -y \
diff --git a/docker/Dockerfile.python b/docker/Dockerfile.python
index 187c1b2d..0f94e8ea 100644
--- a/docker/Dockerfile.python
+++ b/docker/Dockerfile.python
@@ -17,7 +17,7 @@
ARG CUDA_VERSION=11.5.2
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
-ARG CUML_VERSION=23.04
+ARG CUML_VERSION=23.06
# Install packages to build spark-rapids-ml
RUN apt update -y \
@@ -38,7 +38,7 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-py38_4.10.3-Linu
# install cuML
-RUN conda install -y -c rapidsai -c nvidia -c conda-forge python=3.8 cuda-toolkit=11.5 cuml=$CUML_VERSION \
+RUN conda install -y -c rapidsai -c nvidia -c conda-forge python=3.9 cuda-toolkit=11.5 cuml=$CUML_VERSION \
&& conda clean --all -f -y
# install python dependencies
diff --git a/docs/source/conf.py b/docs/source/conf.py
index f27fd682..6d6c7bd2 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -9,7 +9,7 @@
project = 'spark-rapids-ml'
copyright = '2023, NVIDIA'
author = 'NVIDIA'
-release = '23.4.0'
+release = '23.6.0'
# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
diff --git a/jvm/README.md b/jvm/README.md
index a1dd6561..54726456 100644
--- a/jvm/README.md
+++ b/jvm/README.md
@@ -74,7 +74,7 @@ the _project root path_ with:
cd jvm
mvn clean package
```
-Then `rapids-4-spark-ml_2.12-23.04.0-SNAPSHOT.jar` will be generated under `target` folder.
+Then `rapids-4-spark-ml_2.12-23.06.0-SNAPSHOT.jar` will be generated under `target` folder.
Users can also use the _release_ version spark-rapids plugin as the dependency if it's already been
released in public maven repositories, see [rapids-4-spark maven repository](https://mvnrepository.com/artifact/com.nvidia/rapids-4-spark)
@@ -94,8 +94,8 @@ repository, usually in your `~/.m2/repository`.
Add the artifact jar to the Spark, for example:
```bash
-ML_JAR="target/rapids-4-spark-ml_2.12-23.04.0-SNAPSHOT.jar"
-PLUGIN_JAR="~/.m2/repository/com/nvidia/rapids-4-spark_2.12/23.04.0-SNAPSHOT/rapids-4-spark_2.12-23.04.0-SNAPSHOT.jar"
+ML_JAR="target/rapids-4-spark-ml_2.12-23.06.0-SNAPSHOT.jar"
+PLUGIN_JAR="~/.m2/repository/com/nvidia/rapids-4-spark_2.12/23.06.0-SNAPSHOT/rapids-4-spark_2.12-23.06.0-SNAPSHOT.jar"
$SPARK_HOME/bin/spark-shell --master $SPARK_MASTER \
--driver-memory 20G \
diff --git a/jvm/pom.xml b/jvm/pom.xml
index fde9dbcd..eff9ee8c 100644
--- a/jvm/pom.xml
+++ b/jvm/pom.xml
@@ -20,7 +20,7 @@
4.0.0
com.nvidia
rapids-4-spark-ml_2.12
- 23.04.0-SNAPSHOT
+ 23.06.0-SNAPSHOT
RAPIDS Accelerator for Apache Spark ML
The RAPIDS cuML library for Apache Spark
2021
@@ -93,7 +93,7 @@
com.nvidia
rapids-4-spark_2.12
- 23.04.0
+ 23.06.0
diff --git a/notebooks/aws-emr/init-bootstrap-action.sh b/notebooks/aws-emr/init-bootstrap-action.sh
index 396d85a2..8a136702 100755
--- a/notebooks/aws-emr/init-bootstrap-action.sh
+++ b/notebooks/aws-emr/init-bootstrap-action.sh
@@ -8,7 +8,7 @@ sudo chmod a+rwx -R /sys/fs/cgroup/devices
sudo yum install -y gcc openssl-devel bzip2-devel libffi-devel tar gzip wget make mysql-devel
sudo bash -c "wget https://www.python.org/ftp/python/3.9.9/Python-3.9.9.tgz && tar xzf Python-3.9.9.tgz && cd Python-3.9.9 && ./configure --enable-optimizations && make altinstall"
-RAPIDS_VERSION=23.4.0
+RAPIDS_VERSION=23.6.0
# install scikit-learn
sudo /usr/local/bin/pip3.9 install scikit-learn
diff --git a/notebooks/databricks/README.md b/notebooks/databricks/README.md
index 01c98990..d1b24ea8 100644
--- a/notebooks/databricks/README.md
+++ b/notebooks/databricks/README.md
@@ -41,7 +41,7 @@ If you already have a Databricks account, you can run the example notebooks on a
spark.task.resource.gpu.amount 1
spark.databricks.delta.preview.enabled true
spark.python.worker.reuse true
- spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-23.04.0.jar:/databricks/spark/python
+ spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-23.06.0.jar:/databricks/spark/python
spark.sql.execution.arrow.maxRecordsPerBatch 100000
spark.rapids.memory.gpu.minAllocFraction 0.0001
spark.plugins com.nvidia.spark.SQLPlugin
diff --git a/notebooks/databricks/init-pip-cuda-11.8.sh b/notebooks/databricks/init-pip-cuda-11.8.sh
index f71f82a7..63d27268 100644
--- a/notebooks/databricks/init-pip-cuda-11.8.sh
+++ b/notebooks/databricks/init-pip-cuda-11.8.sh
@@ -1,13 +1,13 @@
#!/bin/bash
# set portion of path below after /dbfs/ to dbfs zip file location
SPARK_RAPIDS_ML_ZIP=/dbfs/path/to/zip/file
-# IMPORTANT: specify RAPIDS_VERSION fully 23.4.0 and not 23.4
-# also RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.4.0 and not 23.04.0)
-# while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.04.0 and not 23.4.0)
-RAPIDS_VERSION=23.4.0
-SPARK_RAPIDS_VERSION=23.04.0
+# IMPORTANT: specify RAPIDS_VERSION fully 23.6.0 and not 23.6
+# also RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.6.0 and not 23.06.0)
+# while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.06.0 and not 23.6.0)
+RAPIDS_VERSION=23.6.0
+SPARK_RAPIDS_VERSION=23.06.0
-curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar
+curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda11.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar
# install cudatoolkit 11.8 via runfile approach
wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
@@ -34,7 +34,7 @@ ldconfig
/databricks/python/bin/pip install --upgrade pip
# install cudf, cuml and their rapids dependencies
-# using ~= pulls in lates micro version patches
+# using ~= pulls in latest micro version patches
/databricks/python/bin/pip install cudf-cu11~=${RAPIDS_VERSION} \
cuml-cu11~=${RAPIDS_VERSION} \
pylibraft-cu11~=${RAPIDS_VERSION} \
diff --git a/notebooks/dataproc/README.md b/notebooks/dataproc/README.md
index 7b423d8e..11eb1133 100644
--- a/notebooks/dataproc/README.md
+++ b/notebooks/dataproc/README.md
@@ -20,17 +20,18 @@ If you already have a Dataproc account, you can run the example notebooks on a D
gcloud storage buckets create gs://${GCS_BUCKET}
```
-- Upload the initialization script to your GCS bucket:
+- Upload the initialization scripts to your GCS bucket:
```
gsutil cp spark_rapids_ml.sh gs://${GCS_BUCKET}/spark_rapids_ml.sh
+ gsutil cp ../../python/benchmark/dataproc/spark-rapids.sh gs://${GCS_BUCKET}/spark-rapids.sh
```
- Create a cluster with at least two single-gpu workers. **Note**: in addition to the initialization script from above, this also uses the standard [initialization actions](https://github.com/GoogleCloudDataproc/initialization-actions) for installing the GPU drivers and RAPIDS:
```
export CUDA_VERSION=11.8
- export RAPIDS_VERSION=23.4
+ export RAPIDS_VERSION=23.6
gcloud dataproc clusters create $USER-spark-rapids-ml \
- --image-version=2.0.29-ubuntu18 \
+ --image-version=2.1-ubuntu \
--region ${COMPUTE_REGION} \
--master-machine-type n1-standard-16 \
--master-accelerator type=nvidia-tesla-t4,count=1 \
@@ -40,7 +41,7 @@ If you already have a Dataproc account, you can run the example notebooks on a D
--worker-machine-type n1-standard-16 \
--num-worker-local-ssds 4 \
--worker-local-ssd-interface=NVME \
- --initialization-actions gs://goog-dataproc-initialization-actions-us-central1/gpu/install_gpu_driver.sh,gs://goog-dataproc-initialization-actions-us-central1/rapids/rapids.sh,gs://${GCS_BUCKET}/spark_rapids_ml.sh \
+ --initialization-actions gs://goog-dataproc-initialization-actions-us-central1/gpu/install_gpu_driver.sh,gs://${GCS_BUCKET}/spark_rapids.sh,gs://${GCS_BUCKET}/spark_rapids_ml.sh \
--optional-components=JUPYTER \
--metadata gpu-driver-provider="NVIDIA" \
--metadata rapids-runtime=SPARK \
@@ -48,7 +49,8 @@ If you already have a Dataproc account, you can run the example notebooks on a D
--metadata rapids-version=${RAPIDS_VERSION} \
--bucket ${GCS_BUCKET} \
--enable-component-gateway \
- --subnet=default
+ --subnet=default \
+ --no-shielded-secure-boot
```
- In the [Dataproc console](https://console.cloud.google.com/dataproc/clusters), select your cluster, go to the "Web Interfaces" tab, and click on the "JupyterLab" link.
- In JupyterLab, upload the desired [notebook](../) via the `Upload Files` button.
diff --git a/notebooks/dataproc/spark_rapids_ml.sh b/notebooks/dataproc/spark_rapids_ml.sh
index 84353f3c..e3f37f39 100644
--- a/notebooks/dataproc/spark_rapids_ml.sh
+++ b/notebooks/dataproc/spark_rapids_ml.sh
@@ -1,10 +1,13 @@
#!/bin/bash
-RAPIDS_VERSION=23.4.0
+RAPIDS_VERSION=23.6.0
# patch existing packages
mamba install "llvmlite<0.40,>=0.39.0dev0" "numba>=0.56.2"
+# dataproc 2.1 pyarrow and arrow conda installation is not compatible with cudf
+mamba uninstall -y pyarrow arrow
+
# install cudf and cuml
pip install --upgrade pip
pip install cudf-cu11~=${RAPIDS_VERSION} cuml-cu11~=${RAPIDS_VERSION} \
diff --git a/python/README.md b/python/README.md
index 31546c0f..07c547d4 100644
--- a/python/README.md
+++ b/python/README.md
@@ -8,9 +8,9 @@ For simplicity, the following instructions just use Spark local mode, assuming a
First, install RAPIDS cuML per [these instructions](https://rapids.ai/start.html).
```bash
-conda create -n rapids-23.04 \
+conda create -n rapids-23.06 \
-c rapidsai -c nvidia -c conda-forge \
- cuml=23.04 python=3.8 cudatoolkit=11.5
+ cuml=23.06 python=3.9 cudatoolkit=11.5
```
**Note**: while testing, we recommend using conda or docker to simplify installation and isolate your environment while experimenting. Once you have a working environment, you can then try installing directly, if necessary.
@@ -19,7 +19,7 @@ conda create -n rapids-23.04 \
Once you have the conda environment, activate it and install the required packages.
```bash
-conda activate rapids-23.04
+conda activate rapids-23.06
# for development access to notebooks, tests, and benchmarks
git clone --branch main https://github.com/NVIDIA/spark-rapids-ml.git
@@ -152,7 +152,7 @@ While the Spark Rapids ML API attempts to mirror the PySpark ML API to minimize
```python
# from pyspark.ml.clustering import KMeans
from spark_rapids_ml.clustering import KMeans
-form pyspark.ml.linalg import Vectors
+from pyspark.ml.linalg import Vectors
data = [(Vectors.dense([0.0, 0.0]), 2.0), (Vectors.dense([1.0, 1.0]), 2.0),
(Vectors.dense([9.0, 8.0]), 2.0), (Vectors.dense([8.0, 9.0]), 2.0)]
diff --git a/python/benchmark/aws-emr/README.md b/python/benchmark/aws-emr/README.md
index aabfaba3..074be171 100644
--- a/python/benchmark/aws-emr/README.md
+++ b/python/benchmark/aws-emr/README.md
@@ -1,4 +1,4 @@
-# Benchmarking on Dataproc
+# Benchmarking on AWS EMR
This directory contains shell scripts for running larger-scale benchmarks on an AWS EMR cluster. You will need an AWS account to run them. The benchmarks use datasets synthetically generated using [gen_data.py](../gen_data.py). For convenience, these have been precomputed and are available in the public S3 bucket `spark-rapids-ml-bm-datasets-public`. The benchmark scripts are currently configured to read the data from there.
diff --git a/python/benchmark/databricks/gpu_cluster_spec.sh b/python/benchmark/databricks/gpu_cluster_spec.sh
index becde51f..b302b944 100644
--- a/python/benchmark/databricks/gpu_cluster_spec.sh
+++ b/python/benchmark/databricks/gpu_cluster_spec.sh
@@ -9,7 +9,7 @@ cat <=0.39.0dev0" "numba>=0.56.2"
@@ -16,6 +16,10 @@ mamba install "llvmlite<0.40,>=0.39.0dev0" "numba>=0.56.2"
# install cudf and cuml
# using ~= pulls in lates micro version patches
pip install --upgrade pip
+
+# dataproc 2.1 pyarrow and arrow conda installation is not compatible with cudf
+mamba uninstall -y pyarrow arrow
+
pip install cudf-cu11~=${RAPIDS_VERSION} cuml-cu11~=${RAPIDS_VERSION} \
pylibraft-cu11~=${RAPIDS_VERSION} \
rmm-cu11~=${RAPIDS_VERSION} \
@@ -32,5 +36,6 @@ gsutil cp gs://${BENCHMARK_HOME}/benchmark_runner.py .
gsutil cp gs://${BENCHMARK_HOME}/spark_rapids_ml.zip .
gsutil cp gs://${BENCHMARK_HOME}/benchmark.zip .
-unzip spark_rapids_ml.zip -d /opt/conda/miniconda3/lib/python3.8/site-packages
-unzip benchmark.zip -d /opt/conda/miniconda3/lib/python3.8/site-packages
+python_ver=`python --version | grep -oP '3\.[0-9]+'`
+unzip spark_rapids_ml.zip -d /opt/conda/miniconda3/lib/python${python_ver}/site-packages
+unzip benchmark.zip -d /opt/conda/miniconda3/lib/python${python_ver}/site-packages
diff --git a/python/benchmark/dataproc/setup.sh b/python/benchmark/dataproc/setup.sh
index 354ce1bb..163bd8a1 100755
--- a/python/benchmark/dataproc/setup.sh
+++ b/python/benchmark/dataproc/setup.sh
@@ -12,6 +12,8 @@ SPARK_RAPIDS_ML_HOME='../..'
echo "**** copying benchmarking related files to ${BENCHMARK_HOME} ****"
gsutil cp init_benchmark.sh gs://${BENCHMARK_HOME}/init_benchmark.sh
+curl -LO https://raw.githubusercontent.com/GoogleCloudDataproc/initialization-actions/master/spark-rapids/spark-rapids.sh
+gsutil cp spark-rapids.sh gs://${BENCHMARK_HOME}/spark-rapids.sh
pushd ${SPARK_RAPIDS_ML_HOME}/benchmark
zip -r - benchmark >benchmark.zip
diff --git a/python/benchmark/dataproc/start_cluster.sh b/python/benchmark/dataproc/start_cluster.sh
index d8639149..0ee83d32 100755
--- a/python/benchmark/dataproc/start_cluster.sh
+++ b/python/benchmark/dataproc/start_cluster.sh
@@ -14,16 +14,14 @@ fi
BENCHMARK_HOME=${BENCHMARK_HOME:-${GCS_BUCKET}/benchmark}
CUDA_VERSION=${CUDA_VERSION:-11.8}
-RAPIDS_VERSION=${RAPIDS_VERSION:-23.4.0}
gpu_args=$(cat <