NVIDIA · cdesiniotis · Mar 9, 2024 · Mar 9, 2024 · Mar 11, 2024 · Mar 11, 2024
@@ -1,7 +1,25 @@
-version: v1
-sharing:
-  timeSlicing:
-    renameByDefault: false
-    resources:
-    - name: nvidia.com/gpu
-      replicas: 10
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: plugin-config
+data:
+  time-slicing: |-
+    version: v1
+    flags:
+      migStrategy: none
+    sharing:
+      timeSlicing:
+        renameByDefault: false
+        resources:
+          - name: nvidia.com/gpu
+            replicas: 10
+  mps: |-
+    version: v1
+    flags:
+      migStrategy: none
+    sharing:
+      mps:
+        renameByDefault: false
+        resources:
+          - name: nvidia.com/gpu
+            replicas: 4
@@ -0,0 +1,35 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: nvidia-plugin-mps-test
+  labels:
+    app: nvidia-plugin-mps-test
+spec:
+  replicas: 4
+  selector:
+    matchLabels:
+      app: nvidia-plugin-mps-test
+  template:
+    metadata:
+      labels:
+        app: nvidia-plugin-mps-test
+    spec:
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
+      serviceAccountName: nvidia-device-plugin
+      containers:
+        - name: nvidia-plugin-test-ctr
+          image: nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.10.1
+          imagePullPolicy: IfNotPresent
+          command: ['sh', '-c']
+          args:
+            - "while true;  do vectorAdd; sleep 30; done"
+          securityContext:
+            allowPrivilegeEscalation: false
+          resources:
+            limits:
+              nvidia.com/gpu: 1
+      nodeSelector: # Schedule on the node with MPS
+        nvidia.com/gpu.sharing-strategy: "mps"
@@ -1,7 +1,10 @@
 #!/bin/bash
 
+source ${SCRIPT_DIR}/collect-logs.sh
+
 check_pod_ready() {
 	local pod_label=$1
+	local timeout_minutes=${2:-45}
 	local current_time=0
 	while :; do
 		echo "Checking $pod_label pod"
@@ -21,8 +24,9 @@ check_pod_ready() {
 			fi
 		fi
 
-		if [[ "${current_time}" -gt $((60 * 45)) ]]; then
+		if [[ "${current_time}" -gt $((60 * ${timeout_minutes})) ]]; then
 			echo "timeout reached"
+			collect_logs
 			exit 1;
 		fi
 
@@ -55,6 +59,7 @@ check_pod_deleted() {
 
 		if [[ "${current_time}" -gt $((60 * 45)) ]]; then
 			echo "timeout reached"
+			collect_logs
 			exit 1;
 		fi
 
@@ -73,6 +78,7 @@ check_no_restarts() {
 	if [ $restartCount -gt 1 ]; then
 		echo "$pod_label restarted multiple times: $restartCount"
 		kubectl logs -p -lapp=$pod_label --all-containers -n ${TEST_NAMESPACE}
+		collect_logs
 		exit 1
 	fi
 	echo "Repeated restarts not observed for pod $pod_label"
@@ -107,20 +113,15 @@ test_restart_operator() {
 		fi
 	done
 
-	echo "Timeout reached, the GPU Operator is still not ready. See below for logs:"
-	kubectl logs -n gpu-operator "$(kubectl get pods -n "${ns}" -o json | jq -r '.items[0].metadata.name')"
+	echo "Timeout reached, the GPU Operator is still not ready."
+	collect_logs
 	exit 1
 }
 
 check_gpu_pod_ready() {
-	local log_dir=$1
 	local current_time=0
 
-	# Ensure the log directory exists
-	mkdir -p ${log_dir}
-
 	while :; do
-		pods="$(kubectl get --all-namespaces pods -o json | jq '.items[] | {name: .metadata.name, ns: .metadata.namespace}' | jq -s -c .)"
 		status=$(kubectl get pods gpu-operator-test -o json | jq -r .status.phase)
 		if [ "${status}" = "Succeeded" ]; then
 			echo "GPU pod terminated successfully"
@@ -130,23 +131,12 @@ check_gpu_pod_ready() {
 
 		if [[ "${current_time}" -gt $((60 * 45)) ]]; then
 			echo "timeout reached"
+			collect_logs
 			exit 1
 		fi
 
-		# Echo useful information on stdout
-		kubectl get pods --all-namespaces
-
-		for pod in $(echo "$pods" | jq -r .[].name); do
-			ns=$(echo "$pods" | jq -r ".[] | select(.name == \"$pod\") | .ns")
-			echo "Generating logs for pod: ${pod} ns: ${ns}"
-			echo "------------------------------------------------" >> "${log_dir}/${pod}.describe"
-			kubectl -n "${ns}" describe pods "${pod}" >> "${log_dir}/${pod}.describe"
-			kubectl -n "${ns}" logs "${pod}" --all-containers=true > "${log_dir}/${pod}.logs" || true
-		done
-
-		echo "Generating cluster logs"
-		echo "------------------------------------------------" >> "${log_dir}/cluster.logs"
-		kubectl get --all-namespaces pods >> "${log_dir}/cluster.logs"
+    # Echo useful information on stdout
+    kubectl get pods --all-namespaces
 
 		echo "Sleeping 5 seconds"
 		current_time=$((${current_time} + 5))
@@ -177,6 +167,7 @@ check_nvidia_driver_pods_ready() {
 
 		if [[ "${current_time}" -gt $((60 * 45)) ]]; then
 			echo "timeout reached"
+			collect_logs
 			exit 1;
 		fi
 
@@ -194,6 +185,7 @@ check_no_driver_pod_restarts() {
 	if [ $restartCount -gt 1 ]; then
 		echo "nvidia driver pod restarted multiple times: $restartCount"
 		kubectl logs -p -l "app.kubernetes.io/component=nvidia-driver" --all-containers -n ${TEST_NAMESPACE}
+		collect_logs
 		exit 1
 	fi
 	echo "Repeated restarts not observed for the nvidia driver pod"
@@ -221,6 +213,7 @@ wait_for_driver_upgrade_done() {
 
 		if [[ "${current_time}" -gt $((60 * 45)) ]]; then
 			echo "timeout reached"
+			collect_logs
 			exit 1;
 		fi
 
@@ -234,3 +227,5 @@ wait_for_driver_upgrade_done() {
 		sleep 5
 	done
 }
+
+
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+source ${SCRIPT_DIR}/.definitions.sh
+
+collect_logs() {
+  # Ensure the log directory exists
+  mkdir -p ${LOG_DIR}
+
+  pods="$(kubectl get --all-namespaces pods -o json | jq '.items[] | {name: .metadata.name, ns: .metadata.namespace}' | jq -s -c .)"
+  for pod in $(echo "$pods" | jq -r .[].name); do
+    ns=$(echo "$pods" | jq -r ".[] | select(.name == \"$pod\") | .ns")
+    echo "Generating logs for pod: ${pod} ns: ${ns}"
+    echo "------------------------------------------------" >> "${LOG_DIR}/${pod}.describe"
+    kubectl -n "${ns}" describe pods "${pod}" >> "${LOG_DIR}/${pod}.describe"
+    kubectl -n "${ns}" logs "${pod}" --all-containers=true > "${LOG_DIR}/${pod}.logs" || true
+  done
+
+  echo "Generating cluster logs"
+  echo "------------------------------------------------" >> "${LOG_DIR}/cluster.logs"
+  kubectl get --all-namespaces pods >> "${LOG_DIR}/cluster.logs"
+}