Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add MPS test to e2e test suite #734

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 25 additions & 7 deletions tests/plugin-config.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,25 @@
version: v1
sharing:
timeSlicing:
renameByDefault: false
resources:
- name: nvidia.com/gpu
replicas: 10
apiVersion: v1
kind: ConfigMap
metadata:
name: plugin-config
data:
time-slicing: |-
version: v1
flags:
migStrategy: none
sharing:
timeSlicing:
renameByDefault: false
resources:
- name: nvidia.com/gpu
replicas: 10
mps: |-
version: v1
flags:
migStrategy: none
sharing:
mps:
renameByDefault: false
resources:
- name: nvidia.com/gpu
replicas: 4
35 changes: 35 additions & 0 deletions tests/plugin-mps-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: nvidia-plugin-mps-test
labels:
app: nvidia-plugin-mps-test
spec:
replicas: 4
selector:
matchLabels:
app: nvidia-plugin-mps-test
template:
metadata:
labels:
app: nvidia-plugin-mps-test
spec:
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
serviceAccountName: nvidia-device-plugin
containers:
- name: nvidia-plugin-test-ctr
image: nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.10.1
imagePullPolicy: IfNotPresent
command: ['sh', '-c']
args:
- "while true; do vectorAdd; sleep 30; done"
securityContext:
allowPrivilegeEscalation: false
resources:
limits:
nvidia.com/gpu: 1
nodeSelector: # Schedule on the node with MPS
nvidia.com/gpu.sharing-strategy: "mps"
39 changes: 17 additions & 22 deletions tests/scripts/checks.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
#!/bin/bash

source ${SCRIPT_DIR}/collect-logs.sh

check_pod_ready() {
local pod_label=$1
local timeout_minutes=${2:-45}
local current_time=0
while :; do
echo "Checking $pod_label pod"
Expand All @@ -21,8 +24,9 @@ check_pod_ready() {
fi
fi

if [[ "${current_time}" -gt $((60 * 45)) ]]; then
if [[ "${current_time}" -gt $((60 * ${timeout_minutes})) ]]; then
echo "timeout reached"
collect_logs
exit 1;
fi

Expand Down Expand Up @@ -55,6 +59,7 @@ check_pod_deleted() {

if [[ "${current_time}" -gt $((60 * 45)) ]]; then
echo "timeout reached"
collect_logs
exit 1;
fi

Expand All @@ -73,6 +78,7 @@ check_no_restarts() {
if [ $restartCount -gt 1 ]; then
echo "$pod_label restarted multiple times: $restartCount"
kubectl logs -p -lapp=$pod_label --all-containers -n ${TEST_NAMESPACE}
collect_logs
exit 1
fi
echo "Repeated restarts not observed for pod $pod_label"
Expand Down Expand Up @@ -107,20 +113,15 @@ test_restart_operator() {
fi
done

echo "Timeout reached, the GPU Operator is still not ready. See below for logs:"
kubectl logs -n gpu-operator "$(kubectl get pods -n "${ns}" -o json | jq -r '.items[0].metadata.name')"
echo "Timeout reached, the GPU Operator is still not ready."
collect_logs
exit 1
}

check_gpu_pod_ready() {
local log_dir=$1
local current_time=0

# Ensure the log directory exists
mkdir -p ${log_dir}

while :; do
pods="$(kubectl get --all-namespaces pods -o json | jq '.items[] | {name: .metadata.name, ns: .metadata.namespace}' | jq -s -c .)"
status=$(kubectl get pods gpu-operator-test -o json | jq -r .status.phase)
if [ "${status}" = "Succeeded" ]; then
echo "GPU pod terminated successfully"
Expand All @@ -130,23 +131,12 @@ check_gpu_pod_ready() {

if [[ "${current_time}" -gt $((60 * 45)) ]]; then
echo "timeout reached"
collect_logs
exit 1
fi

# Echo useful information on stdout
kubectl get pods --all-namespaces

for pod in $(echo "$pods" | jq -r .[].name); do
ns=$(echo "$pods" | jq -r ".[] | select(.name == \"$pod\") | .ns")
echo "Generating logs for pod: ${pod} ns: ${ns}"
echo "------------------------------------------------" >> "${log_dir}/${pod}.describe"
kubectl -n "${ns}" describe pods "${pod}" >> "${log_dir}/${pod}.describe"
kubectl -n "${ns}" logs "${pod}" --all-containers=true > "${log_dir}/${pod}.logs" || true
done

echo "Generating cluster logs"
echo "------------------------------------------------" >> "${log_dir}/cluster.logs"
kubectl get --all-namespaces pods >> "${log_dir}/cluster.logs"
# Echo useful information on stdout
kubectl get pods --all-namespaces

echo "Sleeping 5 seconds"
current_time=$((${current_time} + 5))
Expand Down Expand Up @@ -177,6 +167,7 @@ check_nvidia_driver_pods_ready() {

if [[ "${current_time}" -gt $((60 * 45)) ]]; then
echo "timeout reached"
collect_logs
exit 1;
fi

Expand All @@ -194,6 +185,7 @@ check_no_driver_pod_restarts() {
if [ $restartCount -gt 1 ]; then
echo "nvidia driver pod restarted multiple times: $restartCount"
kubectl logs -p -l "app.kubernetes.io/component=nvidia-driver" --all-containers -n ${TEST_NAMESPACE}
collect_logs
exit 1
fi
echo "Repeated restarts not observed for the nvidia driver pod"
Expand Down Expand Up @@ -221,6 +213,7 @@ wait_for_driver_upgrade_done() {

if [[ "${current_time}" -gt $((60 * 45)) ]]; then
echo "timeout reached"
collect_logs
exit 1;
fi

Expand All @@ -234,3 +227,5 @@ wait_for_driver_upgrade_done() {
sleep 5
done
}


21 changes: 21 additions & 0 deletions tests/scripts/collect-logs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash

source ${SCRIPT_DIR}/.definitions.sh

collect_logs() {
# Ensure the log directory exists
mkdir -p ${LOG_DIR}

pods="$(kubectl get --all-namespaces pods -o json | jq '.items[] | {name: .metadata.name, ns: .metadata.namespace}' | jq -s -c .)"
for pod in $(echo "$pods" | jq -r .[].name); do
ns=$(echo "$pods" | jq -r ".[] | select(.name == \"$pod\") | .ns")
echo "Generating logs for pod: ${pod} ns: ${ns}"
echo "------------------------------------------------" >> "${LOG_DIR}/${pod}.describe"
kubectl -n "${ns}" describe pods "${pod}" >> "${LOG_DIR}/${pod}.describe"
kubectl -n "${ns}" logs "${pod}" --all-containers=true > "${LOG_DIR}/${pod}.logs" || true
done

echo "Generating cluster logs"
echo "------------------------------------------------" >> "${LOG_DIR}/cluster.logs"
kubectl get --all-namespaces pods >> "${LOG_DIR}/cluster.logs"
}
Loading
Loading