diff --git a/charts/opea/common/guardrails-usvc/.helmignore b/charts/opea/common/guardrails-usvc/.helmignore new file mode 100644 index 0000000..0e8a0eb --- /dev/null +++ b/charts/opea/common/guardrails-usvc/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/charts/opea/common/guardrails-usvc/Chart.yaml b/charts/opea/common/guardrails-usvc/Chart.yaml new file mode 100644 index 0000000..86a4f27 --- /dev/null +++ b/charts/opea/common/guardrails-usvc/Chart.yaml @@ -0,0 +1,14 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v2 +name: guardrails-usvc +description: The Helm chart for deploying guardrails-usvc as microservice +type: application +version: 1.0.0 +appVersion: "v1.0" +dependencies: + - name: tgi + version: 1.0.0 + repository: file://../tgi + condition: autodependency.enabled diff --git a/charts/opea/common/guardrails-usvc/README.md b/charts/opea/common/guardrails-usvc/README.md new file mode 100644 index 0000000..3e7f5dc --- /dev/null +++ b/charts/opea/common/guardrails-usvc/README.md @@ -0,0 +1,57 @@ +# guardrails-usvc + +Helm chart for deploying LLM microservice. + +guardrails-usvc depends on TGI, you should set TGI_LLM_ENDPOINT as tgi endpoint. + +## (Option1): Installing the chart separately + +First, you need to install the tgi chart, please refer to the [tgi](../tgi) chart for more information. Please use model `meta-llama/Meta-Llama-Guard-2-8B` during installation. + +After you've deployted the tgi chart successfully, please run `kubectl get svc` to get the tgi service endpoint, i.e. `http://tgi`. + +To install the chart, run the following: + +```console +cd GenAIInfra/helm-charts/common/guardrails-usvc +export HFTOKEN="insert-your-huggingface-token-here" +export SAFETY_GUARD_ENDPOINT="http://tgi" +export SAFETY_GUARD_MODEL_ID="meta-llama/Meta-Llama-Guard-2-8B" +helm dependency update +helm install guardrails-usvc . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set SAFETY_GUARD_ENDPOINT=${SAFETY_GUARD_ENDPOINT} --set SAFETY_GUARD_MODEL_ID=${SAFETY_GUARD_MODEL_ID} --wait +``` + +## (Option2): Installing the chart with dependencies automatically + +```console +cd GenAIInfra/helm-charts/common/guardrails-usvc +export HFTOKEN="insert-your-huggingface-token-here" +helm dependency update +helm install guardrails-usvc . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set autodependency.enabled=true --wait +``` + +## Verify + +To verify the installation, run the command `kubectl get pod` to make sure all pods are running. + +Then run the command `kubectl port-forward svc/guardrails-usvc 9090:9090` to expose the llm-uservice service for access. + +Open another terminal and run the following command to verify the service if working: + +```console +curl http://localhost:9090/v1/guardrails \ + -X POST \ + -d '{"text":"How do you buy a tiger in the US?","parameters":{"max_new_tokens":32}}' \ + -H 'Content-Type: application/json' +``` + +## Values + +| Key | Type | Default | Description | +| ------------------------------- | ------ | ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| global.HUGGINGFACEHUB_API_TOKEN | string | `""` | Your own Hugging Face API token | +| global.modelUseHostPath | string | `"/mnt/opea-models"` | Cached models directory, tgi will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory | +| image.repository | string | `"opea/guardrails-usvc"` | | +| service.port | string | `"9090"` | | +| SAFETY_GUARD_ENDPOINT | string | `""` | LLM endpoint | +| SAFETY_GUARD_MODEL_ID | string | `"meta-llama/Meta-Llama-Guard-2-8B"` | Model ID for the underlying LLM service is using | diff --git a/charts/opea/common/guardrails-usvc/templates/_helpers.tpl b/charts/opea/common/guardrails-usvc/templates/_helpers.tpl new file mode 100644 index 0000000..088f884 --- /dev/null +++ b/charts/opea/common/guardrails-usvc/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "guardrails-usvc.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "guardrails-usvc.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "guardrails-usvc.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "guardrails-usvc.labels" -}} +helm.sh/chart: {{ include "guardrails-usvc.chart" . }} +{{ include "guardrails-usvc.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "guardrails-usvc.selectorLabels" -}} +app.kubernetes.io/name: {{ include "guardrails-usvc.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "guardrails-usvc.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "guardrails-usvc.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/charts/opea/common/guardrails-usvc/templates/configmap.yaml b/charts/opea/common/guardrails-usvc/templates/configmap.yaml new file mode 100644 index 0000000..694bf4c --- /dev/null +++ b/charts/opea/common/guardrails-usvc/templates/configmap.yaml @@ -0,0 +1,29 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "guardrails-usvc.fullname" . }}-config + labels: + {{- include "guardrails-usvc.labels" . | nindent 4 }} +data: + {{- if .Values.SAFETY_GUARD_ENDPOINT }} + SAFETY_GUARD_ENDPOINT: {{ tpl .Values.SAFETY_GUARD_ENDPOINT . | quote}} + {{- else }} + SAFETY_GUARD_ENDPOINT: "http://{{ .Release.Name }}-tgi" + {{- end }} + SAFETY_GUARD_MODEL_ID: {{ .Values.SAFETY_GUARD_MODEL_ID | quote }} + HUGGINGFACEHUB_API_TOKEN: {{ .Values.global.HUGGINGFACEHUB_API_TOKEN | quote}} + HF_HOME: "/tmp/.cache/huggingface" + LOGFLAG: {{ .Values.LOGFLAG | quote }} + {{- if .Values.global.HF_ENDPOINT }} + HF_ENDPOINT: {{ .Values.global.HF_ENDPOINT | quote}} + {{- end }} + http_proxy: {{ .Values.global.http_proxy | quote }} + https_proxy: {{ .Values.global.https_proxy | quote }} + {{- if and (not .Values.SAFETY_GUARD_ENDPOINT) (or .Values.global.http_proxy .Values.global.https_proxy) }} + no_proxy: "{{ .Release.Name }}-tgi,{{ .Values.global.no_proxy }}" + {{- else }} + no_proxy: {{ .Values.global.no_proxy | quote }} + {{- end }} diff --git a/charts/opea/common/guardrails-usvc/templates/deployment.yaml b/charts/opea/common/guardrails-usvc/templates/deployment.yaml new file mode 100644 index 0000000..bb6d396 --- /dev/null +++ b/charts/opea/common/guardrails-usvc/templates/deployment.yaml @@ -0,0 +1,88 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "guardrails-usvc.fullname" . }} + labels: + {{- include "guardrails-usvc.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + {{- include "guardrails-usvc.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "guardrails-usvc.selectorLabels" . | nindent 8 }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: {{ .Release.Name }} + envFrom: + - configMapRef: + name: {{ include "guardrails-usvc.fullname" . }}-config + {{- if .Values.global.extraEnvConfig }} + - configMapRef: + name: {{ .Values.global.extraEnvConfig }} + optional: true + {{- end }} + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + ports: + - name: guardrails-usvc + containerPort: 9090 + protocol: TCP + volumeMounts: + - mountPath: /tmp + name: tmp + {{- if .Values.livenessProbe }} + livenessProbe: + {{- toYaml .Values.livenessProbe | nindent 12 }} + {{- end }} + {{- if .Values.readinessProbe }} + readinessProbe: + {{- toYaml .Values.readinessProbe | nindent 12 }} + {{- end }} + {{- if .Values.startupProbe }} + startupProbe: + {{- toYaml .Values.startupProbe | nindent 12 }} + {{- end }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + volumes: + - name: tmp + emptyDir: {} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- if .Values.evenly_distributed }} + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + {{- include "guardrails-usvc.selectorLabels" . | nindent 14 }} + {{- end }} diff --git a/charts/opea/common/guardrails-usvc/templates/service.yaml b/charts/opea/common/guardrails-usvc/templates/service.yaml new file mode 100644 index 0000000..594312f --- /dev/null +++ b/charts/opea/common/guardrails-usvc/templates/service.yaml @@ -0,0 +1,18 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: {{ include "guardrails-usvc.fullname" . }} + labels: + {{- include "guardrails-usvc.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: 9090 + protocol: TCP + name: guardrails-usvc + selector: + {{- include "guardrails-usvc.selectorLabels" . | nindent 4 }} diff --git a/charts/opea/common/guardrails-usvc/templates/tests/test-pod.yaml b/charts/opea/common/guardrails-usvc/templates/tests/test-pod.yaml new file mode 100644 index 0000000..ec077d4 --- /dev/null +++ b/charts/opea/common/guardrails-usvc/templates/tests/test-pod.yaml @@ -0,0 +1,30 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "guardrails-usvc.fullname" . }}-testpod" + labels: + {{- include "guardrails-usvc.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test + #"helm.sh/hook-delete-policy": "hook-succeeded, hook-failure" +spec: + containers: + - name: curl + image: python:3.10.14 + command: ['bash', '-c'] + args: + - | + max_retry=20; + for ((i=1; i<=max_retry; i++)); do + curl http://{{ include "guardrails-usvc.fullname" . }}:{{ .Values.service.port }}/v1/guardrails -sS --fail-with-body \ + -X POST \ + -d '{"text":"How do you buy a tiger in the US?","parameters":{"max_new_tokens":32}}' \ + -H 'Content-Type: application/json' && break; + curlcode=$? + if [[ $curlcode -eq 7 ]]; then sleep 10; else echo "curl failed with code $curlcode"; exit 1; fi; + done; + if [ $i -gt $max_retry ]; then echo "test failed with maximum retry"; exit 1; fi + restartPolicy: Never diff --git a/charts/opea/common/guardrails-usvc/values.yaml b/charts/opea/common/guardrails-usvc/values.yaml new file mode 100644 index 0000000..314791e --- /dev/null +++ b/charts/opea/common/guardrails-usvc/values.yaml @@ -0,0 +1,97 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Default values for guardrails-usvc. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +autodependency: + enabled: false + +replicaCount: 1 + +# TGI service endpoint +SAFETY_GUARD_ENDPOINT: "" +# Guard Model Id +SAFETY_GUARD_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B" +# Set it as a non-null string, such as true, if you want to enable logging facility, +# otherwise, keep it as "" to disable it. +LOGFLAG: "" + +tgi: + LLM_MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B" + +image: + repository: opea/guardrails-tgi + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "latest" + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +podAnnotations: {} + +podSecurityContext: {} + # fsGroup: 2000 + +securityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + capabilities: + drop: + - ALL + seccompProfile: + type: RuntimeDefault + +service: + type: ClusterIP + port: 9090 + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +livenessProbe: + httpGet: + path: v1/health_check + port: guardrails-usvc + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 24 +readinessProbe: + httpGet: + path: v1/health_check + port: guardrails-usvc + initialDelaySeconds: 5 + periodSeconds: 5 +startupProbe: + httpGet: + path: v1/health_check + port: guardrails-usvc + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 120 + +nodeSelector: {} + +tolerations: [] + +affinity: {} + +global: + http_proxy: "" + https_proxy: "" + no_proxy: "" + HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here" diff --git a/charts/opea/common/tgi/.helmignore b/charts/opea/common/tgi/.helmignore new file mode 100644 index 0000000..0e8a0eb --- /dev/null +++ b/charts/opea/common/tgi/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/charts/opea/common/tgi/Chart.yaml b/charts/opea/common/tgi/Chart.yaml new file mode 100644 index 0000000..7ab58f8 --- /dev/null +++ b/charts/opea/common/tgi/Chart.yaml @@ -0,0 +1,10 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v2 +name: tgi +description: The Helm chart for HuggingFace Text Generation Inference Server +type: application +version: 1.0.0 +# The HF TGI version +appVersion: "2.1.0" diff --git a/charts/opea/common/tgi/README.md b/charts/opea/common/tgi/README.md new file mode 100644 index 0000000..0100378 --- /dev/null +++ b/charts/opea/common/tgi/README.md @@ -0,0 +1,51 @@ +# tgi + +Helm chart for deploying Hugging Face Text Generation Inference service. + +## Installing the Chart + +To install the chart, run the following: + +```console +cd GenAIInfra/helm-charts/common +export MODELDIR=/mnt/opea-models +export MODELNAME="bigscience/bloom-560m" +export HFTOKEN="insert-your-huggingface-token-here" +helm install tgi tgi --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} +# To deploy on Gaudi enabled kubernetes cluster +# helm install tgi tgi --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --values gaudi-values.yaml +``` + +By default, the tgi service will downloading the "bigscience/bloom-560m" which is about 1.1GB. + +If you already cached the model locally, you can pass it to container like this example: + +MODELDIR=/mnt/opea-models + +MODELNAME="/data/models--bigscience--bloom-560m" + +## Verify + +To verify the installation, run the command `kubectl get pod` to make sure all pods are runinng. + +Then run the command `kubectl port-forward svc/tgi 2080:80` to expose the tgi service for access. + +Open another terminal and run the following command to verify the service if working: + +```console +curl http://localhost:2080/generate \ + -X POST \ + -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \ + -H 'Content-Type: application/json' +``` + +## Values + +| Key | Type | Default | Description | +| ------------------------------- | ------ | ------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| LLM_MODEL_ID | string | `"bigscience/bloom-560m"` | Models id from https://huggingface.co/, or predownloaded model directory | +| global.HUGGINGFACEHUB_API_TOKEN | string | `insert-your-huggingface-token-here` | Hugging Face API token | +| global.modelUseHostPath | string | `"/mnt/opea-models"` | Cached models directory, tgi will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. | +| image.repository | string | `"ghcr.io/huggingface/text-generation-inference"` | | +| image.tag | string | `"1.4"` | | +| horizontalPodAutoscaler.enabled | bool | false | Enable HPA autoscaling for the service deployment based on metrics it provides. See HPA section in ../../README.md before enabling! | diff --git a/charts/opea/common/tgi/gaudi-values.yaml b/charts/opea/common/tgi/gaudi-values.yaml new file mode 100644 index 0000000..25546c4 --- /dev/null +++ b/charts/opea/common/tgi/gaudi-values.yaml @@ -0,0 +1,20 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Default values for tgi. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +accelDevice: "gaudi" + +image: + repository: ghcr.io/huggingface/tgi-gaudi + tag: "2.0.1" + +MAX_INPUT_LENGTH: "1024" +MAX_TOTAL_TOKENS: "2048" +CUDA_GRAPHS: "" + +resources: + limits: + habana.ai/gaudi: 1 diff --git a/charts/opea/common/tgi/nv-values.yaml b/charts/opea/common/tgi/nv-values.yaml new file mode 100644 index 0000000..798af89 --- /dev/null +++ b/charts/opea/common/tgi/nv-values.yaml @@ -0,0 +1,18 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Default values for tgi. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +accelDevice: "nvidia" + +image: + repository: ghcr.io/huggingface/text-generation-inference + tag: "2.2.0" + +resources: + limits: + nvidia.com/gpu: 1 + +CUDA_GRAPHS: "" diff --git a/charts/opea/common/tgi/templates/_helpers.tpl b/charts/opea/common/tgi/templates/_helpers.tpl new file mode 100644 index 0000000..b672e83 --- /dev/null +++ b/charts/opea/common/tgi/templates/_helpers.tpl @@ -0,0 +1,69 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "tgi.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "tgi.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "tgi.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Convert chart name to a string suitable as metric prefix +*/}} +{{- define "tgi.metricPrefix" -}} +{{- include "tgi.fullname" . | replace "-" "_" | regexFind "[a-zA-Z_:][a-zA-Z0-9_:]*" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "tgi.labels" -}} +helm.sh/chart: {{ include "tgi.chart" . }} +{{ include "tgi.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "tgi.selectorLabels" -}} +app.kubernetes.io/name: {{ include "tgi.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "tgi.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "tgi.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/charts/opea/common/tgi/templates/configmap.yaml b/charts/opea/common/tgi/templates/configmap.yaml new file mode 100644 index 0000000..e44d8ee --- /dev/null +++ b/charts/opea/common/tgi/templates/configmap.yaml @@ -0,0 +1,31 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "tgi.fullname" . }}-config + labels: + {{- include "tgi.labels" . | nindent 4 }} +data: + MODEL_ID: {{ .Values.LLM_MODEL_ID | quote }} + PORT: {{ .Values.port | quote }} + HF_TOKEN: {{ .Values.global.HUGGINGFACEHUB_API_TOKEN | quote}} + {{- if .Values.global.HF_ENDPOINT }} + HF_ENDPOINT: {{ .Values.global.HF_ENDPOINT | quote}} + {{- end }} + http_proxy: {{ .Values.global.http_proxy | quote }} + https_proxy: {{ .Values.global.https_proxy | quote }} + no_proxy: {{ .Values.global.no_proxy | quote }} + HABANA_LOGS: "/tmp/habana_logs" + NUMBA_CACHE_DIR: "/tmp" + HF_HOME: "/tmp/.cache/huggingface" + {{- if .Values.MAX_INPUT_LENGTH }} + MAX_INPUT_LENGTH: {{ .Values.MAX_INPUT_LENGTH | quote }} + {{- end }} + {{- if .Values.MAX_TOTAL_TOKENS }} + MAX_TOTAL_TOKENS: {{ .Values.MAX_TOTAL_TOKENS | quote }} + {{- end }} + {{- if .Values.CUDA_GRAPHS }} + CUDA_GRAPHS: {{ .Values.CUDA_GRAPHS | quote }} + {{- end }} diff --git a/charts/opea/common/tgi/templates/deployment.yaml b/charts/opea/common/tgi/templates/deployment.yaml new file mode 100644 index 0000000..511cead --- /dev/null +++ b/charts/opea/common/tgi/templates/deployment.yaml @@ -0,0 +1,124 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "tgi.fullname" . }} + labels: + {{- include "tgi.labels" . | nindent 4 }} +spec: + {{- if ne (int .Values.replicaCount) 1 }} + # remove if replica count should not be reset on pod update with HPA + replicas: {{ .Values.replicaCount }} + {{- end }} + selector: + matchLabels: + {{- include "tgi.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "tgi.selectorLabels" . | nindent 8 }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: {{ .Chart.Name }} + envFrom: + - configMapRef: + name: {{ include "tgi.fullname" . }}-config + {{- if .Values.global.extraEnvConfig }} + - configMapRef: + name: {{ .Values.global.extraEnvConfig }} + optional: true + {{- end }} + securityContext: + {{- if .Values.global.modelUseHostPath }} + {} + {{- else }} + {{- toYaml .Values.securityContext | nindent 12 }} + {{- end }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + {{- if .Values.extraCmdArgs }} + args: + {{- range .Values.extraCmdArgs }} + - {{ . | quote }} + {{- end }} + {{- end }} + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + - mountPath: /tmp + name: tmp + ports: + - name: http + containerPort: {{ .Values.port }} + protocol: TCP + {{- if .Values.livenessProbe }} + livenessProbe: + {{- toYaml .Values.livenessProbe | nindent 12 }} + {{- end }} + {{- if .Values.readinessProbe }} + readinessProbe: + {{- toYaml .Values.readinessProbe | nindent 12 }} + {{- end }} + {{- if .Values.startupProbe }} + startupProbe: + {{- toYaml .Values.startupProbe | nindent 12 }} + {{- end }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + volumes: + - name: model-volume + {{- if .Values.global.modelUsePVC }} + persistentVolumeClaim: + claimName: {{ .Values.global.modelUsePVC }} + {{- else if .Values.global.modelUseHostPath }} + hostPath: + path: {{ .Values.global.modelUseHostPath }} + type: Directory + {{- else }} + emptyDir: {} + {{- end }} + - name: shm + emptyDir: + medium: Memory + sizeLimit: {{ .Values.shmSize }} + - name: tmp + emptyDir: {} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- if not .Values.accelDevice }} + # extra time to finish processing buffered requests on CPU before pod is forcibly terminated + terminationGracePeriodSeconds: 120 + {{- end }} + {{- if .Values.evenly_distributed }} + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + {{- include "tgi.selectorLabels" . | nindent 14 }} + {{- end }} diff --git a/charts/opea/common/tgi/templates/horizontal-pod-autoscaler.yaml b/charts/opea/common/tgi/templates/horizontal-pod-autoscaler.yaml new file mode 100644 index 0000000..646ea9c --- /dev/null +++ b/charts/opea/common/tgi/templates/horizontal-pod-autoscaler.yaml @@ -0,0 +1,53 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.horizontalPodAutoscaler.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "tgi.fullname" . }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "tgi.fullname" . }} + minReplicas: 1 + maxReplicas: {{ .Values.horizontalPodAutoscaler.maxReplicas }} + metrics: + - type: Object + object: + metric: + # TGI time metrics are in seconds + name: {{ include "tgi.metricPrefix" . }}_request_latency + describedObject: + apiVersion: v1 + # get metric for named object of given type (in same namespace) + kind: Service + name: {{ include "tgi.fullname" . }} + target: + # tgi_request_latency is average for all the TGI pods. To avoid replica fluctuations when + # TGI startup + request processing takes longer than HPA evaluation period, this uses + # "Value" (replicas = metric.value / target.value), instead of "averageValue" type: + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details + type: Value + value: 4 + behavior: + scaleDown: + stabilizationWindowSeconds: 180 + policies: + - type: Percent + value: 25 + periodSeconds: 90 + scaleUp: + selectPolicy: Max + stabilizationWindowSeconds: 0 + policies: + # Slow linear rampup in case additional CPU pods go to same node + # (i.e. interfere with each other) + - type: Pods + value: 1 + periodSeconds: 90 + #- type: Percent + # value: 25 + # periodSeconds: 90 +{{- end }} diff --git a/charts/opea/common/tgi/templates/service.yaml b/charts/opea/common/tgi/templates/service.yaml new file mode 100644 index 0000000..011cc37 --- /dev/null +++ b/charts/opea/common/tgi/templates/service.yaml @@ -0,0 +1,18 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: {{ include "tgi.fullname" . }} + labels: + {{- include "tgi.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: 80 + targetPort: {{ .Values.port }} + protocol: TCP + name: tgi + selector: + {{- include "tgi.selectorLabels" . | nindent 4 }} diff --git a/charts/opea/common/tgi/templates/servicemonitor.yaml b/charts/opea/common/tgi/templates/servicemonitor.yaml new file mode 100644 index 0000000..fdb1159 --- /dev/null +++ b/charts/opea/common/tgi/templates/servicemonitor.yaml @@ -0,0 +1,24 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +# Dashboard for the exposed TGI metrics: +# - https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/ +# Metric descriptions: +# - https://github.com/huggingface/text-generation-inference/discussions/1127#discussioncomment-7240527 + +{{- if .Values.horizontalPodAutoscaler.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "tgi.fullname" . }} + labels: + release: {{ .Values.global.prometheusRelease }} +spec: + selector: + matchLabels: + {{- include "tgi.selectorLabels" . | nindent 6 }} + endpoints: + - interval: 4s + port: tgi + scheme: http +{{- end }} diff --git a/charts/opea/common/tgi/templates/tests/test-pod.yaml b/charts/opea/common/tgi/templates/tests/test-pod.yaml new file mode 100644 index 0000000..948f238 --- /dev/null +++ b/charts/opea/common/tgi/templates/tests/test-pod.yaml @@ -0,0 +1,30 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "tgi.fullname" . }}-testpod" + labels: + {{- include "tgi.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test + #"helm.sh/hook-delete-policy": "hook-succeeded, hook-failure" +spec: + containers: + - name: curl + image: python:3.10.14 + command: ['bash', '-c'] + args: + - | + max_retry=20; + for ((i=1; i<=max_retry; i++)); do + curl http://{{ include "tgi.fullname" . }}/generate -sS --fail-with-body \ + -X POST \ + -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \ + -H 'Content-Type: application/json' && break; + curlcode=$? + if [[ $curlcode -eq 7 ]]; then sleep 10; else echo "curl failed with code $curlcode"; exit 1; fi; + done; + if [ $i -gt $max_retry ]; then echo "test failed with maximum retry"; exit 1; fi + restartPolicy: Never diff --git a/charts/opea/common/tgi/values.yaml b/charts/opea/common/tgi/values.yaml new file mode 100644 index 0000000..805df10 --- /dev/null +++ b/charts/opea/common/tgi/values.yaml @@ -0,0 +1,138 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Default values for tgi. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +replicaCount: 1 + +# Enabling HPA will: +# - Ignore above replica count, as it will be controlled by HPA +# - Add example HPA scaling rules with thresholds suitable for Xeon deployments +# - Require custom metrics ConfigMap available in the main application chart +horizontalPodAutoscaler: + maxReplicas: 4 + enabled: false + +port: 2080 +shmSize: 1Gi + +# Set extraCmdArgs if you need to pass additional parameters to TGI for performance +# Refer to https://huggingface.co/docs/text-generation-inference/en/reference/launcher for more options. +# extraCmdArgs: ["--dtype","bfloat16"] + +image: + repository: ghcr.io/huggingface/text-generation-inference + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "2.2.0" + +# empty for CPU +accelDevice: "" + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +podAnnotations: {} + +podSecurityContext: {} + # fsGroup: 2000 + +securityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + capabilities: + drop: + - ALL + seccompProfile: + type: RuntimeDefault + +service: + type: ClusterIP + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +# Use TCP probe instead of HTTP due to bug #483 +# https://github.com/opea-project/GenAIExamples/issues/483 +livenessProbe: + tcpSocket: + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 24 +readinessProbe: + tcpSocket: + port: http + initialDelaySeconds: 5 + periodSeconds: 5 +startupProbe: + tcpSocket: + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 120 +#livenessProbe: +# httpGet: +# path: /health +# port: http +# initialDelaySeconds: 5 +# periodSeconds: 5 +# failureThreshold: 24 +#readinessProbe: +# httpGet: +# path: /health +# port: http +# initialDelaySeconds: 5 +# periodSeconds: 5 +#startupProbe: +# httpGet: +# path: /health +# port: http +# initialDelaySeconds: 5 +# periodSeconds: 5 +# failureThreshold: 120 + +nodeSelector: {} + +tolerations: [] + +affinity: {} + +LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + +MAX_INPUT_LENGTH: "" +MAX_TOTAL_TOKENS: "" +CUDA_GRAPHS: "0" + +global: + http_proxy: "" + https_proxy: "" + no_proxy: "" + HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here" + + # Choose where to save your downloaded models + # Set modelUseHostPath for local directory, this is good for one node test. Example: + # modelUseHostPath: /mnt/opea-models + # Set modelUsePVC for PersistentVolumeClaim(PVC), which is suitable for multinode deployment. Example: + # modelUsePVC: model-volume + # You can only set one of the following var, the behavior is not defined is both are set. + # By default, both var are set to empty, the model will be downloaded and saved to a tmp volume. + modelUseHostPath: "" + modelUsePVC: "" + + # Prometheus Helm installation info for serviceMonitor + prometheusRelease: prometheus-stack