Merge branch 'main' into liangan1/xpu-support

sgl-project · Sep 24, 2024 · ef880b7 · ef880b7
2 parents 5749dbb + 2854a5e
commit ef880b7
Show file tree

Hide file tree

Showing 30 changed files with 451 additions and 160 deletions.
diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
@@ -228,11 +228,46 @@ jobs:
           cd human-eval
           pip install -e .
 
-      - name: Evaluate Accuracy
+      - name: Evaluate Accuracy (TP=2)
         timeout-minutes: 20
         run: |
           cd test/srt
           python3 test_moe_eval_accuracy_large.py
+      
+      - name: Evaluate MLA Accuracy (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 test_mla.py
+          python3 test_mla_fp8.py
+
+      - name: Evaluate Data Parallelism Accuracy (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 test_data_parallelism.py
+
+  accuracy-test-1-gpu-amd:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 1-gpu-runner-amd
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install -e "python[all]" --no-deps
+
+          git clone https://github.com/merrymercy/human-eval.git
+          cd human-eval
+          pip install -e .
+
+      - name: Evaluate Accuracy
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 test_eval_accuracy_large.py
 
   finish:
     needs: [

diff --git a/README.md b/README.md
@@ -60,7 +60,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
 ### Method 2: From source
 ```
 # Use the last release branch
-git clone -b v0.3.1.post2 https://github.com/sgl-project/sglang.git
+git clone -b v0.3.1.post3 https://github.com/sgl-project/sglang.git
 cd sglang
 
 pip install --upgrade pip
@@ -90,9 +90,9 @@ docker run --gpus all \
 <summary>More</summary>
 
 > This method is recommended if you plan to serve it as a service.
-> A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
+> A better approach is to use the [k8s-sglang-service.yaml](docker/k8s-sglang-service.yaml).
 
-1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
+1. Copy the [compose.yml](docker/compose.yaml) to your local machine
 2. Execute the command `docker compose up -d` in your terminal.
 </details>
 
@@ -271,7 +271,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - gte-Qwen2
   - `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
 
-Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
+Instructions for supporting a new model are [here](docs/en/model_support.md).
 
 #### Use Models From ModelScope
 <details>
@@ -566,7 +566,7 @@ def chat_example(s):
 Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
 
 ## Roadmap
-[Development Roadmap (2024 Q3)](https://github.com/sgl-project/sglang/issues/634)
+[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
 
 ## Citation And Acknowledgment
 Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -18,6 +18,9 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && rm -rf /var/lib/apt/lists/* \
     && apt clean
 
+# For openbmb/MiniCPM models
+RUN pip3 install datamodel_code_generator
+
 WORKDIR /sgl-workspace
 
 RUN python3 -m pip install --upgrade pip setuptools wheel html5lib six \

diff --git a/docs/en/backend.md b/docs/en/backend.md
@@ -19,7 +19,7 @@ curl http://localhost:30000/generate \
     }
   }'
 ```
-Learn more about the argument format [here](docs/en/sampling_params.md).
+Learn more about the argument format [here](https://sglang.readthedocs.io/en/latest/sampling_params.html).
 
 ### OpenAI Compatible API
 In addition, the server supports OpenAI-compatible APIs.
@@ -73,15 +73,15 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 ```
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --mem-fraction-static 0.7
 ```
-- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
+- See [hyperparameter tuning](https://sglang.readthedocs.io/en/latest/hyperparameter_tuning.html) on tuning hyperparameters for better performance.
 - If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
 ```
 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
 ```
 - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
 - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
 - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
-- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
+- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](https://sglang.readthedocs.io/en/latest/custom_chat_template.html).
 - To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
 ```
 # Node 0
@@ -102,11 +102,11 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
   - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
   - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
-  - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
+  - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](https://github.com/sgl-project/sglang/blob/main/test/srt/test_vision_openai_server.py)
 - LLaVA 1.5 / 1.6 / NeXT
   - `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3`
   - `python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8 --chat-template=chatml-llava`
-  - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
+  - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](https://github.com/sgl-project/sglang/blob/main/test/srt/test_vision_openai_server.py)
 - Yi-VL
 - StableLM
 - Command-R
@@ -122,7 +122,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - gte-Qwen2
   - `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
 
-Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
+Instructions for supporting a new model are [here](https://sglang.readthedocs.io/en/latest/model_support.html).
 
 #### Use Models From ModelScope
 <details>

diff --git a/docs/en/frontend.md b/docs/en/frontend.md
@@ -70,7 +70,7 @@ print(state["answer_1"])
 #### More Examples
 
 Anthropic and VertexAI (Gemini) models are also supported.
-You can find more examples at [examples/quick_start](examples/frontend_language/quick_start).
+You can find more examples at [examples/quick_start](https://github.com/sgl-project/sglang/tree/main/examples/frontend_language/quick_start).
 
 ### Language Feature
 To begin with, import sglang.
@@ -83,7 +83,7 @@ You can implement your prompt flow in a function decorated by `sgl.function`.
 You can then invoke the function with `run` or `run_batch`.
 The system will manage the state, chat template, parallelism and batching for you.
 
-The complete code for the examples below can be found at [readme_examples.py](examples/frontend_language/usage/readme_examples.py)
+The complete code for the examples below can be found at [readme_examples.py](https://github.com/sgl-project/sglang/blob/main/examples/frontend_language/usage/readme_examples.py)
 
 #### Control Flow
 You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
@@ -132,7 +132,7 @@ def image_qa(s, image_file, question):
     s += sgl.assistant(sgl.gen("answer", max_tokens=256)
 ```
 
-See also [srt_example_llava.py](examples/frontend_language/quick_start/local_example_llava_next.py).
+See also [local_example_llava_next.py](https://github.com/sgl-project/sglang/blob/main/examples/frontend_language/quick_start/local_example_llava_next.py).
 
 #### Constrained Decoding
 Use `regex` to specify a regular expression as a decoding constraint.
@@ -176,7 +176,7 @@ def character_gen(s, name):
     s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
 ```
 
-See also [json_decode.py](examples/frontend_language/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
+See also [json_decode.py](https://github.com/sgl-project/sglang/blob/main/examples/frontend_language/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
 
 #### Batching
 Use `run_batch` to run a batch of requests with continuous batching.

diff --git a/docs/en/index.rst b/docs/en/index.rst
@@ -8,8 +8,7 @@ The core features include:
 - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
 - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
 - **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
-- **Active Community**: SGLang is open-source and backed by an active community with industry adoption, welcoming contributions to improve LLM and VLM serving.
-
+- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
 
 .. toctree::
    :maxdepth: 1

diff --git a/docs/en/model_support.md b/docs/en/model_support.md
@@ -1,16 +1,35 @@
 # How to Support a New Model
 
-To support a new model in SGLang, you only need to add a single file under [SGLang Models Directory](https://github.com/sgl-project/sglang/tree/main/python/sglang/srt/models). You can learn from existing model implementations and create new files for the new models. Most models are based on the transformer architecture, making them very similar.
+To support a new model in SGLang, you only need to add a single file under [SGLang Models Directory](https://github.com/sgl-project/sglang/tree/main/python/sglang/srt/models).
+You can learn from existing model implementations and create new files for the new models.
+For most models, you should be able to find a similar model to start with (e.g., starting from Llama).
 
-Another valuable resource is the [vLLM Models Directory](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models). vLLM has extensive coverage of models, and SGLang has reused vLLM for most parts of the model implementations. This similarity makes it easy to port many models from vLLM to SGLang.
+## Test the correctness
 
-To port a model from vLLM to SGLang, you can compare these two files [SGLang LLaMA Implementation](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/models/llama2.py) and [vLLM LLaMA Implementation](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llama.py). This comparison will help you understand how to convert a model implementation from vLLM to SGLang. The major difference is the replacement of PagedAttention with RadixAttention. The other parts are almost identical. Specifically,
+### Interactive debugging
+For interactive debugging, you can compare the outputs of huggingface/transformers and SGLang.
+The following two commands should give the same text output and very similar prefill logits.
+
+- Get the reference output by `python3 scripts/playground/reference_hf.py --model [new model]`
+- Get the SGLang output by `python3 -m sglang.bench_latency --correct --model [new model]`
+
+### Add the model to the test suite
+To make sure the new model is well maintained in the future, it is better to add it to the test suite.
+You can add it to the `ALL_OTHER_MODELS` list in the [test_generation_models.py](https://github.com/sgl-project/sglang/blob/main/test/srt/models/test_generation_models.py) and run the following command to test it.
+
+For example, if the model is Qwen/Qwen2-1.5B
+```
+ONLY_RUN=Qwen/Qwen2-1.5B python3 -m unittest test_generation_models.TestGenerationModels.test_others
+```
+
+## Port a model from vLLM to SGLang
+Another valuable resource is the [vLLM Models Directory](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models). vLLM has extensive coverage of models, and SGLang reuses vLLM's interface and some layers to implement the models. This similarity makes it easy to port many models from vLLM to SGLang.
+
+To port a model from vLLM to SGLang, you can compare these two files [SGLang Llama Implementation](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/models/llama.py) and [vLLM Llama Implementation](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llama.py). This comparison will help you understand how to convert a model implementation from vLLM to SGLang. The major difference is the replacement of Attention with RadixAttention. The other parts are almost identical. Specifically,
   - Replace vllm's `Attention` with `RadixAttention`. Note that you need to pass `layer_id` all the way to `RadixAttention`.
   - Replace vllm's `LogitsProcessor` with SGLang's `LogitsProcessor`.
+  - Replace other vLLM layers with SGLang layers (e.g., `RMSNorm`, `SiluAndMul`).
   - Remove `Sample`.
   - Change `forward()` functions, and add `input_metadata`.
   - Add `EntryClass` at the end.
-  - Test correctness by comparing the final logits and outputs of the two following commands:
-    - `python3 scripts/playground/reference_hf.py --model [new model]`
-    - `python3 -m sglang.bench_latency --model [new model] --correct --output-len 16 --trust-remote-code`
-  - Update [Supported Models](https://github.com/sgl-project/sglang/tree/main?tab=readme-ov-file#supported-models) at [README](../README.md).
+
diff --git a/docs/en/setup_github_runner.md b/docs/en/setup_github_runner.md
@@ -8,7 +8,10 @@ You can mount a folder for the shared huggingface model weights cache. The comma
 
 ```
 docker pull nvidia/cuda:12.1.1-devel-ubuntu22.04
+# Nvidia
 docker run --shm-size 64g -it -v /tmp/huggingface:/hf_home --gpus all nvidia/cuda:12.1.1-devel-ubuntu22.04 /bin/bash
+# AMD
+docker run --rm --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 64g -it -v /tmp/huggingface:/hf_home henryx/haisgl:sgl0.3.1.post3_vllm0.6.0_triton3.0.0_rocm6.2.1 /bin/bash
 ```
 
 ### Step 2: Configure the runner by `config.sh`
@@ -41,4 +44,4 @@ export CUDA_VISIBLE_DEVICES=0
 - Run it forever
 ```
 while true; do ./run.sh; echo "Restarting..."; sleep 2; done
-```
+```
diff --git a/examples/runtime/openai_chat_with_response_prefill.py b/examples/runtime/openai_chat_with_response_prefill.py
@@ -0,0 +1,34 @@
+"""
+Usage:
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000
+python openai_chat.py
+"""
+
+import openai
+from openai import OpenAI
+
+client = openai.Client(base_url="http://127.0.0.1:30000/v1", api_key="EMPTY")
+
+response = client.chat.completions.create(
+    model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+    messages=[
+        {"role": "system", "content": "You are a helpful AI assistant"},
+        {
+            "role": "user",
+            "content": """
+Extract the name, size, price, and color from this product description as a JSON object:
+
+<description>
+The SmartHome Mini is a compact smart home assistant available in black or white for only $49.99. At just 5 inches wide, it lets you control lights, thermostats, and other connected devices via voice or app—no matter where you place it in your home. This affordable little hub brings convenient hands-free control to your smart devices.
+</description>
+""",
+        },
+        {
+            "role": "assistant",
+            "content": "{\n",
+        },
+    ],
+    temperature=0,
+)
+
+print(response.choices[0].message.content)
diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sglang"
-version = "0.3.1.post2"
+version = "0.3.1.post3"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"

diff --git a/python/sglang/bench_latency.py b/python/sglang/bench_latency.py
@@ -261,7 +261,7 @@ def correctness_test(
 
     # Decode
     output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
-    for _ in range(bench_args.output_len[0]):
+    for _ in range(bench_args.output_len[0] - 1):
         next_token_ids, _ = decode(next_token_ids, batch, model_runner)
         for i in range(len(reqs)):
             output_ids[i].append(next_token_ids[i])
@@ -317,7 +317,7 @@ def latency_test_run_once(
 
     # Decode
     decode_latencies = []
-    for i in range(output_len):
+    for i in range(output_len - 1):
         synchronize(device)
         tic = time.time()
         next_token_ids, _ = decode(next_token_ids, batch, model_runner)

diff --git a/python/sglang/srt/layers/attention_backend.py b/python/sglang/srt/layers/attention_backend.py
@@ -346,8 +346,9 @@ def __init__(self, model_runner: ModelRunner):
 
         self.decode_attention_fwd = decode_attention_fwd
         self.extend_attention_fwd = extend_attention_fwd
-        self.num_head = model_runner.model_config.num_attention_heads
-        self.model_runner = model_runner
+        self.num_head = (
+            model_runner.model_config.num_attention_heads // model_runner.tp_size
+        )
 
         if global_server_args_dict.get("triton_attention_reduce_in_fp32", False):
             self.reduce_dtype = torch.float32

diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
@@ -433,7 +433,7 @@ def alloc_token_slots(self, num_tokens: int):
     def prepare_for_extend(self, vocab_size: int):
         self.forward_mode = ForwardMode.EXTEND
 
-        bs = self.batch_size()
+        bs = len(self.reqs)
         reqs = self.reqs
         input_ids = [r.fill_ids[len(r.prefix_indices) :] for r in reqs]
         extend_num_tokens = sum(len(ids) for ids in input_ids)
@@ -513,7 +513,7 @@ def mix_with_running(self, running_batch: "ScheduleBatch"):
         self.extend_logprob_start_lens_cpu.extend([0] * running_bs)
 
     def check_decode_mem(self):
-        bs = self.batch_size()
+        bs = len(self.reqs)
         if self.token_to_kv_pool.available_size() >= bs:
             return True
 
@@ -684,14 +684,12 @@ def prepare_for_decode(self, input_ids=None):
                 r.output_ids[-1] if r.output_ids else r.origin_input_ids[-1]
                 for r in self.reqs
             ]
-        else:
-            self.sampling_info.penalizer_orchestrator.cumulate_input_tokens(input_ids)
 
         self.input_ids = torch.tensor(input_ids, dtype=torch.int32, device=self.device)
         self.seq_lens.add_(1)
 
         # Alloc mem
-        bs = self.batch_size()
+        bs = len(self.reqs)
         self.out_cache_loc = self.alloc_token_slots(bs)
 
         self.req_to_token_pool.req_to_token[

diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
@@ -123,6 +123,7 @@ def __init__(
                     initializer=init_global_processor,
                     mp_context=mp.get_context("fork"),
                     initargs=(server_args,),
+                    max_workers=os.environ.get("SGLANG_CPU_COUNT", os.cpu_count()),
                 )
             else:
                 self.tokenizer = get_tokenizer(