From 4aa5dd2c5f1e386e8bf7d9c6309dc414e2fded7e Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 11 Mar 2024 05:49:27 -0700 Subject: [PATCH] Update version to v0.1.13 (#280) --- docs/benchmark_results.md | 3 +-- docs/flashinfer.md | 9 +-------- docs/test_process.md | 20 +++++++++++++++++++ python/pyproject.toml | 2 +- python/sglang/__init__.py | 2 +- python/sglang/srt/layers/radix_attention.py | 2 +- python/sglang/srt/layers/token_attention.py | 2 +- .../srt/managers/router/model_runner.py | 2 +- test/srt/model/test_llama_extend.py | 6 +++--- test/srt/model/test_llama_low_api.py | 3 ++- test/srt/model/test_llava_low_api.py | 5 +++-- 11 files changed, 35 insertions(+), 21 deletions(-) diff --git a/docs/benchmark_results.md b/docs/benchmark_results.md index 4259821134..519dfec3fc 100644 --- a/docs/benchmark_results.md +++ b/docs/benchmark_results.md @@ -11,8 +11,7 @@ We tested our system on the following common LLM workloads and reported the achi - **[DSPy RAG](https://github.com/stanfordnlp/dspy)**: A retrieval-augmented generation pipeline in the DSPy tutorial. - **[LLaVA Bench](https://github.com/haotian-liu/LLaVA)**: Running LLaVA v1.5, a vision language model on the LLaVA-in-the-wild benchmark. -We tested both Llama-7B on one NVIDIA A10G GPU (24GB) and Mixtral-8x7B on 8 NVIDIA A10G GPUs with tensor parallelism, using FP16 precision. We used vllm v0.2.5, guidance v0.1.8, and Hugging Face TGI v1.3.0 as baseline systems. - +We tested both Llama-7B on one NVIDIA A10G GPU (24GB) and Mixtral-8x7B on 8 NVIDIA A10G GPUs with tensor parallelism, using FP16 precision. We used vllm v0.2.5, guidance v0.1.8, Hugging Face TGI v1.3.0, and SGLang v0.1.5. - Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1 ![llama_7b](../assets/llama_7b.jpg) diff --git a/docs/flashinfer.md b/docs/flashinfer.md index 2f1fd2dc1e..7acd083020 100644 --- a/docs/flashinfer.md +++ b/docs/flashinfer.md @@ -5,14 +5,7 @@ It can be used in SGLang runtime to accelerate attention computation. ### Install flashinfer -You can install flashinfer via pip as follows for CUDA 12.1. - -```bash -pip install flashinfer -i https://flashinfer.ai/whl/cu121/ -``` - -You can look for other CUDA versions in https://github.com/flashinfer-ai/flashinfer?tab=readme-ov-file#installation. If there is no desire version for your environment, -please build it from source (the compilation takes a long time). +See https://docs.flashinfer.ai/installation.html. ### Run a Server With Flashinfer Mode diff --git a/docs/test_process.md b/docs/test_process.md index fff46fd143..fcb03ad6fa 100644 --- a/docs/test_process.md +++ b/docs/test_process.md @@ -37,6 +37,23 @@ python3 bench_sglang.py --nsub 3 # Average accuracy: 0.413 ``` +#### GSM-8K +``` +cd benchmark/gsm8k +``` +Follow README.md to download the data. + +``` +python3 bench_sglang.py --num-q 200 + +# Expected performance on A10G +# Latency: 32.103 +# Accuracy: 0.250 +``` + +#### More +Please also test `benchmark/hellaswag`, `benchmark/latency_throughput`. + ### More Models #### LLaVA @@ -48,6 +65,9 @@ python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenize ``` cd benchmark/llava_bench python3 bench_sglang.py + +# Expected performance on A10G +# Latency: 50.031 ``` ## SGLang Unit Tests diff --git a/python/pyproject.toml b/python/pyproject.toml index caf41a2d80..35d82a2d6f 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "sglang" -version = "0.1.12" +version = "0.1.13" description = "A structured generation langauge for LLMs." readme = "README.md" requires-python = ">=3.8" diff --git a/python/sglang/__init__.py b/python/sglang/__init__.py index 1f3304756e..ec06cd3ea6 100644 --- a/python/sglang/__init__.py +++ b/python/sglang/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.1.12" +__version__ = "0.1.13" from sglang.api import * from sglang.global_config import global_config diff --git a/python/sglang/srt/layers/radix_attention.py b/python/sglang/srt/layers/radix_attention.py index 464327eedd..5adc31d3ef 100644 --- a/python/sglang/srt/layers/radix_attention.py +++ b/python/sglang/srt/layers/radix_attention.py @@ -17,7 +17,7 @@ def __init__(self, num_heads, head_dim, scaling, num_kv_heads, layer_id): from sglang.srt.managers.router.model_runner import global_server_args_dict - if global_server_args_dict["enable_flashinfer"]: + if global_server_args_dict.get("enable_flashinfer", False): self.prefill_forward = self.prefill_forward_flashinfer self.extend_forward = self.prefill_forward_flashinfer self.decode_forward = self.decode_forward_flashinfer diff --git a/python/sglang/srt/layers/token_attention.py b/python/sglang/srt/layers/token_attention.py index a4a57fbe79..b0dac1759e 100644 --- a/python/sglang/srt/layers/token_attention.py +++ b/python/sglang/srt/layers/token_attention.py @@ -7,7 +7,7 @@ from sglang.srt.managers.router.model_runner import global_server_args_dict from sglang.srt.utils import wrap_kernel_launcher -if global_server_args_dict["attention_reduce_in_fp32"]: +if global_server_args_dict.get("attention_reduce_in_fp32", False): REDUCE_TRITON_TYPE = tl.float32 REDUCE_TORCH_TYPE = torch.float32 else: diff --git a/python/sglang/srt/managers/router/model_runner.py b/python/sglang/srt/managers/router/model_runner.py index 4ec7946c6a..1f07286ed8 100644 --- a/python/sglang/srt/managers/router/model_runner.py +++ b/python/sglang/srt/managers/router/model_runner.py @@ -222,7 +222,7 @@ def create( if forward_mode == ForwardMode.EXTEND: ret.init_extend_args() - if global_server_args_dict["enable_flashinfer"]: + if global_server_args_dict.get("enable_flashinfer", False): ret.init_flashinfer_args(tp_size) return ret diff --git a/test/srt/model/test_llama_extend.py b/test/srt/model/test_llama_extend.py index ae8df9d054..2931dfa5dc 100644 --- a/test/srt/model/test_llama_extend.py +++ b/test/srt/model/test_llama_extend.py @@ -28,8 +28,8 @@ def test_generate_worker(model_path, tp_rank, tp_size): reqs = [] for i in range(len(prompts)): - req = Req(i, None, None) - req.input_ids = tokenizer.encode(prompts[i])[:cut_num] + input_ids = tokenizer.encode(prompts[i])[:cut_num] + req = Req(i, prompts[i], input_ids) req.sampling_params = sampling_params reqs.append(req) @@ -60,7 +60,7 @@ def test_generate_worker(model_path, tp_rank, tp_size): # Decode for i in range(6): batch.prepare_for_decode(next_token_ids.cpu().numpy()) - logits = model.forward(batch, ForwardMode.DECODE) + logits, _ = model.forward(batch, ForwardMode.DECODE) next_token_ids, next_token_probs = batch.sample(logits) print( diff --git a/test/srt/model/test_llama_low_api.py b/test/srt/model/test_llama_low_api.py index e556ec7ebf..a8917ee4a8 100644 --- a/test/srt/model/test_llama_low_api.py +++ b/test/srt/model/test_llama_low_api.py @@ -71,7 +71,7 @@ def decode(print_logits): ) = model.token_to_kv_pool.alloc_contiguous(batch_size) model.req_to_token_pool.req_to_token[req_pool_indices, seq_lens] = out_cache_loc seq_lens.add_(1) - logits = model.forward_decode( + logits, _ = model.forward_decode( torch.from_numpy(predict_ids).cuda().reshape(-1), req_pool_indices, seq_lens, @@ -80,6 +80,7 @@ def decode(print_logits): None, out_cache_cont_start, out_cache_cont_end, + False, ) prob_out = torch.softmax(logits, dim=-1) predict_ids = torch.argmax(prob_out, dim=1, keepdim=True) diff --git a/test/srt/model/test_llava_low_api.py b/test/srt/model/test_llava_low_api.py index f6a77a74dd..322ba4855f 100644 --- a/test/srt/model/test_llava_low_api.py +++ b/test/srt/model/test_llava_low_api.py @@ -63,7 +63,7 @@ def decode(step, model, tp_rank, batch_size, predict_ids, params, print_logits): ) = model.token_to_kv_pool.alloc_contiguous(batch_size) model.req_to_token_pool.req_to_token[req_pool_indices, seq_lens] = out_cache_loc seq_lens.add_(1) - logits = model.forward_decode( + logits, _ = model.forward_decode( torch.from_numpy(predict_ids).cuda().reshape(-1), req_pool_indices, seq_lens, @@ -72,6 +72,7 @@ def decode(step, model, tp_rank, batch_size, predict_ids, params, print_logits): None, out_cache_cont_start, out_cache_cont_end, + False, ) prob_out = torch.softmax(logits, dim=-1) predict_ids = torch.argmax(prob_out, dim=1, keepdim=True) @@ -92,7 +93,7 @@ def test_generate_worker( # Prepare data prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: \nDescribe this picture ASSISTANT:" - image_path = "/home/ubuntu/sglang/test/lang/image.png" + image_path = "/home/ubuntu/sglang/test/lang/test_image.png" image = load_image(image_path) processor = get_processor("llava-hf/llava-1.5-7b-hf")