From 4aa5dd2c5f1e386e8bf7d9c6309dc414e2fded7e Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 11 Mar 2024 05:49:27 -0700
Subject: [PATCH] Update version to v0.1.13 (#280)

---
 docs/benchmark_results.md                     |  3 +--
 docs/flashinfer.md                            |  9 +--------
 docs/test_process.md                          | 20 +++++++++++++++++++
 python/pyproject.toml                         |  2 +-
 python/sglang/__init__.py                     |  2 +-
 python/sglang/srt/layers/radix_attention.py   |  2 +-
 python/sglang/srt/layers/token_attention.py   |  2 +-
 .../srt/managers/router/model_runner.py       |  2 +-
 test/srt/model/test_llama_extend.py           |  6 +++---
 test/srt/model/test_llama_low_api.py          |  3 ++-
 test/srt/model/test_llava_low_api.py          |  5 +++--
 11 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/docs/benchmark_results.md b/docs/benchmark_results.md
index 4259821134..519dfec3fc 100644
--- a/docs/benchmark_results.md
+++ b/docs/benchmark_results.md
@@ -11,8 +11,7 @@ We tested our system on the following common LLM workloads and reported the achi
 - **[DSPy RAG](https://github.com/stanfordnlp/dspy)**: A retrieval-augmented generation pipeline in the DSPy tutorial.
 - **[LLaVA Bench](https://github.com/haotian-liu/LLaVA)**: Running LLaVA v1.5, a vision language model on the LLaVA-in-the-wild benchmark.
 
-We tested both Llama-7B on one NVIDIA A10G GPU (24GB) and Mixtral-8x7B on 8 NVIDIA A10G GPUs with tensor parallelism, using FP16 precision. We used vllm v0.2.5, guidance v0.1.8, and Hugging Face TGI v1.3.0 as baseline systems.
-
+We tested both Llama-7B on one NVIDIA A10G GPU (24GB) and Mixtral-8x7B on 8 NVIDIA A10G GPUs with tensor parallelism, using FP16 precision. We used vllm v0.2.5, guidance v0.1.8, Hugging Face TGI v1.3.0, and SGLang v0.1.5.
 
 - Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
 ![llama_7b](../assets/llama_7b.jpg)
diff --git a/docs/flashinfer.md b/docs/flashinfer.md
index 2f1fd2dc1e..7acd083020 100644
--- a/docs/flashinfer.md
+++ b/docs/flashinfer.md
@@ -5,14 +5,7 @@ It can be used in SGLang runtime to accelerate attention computation.
 
 ### Install flashinfer
 
-You can install flashinfer via pip as follows for CUDA 12.1.
-
-```bash
-pip install flashinfer -i https://flashinfer.ai/whl/cu121/
-```
-
-You can look for other CUDA versions in https://github.com/flashinfer-ai/flashinfer?tab=readme-ov-file#installation. If there is no desire version for your environment,
-please build it from source (the compilation takes a long time).
+See https://docs.flashinfer.ai/installation.html.
 
 ### Run a Server With Flashinfer Mode
 
diff --git a/docs/test_process.md b/docs/test_process.md
index fff46fd143..fcb03ad6fa 100644
--- a/docs/test_process.md
+++ b/docs/test_process.md
@@ -37,6 +37,23 @@ python3 bench_sglang.py --nsub 3
 # Average accuracy: 0.413
 ```
 
+#### GSM-8K
+```
+cd benchmark/gsm8k
+```
+Follow README.md to download the data.
+
+```
+python3 bench_sglang.py --num-q 200
+
+# Expected performance on A10G
+# Latency: 32.103
+# Accuracy: 0.250
+```
+
+#### More
+Please also test `benchmark/hellaswag`, `benchmark/latency_throughput`.
+
 ### More Models
 
 #### LLaVA
@@ -48,6 +65,9 @@ python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenize
 ```
 cd benchmark/llava_bench
 python3 bench_sglang.py
+
+# Expected performance on A10G
+# Latency: 50.031
 ```
 
 ## SGLang Unit Tests
diff --git a/python/pyproject.toml b/python/pyproject.toml
index caf41a2d80..35d82a2d6f 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "sglang"
-version = "0.1.12"
+version = "0.1.13"
 description = "A structured generation langauge for LLMs." 
 readme = "README.md"
 requires-python = ">=3.8"
diff --git a/python/sglang/__init__.py b/python/sglang/__init__.py
index 1f3304756e..ec06cd3ea6 100644
--- a/python/sglang/__init__.py
+++ b/python/sglang/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.1.12"
+__version__ = "0.1.13"
 
 from sglang.api import *
 from sglang.global_config import global_config
diff --git a/python/sglang/srt/layers/radix_attention.py b/python/sglang/srt/layers/radix_attention.py
index 464327eedd..5adc31d3ef 100644
--- a/python/sglang/srt/layers/radix_attention.py
+++ b/python/sglang/srt/layers/radix_attention.py
@@ -17,7 +17,7 @@ def __init__(self, num_heads, head_dim, scaling, num_kv_heads, layer_id):
 
         from sglang.srt.managers.router.model_runner import global_server_args_dict
 
-        if global_server_args_dict["enable_flashinfer"]:
+        if global_server_args_dict.get("enable_flashinfer", False):
             self.prefill_forward = self.prefill_forward_flashinfer
             self.extend_forward = self.prefill_forward_flashinfer
             self.decode_forward = self.decode_forward_flashinfer
diff --git a/python/sglang/srt/layers/token_attention.py b/python/sglang/srt/layers/token_attention.py
index a4a57fbe79..b0dac1759e 100644
--- a/python/sglang/srt/layers/token_attention.py
+++ b/python/sglang/srt/layers/token_attention.py
@@ -7,7 +7,7 @@
 from sglang.srt.managers.router.model_runner import global_server_args_dict
 from sglang.srt.utils import wrap_kernel_launcher
 
-if global_server_args_dict["attention_reduce_in_fp32"]:
+if global_server_args_dict.get("attention_reduce_in_fp32", False):
     REDUCE_TRITON_TYPE = tl.float32
     REDUCE_TORCH_TYPE = torch.float32
 else:
diff --git a/python/sglang/srt/managers/router/model_runner.py b/python/sglang/srt/managers/router/model_runner.py
index 4ec7946c6a..1f07286ed8 100644
--- a/python/sglang/srt/managers/router/model_runner.py
+++ b/python/sglang/srt/managers/router/model_runner.py
@@ -222,7 +222,7 @@ def create(
         if forward_mode == ForwardMode.EXTEND:
             ret.init_extend_args()
 
-        if global_server_args_dict["enable_flashinfer"]:
+        if global_server_args_dict.get("enable_flashinfer", False):
             ret.init_flashinfer_args(tp_size)
 
         return ret
diff --git a/test/srt/model/test_llama_extend.py b/test/srt/model/test_llama_extend.py
index ae8df9d054..2931dfa5dc 100644
--- a/test/srt/model/test_llama_extend.py
+++ b/test/srt/model/test_llama_extend.py
@@ -28,8 +28,8 @@ def test_generate_worker(model_path, tp_rank, tp_size):
 
     reqs = []
     for i in range(len(prompts)):
-        req = Req(i, None, None)
-        req.input_ids = tokenizer.encode(prompts[i])[:cut_num]
+        input_ids = tokenizer.encode(prompts[i])[:cut_num]
+        req = Req(i, prompts[i], input_ids)
         req.sampling_params = sampling_params
         reqs.append(req)
 
@@ -60,7 +60,7 @@ def test_generate_worker(model_path, tp_rank, tp_size):
     # Decode
     for i in range(6):
         batch.prepare_for_decode(next_token_ids.cpu().numpy())
-        logits = model.forward(batch, ForwardMode.DECODE)
+        logits, _ = model.forward(batch, ForwardMode.DECODE)
         next_token_ids, next_token_probs = batch.sample(logits)
 
         print(
diff --git a/test/srt/model/test_llama_low_api.py b/test/srt/model/test_llama_low_api.py
index e556ec7ebf..a8917ee4a8 100644
--- a/test/srt/model/test_llama_low_api.py
+++ b/test/srt/model/test_llama_low_api.py
@@ -71,7 +71,7 @@ def decode(print_logits):
         ) = model.token_to_kv_pool.alloc_contiguous(batch_size)
         model.req_to_token_pool.req_to_token[req_pool_indices, seq_lens] = out_cache_loc
         seq_lens.add_(1)
-        logits = model.forward_decode(
+        logits, _ = model.forward_decode(
             torch.from_numpy(predict_ids).cuda().reshape(-1),
             req_pool_indices,
             seq_lens,
@@ -80,6 +80,7 @@ def decode(print_logits):
             None,
             out_cache_cont_start,
             out_cache_cont_end,
+            False,
         )
         prob_out = torch.softmax(logits, dim=-1)
         predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
diff --git a/test/srt/model/test_llava_low_api.py b/test/srt/model/test_llava_low_api.py
index f6a77a74dd..322ba4855f 100644
--- a/test/srt/model/test_llava_low_api.py
+++ b/test/srt/model/test_llava_low_api.py
@@ -63,7 +63,7 @@ def decode(step, model, tp_rank, batch_size, predict_ids, params, print_logits):
     ) = model.token_to_kv_pool.alloc_contiguous(batch_size)
     model.req_to_token_pool.req_to_token[req_pool_indices, seq_lens] = out_cache_loc
     seq_lens.add_(1)
-    logits = model.forward_decode(
+    logits, _ = model.forward_decode(
         torch.from_numpy(predict_ids).cuda().reshape(-1),
         req_pool_indices,
         seq_lens,
@@ -72,6 +72,7 @@ def decode(step, model, tp_rank, batch_size, predict_ids, params, print_logits):
         None,
         out_cache_cont_start,
         out_cache_cont_end,
+        False,
     )
     prob_out = torch.softmax(logits, dim=-1)
     predict_ids = torch.argmax(prob_out, dim=1, keepdim=True)
@@ -92,7 +93,7 @@ def test_generate_worker(
 
     # Prepare data
     prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nDescribe this picture ASSISTANT:"
-    image_path = "/home/ubuntu/sglang/test/lang/image.png"
+    image_path = "/home/ubuntu/sglang/test/lang/test_image.png"
     image = load_image(image_path)
 
     processor = get_processor("llava-hf/llava-1.5-7b-hf")