update

sgl-project · Sep 23, 2024 · 82c2fd0 · 82c2fd0
1 parent 0c5d609
commit 82c2fd0
Showing 1 changed file with 0 additions and 3 deletions.
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
@@ -467,7 +467,6 @@ def init_cuda_graphs(self):
         logger.info("Capture cuda graph begin. This can take up to several minutes.")
         self.cuda_graph_runner = CudaGraphRunner(self)
 
-    @torch.inference_mode()
     def forward_decode(self, batch: ScheduleBatch):
         if self.server_args.lora_paths is not None:
             self.lora_manager.prepare_lora_batch(batch)
@@ -481,7 +480,6 @@ def forward_decode(self, batch: ScheduleBatch):
             batch.input_ids, input_metadata.positions, input_metadata
         )
 
-    @torch.inference_mode()
     def forward_extend(self, batch: ScheduleBatch):
         input_metadata = InputMetadata.from_schedule_batch(self, batch)
         if self.server_args.lora_paths is not None:
@@ -500,7 +498,6 @@ def forward_extend(self, batch: ScheduleBatch):
                 get_embedding=True,
             )
 
-    @torch.inference_mode()
     def forward_extend_multi_modal(self, batch: ScheduleBatch):
         input_metadata = InputMetadata.from_schedule_batch(self, batch)
         return self.model.forward(