Skip to content

Commit

Permalink
Revert "kernel: use tensor cores for flashinfer gqa kernels" (#1511)
Browse files Browse the repository at this point in the history
  • Loading branch information
Ying1123 committed Sep 25, 2024
1 parent 3c93187 commit f39a019
Showing 1 changed file with 3 additions and 11 deletions.
14 changes: 3 additions & 11 deletions python/sglang/srt/layers/attention_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,17 +86,9 @@ def __init__(self, model_runner: ModelRunner):
super().__init__()
self.model_runner = model_runner

local_num_qo_heads = (
model_runner.model_config.num_attention_heads // model_runner.tp_size
)
local_num_kv_heads = model_runner.model_config.get_num_kv_heads(
model_runner.tp_size
)
if (
not _grouped_size_compiled_for_decode_kernels(
local_num_qo_heads, local_num_kv_heads
)
or local_num_qo_heads // local_num_kv_heads > 4
if not _grouped_size_compiled_for_decode_kernels(
model_runner.model_config.num_attention_heads // model_runner.tp_size,
model_runner.model_config.get_num_kv_heads(model_runner.tp_size),
):
self.decode_use_tensor_cores = True
else:
Expand Down

0 comments on commit f39a019

Please sign in to comment.