Update TensorRT-LLM (#1763)

* Update TensorRT-LLM --------- Co-authored-by: Kota Tsuyuzaki <[email protected]> Co-authored-by: Pzzzzz <[email protected]> Co-authored-by: Patrick Reiter Horn <[email protected]>
NVIDIA · Jun 11, 2024 · db4edea · db4edea
1 parent b777bd6
commit db4edea
Show file tree

Hide file tree

Showing 301 changed files with 556,832 additions and 523,784 deletions.
diff --git a/benchmarks/cpp/README.md b/benchmarks/cpp/README.md
@@ -159,7 +159,7 @@ mpirun -n 2 ./benchmarks/gptManagerBenchmark \
     --max_num_samples 500
 ```
 
-`gptManagerBenchmark` can also be used with the high-level C++ API defined by the `executor::Executor` class (see `cpp/include/tensorrt_llm/executor/executor.h`). This can be done by passing the argument `--api executor`. Note that the Executor class is still under development and currently does not support models with tp or pp > 1.
+`gptManagerBenchmark` by default uses the high-level C++ API defined by the `executor::Executor` class (see `cpp/include/tensorrt_llm/executor/executor.h`).
 
 #### Emulated static batching
 

diff --git a/benchmarks/cpp/gptManagerBenchmark.cpp b/benchmarks/cpp/gptManagerBenchmark.cpp
diff --git a/benchmarks/python/allowed_configs.py b/benchmarks/python/allowed_configs.py
@@ -93,6 +93,7 @@ class EncDecBuildConfig:
     builder_opt: Optional[int] = None
     n_mels: Optional[int] = None
     skip_cross_qkv: bool = False
+    use_implicit_relative_attention: Optional[bool] = False
 
     def __post_init__(self) -> None:
         assert self.head_size is not None
@@ -584,6 +585,25 @@ class ModelConfig:
                     builder_opt=None,
                     remove_input_padding=False,
                 )),
+    "glm_10b":
+    ModelConfig(name="glm_10b",
+                family="glm",
+                benchmark_type="gpt",
+                build_config=BuildConfig(
+                    num_layers=48,
+                    num_heads=64,
+                    num_kv_heads=64,
+                    hidden_size=4096,
+                    inter_size=16384,
+                    vocab_size=50304,
+                    hidden_act='gelu',
+                    n_positions=1024,
+                    max_batch_size=128,
+                    max_input_len=1024,
+                    max_output_len=256,
+                    builder_opt=None,
+                    remove_input_padding=False,
+                )),
     "bloom_560m":
     ModelConfig(name="bloom_560m",
                 family="bloom",

diff --git a/benchmarks/python/build.py b/benchmarks/python/build.py
@@ -273,7 +273,7 @@ def build_gpt(args):
         raise Exception(
             f'--opt_num_tokens does not support ootb mode. Please using --opt_batch_size instead it.'
         )
-
+    max_num_tokens = max_batch_size * max(max_input_len, max_beam_width)
     quant_config = get_quant_config(args.quantization)
     quant_algo = quant_config.quant_algo
     kv_cache_quant_algo = quant_config.kv_cache_quant_algo
@@ -309,6 +309,7 @@ def build_gpt(args):
         max_beam_width=max_beam_width,
         max_input_len=max_input_len,
         max_output_len=max_output_len,
+        max_num_tokens=max_num_tokens,
         int8=(quant_mode.has_act_and_weight_quant()
               or quant_mode.is_int8_weight_only()),
         quant_mode=quant_mode,
@@ -572,6 +573,39 @@ def build_gpt(args):
         config = PretrainedConfig.from_dict(config)
         tensorrt_llm_model = tensorrt_llm.models.ChatGLMForCausalLM(config)
 
+    elif family == "glm":
+        config = {
+            'architecture': 'ChatGLMForCausalLM',
+            'dtype': args.dtype,
+            'num_hidden_layers': build_config['num_layers'],
+            'num_attention_heads': build_config['num_heads'],
+            'num_key_value_heads': build_config['num_kv_heads'],
+            'hidden_size': build_config['hidden_size'],
+            'intermediate_size': build_config['inter_size'],
+            'norm_epsilon': 1e-5,
+            'vocab_size': build_config['vocab_size'],
+            'position_embedding_type': 'learned_absolute',
+            'max_position_embeddings': build_config['n_positions'],
+            'hidden_act': build_config['hidden_act'],
+            'quantization': {
+                'quant_algo': quant_algo,
+                'kv_cache_quant_algo': kv_cache_quant_algo
+            },
+            'mapping': {
+                'world_size': world_size,
+                'tp_size': world_size
+            },
+            'chatglm_version': 'glm',
+            'add_bias_linear': True,
+            'add_qkv_bias': True,
+            'apply_query_key_layer_scaling': False,
+            'apply_residual_connection_post_layernorm': False,
+            'rmsnorm': False,
+            'rope_ratio': 1.0,
+        }
+        config = PretrainedConfig.from_dict(config)
+        tensorrt_llm_model = tensorrt_llm.models.ChatGLMForCausalLM(config)
+
     elif family == "bloom":
         config = {
             'architecture': 'BloomForCausalLM',
@@ -871,6 +905,7 @@ def build_gpt(args):
             'layer_types': build_config['layer_types'],
             'rnn_hidden_size': build_config['rnn_hidden_size'],
             'logits_soft_cap': build_config['logits_soft_cap'],
+            'rotary_pct': build_config['rotary_pct'],
         }
         config = PretrainedConfig.from_dict(config)
         tensorrt_llm_model = tensorrt_llm.models.RecurrentGemmaForCausalLM(
@@ -935,10 +970,13 @@ def build_gpt(args):
         print(
             f"max_batch_size: {max_batch_size}, max_input_len: {max_input_len}, max_output_len: {max_output_len}, max_beam_width: {max_beam_width}"
         )
+        # NOTE: all other models use PretrainedModel.prepare_inputs(...)
+        # except RecurrentGemmaForCausalLM and MambaForCausalLM
         inputs = tensorrt_llm_model.prepare_inputs(
             max_batch_size=max_batch_size,
             max_input_len=max_input_len,
             max_seq_len=max_input_len + max_output_len,
+            max_num_tokens=max_num_tokens,
             use_cache=True,
             max_beam_width=max_beam_width,
             opt_batch_size=opt_batch_size,
@@ -1293,7 +1331,7 @@ def enc_dec_build_helper(component, config, args):
                 has_embedding_layernorm,
                 'has_embedding_scale':
                 config.get('has_embedding_scale', False),
-                'ffn_hidden_size':
+                'intermediate_size':
                 config['ffn_hidden_size'],
                 'q_scaling':
                 q_scaling,
@@ -1358,7 +1396,7 @@ def enc_dec_build_helper(component, config, args):
             has_embedding_layernorm,
             'has_embedding_scale':
             config.get('has_embedding_scale', False),
-            'ffn_hidden_size':
+            'intermediate_size':
             config['ffn_hidden_size'],
             'q_scaling':
             q_scaling,
@@ -1381,12 +1419,16 @@ def enc_dec_build_helper(component, config, args):
             'encoder_head_size':
             config['head_size'],
             'skip_cross_qkv':
-            config['skip_cross_qkv']
+            config['skip_cross_qkv'],
+            'use_implicit_relative_attention':
+            config['use_implicit_relative_attention']
         })
         tllm_model = tensorrt_llm.models.DecoderModel(pretrained_config)
         if use_weight_only and family == 'whisper':
             tllm_model = quantize(tllm_model, quant_config)
 
+    tllm_model.precompute_relative_attention_bias(builder_config)
+
     # Module -> Network
     engine_name = get_engine_name(args.model, args.dtype, world_size,
                                   runtime_rank)
@@ -1418,7 +1460,7 @@ def enc_dec_build_helper(component, config, args):
             if family == 'whisper':
                 inputs = tllm_model.prepare_inputs(
                     max_batch_size=config['max_batch_size'], )
-                tllm_model(*inputs)
+                tllm_model(**inputs)
             else:
                 inputs = tllm_model.prepare_inputs(
                     max_batch_size=config['max_batch_size'],

diff --git a/benchmarks/python/gpt_benchmark.py b/benchmarks/python/gpt_benchmark.py
@@ -174,6 +174,15 @@ def __init__(self, args, batch_sizes, in_out_lens, gpu_weights_percents,
                 top_p=args.top_p)
             self.decoder = tensorrt_llm.runtime.GenerationSession(
                 model_config, engine_buffer, self.runtime_mapping)
+        if args.model == 'glm_10b':
+            self.sampling_config = tensorrt_llm.runtime.SamplingConfig(
+                end_id=50258,
+                pad_id=50256,
+                num_beams=self.num_beams,
+                top_k=args.top_k,
+                top_p=args.top_p)
+            self.decoder = tensorrt_llm.runtime.ChatGLMGenerationSession(
+                model_config, engine_buffer, self.runtime_mapping)
         else:
             end_id = 50256
             pad_id = 50256

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -180,9 +180,31 @@ if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
   endif()
 endif()
 
-message(STATUS "GPU architectures: ${CMAKE_CUDA_ARCHITECTURES}")
-# Store CMAKE_CUDA_ARCHITECTURES for later use since torch sets this to "OFF"
-set(CMAKE_CUDA_ARCHITECTURES_ORIG "${CMAKE_CUDA_ARCHITECTURES}")
+if(CMAKE_CUDA_ARCHITECTURES STREQUAL "native")
+  # Detect highest available compute capability
+  set(OUTPUTFILE ${PROJECT_BINARY_DIR}/detect_cuda_arch)
+  set(CUDAFILE ${CMAKE_SOURCE_DIR}/cmake/utils/detect_cuda_arch.cu)
+  execute_process(COMMAND ${CMAKE_CUDA_COMPILER} -lcuda ${CUDAFILE} -o
+                          ${OUTPUTFILE})
+  message(VERBOSE "Detecting native CUDA compute capability")
+  execute_process(
+    COMMAND ${OUTPUTFILE}
+    RESULT_VARIABLE CUDA_RETURN_CODE
+    OUTPUT_VARIABLE CUDA_ARCH_OUTPUT)
+  if(NOT ${CUDA_RETURN_CODE} EQUAL 0)
+    message(WARNING "Detecting native CUDA compute capability - fail")
+    message(
+      WARNING "CUDA compute capability detection failed, compiling for 'all'")
+    set(CMAKE_CUDA_ARCHITECTURES_ORIG "all")
+  else()
+    message(STATUS "Detecting native CUDA compute capability - done")
+    set(CMAKE_CUDA_ARCHITECTURES_ORIG "${CUDA_ARCH_OUTPUT}")
+  endif()
+else()
+  # Store CMAKE_CUDA_ARCHITECTURES for later use since torch sets this to "OFF"
+  set(CMAKE_CUDA_ARCHITECTURES_ORIG "${CMAKE_CUDA_ARCHITECTURES}")
+endif()
+message(STATUS "GPU architectures: ${CMAKE_CUDA_ARCHITECTURES_ORIG}")
 
 enable_language(C CXX CUDA)
 

diff --git a/cpp/cmake/utils/detect_cuda_arch.cu b/cpp/cmake/utils/detect_cuda_arch.cu
@@ -0,0 +1,39 @@
+#include <algorithm>
+#include <cuda_runtime.h>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+int main(int argc, char* argv[])
+{
+    int n_devices = 0;
+    int rc = cudaGetDeviceCount(&n_devices);
+    if (rc != cudaSuccess)
+    {
+        cudaError_t error = cudaGetLastError();
+        std::cout << "CUDA error: " << cudaGetErrorString(error) << std::endl;
+        return rc;
+    }
+
+    std::vector<std::pair<int, int>> arch(n_devices);
+    for (int cd = 0; cd < n_devices; ++cd)
+    {
+        cudaDeviceProp dev;
+        int rc = cudaGetDeviceProperties(&dev, cd);
+        if (rc != cudaSuccess)
+        {
+            cudaError_t error = cudaGetLastError();
+            std::cout << "CUDA error: " << cudaGetErrorString(error) << std::endl;
+            return rc;
+        }
+        else
+        {
+            arch[cd] = {dev.major, dev.minor};
+        }
+    }
+
+    std::pair<int, int> best_cc = *std::max_element(begin(arch), end(arch));
+    std::cout << best_cc.first << best_cc.second;
+
+    return 0;
+}
diff --git a/cpp/include/tensorrt_llm/batch_manager/GptManager.h b/cpp/include/tensorrt_llm/batch_manager/GptManager.h
@@ -48,9 +48,9 @@ class GptManager
     using RequestList = std::list<std::shared_ptr<LlmRequest>>;
     using TensorPtr = runtime::ITensor::SharedPtr;
 
-    GptManager(std::filesystem::path const& trtEnginePath, TrtGptModelType modelType, SizeType32 maxBeamWidth,
-        executor::SchedulerConfig const& schedulerConfig, GetInferenceRequestsCallback getInferenceRequestsCb,
-        SendResponseCallback sendResponseCb, PollStopSignalCallback pollStopSignalCb = nullptr,
+    GptManager(std::filesystem::path const& trtEnginePath, TrtGptModelType modelType,
+        GetInferenceRequestsCallback getInferenceRequestsCb, SendResponseCallback sendResponseCb,
+        PollStopSignalCallback pollStopSignalCb = nullptr,
         ReturnBatchManagerStatsCallback returnBatchManagerStatsCb = nullptr,
         TrtGptModelOptionalParams const& optionalParams = TrtGptModelOptionalParams(),
         std::optional<uint64_t> terminateReqId = std::nullopt, bool excludeInputInOutput = false);

diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -699,6 +699,12 @@ class GenericLlmRequest
             runtime::ITensor::makeShape({mSamplingConfig.beamWidth, mMaxNewTokens, vocabSizePadded}), logitsDataType);
     }
 
+    void allocTargetModelAcceptedTokenLogitsHost(SizeType32 vocabSizePadded, nvinfer1::DataType logitsDataType)
+    {
+        mGenerationLogitsHost = runtime::BufferManager::pinned(
+            runtime::ITensor::makeShape({getNumDraftTokens() + 1, vocabSizePadded}), logitsDataType);
+    }
+
     [[nodiscard]] std::vector<TensorPtr> const& getGenerationLogitsFragments() const
     {
         return mGenerationLogitsFragments;
@@ -901,6 +907,18 @@ class GenericLlmRequest
                     result.generationLogits = executor::detail::ofITensor(getGenerationLogitsHost());
                 }
 
+                if (getReturnTargetModelAcceptedLogits())
+                {
+                    auto targetModelAcceptedTokenLogitsShape = getGenerationLogitsHost()->getShape();
+                    TLLM_CHECK(targetModelAcceptedTokenLogitsShape.nbDims == 2);
+                    auto numAcceptedToken = targetModelAcceptedTokenLogitsShape.d[0];
+                    auto vocabSizePadded = targetModelAcceptedTokenLogitsShape.d[1];
+                    // Align the shape of accepted token logits and generation logits
+                    TensorPtr targetModelAcceptedTokenLogitsHostView = runtime::ITensor::view(
+                        getGenerationLogitsHost(), runtime::ITensor::makeShape({1, numAcceptedToken, vocabSizePadded}));
+                    result.generationLogits = executor::detail::ofITensor(targetModelAcceptedTokenLogitsHostView);
+                }
+
                 if (getReturnEncoderOutput())
                 {
                     result.encoderOutput = executor::detail::ofITensor(getEncoderOutputHost());
@@ -1023,6 +1041,7 @@ class GenericLlmRequest
         auto data = runtime::bufferCast<int32_t>(*tensor);
         std::memcpy(data, words.data(), numWords * sizeof(int32_t));
         std::memcpy(data + numWords, offsets.data(), numWords * sizeof(int32_t));
+
         // Add leading dim of 1
         tensor->unsqueeze(0);
 

diff --git a/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h b/cpp/include/tensorrt_llm/batch_manager/trtGptModelOptionalParams.h
@@ -23,6 +23,7 @@
 #include "tensorrt_llm/runtime/common.h"
 
 #include <optional>
+#include <utility>
 #include <vector>
 
 namespace tensorrt_llm::batch_manager
@@ -39,15 +40,19 @@ class TrtGptModelOptionalParams
         bool enableTrtOverlap = false, std::optional<std::vector<SizeType32>> const& deviceIds = std::nullopt,
         bool normalizeLogProbs = true, bool enableChunkedContext = false,
         PeftCacheManagerConfig const& peftCacheManagerConfig = PeftCacheManagerConfig{},
-        executor::DecodingConfig const& decodingConfig = executor::DecodingConfig{}, float gpuWeightsPercent = 1)
+        executor::DecodingConfig decodingConfig = executor::DecodingConfig{}, float gpuWeightsPercent = 1,
+        std::optional<SizeType32> maxBeamWidth = std::nullopt,
+        executor::SchedulerConfig const& schedulerConfig = executor::SchedulerConfig{})
         : kvCacheConfig{kvCacheConfig}
         , enableTrtOverlap{enableTrtOverlap}
         , deviceIds(deviceIds)
         , normalizeLogProbs{normalizeLogProbs}
         , enableChunkedContext{enableChunkedContext}
         , peftCacheManagerConfig(peftCacheManagerConfig)
-        , decodingConfig(decodingConfig)
+        , decodingConfig(std::move(decodingConfig))
         , gpuWeightsPercent(gpuWeightsPercent)
+        , maxBeamWidth(maxBeamWidth)
+        , schedulerConfig{schedulerConfig}
     {
     }
 
@@ -57,7 +62,8 @@ class TrtGptModelOptionalParams
             executorConfig.getNormalizeLogProbs(), executorConfig.getEnableChunkedContext(),
             PeftCacheManagerConfig(executorConfig.getPeftCacheConfig().value_or(executor::PeftCacheConfig())),
             executorConfig.getDecodingConfig().value_or(executor::DecodingConfig{}),
-            executorConfig.getGpuWeightsPercent())
+            executorConfig.getGpuWeightsPercent(), executorConfig.getMaxBeamWidth(),
+            executorConfig.getSchedulerConfig())
     {
     }
 
@@ -80,6 +86,8 @@ class TrtGptModelOptionalParams
     executor::DecodingConfig decodingConfig;
     // Percentage of weights on the gpu at runtime
     float gpuWeightsPercent;
+    std::optional<SizeType32> maxBeamWidth;
+    executor::SchedulerConfig schedulerConfig;
 };
 
 } // namespace tensorrt_llm::batch_manager