diff --git a/llvm/lib/SYCLNativeCPUUtils/PipelineSYCLNativeCPU.cpp b/llvm/lib/SYCLNativeCPUUtils/PipelineSYCLNativeCPU.cpp index 581c2f4866c9..1454c10fc420 100644 --- a/llvm/lib/SYCLNativeCPUUtils/PipelineSYCLNativeCPU.cpp +++ b/llvm/lib/SYCLNativeCPUUtils/PipelineSYCLNativeCPU.cpp @@ -21,7 +21,9 @@ #include "compiler/utils/builtin_info.h" #include "compiler/utils/define_mux_builtins_pass.h" #include "compiler/utils/device_info.h" +#include "compiler/utils/encode_kernel_metadata_pass.h" #include "compiler/utils/prepare_barriers_pass.h" +#include "compiler/utils/replace_local_module_scope_variables_pass.h" #include "compiler/utils/sub_group_analysis.h" #include "compiler/utils/work_item_loops_pass.h" #include "vecz/pass.h" @@ -60,6 +62,7 @@ void llvm::sycl::utils::addSYCLNativeCPUBackendPasses( OptimizationLevel OptLevel) { MPM.addPass(ConvertToMuxBuiltinsSYCLNativeCPUPass()); #ifdef NATIVECPU_USE_OCK + MPM.addPass(compiler::utils::TransferKernelMetadataPass()); // Always enable vectorizer, unless explictly disabled or -O0 is set. if (OptLevel != OptimizationLevel::O0 && !SYCLNativeCPUNoVecz) { MAM.registerPass([] { return vecz::TargetInfoAnalysis(); }); @@ -87,6 +90,7 @@ void llvm::sycl::utils::addSYCLNativeCPUBackendPasses( MAM.registerPass([] { return compiler::utils::SubgroupAnalysis(); }); MPM.addPass(compiler::utils::PrepareBarriersPass()); MPM.addPass(compiler::utils::WorkItemLoopsPass(Opts)); + MPM.addPass(compiler::utils::ReplaceLocalModuleScopeVariablesPass()); MPM.addPass(AlwaysInlinerPass()); #endif MPM.addPass(PrepareSYCLNativeCPUPass()); diff --git a/llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp index c5625217bdfd..b3888db8a7b5 100644 --- a/llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp +++ b/llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp @@ -338,47 +338,21 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M, SmallSet RemovableFuncs; SmallVector WrapperFuncs; - // Retrieve the wrapper functions created by the WorkItemLoop pass. for (auto &OldF : OldKernels) { - std::optional VeczR = - compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF); - if (VeczR && VeczR.value().first) { - WrapperFuncs.push_back(OldF); - } else { - auto Name = compiler::utils::getBaseFnNameOrFnName(*OldF); - if (Name != OldF->getName()) { - WrapperFuncs.push_back(OldF); - } - } - } - - for (auto &OldF : WrapperFuncs) { // If vectorization occurred, at this point we have a wrapper function - // that runs the vectorized kernel and peels using the scalar kernel. We - // make it so this wrapper steals the original kernel name. - std::optional VeczR = - compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF); - if (VeczR && VeczR.value().first) { - auto ScalarF = VeczR.value().first; - OldF->takeName(ScalarF); - if (ScalarF->use_empty()) - RemovableFuncs.insert(ScalarF); - } else { - // The WorkItemLoops pass created a wrapper function for the original - // kernel. If we have a kernel named foo(), the wrapper will be called - // foo-wrapper(), and will have the original kernel name retrieved by - // getBaseFnNameOrFnName. We set the name of the wrapper function - // to the original kernel name and add the original kernel to the - // list of functions that can be removed from the module. - auto Name = compiler::utils::getBaseFnNameOrFnName(*OldF); - Function *OrigF = M.getFunction(Name); + // that runs the vectorized kernel and peels using the scalar kernel. + // There may also be a wrapper for local variables replacement. We make it + // so this wrapper steals the original kernel name. Otherwise we will have + // a wrapper function from the work item loops. In this case we also steal + // the original kernel name. + auto Name = compiler::utils::getOrigFnName(*OldF); + Function *OrigF = M.getFunction(Name); + if (Name != OldF->getName()) { if (OrigF != nullptr) { - // The original kernel is inlined by the WorkItemLoops - // pass if it contained barriers or group collectives, otherwise - // we don't want to (and can't) remove it. - if (OrigF->use_empty()) - RemovableFuncs.insert(OrigF); OldF->takeName(OrigF); + if (OrigF->use_empty()) { + RemovableFuncs.insert(OrigF); + } } else { OldF->setName(Name); } diff --git a/sycl/test/check_device_code/native_cpu/local_module_scope.cpp b/sycl/test/check_device_code/native_cpu/local_module_scope.cpp new file mode 100644 index 000000000000..bb1ea27a115b --- /dev/null +++ b/sycl/test/check_device_code/native_cpu/local_module_scope.cpp @@ -0,0 +1,39 @@ +// REQUIRES: native_cpu_ock + +// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -mllvm -inline-threshold=500 -mllvm -sycl-native-cpu-no-vecz -mllvm -sycl-native-dump-device-ir %s | FileCheck %s + +// Check that local types structure is created and placed on the stack +// We also check that the attribute mux-orig-fn is created as this is needed to +// find the original function after this pass is run + +// CHECK: %localVarTypes = type { ptr addrspace(1) } +// CHECK: define void @_ZTS4TestILi1ELi4EiE.NativeCPUKernel{{.*}} #[[ATTR:[0-9]*]] +// CHECK: alloca %localVarTypes +// CHECK: attributes #[[ATTR]] = {{.*}} "mux-orig-fn"="_ZTS4TestILi1ELi4EiE" + +#include "sycl.hpp" + +template struct Test; + +int main() { + sycl::queue queue; + + constexpr int dims = 1; + constexpr int size = 4; + + std::array data; + + const auto range = sycl::range(size); + const auto range_wg = sycl::range(1); + { + sycl::buffer buf(data.data(), range); + + queue.submit([&](sycl::handler &cgh) { + auto acc = sycl::accessor(buf, cgh, sycl::write_only); + cgh.parallel_for_work_group>( + range, range_wg, [=](auto group) { acc[group.get_group_id()] = 42; }); + }); + queue.wait_and_throw(); + } + return 0; +}