diff --git a/llvm/lib/SYCLNativeCPUUtils/PipelineSYCLNativeCPU.cpp b/llvm/lib/SYCLNativeCPUUtils/PipelineSYCLNativeCPU.cpp
index 581c2f4866c9..1454c10fc420 100644
--- a/llvm/lib/SYCLNativeCPUUtils/PipelineSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/PipelineSYCLNativeCPU.cpp
@@ -21,7 +21,9 @@
 #include "compiler/utils/builtin_info.h"
 #include "compiler/utils/define_mux_builtins_pass.h"
 #include "compiler/utils/device_info.h"
+#include "compiler/utils/encode_kernel_metadata_pass.h"
 #include "compiler/utils/prepare_barriers_pass.h"
+#include "compiler/utils/replace_local_module_scope_variables_pass.h"
 #include "compiler/utils/sub_group_analysis.h"
 #include "compiler/utils/work_item_loops_pass.h"
 #include "vecz/pass.h"
@@ -60,6 +62,7 @@ void llvm::sycl::utils::addSYCLNativeCPUBackendPasses(
     OptimizationLevel OptLevel) {
   MPM.addPass(ConvertToMuxBuiltinsSYCLNativeCPUPass());
 #ifdef NATIVECPU_USE_OCK
+  MPM.addPass(compiler::utils::TransferKernelMetadataPass());
   // Always enable vectorizer, unless explictly disabled or -O0 is set.
   if (OptLevel != OptimizationLevel::O0 && !SYCLNativeCPUNoVecz) {
     MAM.registerPass([] { return vecz::TargetInfoAnalysis(); });
@@ -87,6 +90,7 @@ void llvm::sycl::utils::addSYCLNativeCPUBackendPasses(
   MAM.registerPass([] { return compiler::utils::SubgroupAnalysis(); });
   MPM.addPass(compiler::utils::PrepareBarriersPass());
   MPM.addPass(compiler::utils::WorkItemLoopsPass(Opts));
+  MPM.addPass(compiler::utils::ReplaceLocalModuleScopeVariablesPass());
   MPM.addPass(AlwaysInlinerPass());
 #endif
   MPM.addPass(PrepareSYCLNativeCPUPass());
diff --git a/llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp
index c5625217bdfd..b3888db8a7b5 100644
--- a/llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp
+++ b/llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp
@@ -338,47 +338,21 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
     SmallSet<Function *, 5> RemovableFuncs;
     SmallVector<Function *, 5> WrapperFuncs;
 
-    // Retrieve the wrapper functions created by the WorkItemLoop pass.
     for (auto &OldF : OldKernels) {
-      std::optional<compiler::utils::LinkMetadataResult> VeczR =
-          compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF);
-      if (VeczR && VeczR.value().first) {
-        WrapperFuncs.push_back(OldF);
-      } else {
-        auto Name = compiler::utils::getBaseFnNameOrFnName(*OldF);
-        if (Name != OldF->getName()) {
-          WrapperFuncs.push_back(OldF);
-        }
-      }
-    }
-
-    for (auto &OldF : WrapperFuncs) {
       // If vectorization occurred, at this point we have a wrapper function
-      // that runs the vectorized kernel and peels using the scalar kernel. We
-      // make it so this wrapper steals the original kernel name.
-      std::optional<compiler::utils::LinkMetadataResult> VeczR =
-          compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF);
-      if (VeczR && VeczR.value().first) {
-        auto ScalarF = VeczR.value().first;
-        OldF->takeName(ScalarF);
-        if (ScalarF->use_empty())
-          RemovableFuncs.insert(ScalarF);
-      } else {
-        // The WorkItemLoops pass created a wrapper function for the original
-        // kernel. If we have a kernel named foo(), the wrapper will be called
-        // foo-wrapper(), and will have the original kernel name retrieved by
-        // getBaseFnNameOrFnName. We set the name of the wrapper function
-        // to the original kernel name and add the original kernel to the
-        // list of functions that can be removed from the module.
-        auto Name = compiler::utils::getBaseFnNameOrFnName(*OldF);
-        Function *OrigF = M.getFunction(Name);
+      // that runs the vectorized kernel and peels using the scalar kernel.
+      // There may also be a wrapper for local variables replacement. We make it
+      // so this wrapper steals the original kernel name. Otherwise we will have
+      // a wrapper function from the work item loops. In this case we also steal
+      // the original kernel name.
+      auto Name = compiler::utils::getOrigFnName(*OldF);
+      Function *OrigF = M.getFunction(Name);
+      if (Name != OldF->getName()) {
         if (OrigF != nullptr) {
-          // The original kernel is inlined by the WorkItemLoops
-          // pass if it contained barriers or group collectives, otherwise
-          // we don't want to (and can't) remove it.
-          if (OrigF->use_empty())
-            RemovableFuncs.insert(OrigF);
           OldF->takeName(OrigF);
+          if (OrigF->use_empty()) {
+            RemovableFuncs.insert(OrigF);
+          }
         } else {
           OldF->setName(Name);
         }
diff --git a/sycl/test/check_device_code/native_cpu/local_module_scope.cpp b/sycl/test/check_device_code/native_cpu/local_module_scope.cpp
new file mode 100644
index 000000000000..bb1ea27a115b
--- /dev/null
+++ b/sycl/test/check_device_code/native_cpu/local_module_scope.cpp
@@ -0,0 +1,39 @@
+// REQUIRES: native_cpu_ock
+
+// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -mllvm -inline-threshold=500 -mllvm -sycl-native-cpu-no-vecz -mllvm -sycl-native-dump-device-ir %s | FileCheck %s
+
+// Check that local types structure is created and placed on the stack
+// We also check that the attribute mux-orig-fn is created as this is needed to
+// find the original function after this pass is run
+
+// CHECK: %localVarTypes = type { ptr addrspace(1) }
+// CHECK: define void @_ZTS4TestILi1ELi4EiE.NativeCPUKernel{{.*}} #[[ATTR:[0-9]*]]
+// CHECK: alloca %localVarTypes
+// CHECK: attributes #[[ATTR]] = {{.*}} "mux-orig-fn"="_ZTS4TestILi1ELi4EiE"
+
+#include "sycl.hpp"
+
+template <int dims, int size, typename T = int> struct Test;
+
+int main() {
+  sycl::queue queue;
+
+  constexpr int dims = 1;
+  constexpr int size = 4;
+
+  std::array<int, size> data;
+
+  const auto range = sycl::range<dims>(size);
+  const auto range_wg = sycl::range<dims>(1);
+  {
+    sycl::buffer<int, dims> buf(data.data(), range);
+
+    queue.submit([&](sycl::handler &cgh) {
+      auto acc = sycl::accessor(buf, cgh, sycl::write_only);
+      cgh.parallel_for_work_group<Test<dims, size>>(
+          range, range_wg, [=](auto group) { acc[group.get_group_id()] = 42; });
+    });
+    queue.wait_and_throw();
+  }
+  return 0;
+}