diff --git a/sycl/test-e2e/BFloat16/bfloat16_example.cpp b/sycl/test-e2e/BFloat16/bfloat16_example.cpp
index 7a125adf134fe..84ea36fd363a5 100644
--- a/sycl/test-e2e/BFloat16/bfloat16_example.cpp
+++ b/sycl/test-e2e/BFloat16/bfloat16_example.cpp
@@ -1,80 +1,15 @@
 ///
-/// Check if bfloat16 example works using fallback libraries
+/// Checks a simple case of bfloat16, also employed for AOT library fallback.
 ///
 
-// REQUIRES: opencl-aot, ocloc, gpu-intel-gen9
-
 // CUDA is not compatible with SPIR.
 // UNSUPPORTED: cuda
 
 // RUN: %clangxx -fsycl %s -o %t.out
 // RUN: %{run} %t.out
 
-// RUN: %clangxx -fsycl -fsycl-targets=spir64 %s -o %t.out
-// RUN: %{run} %t.out
-
-// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" %s -o %t.out
-// RUN: %{run} %t.out
-
-// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -o %t.out
-// RUN: %if gpu %{ %{run} %t.out %}
-
-// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -o %t.out
-// RUN: %{run} %t.out
-
-// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -o %t.out
-// RUN: %{run} %t.out
-
-// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out
-// RUN: %if cpu %{ %{run} %t.out %}
-
-// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out
-// RUN: %if cpu %{ %{run} %t.out %}
-
-#include <sycl/detail/core.hpp>
-#include <sycl/ext/oneapi/bfloat16.hpp>
-
-using namespace sycl;
-using sycl::ext::oneapi::bfloat16;
-
-float foo(float a, float b) {
-  // Convert from float to bfloat16.
-  bfloat16 A{a};
-  bfloat16 B{b};
-
-  // Convert A and B from bfloat16 to float, do addition on floating-point
-  // numbers, then convert the result to bfloat16 and store it in C.
-  bfloat16 C = A + B;
-
-  // Return the result converted from bfloat16 to float.
-  return C;
-}
-
-int main(int argc, char *argv[]) {
-  float data[3] = {7.0f, 8.1f, 0.0f};
-
-  float result_host = foo(7.0f, 8.1f);
-  std::cout << "CPU Result = " << result_host << std::endl;
-  if (std::abs(15.1f - result_host) > 0.1f) {
-    std::cout << "Test failed. Expected CPU Result ~= 15.1" << std::endl;
-    return 1;
-  }
-
-  queue deviceQueue;
-  buffer<float, 1> buf{data, 3};
-
-  deviceQueue.submit([&](handler &cgh) {
-    accessor numbers{buf, cgh, read_write};
-    cgh.single_task([=]() { numbers[2] = foo(numbers[0], numbers[1]); });
-  });
-
-  host_accessor hostOutAcc{buf, read_only};
-  float result_device = hostOutAcc[2];
-  std::cout << "GPU Result = " << result_device << std::endl;
-  if (std::abs(result_host - result_device) > 0.1f) {
-    std::cout << "Test failed. CPU Result !~= GPU result" << std::endl;
-    return 1;
-  }
+#include "bfloat16_example.hpp"
 
-  return 0;
+int main() {
+  return runTest();
 }
diff --git a/sycl/test-e2e/BFloat16/bfloat16_example.hpp b/sycl/test-e2e/BFloat16/bfloat16_example.hpp
new file mode 100644
index 0000000000000..85c9172ed174f
--- /dev/null
+++ b/sycl/test-e2e/BFloat16/bfloat16_example.hpp
@@ -0,0 +1,47 @@
+#include <sycl/detail/core.hpp>
+#include <sycl/ext/oneapi/bfloat16.hpp>
+
+using namespace sycl;
+using sycl::ext::oneapi::bfloat16;
+
+float foo(float a, float b) {
+  // Convert from float to bfloat16.
+  bfloat16 A{a};
+  bfloat16 B{b};
+
+  // Convert A and B from bfloat16 to float, do addition on floating-point
+  // numbers, then convert the result to bfloat16 and store it in C.
+  bfloat16 C = A + B;
+
+  // Return the result converted from bfloat16 to float.
+  return C;
+}
+
+int runTest() {
+  float data[3] = {7.0f, 8.1f, 0.0f};
+
+  float result_host = foo(7.0f, 8.1f);
+  std::cout << "Host Result = " << result_host << std::endl;
+  if (std::abs(15.1f - result_host) > 0.1f) {
+    std::cout << "Test failed. Expected Host Result ~= 15.1" << std::endl;
+    return 1;
+  }
+
+  queue deviceQueue;
+  buffer<float, 1> buf{data, 3};
+
+  deviceQueue.submit([&](handler &cgh) {
+    accessor numbers{buf, cgh, read_write};
+    cgh.single_task([=]() { numbers[2] = foo(numbers[0], numbers[1]); });
+  });
+
+  host_accessor hostOutAcc{buf, read_only};
+  float result_device = hostOutAcc[2];
+  std::cout << "Device Result = " << result_device << std::endl;
+  if (std::abs(result_host - result_device) > 0.1f) {
+    std::cout << "Test failed. Host Result !~= Device result" << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/BFloat16/bfloat16_example_aot.cpp b/sycl/test-e2e/BFloat16/bfloat16_example_aot.cpp
new file mode 100644
index 0000000000000..8337716c3191e
--- /dev/null
+++ b/sycl/test-e2e/BFloat16/bfloat16_example_aot.cpp
@@ -0,0 +1,21 @@
+///
+/// Check if bfloat16 example works using fallback libraries AOT compiled for
+/// both GPU and CPU.
+///
+
+// REQUIRES: opencl-aot, ocloc, gpu-intel-gen12, any-device-is-cpu
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64 %s -o %t.out
+// RUN: %{run} %t.out
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12lp" %s -o %t.out
+// RUN: %{run} %t.out
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12lp" %s -o %t.out
+// RUN: %{run} %t.out
+
+#include "bfloat16_example.hpp"
+
+int main() {
+  return runTest();
+}
diff --git a/sycl/test-e2e/BFloat16/bfloat16_example_aot_cpu.cpp b/sycl/test-e2e/BFloat16/bfloat16_example_aot_cpu.cpp
new file mode 100644
index 0000000000000..2f6d893768c4e
--- /dev/null
+++ b/sycl/test-e2e/BFloat16/bfloat16_example_aot_cpu.cpp
@@ -0,0 +1,18 @@
+///
+/// Check if bfloat16 example works using fallback libraries AOT compiled for
+/// CPU.
+///
+
+// REQUIRES: opencl-aot, ocloc, gpu-intel-gen12, any-device-is-cpu
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device dg1" %s -o %t.out
+// RUN: %if cpu %{ %{run} %t.out %}
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device dg1" %s -o %t.out
+// RUN: %if cpu %{ %{run} %t.out %}
+
+#include "bfloat16_example.hpp"
+
+int main() {
+  return runTest();
+}
diff --git a/sycl/test-e2e/BFloat16/bfloat16_example_aot_gpu.cpp b/sycl/test-e2e/BFloat16/bfloat16_example_aot_gpu.cpp
new file mode 100644
index 0000000000000..8262d3f655b37
--- /dev/null
+++ b/sycl/test-e2e/BFloat16/bfloat16_example_aot_gpu.cpp
@@ -0,0 +1,18 @@
+///
+/// Check if bfloat16 example works using fallback libraries AOT compiled for
+/// GPU.
+///
+
+// REQUIRES: opencl-aot, ocloc, gpu-intel-gen12, any-device-is-gpu
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen12lp" %s -o %t.out
+// RUN: %if gpu %{%{run} %t.out %}
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -o %t.out
+// RUN: %if gpu %{%{run} %t.out %}
+
+#include "bfloat16_example.hpp"
+
+int main() {
+  return runTest();
+}
diff --git a/sycl/test-e2e/ESIMD/ext_math_ieee_sqrt_div.cpp b/sycl/test-e2e/ESIMD/ext_math_ieee_sqrt_div.cpp
index 2d609213ce035..bc3033b49bc21 100644
--- a/sycl/test-e2e/ESIMD/ext_math_ieee_sqrt_div.cpp
+++ b/sycl/test-e2e/ESIMD/ext_math_ieee_sqrt_div.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-gen9 || arch-intel_gpu_pvc
+// REQUIRES: arch-intel_gpu_pvc
 
 // DEFINE: %{mathflags} = %if cl_options %{/clang:-fno-fast-math%} %else %{-fno-fast-math%}
 // RUN: %{build} -fsycl-device-code-split=per_kernel %{mathflags} -o %t.out
diff --git a/sycl/test-e2e/ESIMD/histogram_raw_send.cpp b/sycl/test-e2e/ESIMD/histogram_raw_send.cpp
deleted file mode 100644
index c82bac47c9253..0000000000000
--- a/sycl/test-e2e/ESIMD/histogram_raw_send.cpp
+++ /dev/null
@@ -1,297 +0,0 @@
-//==-histogram_raw_send.cpp  - DPC++ ESIMD on-device test-==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------===//
-// REQUIRES: gpu-intel-gen9
-// UNSUPPORTED: gpu-intel-dg1,gpu-intel-dg2,arch-intel_gpu_pvc
-// UNSUPPORTED: ze_debug
-// RUN: %{build} -o %t1.out
-// RUN: %{run} %t1.out
-// RUN: %{build} -DUSE_CONSTEXPR_API -o %t2.out
-// RUN: %{run} %t2.out
-// RUN: %{build} -DUSE_SUPPORTED_API -o %t3.out
-// RUN: %{run} %t3.out
-
-// The test checks raw send functionality with atomic write implementation
-// on SKL. It does not work on DG1 due to send instruction incompatibility.
-
-#include "esimd_test_utils.hpp"
-
-#include <sycl/accessor_image.hpp>
-
-#include <array>
-
-using namespace sycl;
-
-#define NUM_BINS 256
-#define IMG_WIDTH 1024
-#define IMG_HEIGHT 1024
-//
-// each parallel_for handles 64x32 bytes
-//
-#define BLOCK_WIDTH 32
-#define BLOCK_HEIGHT 64
-
-void histogram_CPU(unsigned int width, unsigned int height, uint8_t *srcY,
-                   unsigned int *cpuHistogram) {
-  int i;
-  for (i = 0; i < width * height; i++) {
-    cpuHistogram[srcY[i]] += 1;
-  }
-}
-
-void writeHist(unsigned int *hist) {
-  int total = 0;
-
-  std::cerr << "\nHistogram: \n";
-  for (int i = 0; i < NUM_BINS; i += 8) {
-    std::cerr << "\n  [" << i << " - " << i + 7 << "]:";
-    for (int j = 0; j < 8; j++) {
-      std::cerr << "\t" << hist[i + j];
-      total += hist[i + j];
-    }
-  }
-  std::cerr << "\nTotal = " << total << " \n";
-}
-
-int checkHistogram(unsigned int *refHistogram, unsigned int *hist) {
-
-  for (int i = 0; i < NUM_BINS; i++) {
-    if (refHistogram[i] != hist[i]) {
-      return 0;
-    }
-  }
-  return 1;
-}
-
-using namespace sycl::ext::intel;
-using namespace sycl::ext::intel::esimd;
-
-template <atomic_op Op, typename T, int n>
-ESIMD_INLINE void atomic_write(T *bins, simd<unsigned, n> offset,
-                               simd<T, n> src0) {
-  simd<T, n> oldDst;
-  simd<uintptr_t, n> vAddr(reinterpret_cast<uintptr_t>(bins));
-  simd<uintptr_t, n> vOffset = offset;
-  vAddr += vOffset;
-
-  uint32_t exDesc = 0x4C;
-  uint32_t desc = 0x414A7FF;
-  constexpr uint8_t execSize = 0x83;
-  constexpr uint8_t sfid = 0x1;
-  constexpr uint8_t numDst = 0x1;
-  constexpr uint8_t numSrc0 = 0x2;
-  constexpr uint8_t numSrc1 = 0x1;
-  constexpr uint8_t isEOT = 0;
-  constexpr uint8_t isSendc = 0;
-
-#ifdef USE_CONSTEXPR_API
-  experimental::esimd::raw_sends<execSize, sfid, numSrc0, numSrc1, numDst,
-                                 isEOT, isSendc>(oldDst, vAddr, src0, exDesc,
-                                                 desc);
-#elif defined(USE_SUPPORTED_API)
-  esimd::raw_sends<execSize, sfid, numSrc0, numSrc1, numDst,
-                   raw_send_eot::not_eot, raw_send_sendc::not_sendc>(
-      oldDst, vAddr, src0, exDesc, desc);
-
-#else
-  experimental::esimd::raw_sends(oldDst, vAddr, src0, exDesc, desc, execSize,
-                                 sfid, numSrc0, numSrc1, numDst, isEOT,
-                                 isSendc);
-#endif
-}
-
-int main(int argc, char *argv[]) {
-
-  const char *input_file = nullptr;
-  unsigned int width = IMG_WIDTH * sizeof(unsigned int);
-  unsigned int height = IMG_HEIGHT;
-
-  if (argc == 2) {
-    input_file = argv[1];
-  } else {
-    std::cerr << "Usage: Histogram.exe input_file" << std::endl;
-    std::cerr << "No input file specificed. Use default random value ...."
-              << std::endl;
-  }
-
-  // ------------------------------------------------------------------------
-  // Read in image luma plane
-
-  // Allocate Input Buffer
-  queue q = esimd_test::createQueue();
-  esimd_test::printTestLabel(q);
-
-  esimd_test::shared_vector<uint8_t> srcY_vec(
-      width * height, esimd_test::shared_allocator<uint8_t>{q});
-  esimd_test::shared_vector<unsigned int> bins_vec(
-      NUM_BINS, esimd_test::shared_allocator<unsigned int>{q});
-  uint8_t *srcY = srcY_vec.data();
-  ;
-  unsigned int *bins = bins_vec.data();
-
-  uint range_width = width / BLOCK_WIDTH;
-  uint range_height = height / BLOCK_HEIGHT;
-
-  // Initializes input.
-  unsigned int input_size = width * height;
-  std::cerr << "Processing inputs\n";
-
-  if (input_file != nullptr) {
-    FILE *f = fopen(input_file, "rb");
-    if (f == NULL) {
-      std::cerr << "Error opening file " << input_file;
-      std::exit(1);
-    }
-
-    unsigned int cnt = fread(srcY, sizeof(unsigned char), input_size, f);
-    if (cnt != input_size) {
-      std::cerr << "Error reading input from " << input_file;
-      std::exit(1);
-    }
-  } else {
-    srand(2009);
-    for (int i = 0; i < input_size; ++i) {
-      srcY[i] = rand() % 256;
-    }
-  }
-
-  for (int i = 0; i < NUM_BINS; i++) {
-    bins[i] = 0;
-  }
-
-  // ------------------------------------------------------------------------
-  // CPU Execution:
-
-  unsigned int cpuHistogram[NUM_BINS];
-  memset(cpuHistogram, 0, sizeof(cpuHistogram));
-  histogram_CPU(width, height, srcY, cpuHistogram);
-
-  sycl::image<2> Img(srcY, image_channel_order::rgba,
-                     image_channel_type::unsigned_int32,
-                     range<2>{width / sizeof(uint4), height});
-
-  // Start Timer
-  esimd_test::Timer timer;
-  double start;
-
-  double kernel_times = 0;
-  unsigned num_iters = 10;
-  const bool profiling =
-      q.has_property<sycl::property::queue::enable_profiling>();
-  try {
-    // num_iters + 1, iteration#0 is for warmup
-    for (int iter = 0; iter <= num_iters; ++iter) {
-      double etime = 0;
-      for (int b = 0; b < NUM_BINS; b++)
-        bins[b] = 0;
-      // create ranges
-      // We need that many task groups
-      auto GlobalRange = range<1>(range_width * range_height);
-      // We need that many tasks in each group
-      auto LocalRange = range<1>(1);
-      nd_range<1> Range(GlobalRange, LocalRange);
-
-      auto e = q.submit([&](handler &cgh) {
-        auto readAcc = Img.get_access<uint4, sycl::access::mode::read>(cgh);
-
-        cgh.parallel_for<class Hist>(
-            Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
-              // Get thread origin offsets
-              uint tid = ndi.get_group(0);
-              uint h_pos = (tid % range_width) * BLOCK_WIDTH;
-              uint v_pos = (tid / range_width) * BLOCK_HEIGHT;
-
-              // Declare a 8x32 uchar matrix to store the input block pixel
-              // value
-              simd<unsigned char, 8 * 32> in;
-
-              // Declare a vector to store the local histogram
-              simd<unsigned int, NUM_BINS> histogram(0);
-
-              // Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block
-              for (int y = 0; y < BLOCK_HEIGHT / 8; y++) {
-                // Perform 2D media block read to load 8x32 pixel block
-                in = media_block_load<unsigned char, 8, 32>(readAcc, h_pos,
-                                                            v_pos);
-
-            // Accumulate local histogram for each pixel value
-#pragma unroll
-                for (int i = 0; i < 8; i++) {
-#pragma unroll
-                  for (int j = 0; j < 32; j++) {
-                    histogram.select<1, 1>(in[i * 32 + j]) += 1;
-                  }
-                }
-
-                // Update starting offset for the next work block
-                v_pos += 8;
-              }
-
-              // Declare a vector to store the offset for atomic write operation
-              simd<unsigned int, 8> offset(0, 1); // init to 0, 1, 2, ..., 7
-              offset *= sizeof(unsigned int);
-
-          // Update global sum by atomically adding each local histogram
-#pragma unroll
-              for (int i = 0; i < NUM_BINS; i += 8) {
-                // Declare a vector to store the source for atomic write
-                // operation
-                simd<unsigned int, 8> src;
-                src = histogram.select<8, 1>(i);
-
-#ifdef __SYCL_DEVICE_ONLY__
-                // flat_atomic<atomic_op::add, unsigned int,
-                // 8>(bins, offset, src, 1);
-                atomic_write<atomic_op::add, unsigned int, 8>(bins, offset,
-                                                              src);
-                offset += 8 * sizeof(unsigned int);
-#else
-                simd<unsigned int, 8> vals;
-                vals.copy_from(bins + i);
-                vals = vals + src;
-                vals.copy_to(bins + i);
-#endif
-              }
-            });
-      });
-      e.wait();
-      if (profiling) {
-        etime = esimd_test::report_time("kernel time", e, e);
-        if (iter > 0)
-          kernel_times += etime;
-      }
-      if (iter == 0)
-        start = timer.Elapsed();
-    }
-
-    // SYCL will enqueue and run the kernel. Recall that the buffer's data is
-    // given back to the host at the end of scope.
-    // make sure data is given back to the host at the end of this scope
-  } catch (sycl::exception const &e) {
-    std::cout << "SYCL exception caught: " << e.what() << '\n';
-    return 1;
-  }
-
-  // End timer.
-  double end = timer.Elapsed();
-
-  esimd_test::display_timing_stats(profiling ? &kernel_times : nullptr,
-                                   num_iters, (end - start) * 1000);
-
-  writeHist(bins);
-  writeHist(cpuHistogram);
-  // Checking Histogram
-  if (checkHistogram(cpuHistogram, bins)) {
-    std::cerr << "PASSED\n";
-    return 0;
-  } else {
-    std::cerr << "FAILED\n";
-    return 1;
-  }
-
-  return 0;
-}
diff --git a/sycl/test-e2e/ESIMD/lit.local.cfg b/sycl/test-e2e/ESIMD/lit.local.cfg
index b4164a1b42238..41191cc92a6c5 100644
--- a/sycl/test-e2e/ESIMD/lit.local.cfg
+++ b/sycl/test-e2e/ESIMD/lit.local.cfg
@@ -3,9 +3,6 @@ import platform
 config.unsupported_features += ['cuda', 'hip']
 config.required_features += ['gpu']
 
-if 'gpu-intel-gen9' in config.available_features and platform.system() == 'Windows':
-  config.unsupported = True
-
 # We need this to fix failures when run on OCL.
 # The current DG2 postcommit job only runs L0 anyway,
 # so there's no difference in coverage.
diff --git a/sycl/test-e2e/ESIMD/vadd_raw_send.cpp b/sycl/test-e2e/ESIMD/vadd_raw_send.cpp
deleted file mode 100644
index 072e94947b76d..0000000000000
--- a/sycl/test-e2e/ESIMD/vadd_raw_send.cpp
+++ /dev/null
@@ -1,186 +0,0 @@
-//==---------------- vadd_raw_send.cpp  - DPC++ ESIMD on-device test--------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-gen9
-// UNSUPPORTED: gpu-intel-dg1,gpu-intel-dg2,arch-intel_gpu_pvc
-// RUN: %{build} -fno-sycl-esimd-force-stateless-mem -o %t1.out
-// RUN: %{run} %t1.out
-// RUN: %{build} -fno-sycl-esimd-force-stateless-mem -DUSE_CONSTEXPR_API -o %t2.out
-// RUN: %{run} %t2.out
-// RUN: %{build} -fno-sycl-esimd-force-stateless-mem -DUSE_SUPPORTED_API -o %t3.out
-// RUN: %{run} %t3.out
-// The test checks raw send functionality with block read/write implementation
-// on SKL. It does not work on DG1 due to send instruction incompatibility.
-
-#include "esimd_test_utils.hpp"
-
-using namespace sycl;
-
-using namespace sycl::ext::intel;
-using namespace sycl::ext::intel::esimd;
-
-template <typename T, int N, typename AccessorTy>
-ESIMD_INLINE simd<T, N> dwaligned_block_read(AccessorTy acc,
-                                             unsigned int offset) {
-  simd<unsigned int, 8> src0;
-  simd<T, N> oldDst;
-
-  src0.select<1, 1>(2) = offset;
-  uint32_t exDesc = 0xA;
-  SurfaceIndex desc = esimd::get_surface_index(acc);
-  desc += 0x2284300;
-  constexpr uint8_t execSize = 0x84;
-  constexpr uint8_t sfid = 0x0;
-  constexpr uint8_t numSrc0 = 0x1;
-  constexpr uint8_t numDst = 0x2;
-#ifdef USE_CONSTEXPR_API
-  return experimental::esimd::raw_send<execSize, sfid, numSrc0, numDst>(
-      oldDst, src0, exDesc, desc);
-#elif defined(USE_SUPPORTED_API)
-  return esimd::raw_send<execSize, sfid, numSrc0, numDst>(oldDst, src0, exDesc,
-                                                          desc);
-#else
-  return experimental::esimd::raw_send(oldDst, src0, exDesc, desc, execSize,
-                                       sfid, numSrc0, numDst);
-#endif
-}
-
-template <typename T, int N, typename AccessorTy>
-ESIMD_INLINE void block_write1(AccessorTy acc, unsigned int offset,
-                               simd<T, N> data) {
-  simd<unsigned int, 8> src0;
-
-  src0.template select<1, 1>(2) = offset >> 4;
-  uint32_t exDesc = 0x4A;
-  SurfaceIndex desc = esimd::get_surface_index(acc);
-  desc += 0x20A0200;
-  constexpr uint8_t execSize = 0x83;
-  constexpr uint8_t sfid = 0x0;
-  constexpr uint8_t numSrc0 = 0x1;
-  constexpr uint8_t numSrc1 = 0x1;
-#ifdef USE_CONSTEXPR_API
-  return experimental::esimd::raw_sends<execSize, sfid, numSrc0, numSrc1>(
-      src0, data, exDesc, desc);
-#elif defined(USE_SUPPORTED_API)
-  return esimd::raw_sends<execSize, sfid, numSrc0, numSrc1>(src0, data, exDesc,
-                                                            desc);
-#else
-  return experimental::esimd::raw_sends(src0, data, exDesc, desc, execSize,
-                                        sfid, numSrc0, numSrc1);
-#endif
-}
-
-template <typename T, int N, typename AccessorTy>
-ESIMD_INLINE void block_write2(AccessorTy acc, unsigned int offset,
-                               simd<T, N> data) {
-  simd<uint32_t, 16> src0;
-  auto src0_ref1 =
-      src0.template select<8, 1>(0).template bit_cast_view<unsigned int>();
-  auto src0_ref2 = src0.template select<8, 1>(8);
-
-  src0_ref1.template select<1, 1>(2) = offset >> 4;
-  src0_ref2 = data.template bit_cast_view<uint32_t>();
-  uint32_t exDesc = 0xA;
-  SurfaceIndex desc = esimd::get_surface_index(acc);
-  desc += 0x40A0200;
-  constexpr uint8_t execSize = 0x83;
-  constexpr uint8_t sfid = 0x0;
-  constexpr uint8_t numSrc0 = 0x2;
-#ifdef USE_CONSTEXPR_API
-  return experimental::esimd::raw_send<execSize, sfid, numSrc0>(src0, exDesc,
-                                                                desc);
-#elif defined(USE_SUPPORTED_API)
-  return esimd::raw_send<execSize, sfid, numSrc0>(src0, exDesc, desc);
-#else
-  return experimental::esimd::raw_send(src0, exDesc, desc, execSize, sfid,
-                                       numSrc0);
-#endif
-}
-
-template <typename T> int test(queue q) {
-  constexpr unsigned Size = 1024 * 128;
-  constexpr unsigned VL = sizeof(T) == 4 ? 16 : 32;
-  T *A = new T[Size];
-  T *B = new T[Size];
-  T *C = new T[Size];
-
-  for (unsigned i = 0; i < Size; ++i) {
-    A[i] = B[i] = i;
-    C[i] = 0;
-  }
-
-  try {
-    buffer<T, 1> bufa(A, range<1>(Size));
-    buffer<T, 1> bufb(B, range<1>(Size));
-    buffer<T, 1> bufc(C, range<1>(Size));
-
-    // We need that many workgroups
-    range<1> GlobalRange{Size / VL};
-
-    // We need that many threads in each group
-    range<1> LocalRange{1};
-
-    auto e = q.submit([&](handler &cgh) {
-      auto PA = bufa.template get_access<access::mode::read>(cgh);
-      auto PB = bufb.template get_access<access::mode::read>(cgh);
-      auto PC = bufc.template get_access<access::mode::write>(cgh);
-      cgh.parallel_for(
-          GlobalRange * LocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
-            unsigned int offset = i * VL * sizeof(T);
-            simd<T, VL> va = dwaligned_block_read<T, VL>(PA, offset);
-            simd<T, VL> vb = dwaligned_block_read<T, VL>(PB, offset);
-            simd<T, VL> vc = va + vb;
-            constexpr int SIZE = VL / 2;
-            block_write1(PC, offset, vc.template select<SIZE, 1>(0).read());
-            offset += SIZE * sizeof(T);
-            block_write2(PC, offset, vc.template select<SIZE, 1>(SIZE).read());
-          });
-    });
-    e.wait();
-  } catch (sycl::exception const &e) {
-    std::cout << "SYCL exception caught: " << e.what() << '\n';
-
-    delete[] A;
-    delete[] B;
-    delete[] C;
-    return 1;
-  }
-
-  int err_cnt = 0;
-
-  for (unsigned i = 0; i < Size; ++i) {
-    if (A[i] + B[i] != C[i]) {
-      if (++err_cnt < 10) {
-        std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i]
-                  << " + " << B[i] << "\n";
-      }
-    }
-  }
-
-  delete[] A;
-  delete[] B;
-  delete[] C;
-
-  std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n");
-  return err_cnt;
-}
-
-int main(void) {
-
-  queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
-
-  auto dev = q.get_device();
-  std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
-            << "\n";
-  int err_cnt = 0;
-
-  err_cnt += test<float>(q);
-  if (dev.has(sycl::aspect::fp16)) {
-    err_cnt += test<sycl::half>(q);
-  }
-  return err_cnt > 0 ? 1 : 0;
-}
diff --git a/sycl/test-e2e/Plugin/level_zero_device_free_mem.cpp b/sycl/test-e2e/Plugin/level_zero_device_free_mem.cpp
index 560f427e88b13..d9423cfc82a35 100644
--- a/sycl/test-e2e/Plugin/level_zero_device_free_mem.cpp
+++ b/sycl/test-e2e/Plugin/level_zero_device_free_mem.cpp
@@ -5,7 +5,7 @@
 // so requiring DG2. There may be more devices in our CI supporting this aspect.
 // REQUIRES: gpu-intel-dg2
 // REQUIRES: level_zero, level_zero_dev_kit
-// UNSUPPORTED: gpu-intel-gen9, gpu-intel-gen12
+// UNSUPPORTED: gpu-intel-gen12
 // The query of free memory is not supported on integrated devices
 //
 // RUN: %{build} %level_zero_options -o %t.out
diff --git a/sycl/test-e2e/Properties/cache_config.cpp b/sycl/test-e2e/Properties/cache_config.cpp
index 72d5d68d3837e..0cda3e97a5d1f 100644
--- a/sycl/test-e2e/Properties/cache_config.cpp
+++ b/sycl/test-e2e/Properties/cache_config.cpp
@@ -1,9 +1,4 @@
-
 // REQUIRES: gpu, level_zero
-// TODO: There is a bug on Windows Gen 9 with reductions
-// which is not related to tested feature. Enable back when
-// bug is fixed on Windows Gen9
-// UNSUPPORTED: gpu-intel-gen9 && windows
 
 // RUN: %{build} -o %t.out
 // RUN: env UR_L0_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s
diff --git a/sycl/test-e2e/README.md b/sycl/test-e2e/README.md
index 662d8d36cd608..44f96af580f7e 100644
--- a/sycl/test-e2e/README.md
+++ b/sycl/test-e2e/README.md
@@ -241,7 +241,6 @@ section below). All these features are related to HW detection and they should
 be considered deprecated, because we have HW auto-detection functionality in
 place. No new tests should use these features:
 
- * **gpu-intel-gen9**  - Intel GPU Gen9  availability;
  * **gpu-intel-gen11** - Intel GPU Gen11 availability;
  * **gpu-intel-gen12** - Intel GPU Gen12 availability;
  * **gpu-intel-dg1** - Intel GPU DG1 availability;
diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py
index c2582467dd4cf..9e2c6da4d044d 100644
--- a/sycl/test-e2e/lit.cfg.py
+++ b/sycl/test-e2e/lit.cfg.py
@@ -140,8 +140,6 @@
 config.substitutions.append(("%sycl_include", config.sycl_include))
 
 # Intel GPU FAMILY availability
-if lit_config.params.get("gpu-intel-gen9", False):
-    config.available_features.add("gpu-intel-gen9")
 if lit_config.params.get("gpu-intel-gen11", False):
     config.available_features.add("gpu-intel-gen11")
 if lit_config.params.get("gpu-intel-gen12", False):