diff --git a/sycl/test-e2e/BFloat16/bfloat16_example.cpp b/sycl/test-e2e/BFloat16/bfloat16_example.cpp index 7a125adf134fe..84ea36fd363a5 100644 --- a/sycl/test-e2e/BFloat16/bfloat16_example.cpp +++ b/sycl/test-e2e/BFloat16/bfloat16_example.cpp @@ -1,80 +1,15 @@ /// -/// Check if bfloat16 example works using fallback libraries +/// Checks a simple case of bfloat16, also employed for AOT library fallback. /// -// REQUIRES: opencl-aot, ocloc, gpu-intel-gen9 - // CUDA is not compatible with SPIR. // UNSUPPORTED: cuda // RUN: %clangxx -fsycl %s -o %t.out // RUN: %{run} %t.out -// RUN: %clangxx -fsycl -fsycl-targets=spir64 %s -o %t.out -// RUN: %{run} %t.out - -// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" %s -o %t.out -// RUN: %{run} %t.out - -// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -o %t.out -// RUN: %if gpu %{ %{run} %t.out %} - -// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -o %t.out -// RUN: %{run} %t.out - -// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen9" %s -o %t.out -// RUN: %{run} %t.out - -// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out -// RUN: %if cpu %{ %{run} %t.out %} - -// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device pvc" %s -o %t.out -// RUN: %if cpu %{ %{run} %t.out %} - -#include -#include - -using namespace sycl; -using sycl::ext::oneapi::bfloat16; - -float foo(float a, float b) { - // Convert from float to bfloat16. - bfloat16 A{a}; - bfloat16 B{b}; - - // Convert A and B from bfloat16 to float, do addition on floating-point - // numbers, then convert the result to bfloat16 and store it in C. - bfloat16 C = A + B; - - // Return the result converted from bfloat16 to float. - return C; -} - -int main(int argc, char *argv[]) { - float data[3] = {7.0f, 8.1f, 0.0f}; - - float result_host = foo(7.0f, 8.1f); - std::cout << "CPU Result = " << result_host << std::endl; - if (std::abs(15.1f - result_host) > 0.1f) { - std::cout << "Test failed. Expected CPU Result ~= 15.1" << std::endl; - return 1; - } - - queue deviceQueue; - buffer buf{data, 3}; - - deviceQueue.submit([&](handler &cgh) { - accessor numbers{buf, cgh, read_write}; - cgh.single_task([=]() { numbers[2] = foo(numbers[0], numbers[1]); }); - }); - - host_accessor hostOutAcc{buf, read_only}; - float result_device = hostOutAcc[2]; - std::cout << "GPU Result = " << result_device << std::endl; - if (std::abs(result_host - result_device) > 0.1f) { - std::cout << "Test failed. CPU Result !~= GPU result" << std::endl; - return 1; - } +#include "bfloat16_example.hpp" - return 0; +int main() { + return runTest(); } diff --git a/sycl/test-e2e/BFloat16/bfloat16_example.hpp b/sycl/test-e2e/BFloat16/bfloat16_example.hpp new file mode 100644 index 0000000000000..85c9172ed174f --- /dev/null +++ b/sycl/test-e2e/BFloat16/bfloat16_example.hpp @@ -0,0 +1,47 @@ +#include +#include + +using namespace sycl; +using sycl::ext::oneapi::bfloat16; + +float foo(float a, float b) { + // Convert from float to bfloat16. + bfloat16 A{a}; + bfloat16 B{b}; + + // Convert A and B from bfloat16 to float, do addition on floating-point + // numbers, then convert the result to bfloat16 and store it in C. + bfloat16 C = A + B; + + // Return the result converted from bfloat16 to float. + return C; +} + +int runTest() { + float data[3] = {7.0f, 8.1f, 0.0f}; + + float result_host = foo(7.0f, 8.1f); + std::cout << "Host Result = " << result_host << std::endl; + if (std::abs(15.1f - result_host) > 0.1f) { + std::cout << "Test failed. Expected Host Result ~= 15.1" << std::endl; + return 1; + } + + queue deviceQueue; + buffer buf{data, 3}; + + deviceQueue.submit([&](handler &cgh) { + accessor numbers{buf, cgh, read_write}; + cgh.single_task([=]() { numbers[2] = foo(numbers[0], numbers[1]); }); + }); + + host_accessor hostOutAcc{buf, read_only}; + float result_device = hostOutAcc[2]; + std::cout << "Device Result = " << result_device << std::endl; + if (std::abs(result_host - result_device) > 0.1f) { + std::cout << "Test failed. Host Result !~= Device result" << std::endl; + return 1; + } + + return 0; +} diff --git a/sycl/test-e2e/BFloat16/bfloat16_example_aot.cpp b/sycl/test-e2e/BFloat16/bfloat16_example_aot.cpp new file mode 100644 index 0000000000000..8337716c3191e --- /dev/null +++ b/sycl/test-e2e/BFloat16/bfloat16_example_aot.cpp @@ -0,0 +1,21 @@ +/// +/// Check if bfloat16 example works using fallback libraries AOT compiled for +/// both GPU and CPU. +/// + +// REQUIRES: opencl-aot, ocloc, gpu-intel-gen12, any-device-is-cpu + +// RUN: %clangxx -fsycl -fsycl-targets=spir64 %s -o %t.out +// RUN: %{run} %t.out + +// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12lp" %s -o %t.out +// RUN: %{run} %t.out + +// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device gen12lp" %s -o %t.out +// RUN: %{run} %t.out + +#include "bfloat16_example.hpp" + +int main() { + return runTest(); +} diff --git a/sycl/test-e2e/BFloat16/bfloat16_example_aot_cpu.cpp b/sycl/test-e2e/BFloat16/bfloat16_example_aot_cpu.cpp new file mode 100644 index 0000000000000..2f6d893768c4e --- /dev/null +++ b/sycl/test-e2e/BFloat16/bfloat16_example_aot_cpu.cpp @@ -0,0 +1,18 @@ +/// +/// Check if bfloat16 example works using fallback libraries AOT compiled for +/// CPU. +/// + +// REQUIRES: opencl-aot, ocloc, gpu-intel-gen12, any-device-is-cpu + +// RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device dg1" %s -o %t.out +// RUN: %if cpu %{ %{run} %t.out %} + +// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64,spir64_gen -Xsycl-target-backend=spir64_gen "-device dg1" %s -o %t.out +// RUN: %if cpu %{ %{run} %t.out %} + +#include "bfloat16_example.hpp" + +int main() { + return runTest(); +} diff --git a/sycl/test-e2e/BFloat16/bfloat16_example_aot_gpu.cpp b/sycl/test-e2e/BFloat16/bfloat16_example_aot_gpu.cpp new file mode 100644 index 0000000000000..8262d3f655b37 --- /dev/null +++ b/sycl/test-e2e/BFloat16/bfloat16_example_aot_gpu.cpp @@ -0,0 +1,18 @@ +/// +/// Check if bfloat16 example works using fallback libraries AOT compiled for +/// GPU. +/// + +// REQUIRES: opencl-aot, ocloc, gpu-intel-gen12, any-device-is-gpu + +// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen12lp" %s -o %t.out +// RUN: %if gpu %{%{run} %t.out %} + +// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %s -o %t.out +// RUN: %if gpu %{%{run} %t.out %} + +#include "bfloat16_example.hpp" + +int main() { + return runTest(); +} diff --git a/sycl/test-e2e/ESIMD/ext_math_ieee_sqrt_div.cpp b/sycl/test-e2e/ESIMD/ext_math_ieee_sqrt_div.cpp index 2d609213ce035..bc3033b49bc21 100644 --- a/sycl/test-e2e/ESIMD/ext_math_ieee_sqrt_div.cpp +++ b/sycl/test-e2e/ESIMD/ext_math_ieee_sqrt_div.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: gpu-intel-gen9 || arch-intel_gpu_pvc +// REQUIRES: arch-intel_gpu_pvc // DEFINE: %{mathflags} = %if cl_options %{/clang:-fno-fast-math%} %else %{-fno-fast-math%} // RUN: %{build} -fsycl-device-code-split=per_kernel %{mathflags} -o %t.out diff --git a/sycl/test-e2e/ESIMD/histogram_raw_send.cpp b/sycl/test-e2e/ESIMD/histogram_raw_send.cpp deleted file mode 100644 index c82bac47c9253..0000000000000 --- a/sycl/test-e2e/ESIMD/histogram_raw_send.cpp +++ /dev/null @@ -1,297 +0,0 @@ -//==-histogram_raw_send.cpp - DPC++ ESIMD on-device test-==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------===// -// REQUIRES: gpu-intel-gen9 -// UNSUPPORTED: gpu-intel-dg1,gpu-intel-dg2,arch-intel_gpu_pvc -// UNSUPPORTED: ze_debug -// RUN: %{build} -o %t1.out -// RUN: %{run} %t1.out -// RUN: %{build} -DUSE_CONSTEXPR_API -o %t2.out -// RUN: %{run} %t2.out -// RUN: %{build} -DUSE_SUPPORTED_API -o %t3.out -// RUN: %{run} %t3.out - -// The test checks raw send functionality with atomic write implementation -// on SKL. It does not work on DG1 due to send instruction incompatibility. - -#include "esimd_test_utils.hpp" - -#include - -#include - -using namespace sycl; - -#define NUM_BINS 256 -#define IMG_WIDTH 1024 -#define IMG_HEIGHT 1024 -// -// each parallel_for handles 64x32 bytes -// -#define BLOCK_WIDTH 32 -#define BLOCK_HEIGHT 64 - -void histogram_CPU(unsigned int width, unsigned int height, uint8_t *srcY, - unsigned int *cpuHistogram) { - int i; - for (i = 0; i < width * height; i++) { - cpuHistogram[srcY[i]] += 1; - } -} - -void writeHist(unsigned int *hist) { - int total = 0; - - std::cerr << "\nHistogram: \n"; - for (int i = 0; i < NUM_BINS; i += 8) { - std::cerr << "\n [" << i << " - " << i + 7 << "]:"; - for (int j = 0; j < 8; j++) { - std::cerr << "\t" << hist[i + j]; - total += hist[i + j]; - } - } - std::cerr << "\nTotal = " << total << " \n"; -} - -int checkHistogram(unsigned int *refHistogram, unsigned int *hist) { - - for (int i = 0; i < NUM_BINS; i++) { - if (refHistogram[i] != hist[i]) { - return 0; - } - } - return 1; -} - -using namespace sycl::ext::intel; -using namespace sycl::ext::intel::esimd; - -template -ESIMD_INLINE void atomic_write(T *bins, simd offset, - simd src0) { - simd oldDst; - simd vAddr(reinterpret_cast(bins)); - simd vOffset = offset; - vAddr += vOffset; - - uint32_t exDesc = 0x4C; - uint32_t desc = 0x414A7FF; - constexpr uint8_t execSize = 0x83; - constexpr uint8_t sfid = 0x1; - constexpr uint8_t numDst = 0x1; - constexpr uint8_t numSrc0 = 0x2; - constexpr uint8_t numSrc1 = 0x1; - constexpr uint8_t isEOT = 0; - constexpr uint8_t isSendc = 0; - -#ifdef USE_CONSTEXPR_API - experimental::esimd::raw_sends(oldDst, vAddr, src0, exDesc, - desc); -#elif defined(USE_SUPPORTED_API) - esimd::raw_sends( - oldDst, vAddr, src0, exDesc, desc); - -#else - experimental::esimd::raw_sends(oldDst, vAddr, src0, exDesc, desc, execSize, - sfid, numSrc0, numSrc1, numDst, isEOT, - isSendc); -#endif -} - -int main(int argc, char *argv[]) { - - const char *input_file = nullptr; - unsigned int width = IMG_WIDTH * sizeof(unsigned int); - unsigned int height = IMG_HEIGHT; - - if (argc == 2) { - input_file = argv[1]; - } else { - std::cerr << "Usage: Histogram.exe input_file" << std::endl; - std::cerr << "No input file specificed. Use default random value ...." - << std::endl; - } - - // ------------------------------------------------------------------------ - // Read in image luma plane - - // Allocate Input Buffer - queue q = esimd_test::createQueue(); - esimd_test::printTestLabel(q); - - esimd_test::shared_vector srcY_vec( - width * height, esimd_test::shared_allocator{q}); - esimd_test::shared_vector bins_vec( - NUM_BINS, esimd_test::shared_allocator{q}); - uint8_t *srcY = srcY_vec.data(); - ; - unsigned int *bins = bins_vec.data(); - - uint range_width = width / BLOCK_WIDTH; - uint range_height = height / BLOCK_HEIGHT; - - // Initializes input. - unsigned int input_size = width * height; - std::cerr << "Processing inputs\n"; - - if (input_file != nullptr) { - FILE *f = fopen(input_file, "rb"); - if (f == NULL) { - std::cerr << "Error opening file " << input_file; - std::exit(1); - } - - unsigned int cnt = fread(srcY, sizeof(unsigned char), input_size, f); - if (cnt != input_size) { - std::cerr << "Error reading input from " << input_file; - std::exit(1); - } - } else { - srand(2009); - for (int i = 0; i < input_size; ++i) { - srcY[i] = rand() % 256; - } - } - - for (int i = 0; i < NUM_BINS; i++) { - bins[i] = 0; - } - - // ------------------------------------------------------------------------ - // CPU Execution: - - unsigned int cpuHistogram[NUM_BINS]; - memset(cpuHistogram, 0, sizeof(cpuHistogram)); - histogram_CPU(width, height, srcY, cpuHistogram); - - sycl::image<2> Img(srcY, image_channel_order::rgba, - image_channel_type::unsigned_int32, - range<2>{width / sizeof(uint4), height}); - - // Start Timer - esimd_test::Timer timer; - double start; - - double kernel_times = 0; - unsigned num_iters = 10; - const bool profiling = - q.has_property(); - try { - // num_iters + 1, iteration#0 is for warmup - for (int iter = 0; iter <= num_iters; ++iter) { - double etime = 0; - for (int b = 0; b < NUM_BINS; b++) - bins[b] = 0; - // create ranges - // We need that many task groups - auto GlobalRange = range<1>(range_width * range_height); - // We need that many tasks in each group - auto LocalRange = range<1>(1); - nd_range<1> Range(GlobalRange, LocalRange); - - auto e = q.submit([&](handler &cgh) { - auto readAcc = Img.get_access(cgh); - - cgh.parallel_for( - Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL { - // Get thread origin offsets - uint tid = ndi.get_group(0); - uint h_pos = (tid % range_width) * BLOCK_WIDTH; - uint v_pos = (tid / range_width) * BLOCK_HEIGHT; - - // Declare a 8x32 uchar matrix to store the input block pixel - // value - simd in; - - // Declare a vector to store the local histogram - simd histogram(0); - - // Each thread handles BLOCK_HEIGHTxBLOCK_WIDTH pixel block - for (int y = 0; y < BLOCK_HEIGHT / 8; y++) { - // Perform 2D media block read to load 8x32 pixel block - in = media_block_load(readAcc, h_pos, - v_pos); - - // Accumulate local histogram for each pixel value -#pragma unroll - for (int i = 0; i < 8; i++) { -#pragma unroll - for (int j = 0; j < 32; j++) { - histogram.select<1, 1>(in[i * 32 + j]) += 1; - } - } - - // Update starting offset for the next work block - v_pos += 8; - } - - // Declare a vector to store the offset for atomic write operation - simd offset(0, 1); // init to 0, 1, 2, ..., 7 - offset *= sizeof(unsigned int); - - // Update global sum by atomically adding each local histogram -#pragma unroll - for (int i = 0; i < NUM_BINS; i += 8) { - // Declare a vector to store the source for atomic write - // operation - simd src; - src = histogram.select<8, 1>(i); - -#ifdef __SYCL_DEVICE_ONLY__ - // flat_atomic(bins, offset, src, 1); - atomic_write(bins, offset, - src); - offset += 8 * sizeof(unsigned int); -#else - simd vals; - vals.copy_from(bins + i); - vals = vals + src; - vals.copy_to(bins + i); -#endif - } - }); - }); - e.wait(); - if (profiling) { - etime = esimd_test::report_time("kernel time", e, e); - if (iter > 0) - kernel_times += etime; - } - if (iter == 0) - start = timer.Elapsed(); - } - - // SYCL will enqueue and run the kernel. Recall that the buffer's data is - // given back to the host at the end of scope. - // make sure data is given back to the host at the end of this scope - } catch (sycl::exception const &e) { - std::cout << "SYCL exception caught: " << e.what() << '\n'; - return 1; - } - - // End timer. - double end = timer.Elapsed(); - - esimd_test::display_timing_stats(profiling ? &kernel_times : nullptr, - num_iters, (end - start) * 1000); - - writeHist(bins); - writeHist(cpuHistogram); - // Checking Histogram - if (checkHistogram(cpuHistogram, bins)) { - std::cerr << "PASSED\n"; - return 0; - } else { - std::cerr << "FAILED\n"; - return 1; - } - - return 0; -} diff --git a/sycl/test-e2e/ESIMD/lit.local.cfg b/sycl/test-e2e/ESIMD/lit.local.cfg index b4164a1b42238..41191cc92a6c5 100644 --- a/sycl/test-e2e/ESIMD/lit.local.cfg +++ b/sycl/test-e2e/ESIMD/lit.local.cfg @@ -3,9 +3,6 @@ import platform config.unsupported_features += ['cuda', 'hip'] config.required_features += ['gpu'] -if 'gpu-intel-gen9' in config.available_features and platform.system() == 'Windows': - config.unsupported = True - # We need this to fix failures when run on OCL. # The current DG2 postcommit job only runs L0 anyway, # so there's no difference in coverage. diff --git a/sycl/test-e2e/ESIMD/vadd_raw_send.cpp b/sycl/test-e2e/ESIMD/vadd_raw_send.cpp deleted file mode 100644 index 072e94947b76d..0000000000000 --- a/sycl/test-e2e/ESIMD/vadd_raw_send.cpp +++ /dev/null @@ -1,186 +0,0 @@ -//==---------------- vadd_raw_send.cpp - DPC++ ESIMD on-device test--------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: gpu-intel-gen9 -// UNSUPPORTED: gpu-intel-dg1,gpu-intel-dg2,arch-intel_gpu_pvc -// RUN: %{build} -fno-sycl-esimd-force-stateless-mem -o %t1.out -// RUN: %{run} %t1.out -// RUN: %{build} -fno-sycl-esimd-force-stateless-mem -DUSE_CONSTEXPR_API -o %t2.out -// RUN: %{run} %t2.out -// RUN: %{build} -fno-sycl-esimd-force-stateless-mem -DUSE_SUPPORTED_API -o %t3.out -// RUN: %{run} %t3.out -// The test checks raw send functionality with block read/write implementation -// on SKL. It does not work on DG1 due to send instruction incompatibility. - -#include "esimd_test_utils.hpp" - -using namespace sycl; - -using namespace sycl::ext::intel; -using namespace sycl::ext::intel::esimd; - -template -ESIMD_INLINE simd dwaligned_block_read(AccessorTy acc, - unsigned int offset) { - simd src0; - simd oldDst; - - src0.select<1, 1>(2) = offset; - uint32_t exDesc = 0xA; - SurfaceIndex desc = esimd::get_surface_index(acc); - desc += 0x2284300; - constexpr uint8_t execSize = 0x84; - constexpr uint8_t sfid = 0x0; - constexpr uint8_t numSrc0 = 0x1; - constexpr uint8_t numDst = 0x2; -#ifdef USE_CONSTEXPR_API - return experimental::esimd::raw_send( - oldDst, src0, exDesc, desc); -#elif defined(USE_SUPPORTED_API) - return esimd::raw_send(oldDst, src0, exDesc, - desc); -#else - return experimental::esimd::raw_send(oldDst, src0, exDesc, desc, execSize, - sfid, numSrc0, numDst); -#endif -} - -template -ESIMD_INLINE void block_write1(AccessorTy acc, unsigned int offset, - simd data) { - simd src0; - - src0.template select<1, 1>(2) = offset >> 4; - uint32_t exDesc = 0x4A; - SurfaceIndex desc = esimd::get_surface_index(acc); - desc += 0x20A0200; - constexpr uint8_t execSize = 0x83; - constexpr uint8_t sfid = 0x0; - constexpr uint8_t numSrc0 = 0x1; - constexpr uint8_t numSrc1 = 0x1; -#ifdef USE_CONSTEXPR_API - return experimental::esimd::raw_sends( - src0, data, exDesc, desc); -#elif defined(USE_SUPPORTED_API) - return esimd::raw_sends(src0, data, exDesc, - desc); -#else - return experimental::esimd::raw_sends(src0, data, exDesc, desc, execSize, - sfid, numSrc0, numSrc1); -#endif -} - -template -ESIMD_INLINE void block_write2(AccessorTy acc, unsigned int offset, - simd data) { - simd src0; - auto src0_ref1 = - src0.template select<8, 1>(0).template bit_cast_view(); - auto src0_ref2 = src0.template select<8, 1>(8); - - src0_ref1.template select<1, 1>(2) = offset >> 4; - src0_ref2 = data.template bit_cast_view(); - uint32_t exDesc = 0xA; - SurfaceIndex desc = esimd::get_surface_index(acc); - desc += 0x40A0200; - constexpr uint8_t execSize = 0x83; - constexpr uint8_t sfid = 0x0; - constexpr uint8_t numSrc0 = 0x2; -#ifdef USE_CONSTEXPR_API - return experimental::esimd::raw_send(src0, exDesc, - desc); -#elif defined(USE_SUPPORTED_API) - return esimd::raw_send(src0, exDesc, desc); -#else - return experimental::esimd::raw_send(src0, exDesc, desc, execSize, sfid, - numSrc0); -#endif -} - -template int test(queue q) { - constexpr unsigned Size = 1024 * 128; - constexpr unsigned VL = sizeof(T) == 4 ? 16 : 32; - T *A = new T[Size]; - T *B = new T[Size]; - T *C = new T[Size]; - - for (unsigned i = 0; i < Size; ++i) { - A[i] = B[i] = i; - C[i] = 0; - } - - try { - buffer bufa(A, range<1>(Size)); - buffer bufb(B, range<1>(Size)); - buffer bufc(C, range<1>(Size)); - - // We need that many workgroups - range<1> GlobalRange{Size / VL}; - - // We need that many threads in each group - range<1> LocalRange{1}; - - auto e = q.submit([&](handler &cgh) { - auto PA = bufa.template get_access(cgh); - auto PB = bufb.template get_access(cgh); - auto PC = bufc.template get_access(cgh); - cgh.parallel_for( - GlobalRange * LocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL { - unsigned int offset = i * VL * sizeof(T); - simd va = dwaligned_block_read(PA, offset); - simd vb = dwaligned_block_read(PB, offset); - simd vc = va + vb; - constexpr int SIZE = VL / 2; - block_write1(PC, offset, vc.template select(0).read()); - offset += SIZE * sizeof(T); - block_write2(PC, offset, vc.template select(SIZE).read()); - }); - }); - e.wait(); - } catch (sycl::exception const &e) { - std::cout << "SYCL exception caught: " << e.what() << '\n'; - - delete[] A; - delete[] B; - delete[] C; - return 1; - } - - int err_cnt = 0; - - for (unsigned i = 0; i < Size; ++i) { - if (A[i] + B[i] != C[i]) { - if (++err_cnt < 10) { - std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i] - << " + " << B[i] << "\n"; - } - } - } - - delete[] A; - delete[] B; - delete[] C; - - std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n"); - return err_cnt; -} - -int main(void) { - - queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); - - auto dev = q.get_device(); - std::cout << "Running on " << dev.get_info() - << "\n"; - int err_cnt = 0; - - err_cnt += test(q); - if (dev.has(sycl::aspect::fp16)) { - err_cnt += test(q); - } - return err_cnt > 0 ? 1 : 0; -} diff --git a/sycl/test-e2e/Plugin/level_zero_device_free_mem.cpp b/sycl/test-e2e/Plugin/level_zero_device_free_mem.cpp index 560f427e88b13..d9423cfc82a35 100644 --- a/sycl/test-e2e/Plugin/level_zero_device_free_mem.cpp +++ b/sycl/test-e2e/Plugin/level_zero_device_free_mem.cpp @@ -5,7 +5,7 @@ // so requiring DG2. There may be more devices in our CI supporting this aspect. // REQUIRES: gpu-intel-dg2 // REQUIRES: level_zero, level_zero_dev_kit -// UNSUPPORTED: gpu-intel-gen9, gpu-intel-gen12 +// UNSUPPORTED: gpu-intel-gen12 // The query of free memory is not supported on integrated devices // // RUN: %{build} %level_zero_options -o %t.out diff --git a/sycl/test-e2e/Properties/cache_config.cpp b/sycl/test-e2e/Properties/cache_config.cpp index 72d5d68d3837e..0cda3e97a5d1f 100644 --- a/sycl/test-e2e/Properties/cache_config.cpp +++ b/sycl/test-e2e/Properties/cache_config.cpp @@ -1,9 +1,4 @@ - // REQUIRES: gpu, level_zero -// TODO: There is a bug on Windows Gen 9 with reductions -// which is not related to tested feature. Enable back when -// bug is fixed on Windows Gen9 -// UNSUPPORTED: gpu-intel-gen9 && windows // RUN: %{build} -o %t.out // RUN: env UR_L0_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s diff --git a/sycl/test-e2e/README.md b/sycl/test-e2e/README.md index 662d8d36cd608..44f96af580f7e 100644 --- a/sycl/test-e2e/README.md +++ b/sycl/test-e2e/README.md @@ -241,7 +241,6 @@ section below). All these features are related to HW detection and they should be considered deprecated, because we have HW auto-detection functionality in place. No new tests should use these features: - * **gpu-intel-gen9** - Intel GPU Gen9 availability; * **gpu-intel-gen11** - Intel GPU Gen11 availability; * **gpu-intel-gen12** - Intel GPU Gen12 availability; * **gpu-intel-dg1** - Intel GPU DG1 availability; diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py index c2582467dd4cf..9e2c6da4d044d 100644 --- a/sycl/test-e2e/lit.cfg.py +++ b/sycl/test-e2e/lit.cfg.py @@ -140,8 +140,6 @@ config.substitutions.append(("%sycl_include", config.sycl_include)) # Intel GPU FAMILY availability -if lit_config.params.get("gpu-intel-gen9", False): - config.available_features.add("gpu-intel-gen9") if lit_config.params.get("gpu-intel-gen11", False): config.available_features.add("gpu-intel-gen11") if lit_config.params.get("gpu-intel-gen12", False):