Integrate llvm-project at 2dc1a27449a9 (#14532)

* Reset third_party/llvm-project: 2dc1a27449a98cf18214174d626c29c7bb72c88f * AMDGPU: Some AMDGPULibCalls cleanups * remove builder from foldDynamicIndexList() * change vector type for SetVector * get int64_t value directly from a dense int attribute * remove LinalgToLLVM --------- Co-authored-by: Quinn Dawkins <[email protected]> Co-authored-by: Okwan Kwon <[email protected]>
iree-org · Aug 1, 2023 · 24e9133 · 24e9133
1 parent c888ed5
commit 24e9133
Show file tree

Hide file tree

Showing 32 changed files with 88 additions and 125 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistribute.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistribute.cpp
@@ -32,12 +32,18 @@ struct GPUDistributePass : public GPUDistributeBase<GPUDistributePass> {
         getEntryPoint(funcOp)->getWorkgroupSize().value(),
         [&](Attribute attr) { return llvm::cast<IntegerAttr>(attr).getInt(); });
 
+    // TODO: Thread through subgroup size everywhere.
+    std::optional<llvm::APInt> maybeSubgroupSize =
+        getEntryPoint(funcOp)->getSubgroupSize();
+    // TODO: Don't hard code kCudaWarpSize here.
+    int64_t subgroupSize =
+        maybeSubgroupSize ? maybeSubgroupSize->getSExtValue() : 32;
+
     IRRewriter rewriter(funcOp->getContext());
     rewriter.setInsertionPointToStart(&funcOp.getBody().front());
     DiagnosedSilenceableFailure result =
         mlir::transform::gpu::mapNestedForallToThreadsImpl(
-            rewriter, std::nullopt, funcOp, workgroupSize, /*warpDims=*/{},
-            false);
+            rewriter, std::nullopt, funcOp, workgroupSize, subgroupSize, false);
     if (!result.succeeded())
       return signalPassFailure();
   }

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTensorTile.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTensorTile.cpp
@@ -227,9 +227,9 @@ static LogicalResult tileParallelDims(func::FuncOp funcOp,
     SmallVector<Attribute> idDims;
     auto getThreadMapping = [&](int64_t dim) {
       return mlir::gpu::GPUThreadMappingAttr::get(
-          tilingOp->getContext(), dim == 0   ? mlir::gpu::Threads::DimX
-                                  : dim == 1 ? mlir::gpu::Threads::DimY
-                                             : mlir::gpu::Threads::DimZ);
+          tilingOp->getContext(), dim == 0   ? mlir::gpu::MappingId::DimX
+                                  : dim == 1 ? mlir::gpu::MappingId::DimY
+                                             : mlir::gpu::MappingId::DimZ);
     };
     for (unsigned loop : llvm::reverse(partitionedLoops)) {
       int64_t num = elementPerWorkgroup[id++];

diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
@@ -458,9 +458,9 @@ LogicalResult rewriteForallToWorkgroup(RewriterBase &rewriter,
   MLIRContext *ctx = forallOp->getContext();
   Location loc = forallOp->getLoc();
   // TODO iree should have own device mapping like #hal.workgroup<x/y/z>
-  Attribute bX = gpu::GPUBlockMappingAttr::get(ctx, gpu::Blocks::DimX);
-  Attribute bY = gpu::GPUBlockMappingAttr::get(ctx, gpu::Blocks::DimY);
-  Attribute bZ = gpu::GPUBlockMappingAttr::get(ctx, gpu::Blocks::DimZ);
+  Attribute bX = gpu::GPUBlockMappingAttr::get(ctx, gpu::MappingId::DimX);
+  Attribute bY = gpu::GPUBlockMappingAttr::get(ctx, gpu::MappingId::DimY);
+  Attribute bZ = gpu::GPUBlockMappingAttr::get(ctx, gpu::MappingId::DimZ);
   if (forallOp.getNumResults() > 0)
     return forallOp->emitError(
         "only bufferized scf.forall lowers to workgroup");

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel
@@ -135,7 +135,6 @@ iree_compiler_cc_library(
         "@llvm-project//mlir:LLVMCommonConversion",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:LinalgDialect",
-        "@llvm-project//mlir:LinalgToLLVM",
         "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:LinalgUtils",
         "@llvm-project//mlir:MathDialect",

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
@@ -107,7 +107,6 @@ iree_cc_library(
     MLIRLLVMCommonConversion
     MLIRLLVMDialect
     MLIRLinalgDialect
-    MLIRLinalgToLLVM
     MLIRLinalgTransforms
     MLIRLinalgUtils
     MLIRMathDialect

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp
@@ -29,7 +29,6 @@
 #include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
-#include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
@@ -1051,7 +1050,6 @@ void ConvertToLLVMPass::runOnOperation() {
   populateVectorToLLVMMatrixConversionPatterns(typeConverter, patterns);
   populateVectorToLLVMConversionPatterns(
       typeConverter, patterns, targetReassociateFpReductions.getValue());
-  populateLinalgToLLVMConversionPatterns(typeConverter, patterns);
   populateReconcileUnrealizedCastsPatterns(patterns);
 
   HALDispatchABI abi(&typeConverter);

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
@@ -102,35 +102,19 @@ transform_dialect::MapNestedForallToGpuThreadsOp::applyToOne(
   rewriter.setInsertionPointToStart(&target.getBody().front());
   DiagnosedSilenceableFailure diag =
       mlir::transform::gpu::mapNestedForallToThreadsImpl(
-          rewriter, transformOp, target, getWorkgroupDims(), getWarpDims(),
+          rewriter, transformOp, target, getWorkgroupDims(), getSubgroupSize(),
           true);
   if (!diag.succeeded())
     return diag;
   auto newAttr = rewriter.getIndexArrayAttr(getWorkgroupDims());
+  auto subgroupSizeAttr = rewriter.getIndexAttr(getSubgroupSize());
   rewriter.startRootUpdate(exportOp);
   exportOp->setAttr(exportOp.getWorkgroupSizeAttrName(), newAttr);
-  if (std::optional<int64_t> subgroupSize = getSubgroupSize()) {
-    auto subgroupSizeAttr = rewriter.getIndexAttr(*subgroupSize);
-    exportOp->setAttr(exportOp.getSubgroupSizeAttrName(), subgroupSizeAttr);
-  }
+  exportOp->setAttr(exportOp.getSubgroupSizeAttrName(), subgroupSizeAttr);
   rewriter.finalizeRootUpdate(exportOp);
   return DiagnosedSilenceableFailure::success();
 }
 
-void transform_dialect::MapNestedForallToGpuThreadsOp::build(
-    OpBuilder &builder, OperationState &state, Value target,
-    ArrayRef<int64_t> workgroupDims, ArrayRef<int64_t> warpDims) {
-  build(builder, state, {}, target, workgroupDims, warpDims, IntegerAttr());
-}
-
-void transform_dialect::MapNestedForallToGpuThreadsOp::build(
-    OpBuilder &builder, OperationState &state, Value target,
-    ArrayRef<int64_t> workgroupDims, ArrayRef<int64_t> warpDims,
-    int64_t subgroupSize) {
-  build(builder, state, {}, target, workgroupDims, warpDims,
-        builder.getI64IntegerAttr(subgroupSize));
-}
-
 void transform_dialect::MapNestedForallToGpuThreadsOp::getEffects(
     SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
   transform::onlyReadsHandle(getTarget(), effects);

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
@@ -89,29 +89,18 @@ def MapNestedForallToGpuThreadsOp :
 
   let arguments = (ins TransformHandleTypeInterface:$target,
                    DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$workgroup_dims,
-                   DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$warp_dims,
-                   OptionalAttr<I64Attr>:$subgroup_size);
+                   DefaultValuedOptionalAttr<I64Attr, "32">:$subgroup_size);
   let results = (outs);
 
   let assemblyFormat = [{
     $target
     `workgroup_dims` `=` $workgroup_dims
-    (`warp_dims` `=` $warp_dims^)?
     (`subgroup_size` `=` $subgroup_size^)?
     attr-dict
     `:` functional-type($target, results)
   }];
   let cppNamespace = "mlir::iree_compiler::IREE::transform_dialect";
 
-  let builders = [
-    OpBuilder<(ins "Value":$target,
-                   "ArrayRef<int64_t>":$workgroup_dims,
-                   "ArrayRef<int64_t>":$warp_dims)>,
-    OpBuilder<(ins "Value":$target,
-                   "ArrayRef<int64_t>":$workgroup_dims,
-                   "ArrayRef<int64_t>":$warp_dims,
-                   "int64_t":$subgroupSize)>
-  ];
   let extraClassDeclaration = [{
     ::mlir::DiagnosedSilenceableFailure applyToOne(
         ::mlir::transform::TransformRewriter &rewriter,

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.cpp
@@ -114,8 +114,7 @@ static Value getMaskValue(RewriterBase &rewriter, Operation *op) {
   vector::ExtractOp maybeExtractOp = maskResult.maybeExtractOp;
   if (maybeExtractOp) {
     assert(maybeExtractOp.getPosition().size() == 1 && "expected single pos");
-    int64_t sliceNum =
-        llvm::cast<IntegerAttr>(maybeExtractOp.getPosition()[0]).getInt();
+    int64_t sliceNum = maybeExtractOp.getPosition()[0];
     // TODO: to support >2-D mask + extract, and all the cmp.
     Location loc = op->getLoc();
     Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention.mlir
@@ -76,7 +76,7 @@ transform.sequence failures(propagate) {
 
   // Tile and fuse attention ops
   // ==========================================
-  %forall, %tiled_matmul = transform.structured.tile_to_forall_op %promoted_second_matmul tile_sizes [32] (mapping = [#gpu.warp<x>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+  %forall, %tiled_matmul = transform.structured.tile_to_forall_op %promoted_second_matmul tile_sizes [32] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
   %f0, %loop0 = transform.structured.fuse_into_containing_op %scale_acc into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
   %f1, %loop1 = transform.structured.fuse_into_containing_op %truncate into %loop0 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
@@ -101,7 +101,7 @@ transform.sequence failures(propagate) {
   // Distribute fills and last truncate
   // ==========================================
   %fills = transform.merge_handles %acc_fill, %max_fill, %sum_fill, %last_truncate : !transform.any_op
-  %fill_grid, %tiled_fill = transform.structured.tile_to_forall_op %fills tile_sizes[32] (mapping = [#gpu.warp<x>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+  %fill_grid, %tiled_fill = transform.structured.tile_to_forall_op %fills tile_sizes[32] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 
   // Vectorize function
   // ==========================================
@@ -137,7 +137,7 @@ transform.sequence failures(propagate) {
   // ===========================================================================
   %func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
   transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
-  transform.iree.map_nested_forall_to_gpu_threads %func_7 workgroup_dims = [4, 8, 4] warp_dims = [4, 1, 1] : (!transform.any_op) -> ()
+  transform.iree.map_nested_forall_to_gpu_threads %func_7 workgroup_dims = [4, 8, 4] subgroup_size = 32 : (!transform.any_op) -> ()
 
   transform.apply_patterns to %func_7 {
      transform.apply_patterns.memref.fold_memref_alias_ops
@@ -158,7 +158,7 @@ transform.sequence failures(propagate) {
 
 // CHECK-DAG:  #[[MAP:.+]] = affine_map<()[s0] -> (s0 * 128)>
 // CHECK-DAG:  #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-// CHECK-DAG:  #[[MAP2:.+]] = affine_map<()[s0, s1, s2] -> (s2 * 32 + ((s0 + s1 * 4) floordiv 32) * 32 - ((s2 + (s0 + s1 * 4) floordiv 32) floordiv 4) * 128)>
+// CHECK-DAG:  #[[MAP2:.+]] = affine_map<()[s0, s1, s2] -> (s2 * 32 + ((s0 + s1 * 4) floordiv 32) * 32)>
 // CHECK-DAG:  #[[MAP3:.+]] = affine_map<(d0, d1) -> (d0, d1)>
 // CHECK-DAG:  #[[MAP4:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
 // CHECK-DAG:  #[[MAP5:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/linalg_transform.mlir
@@ -34,7 +34,7 @@ module attributes {hal.device.targets = [#device_target_cuda]} {
           // CHECK-NEXT: return
 
           // workgroup_size is explicitly set to [10, 11].
-          // FOREACH-TO-GPU-DAG: hal.executable.export {{.*}}{translation_info = #translation, workgroup_size = [10 : index, 11 : index, 1 : index]}
+          // FOREACH-TO-GPU-DAG: hal.executable.export {{.*}}{subgroup_size = 32 : index, translation_info = #translation, workgroup_size = [10 : index, 11 : index, 1 : index]}
           // FOREACH-TO-GPU-DAG: %[[C0:.*]] = arith.constant 0 : index
           // FOREACH-TO-GPU-DAG: %[[C1:.*]] = arith.constant 1 : index
           // FOREACH-TO-GPU-DAG: %[[C5:.*]] = arith.constant 5 : index

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_batch_matmul.mlir
@@ -114,24 +114,24 @@ module attributes {hal.device.targets = [#device_target_cuda]} {
 // CHECK:   %[[RHS_DPS:.+]] = transform.structured.rewrite_in_destination_passing_style %[[RHS]]
 
 // CHECK:   transform.structured.tile_to_forall_op %[[LHS]] 
-// DEFAULT:  num_threads [1, 32, 4] tile_sizes [](mapping = [#gpu.linear<z>, #gpu.linear<y>, #gpu.linear<x>])
-// OPTIONS:  num_threads [1, 64, 2] tile_sizes [](mapping = [#gpu.linear<z>, #gpu.linear<y>, #gpu.linear<x>])
+// DEFAULT:  num_threads [1, 32, 4] tile_sizes [](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
+// OPTIONS:  num_threads [1, 64, 2] tile_sizes [](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
 // CHECK:   apply_patterns
 // CHECK:   transform.iree.apply_licm
 // CHECK:   transform.iree.apply_cse
 // CHECK:   transform.structured.match ops{["scf.if"]}
 // CHECK:   transform.scf.take_assumed_branch %{{.*}} take_else_branch
 
 // CHECK:   transform.structured.tile_to_forall_op %[[RHS_DPS]]  
-// DEFAULT:  num_threads [8, 16, 1] tile_sizes [](mapping = [#gpu.linear<z>, #gpu.linear<y>, #gpu.linear<x>])
-// OPTIONS:  num_threads [2, 8, 8] tile_sizes [](mapping = [#gpu.linear<z>, #gpu.linear<y>, #gpu.linear<x>])
+// DEFAULT:  num_threads [8, 16, 1] tile_sizes [](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
+// OPTIONS:  num_threads [2, 8, 8] tile_sizes [](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
 // CHECK:   apply_patterns 
 // CHECK:   transform.iree.apply_licm
 // CHECK:   transform.iree.apply_cse
 
 // CHECK:   transform.structured.tile_to_forall_op
-// DEFAULT:  num_threads [2, 64, 1] tile_sizes [](mapping = [#gpu.linear<z>, #gpu.linear<y>, #gpu.linear<x>])
-// OPTIONS:  num_threads [1, 16, 8] tile_sizes [](mapping = [#gpu.linear<z>, #gpu.linear<y>, #gpu.linear<x>])
+// DEFAULT:  num_threads [2, 64, 1] tile_sizes [](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
+// OPTIONS:  num_threads [1, 16, 8] tile_sizes [](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
 // CHECK:   apply_patterns
 // CHECK:   transform.iree.apply_licm
 // CHECK:   transform.iree.apply_cse
@@ -175,8 +175,8 @@ module attributes {hal.device.targets = [#device_target_cuda]} {
 // CHECK:   transform.iree.apply_buffer_optimizations
 // CHECK:   transform.iree.forall_to_workgroup
 // CHECK:   transform.iree.map_nested_forall_to_gpu_threads
-// DEFAULT:  workgroup_dims = [64, 2, 1] warp_dims = [2, 2, 1]
-// OPTIONS:  workgroup_dims = [32, 4, 1] warp_dims = [1, 4, 1]
+// DEFAULT:  workgroup_dims = [64, 2, 1]
+// OPTIONS:  workgroup_dims = [32, 4, 1]
 // CHECK:   transform.iree.eliminate_gpu_barriers
 // CHECK:   apply_patterns
 // CHECK:   transform.iree.apply_licm

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_convolution.mlir
@@ -49,8 +49,8 @@ hal.executable.variant public @cuda_nvptx_fb, target = <"cuda", "cuda-nvptx-fb",
 // CHECK: %[[LHS:.+]] = get_producer_of_operand %{{.*}}[0]
 // CHECK: %[[RHS:.+]] = get_producer_of_operand %{{.*}}[1]
 // CHECK: transform.structured.rewrite_in_destination_passing_style %[[LHS]]
-// CHECK: transform.structured.tile_to_forall_op %{{.*}}   num_threads [32, 4] tile_sizes [](mapping = [#gpu.linear<y>, #gpu.linear<x>])
-// CHECK: transform.structured.tile_to_forall_op %[[RHS]]   num_threads [1, 4, 32] tile_sizes [](mapping = [#gpu.linear<z>, #gpu.linear<y>, #gpu.linear<x>])
+// CHECK: transform.structured.tile_to_forall_op %{{.*}}   num_threads [32, 4] tile_sizes [](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
+// CHECK: transform.structured.tile_to_forall_op %[[RHS]]   num_threads [1, 4, 32] tile_sizes [](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
 // CHECK: transform.structured.tile_to_forall_op %{{.*}}   num_threads [1, 2, 2] tile_sizes [](mapping = [#gpu.warp<z>, #gpu.warp<y>, #gpu.warp<x>])
 // CHECK: transform.structured.tile_to_forall_op %{{.*}}   num_threads [1, 2, 2] tile_sizes [](mapping = [#gpu.warp<z>, #gpu.warp<y>, #gpu.warp<x>])
 // CHECK: transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
@@ -61,7 +61,7 @@ hal.executable.variant public @cuda_nvptx_fb, target = <"cuda", "cuda-nvptx-fb",
 // CHECK: transform.iree.bufferize {target_gpu}
 // CHECK: transform.iree.apply_buffer_optimizations
 // CHECK: transform.iree.forall_to_workgroup
-// CHECK: transform.iree.map_nested_forall_to_gpu_threads %{{.*}} workgroup_dims = [64, 2, 1] warp_dims = [2, 2, 1]
+// CHECK: transform.iree.map_nested_forall_to_gpu_threads %{{.*}} workgroup_dims = [64, 2, 1]
 // CHECK: transform.iree.hoist_static_alloc %{{.*}}
 // CHECK: transform.apply_patterns.memref.fold_memref_alias_ops
 // CHECK: transform.apply_patterns.memref.extract_address_computations
@@ -108,11 +108,11 @@ hal.executable.variant public @cuda_nvptx_fb, target = <"cuda", "cuda-nvptx-fb",
 // CHECK: %[[LHS:.+]] = get_producer_of_operand %{{.*}}[0]
 // CHECK: %[[RHS:.+]] = get_producer_of_operand %{{.*}}[1]
 // CHECK: transform.structured.rewrite_in_destination_passing_style %[[RHS]]
-// CHECK: transform.structured.tile_to_forall_op %[[LHS]]   num_threads [1, 32, 4] tile_sizes [](mapping = [#gpu.linear<z>, #gpu.linear<y>, #gpu.linear<x>])
-// CHECK: transform.structured.tile_to_forall_op %{{.*}}   num_threads [4, 32] tile_sizes [](mapping = [#gpu.linear<y>, #gpu.linear<x>])
+// CHECK: transform.structured.tile_to_forall_op %[[LHS]]   num_threads [1, 32, 4] tile_sizes [](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
+// CHECK: transform.structured.tile_to_forall_op %{{.*}}   num_threads [4, 32] tile_sizes [](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
 // CHECK: transform.structured.tile_to_forall_op %{{.*}}   num_threads [1, 2, 2] tile_sizes [](mapping = [#gpu.warp<z>, #gpu.warp<y>, #gpu.warp<x>])
 // CHECK: transform.structured.tile_to_forall_op %{{.*}}   num_threads [1, 2, 2] tile_sizes [](mapping = [#gpu.warp<z>, #gpu.warp<y>, #gpu.warp<x>])
-// CHECK: transform.iree.map_nested_forall_to_gpu_threads %{{.*}} workgroup_dims = [64, 2, 1] warp_dims = [2, 2, 1]
+// CHECK: transform.iree.map_nested_forall_to_gpu_threads %{{.*}} workgroup_dims = [64, 2, 1]
 
 
 // -----