Skip to content

Commit

Permalink
Integrate llvm-project at 2dc1a27449a9 (#14532)
Browse files Browse the repository at this point in the history
* Reset third_party/llvm-project:
2dc1a27449a98cf18214174d626c29c7bb72c88f
* AMDGPU: Some AMDGPULibCalls cleanups
* remove builder from foldDynamicIndexList()
* change vector type for SetVector
* get int64_t value directly from a dense int attribute
* remove LinalgToLLVM

---------

Co-authored-by: Quinn Dawkins <[email protected]>
Co-authored-by: Okwan Kwon <[email protected]>
  • Loading branch information
3 people committed Aug 1, 2023
1 parent c888ed5 commit 24e9133
Show file tree
Hide file tree
Showing 32 changed files with 88 additions and 125 deletions.
10 changes: 8 additions & 2 deletions compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistribute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,18 @@ struct GPUDistributePass : public GPUDistributeBase<GPUDistributePass> {
getEntryPoint(funcOp)->getWorkgroupSize().value(),
[&](Attribute attr) { return llvm::cast<IntegerAttr>(attr).getInt(); });

// TODO: Thread through subgroup size everywhere.
std::optional<llvm::APInt> maybeSubgroupSize =
getEntryPoint(funcOp)->getSubgroupSize();
// TODO: Don't hard code kCudaWarpSize here.
int64_t subgroupSize =
maybeSubgroupSize ? maybeSubgroupSize->getSExtValue() : 32;

IRRewriter rewriter(funcOp->getContext());
rewriter.setInsertionPointToStart(&funcOp.getBody().front());
DiagnosedSilenceableFailure result =
mlir::transform::gpu::mapNestedForallToThreadsImpl(
rewriter, std::nullopt, funcOp, workgroupSize, /*warpDims=*/{},
false);
rewriter, std::nullopt, funcOp, workgroupSize, subgroupSize, false);
if (!result.succeeded())
return signalPassFailure();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -227,9 +227,9 @@ static LogicalResult tileParallelDims(func::FuncOp funcOp,
SmallVector<Attribute> idDims;
auto getThreadMapping = [&](int64_t dim) {
return mlir::gpu::GPUThreadMappingAttr::get(
tilingOp->getContext(), dim == 0 ? mlir::gpu::Threads::DimX
: dim == 1 ? mlir::gpu::Threads::DimY
: mlir::gpu::Threads::DimZ);
tilingOp->getContext(), dim == 0 ? mlir::gpu::MappingId::DimX
: dim == 1 ? mlir::gpu::MappingId::DimY
: mlir::gpu::MappingId::DimZ);
};
for (unsigned loop : llvm::reverse(partitionedLoops)) {
int64_t num = elementPerWorkgroup[id++];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -458,9 +458,9 @@ LogicalResult rewriteForallToWorkgroup(RewriterBase &rewriter,
MLIRContext *ctx = forallOp->getContext();
Location loc = forallOp->getLoc();
// TODO iree should have own device mapping like #hal.workgroup<x/y/z>
Attribute bX = gpu::GPUBlockMappingAttr::get(ctx, gpu::Blocks::DimX);
Attribute bY = gpu::GPUBlockMappingAttr::get(ctx, gpu::Blocks::DimY);
Attribute bZ = gpu::GPUBlockMappingAttr::get(ctx, gpu::Blocks::DimZ);
Attribute bX = gpu::GPUBlockMappingAttr::get(ctx, gpu::MappingId::DimX);
Attribute bY = gpu::GPUBlockMappingAttr::get(ctx, gpu::MappingId::DimY);
Attribute bZ = gpu::GPUBlockMappingAttr::get(ctx, gpu::MappingId::DimZ);
if (forallOp.getNumResults() > 0)
return forallOp->emitError(
"only bufferized scf.forall lowers to workgroup");
Expand Down
1 change: 0 additions & 1 deletion compiler/src/iree/compiler/Codegen/LLVMCPU/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,6 @@ iree_compiler_cc_library(
"@llvm-project//mlir:LLVMCommonConversion",
"@llvm-project//mlir:LLVMDialect",
"@llvm-project//mlir:LinalgDialect",
"@llvm-project//mlir:LinalgToLLVM",
"@llvm-project//mlir:LinalgTransforms",
"@llvm-project//mlir:LinalgUtils",
"@llvm-project//mlir:MathDialect",
Expand Down
1 change: 0 additions & 1 deletion compiler/src/iree/compiler/Codegen/LLVMCPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,6 @@ iree_cc_library(
MLIRLLVMCommonConversion
MLIRLLVMDialect
MLIRLinalgDialect
MLIRLinalgToLLVM
MLIRLinalgTransforms
MLIRLinalgUtils
MLIRMathDialect
Expand Down
2 changes: 0 additions & 2 deletions compiler/src/iree/compiler/Codegen/LLVMCPU/ConvertToLLVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
#include "mlir/Conversion/LLVMCommon/Pattern.h"
#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
#include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"
#include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
Expand Down Expand Up @@ -1051,7 +1050,6 @@ void ConvertToLLVMPass::runOnOperation() {
populateVectorToLLVMMatrixConversionPatterns(typeConverter, patterns);
populateVectorToLLVMConversionPatterns(
typeConverter, patterns, targetReassociateFpReductions.getValue());
populateLinalgToLLVMConversionPatterns(typeConverter, patterns);
populateReconcileUnrealizedCastsPatterns(patterns);

HALDispatchABI abi(&typeConverter);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,35 +102,19 @@ transform_dialect::MapNestedForallToGpuThreadsOp::applyToOne(
rewriter.setInsertionPointToStart(&target.getBody().front());
DiagnosedSilenceableFailure diag =
mlir::transform::gpu::mapNestedForallToThreadsImpl(
rewriter, transformOp, target, getWorkgroupDims(), getWarpDims(),
rewriter, transformOp, target, getWorkgroupDims(), getSubgroupSize(),
true);
if (!diag.succeeded())
return diag;
auto newAttr = rewriter.getIndexArrayAttr(getWorkgroupDims());
auto subgroupSizeAttr = rewriter.getIndexAttr(getSubgroupSize());
rewriter.startRootUpdate(exportOp);
exportOp->setAttr(exportOp.getWorkgroupSizeAttrName(), newAttr);
if (std::optional<int64_t> subgroupSize = getSubgroupSize()) {
auto subgroupSizeAttr = rewriter.getIndexAttr(*subgroupSize);
exportOp->setAttr(exportOp.getSubgroupSizeAttrName(), subgroupSizeAttr);
}
exportOp->setAttr(exportOp.getSubgroupSizeAttrName(), subgroupSizeAttr);
rewriter.finalizeRootUpdate(exportOp);
return DiagnosedSilenceableFailure::success();
}

void transform_dialect::MapNestedForallToGpuThreadsOp::build(
OpBuilder &builder, OperationState &state, Value target,
ArrayRef<int64_t> workgroupDims, ArrayRef<int64_t> warpDims) {
build(builder, state, {}, target, workgroupDims, warpDims, IntegerAttr());
}

void transform_dialect::MapNestedForallToGpuThreadsOp::build(
OpBuilder &builder, OperationState &state, Value target,
ArrayRef<int64_t> workgroupDims, ArrayRef<int64_t> warpDims,
int64_t subgroupSize) {
build(builder, state, {}, target, workgroupDims, warpDims,
builder.getI64IntegerAttr(subgroupSize));
}

void transform_dialect::MapNestedForallToGpuThreadsOp::getEffects(
SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
transform::onlyReadsHandle(getTarget(), effects);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,29 +89,18 @@ def MapNestedForallToGpuThreadsOp :

let arguments = (ins TransformHandleTypeInterface:$target,
DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$workgroup_dims,
DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$warp_dims,
OptionalAttr<I64Attr>:$subgroup_size);
DefaultValuedOptionalAttr<I64Attr, "32">:$subgroup_size);
let results = (outs);

let assemblyFormat = [{
$target
`workgroup_dims` `=` $workgroup_dims
(`warp_dims` `=` $warp_dims^)?
(`subgroup_size` `=` $subgroup_size^)?
attr-dict
`:` functional-type($target, results)
}];
let cppNamespace = "mlir::iree_compiler::IREE::transform_dialect";

let builders = [
OpBuilder<(ins "Value":$target,
"ArrayRef<int64_t>":$workgroup_dims,
"ArrayRef<int64_t>":$warp_dims)>,
OpBuilder<(ins "Value":$target,
"ArrayRef<int64_t>":$workgroup_dims,
"ArrayRef<int64_t>":$warp_dims,
"int64_t":$subgroupSize)>
];
let extraClassDeclaration = [{
::mlir::DiagnosedSilenceableFailure applyToOne(
::mlir::transform::TransformRewriter &rewriter,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,7 @@ static Value getMaskValue(RewriterBase &rewriter, Operation *op) {
vector::ExtractOp maybeExtractOp = maskResult.maybeExtractOp;
if (maybeExtractOp) {
assert(maybeExtractOp.getPosition().size() == 1 && "expected single pos");
int64_t sliceNum =
llvm::cast<IntegerAttr>(maybeExtractOp.getPosition()[0]).getInt();
int64_t sliceNum = maybeExtractOp.getPosition()[0];
// TODO: to support >2-D mask + extract, and all the cmp.
Location loc = op->getLoc();
Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ transform.sequence failures(propagate) {

// Tile and fuse attention ops
// ==========================================
%forall, %tiled_matmul = transform.structured.tile_to_forall_op %promoted_second_matmul tile_sizes [32] (mapping = [#gpu.warp<x>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
%forall, %tiled_matmul = transform.structured.tile_to_forall_op %promoted_second_matmul tile_sizes [32] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)

%f0, %loop0 = transform.structured.fuse_into_containing_op %scale_acc into %forall : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
%f1, %loop1 = transform.structured.fuse_into_containing_op %truncate into %loop0 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
Expand All @@ -101,7 +101,7 @@ transform.sequence failures(propagate) {
// Distribute fills and last truncate
// ==========================================
%fills = transform.merge_handles %acc_fill, %max_fill, %sum_fill, %last_truncate : !transform.any_op
%fill_grid, %tiled_fill = transform.structured.tile_to_forall_op %fills tile_sizes[32] (mapping = [#gpu.warp<x>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
%fill_grid, %tiled_fill = transform.structured.tile_to_forall_op %fills tile_sizes[32] (mapping = [#gpu.warp<linear_dim_0>]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)

// Vectorize function
// ==========================================
Expand Down Expand Up @@ -137,7 +137,7 @@ transform.sequence failures(propagate) {
// ===========================================================================
%func_7 = transform.structured.match ops{["func.func"]} in %variant_op_3 : (!transform.any_op) -> !transform.any_op
transform.iree.forall_to_workgroup %func_7 : (!transform.any_op) -> ()
transform.iree.map_nested_forall_to_gpu_threads %func_7 workgroup_dims = [4, 8, 4] warp_dims = [4, 1, 1] : (!transform.any_op) -> ()
transform.iree.map_nested_forall_to_gpu_threads %func_7 workgroup_dims = [4, 8, 4] subgroup_size = 32 : (!transform.any_op) -> ()

transform.apply_patterns to %func_7 {
transform.apply_patterns.memref.fold_memref_alias_ops
Expand All @@ -158,7 +158,7 @@ transform.sequence failures(propagate) {

// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0] -> (s0 * 128)>
// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
// CHECK-DAG: #[[MAP2:.+]] = affine_map<()[s0, s1, s2] -> (s2 * 32 + ((s0 + s1 * 4) floordiv 32) * 32 - ((s2 + (s0 + s1 * 4) floordiv 32) floordiv 4) * 128)>
// CHECK-DAG: #[[MAP2:.+]] = affine_map<()[s0, s1, s2] -> (s2 * 32 + ((s0 + s1 * 4) floordiv 32) * 32)>
// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0, d1) -> (d0, d1)>
// CHECK-DAG: #[[MAP4:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
// CHECK-DAG: #[[MAP5:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ module attributes {hal.device.targets = [#device_target_cuda]} {
// CHECK-NEXT: return

// workgroup_size is explicitly set to [10, 11].
// FOREACH-TO-GPU-DAG: hal.executable.export {{.*}}{translation_info = #translation, workgroup_size = [10 : index, 11 : index, 1 : index]}
// FOREACH-TO-GPU-DAG: hal.executable.export {{.*}}{subgroup_size = 32 : index, translation_info = #translation, workgroup_size = [10 : index, 11 : index, 1 : index]}
// FOREACH-TO-GPU-DAG: %[[C0:.*]] = arith.constant 0 : index
// FOREACH-TO-GPU-DAG: %[[C1:.*]] = arith.constant 1 : index
// FOREACH-TO-GPU-DAG: %[[C5:.*]] = arith.constant 5 : index
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,24 +114,24 @@ module attributes {hal.device.targets = [#device_target_cuda]} {
// CHECK: %[[RHS_DPS:.+]] = transform.structured.rewrite_in_destination_passing_style %[[RHS]]

// CHECK: transform.structured.tile_to_forall_op %[[LHS]]
// DEFAULT: num_threads [1, 32, 4] tile_sizes [](mapping = [#gpu.linear<z>, #gpu.linear<y>, #gpu.linear<x>])
// OPTIONS: num_threads [1, 64, 2] tile_sizes [](mapping = [#gpu.linear<z>, #gpu.linear<y>, #gpu.linear<x>])
// DEFAULT: num_threads [1, 32, 4] tile_sizes [](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
// OPTIONS: num_threads [1, 64, 2] tile_sizes [](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
// CHECK: apply_patterns
// CHECK: transform.iree.apply_licm
// CHECK: transform.iree.apply_cse
// CHECK: transform.structured.match ops{["scf.if"]}
// CHECK: transform.scf.take_assumed_branch %{{.*}} take_else_branch

// CHECK: transform.structured.tile_to_forall_op %[[RHS_DPS]]
// DEFAULT: num_threads [8, 16, 1] tile_sizes [](mapping = [#gpu.linear<z>, #gpu.linear<y>, #gpu.linear<x>])
// OPTIONS: num_threads [2, 8, 8] tile_sizes [](mapping = [#gpu.linear<z>, #gpu.linear<y>, #gpu.linear<x>])
// DEFAULT: num_threads [8, 16, 1] tile_sizes [](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
// OPTIONS: num_threads [2, 8, 8] tile_sizes [](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
// CHECK: apply_patterns
// CHECK: transform.iree.apply_licm
// CHECK: transform.iree.apply_cse

// CHECK: transform.structured.tile_to_forall_op
// DEFAULT: num_threads [2, 64, 1] tile_sizes [](mapping = [#gpu.linear<z>, #gpu.linear<y>, #gpu.linear<x>])
// OPTIONS: num_threads [1, 16, 8] tile_sizes [](mapping = [#gpu.linear<z>, #gpu.linear<y>, #gpu.linear<x>])
// DEFAULT: num_threads [2, 64, 1] tile_sizes [](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
// OPTIONS: num_threads [1, 16, 8] tile_sizes [](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
// CHECK: apply_patterns
// CHECK: transform.iree.apply_licm
// CHECK: transform.iree.apply_cse
Expand Down Expand Up @@ -175,8 +175,8 @@ module attributes {hal.device.targets = [#device_target_cuda]} {
// CHECK: transform.iree.apply_buffer_optimizations
// CHECK: transform.iree.forall_to_workgroup
// CHECK: transform.iree.map_nested_forall_to_gpu_threads
// DEFAULT: workgroup_dims = [64, 2, 1] warp_dims = [2, 2, 1]
// OPTIONS: workgroup_dims = [32, 4, 1] warp_dims = [1, 4, 1]
// DEFAULT: workgroup_dims = [64, 2, 1]
// OPTIONS: workgroup_dims = [32, 4, 1]
// CHECK: transform.iree.eliminate_gpu_barriers
// CHECK: apply_patterns
// CHECK: transform.iree.apply_licm
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ hal.executable.variant public @cuda_nvptx_fb, target = <"cuda", "cuda-nvptx-fb",
// CHECK: %[[LHS:.+]] = get_producer_of_operand %{{.*}}[0]
// CHECK: %[[RHS:.+]] = get_producer_of_operand %{{.*}}[1]
// CHECK: transform.structured.rewrite_in_destination_passing_style %[[LHS]]
// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [32, 4] tile_sizes [](mapping = [#gpu.linear<y>, #gpu.linear<x>])
// CHECK: transform.structured.tile_to_forall_op %[[RHS]] num_threads [1, 4, 32] tile_sizes [](mapping = [#gpu.linear<z>, #gpu.linear<y>, #gpu.linear<x>])
// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [32, 4] tile_sizes [](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
// CHECK: transform.structured.tile_to_forall_op %[[RHS]] num_threads [1, 4, 32] tile_sizes [](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [1, 2, 2] tile_sizes [](mapping = [#gpu.warp<z>, #gpu.warp<y>, #gpu.warp<x>])
// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [1, 2, 2] tile_sizes [](mapping = [#gpu.warp<z>, #gpu.warp<y>, #gpu.warp<x>])
// CHECK: transform.apply_patterns.iree.fold_reshape_into_tensor_hal_interface
Expand All @@ -61,7 +61,7 @@ hal.executable.variant public @cuda_nvptx_fb, target = <"cuda", "cuda-nvptx-fb",
// CHECK: transform.iree.bufferize {target_gpu}
// CHECK: transform.iree.apply_buffer_optimizations
// CHECK: transform.iree.forall_to_workgroup
// CHECK: transform.iree.map_nested_forall_to_gpu_threads %{{.*}} workgroup_dims = [64, 2, 1] warp_dims = [2, 2, 1]
// CHECK: transform.iree.map_nested_forall_to_gpu_threads %{{.*}} workgroup_dims = [64, 2, 1]
// CHECK: transform.iree.hoist_static_alloc %{{.*}}
// CHECK: transform.apply_patterns.memref.fold_memref_alias_ops
// CHECK: transform.apply_patterns.memref.extract_address_computations
Expand Down Expand Up @@ -108,11 +108,11 @@ hal.executable.variant public @cuda_nvptx_fb, target = <"cuda", "cuda-nvptx-fb",
// CHECK: %[[LHS:.+]] = get_producer_of_operand %{{.*}}[0]
// CHECK: %[[RHS:.+]] = get_producer_of_operand %{{.*}}[1]
// CHECK: transform.structured.rewrite_in_destination_passing_style %[[RHS]]
// CHECK: transform.structured.tile_to_forall_op %[[LHS]] num_threads [1, 32, 4] tile_sizes [](mapping = [#gpu.linear<z>, #gpu.linear<y>, #gpu.linear<x>])
// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [4, 32] tile_sizes [](mapping = [#gpu.linear<y>, #gpu.linear<x>])
// CHECK: transform.structured.tile_to_forall_op %[[LHS]] num_threads [1, 32, 4] tile_sizes [](mapping = [#gpu.thread<linear_dim_2>, #gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [4, 32] tile_sizes [](mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>])
// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [1, 2, 2] tile_sizes [](mapping = [#gpu.warp<z>, #gpu.warp<y>, #gpu.warp<x>])
// CHECK: transform.structured.tile_to_forall_op %{{.*}} num_threads [1, 2, 2] tile_sizes [](mapping = [#gpu.warp<z>, #gpu.warp<y>, #gpu.warp<x>])
// CHECK: transform.iree.map_nested_forall_to_gpu_threads %{{.*}} workgroup_dims = [64, 2, 1] warp_dims = [2, 2, 1]
// CHECK: transform.iree.map_nested_forall_to_gpu_threads %{{.*}} workgroup_dims = [64, 2, 1]


// -----
Expand Down
Loading

0 comments on commit 24e9133

Please sign in to comment.