Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions test/TritonGPU/amd/amd-update-async-wait-count.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,34 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ

// -----

// Simple case with amdgpu.buffer_load_to_local

#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [32, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
#shared = #ttg.swizzled_shared<{vec = 8, perPhase = 8, maxPhase = 2, order = [1, 0]}>
#shared1 = #ttg.padded_shared<[4:+4] {order = [1, 0], shape = [16, 256]}>
#smem = #ttg.shared_memory
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
// CHECK-LABEL: simple_buffer_load_to_local_waitcnt
tt.func public @simple_buffer_load_to_local_waitcnt(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: tensor<128x16xi32, #blocked> {tt.contiguity = dense<16> : tensor<2xi32>, tt.divisibility = dense<16> : tensor<2xi32>}, %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg3: tensor<16x256xi32, #blocked1> {tt.contiguity = dense<16> : tensor<2xi32>, tt.divisibility = dense<16> : tensor<2xi32>}, %arg4: !ttg.memdesc<128x16xf16, #shared, #smem, mutable>, %arg5: !ttg.memdesc<16x256xf16, #shared1, #smem, mutable>) {
// Emits 1 direct to lds instruction
%0 = amdgpu.buffer_load_to_local %arg0[%arg1] into %arg4 : <f16>[tensor<128x16xi32, #blocked>] -> <128x16xf16, #shared, #smem, mutable>
%1 = ttg.async_commit_group tokens %0
// Emits 2 direct to lds instructions
%2 = amdgpu.buffer_load_to_local %arg2[%arg3] into %arg5 : <f16>[tensor<16x256xi32, #blocked1>] -> <16x256xf16, #shared1, #smem, mutable>
// Do not wait on the second buffer_load_to_local => waitcnt 2
// CHECK: amdgpu.async_wait {{.*}} {num_inst = 2
%3 = ttg.async_commit_group tokens %2
%4 = ttg.async_wait %1 {num = 0 : i32}
// No buffer_load_to_local in between => waitcnt 0
// CHECK: amdgpu.async_wait {{.*}} {num_inst = 0
%5 = ttg.async_wait %3 {num = 0 : i32}
tt.return
}
}

// -----

// Same as simple_waitcnt but swapped async_waits

#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [32, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,13 @@ namespace mlir {

namespace {

// Returns the number of individual async load memory transactions when copy
// data from the given |srcTy| in global memory to the given |dstTy| in shared
// memory. This takes into account the mask and ptrs alignment and contiguoutiy
// as well as the layouts mapping from global to shared memory addresses
int getNumberOfLoadInstructions(TypedValue<RankedTensorType> ptrs,
ttg::MemDescType dstTy, Value mask,
// Returns the number of individual async load memory transactions required when
// copying data from |srcTy| to |dstTy|, accounting for data contiguity, mask
// alignment, and the layout mapping from global to shared memory addresses.
int getNumberOfLoadInstructions(RankedTensorType srcTy, ttg::MemDescType dstTy,
Value mask, int contig,
ModuleAxisInfoAnalysis &axisInfo) {
LinearLayout srcLayout = tt::gpu::toLinearLayout(ptrs.getType());
LinearLayout srcLayout = tt::gpu::toLinearLayout(srcTy);
LinearLayout sharedLayout;
if (auto paddedEnc = dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(
dstTy.getEncoding())) {
Expand All @@ -66,21 +65,14 @@ int getNumberOfLoadInstructions(TypedValue<RankedTensorType> ptrs,
sharedLayout = triton::gpu::toLinearLayout(dstTy);
}
LinearLayout srcToSharedLayout = srcLayout.invertAndCompose(sharedLayout);
contig = std::min(contig, srcToSharedLayout.getNumConsecutiveInOut());

// On GFX9 we cannot split direct to lds loads into multiple ones because we
// need coalesced writes. So we can divide the number of registers by the
// contiguity to get the number of load instructions.
int contig = srcToSharedLayout.getNumConsecutiveInOut();

// Further restrict by contiguity information for ptr and mask
auto order = tt::gpu::getOrder(ptrs.getType());
auto *ptrInfo = axisInfo.getAxisInfo(ptrs);
contig = std::min<int>(contig, LLVM::AMD::getVectorSize(ptrs, axisInfo));
if (mask)
contig = std::min<int>(contig, axisInfo.getMaskAlignment(mask));

// Divide number of registers by contig to get the number of async intrinsics
int numberOfRegisters = srcToSharedLayout.getInDimSize(
StringAttr::get(ptrs.getContext(), "register"));
StringAttr::get(srcTy.getContext(), "register"));
int loadInstructionCount = std::max(1, numberOfRegisters / contig);
return loadInstructionCount;
}
Expand All @@ -93,9 +85,17 @@ int getOpNumberOfAsyncLoadInstructions(Operation *op,
ModuleAxisInfoAnalysis &axisInfo,
bool emitRemarkOnNonAsyncOp) {
if (auto copyOp = dyn_cast<ttg::AsyncCopyGlobalToLocalOp>(op)) {
return getNumberOfLoadInstructions(copyOp.getSrc(),
int contig = LLVM::AMD::getVectorSize(copyOp.getSrc(), axisInfo);
return getNumberOfLoadInstructions(copyOp.getSrc().getType(),
copyOp.getResult().getType(),
copyOp.getMask(), axisInfo);
copyOp.getMask(), contig, axisInfo);
} else if (auto bufferOp = dyn_cast<amdgpu::BufferLoadToLocalOp>(op)) {
auto ptrType = cast<RankedTensorType>(LLVM::AMD::getPointerTypeWithShape(
bufferOp.getPtr(), bufferOp.getOffsets()));
int contig = LLVM::AMD::getVectorSize(bufferOp.getPtr(),
bufferOp.getOffsets(), axisInfo);
return getNumberOfLoadInstructions(ptrType, bufferOp.getDest().getType(),
bufferOp.getMask(), contig, axisInfo);
} else if (emitRemarkOnNonAsyncOp) {
SmallVector<mlir::MemoryEffects::EffectInstance> effects;
if (auto memEffectIface = dyn_cast<MemoryEffectOpInterface>(op))
Expand Down
Loading