triton-lang · antiagainst · Nov 3, 2025 · Nov 3, 2025 · Nov 3, 2025
@@ -29,6 +29,34 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
 
 // -----
 
+// Simple case with amdgpu.buffer_load_to_local
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [32, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 8, perPhase = 8, maxPhase = 2, order = [1, 0]}>
+#shared1 = #ttg.padded_shared<[4:+4] {order = [1, 0], shape = [16, 256]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  // CHECK-LABEL: simple_buffer_load_to_local_waitcnt
+  tt.func public @simple_buffer_load_to_local_waitcnt(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: tensor<128x16xi32, #blocked> {tt.contiguity = dense<16> : tensor<2xi32>, tt.divisibility = dense<16> : tensor<2xi32>}, %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg3: tensor<16x256xi32, #blocked1> {tt.contiguity = dense<16> : tensor<2xi32>, tt.divisibility = dense<16> : tensor<2xi32>}, %arg4: !ttg.memdesc<128x16xf16, #shared, #smem, mutable>, %arg5: !ttg.memdesc<16x256xf16, #shared1, #smem, mutable>) {
+    // Emits 1 direct to lds instruction
+    %0 = amdgpu.buffer_load_to_local %arg0[%arg1] into %arg4 : <f16>[tensor<128x16xi32, #blocked>]  -> <128x16xf16, #shared, #smem, mutable>
+    %1 = ttg.async_commit_group tokens %0
+    // Emits 2 direct to lds instructions
+    %2 = amdgpu.buffer_load_to_local %arg2[%arg3] into %arg5 : <f16>[tensor<16x256xi32, #blocked1>]  -> <16x256xf16, #shared1, #smem, mutable>
+    // Do not wait on the second buffer_load_to_local => waitcnt 2
+    // CHECK: amdgpu.async_wait {{.*}} {num_inst = 2
+    %3 = ttg.async_commit_group tokens %2
+    %4 = ttg.async_wait %1 {num = 0 : i32}
+    // No buffer_load_to_local in between => waitcnt 0
+    // CHECK: amdgpu.async_wait {{.*}} {num_inst = 0
+    %5 = ttg.async_wait %3 {num = 0 : i32}
+    tt.return
+  }
+}
+
+// -----
+
 // Same as simple_waitcnt but swapped async_waits
 
 #blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [32, 2], warpsPerCTA = [4, 1], order = [1, 0]}>

@@ -50,14 +50,13 @@ namespace mlir {
 
 namespace {
 
-// Returns the number of individual async load memory transactions when copy
-// data from the given |srcTy| in global memory to the given |dstTy| in shared
-// memory. This takes into account the mask and ptrs alignment and contiguoutiy
-// as well as the layouts mapping from global to shared memory addresses
-int getNumberOfLoadInstructions(TypedValue<RankedTensorType> ptrs,
-                                ttg::MemDescType dstTy, Value mask,
+// Returns the number of individual async load memory transactions required when
+// copying data from |srcTy| to |dstTy|, accounting for data contiguity, mask
+// alignment, and the layout mapping from global to shared memory addresses.
+int getNumberOfLoadInstructions(RankedTensorType srcTy, ttg::MemDescType dstTy,
+                                Value mask, int contig,
                                 ModuleAxisInfoAnalysis &axisInfo) {
-  LinearLayout srcLayout = tt::gpu::toLinearLayout(ptrs.getType());
+  LinearLayout srcLayout = tt::gpu::toLinearLayout(srcTy);
   LinearLayout sharedLayout;
   if (auto paddedEnc = dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(
           dstTy.getEncoding())) {
@@ -66,21 +65,14 @@ int getNumberOfLoadInstructions(TypedValue<RankedTensorType> ptrs,
     sharedLayout = triton::gpu::toLinearLayout(dstTy);
   }
   LinearLayout srcToSharedLayout = srcLayout.invertAndCompose(sharedLayout);
+  contig = std::min(contig, srcToSharedLayout.getNumConsecutiveInOut());
 
-  // On GFX9 we cannot split direct to lds loads into multiple ones because we
-  // need coalesced writes. So we can divide the number of registers by the
-  // contiguity to get the number of load instructions.
-  int contig = srcToSharedLayout.getNumConsecutiveInOut();
-
-  // Further restrict by contiguity information for ptr and mask
-  auto order = tt::gpu::getOrder(ptrs.getType());
-  auto *ptrInfo = axisInfo.getAxisInfo(ptrs);
-  contig = std::min<int>(contig, LLVM::AMD::getVectorSize(ptrs, axisInfo));
   if (mask)
     contig = std::min<int>(contig, axisInfo.getMaskAlignment(mask));
 
+  // Divide number of registers by contig to get the number of async intrinsics
   int numberOfRegisters = srcToSharedLayout.getInDimSize(
-      StringAttr::get(ptrs.getContext(), "register"));
+      StringAttr::get(srcTy.getContext(), "register"));
   int loadInstructionCount = std::max(1, numberOfRegisters / contig);
   return loadInstructionCount;
 }
@@ -93,9 +85,17 @@ int getOpNumberOfAsyncLoadInstructions(Operation *op,
                                        ModuleAxisInfoAnalysis &axisInfo,
                                        bool emitRemarkOnNonAsyncOp) {
   if (auto copyOp = dyn_cast<ttg::AsyncCopyGlobalToLocalOp>(op)) {
-    return getNumberOfLoadInstructions(copyOp.getSrc(),
+    int contig = LLVM::AMD::getVectorSize(copyOp.getSrc(), axisInfo);
+    return getNumberOfLoadInstructions(copyOp.getSrc().getType(),
                                        copyOp.getResult().getType(),
-                                       copyOp.getMask(), axisInfo);
+                                       copyOp.getMask(), contig, axisInfo);
+  } else if (auto bufferOp = dyn_cast<amdgpu::BufferLoadToLocalOp>(op)) {
+    auto ptrType = cast<RankedTensorType>(LLVM::AMD::getPointerTypeWithShape(
+        bufferOp.getPtr(), bufferOp.getOffsets()));
+    int contig = LLVM::AMD::getVectorSize(bufferOp.getPtr(),
+                                          bufferOp.getOffsets(), axisInfo);
+    return getNumberOfLoadInstructions(ptrType, bufferOp.getDest().getType(),
+                                       bufferOp.getMask(), contig, axisInfo);
   } else if (emitRemarkOnNonAsyncOp) {
     SmallVector<mlir::MemoryEffects::EffectInstance> effects;
     if (auto memEffectIface = dyn_cast<MemoryEffectOpInterface>(op))