pytorch-labs
diff --git a/‎examples/fp8_attention.py
Lines changed: 47 additions & 27 deletions b/‎examples/fp8_attention.py
Lines changed: 47 additions & 27 deletions
diff --git a/‎examples/fp8_gemm.py
Lines changed: 11 additions & 6 deletions b/‎examples/fp8_gemm.py
Lines changed: 11 additions & 6 deletions
diff --git a/‎helion/_compiler/indexing_strategy.py
Lines changed: 8 additions & 1 deletion b/‎helion/_compiler/indexing_strategy.py
Lines changed: 8 additions & 1 deletion
diff --git a/‎helion/_compiler/inductor_lowering.py
Lines changed: 8 additions & 4 deletions b/‎helion/_compiler/inductor_lowering.py
Lines changed: 8 additions & 4 deletions
diff --git a/‎helion/language/__init__.py
Lines changed: 1 addition & 0 deletions b/‎helion/language/__init__.py
Lines changed: 1 addition & 0 deletions
@@ -23,7 +23,7 @@ def fp8_attention_kernel(
 
     # Output tensor with 4D shape in FP8 format
     out = torch.empty(
-        [batch, heads, seq_len, head_dim], dtype=torch.float8_e5m2, device=q.device
+        [batch, heads, seq_len, head_dim], dtype=torch.float8_e4m3fn, device=q.device
     )
 
     # Scale factor for attention
@@ -54,9 +54,7 @@ def fp8_attention_kernel(
                 k_tile_t = k_tile.transpose(0, 1)  # [dim, tile_n]
 
                 # Compute Q @ K^T with FP8 inputs, result in FP32
-                qk = torch.matmul(q_tile, k_tile_t).to(
-                    torch.float32
-                )  # [tile_m, tile_n]
+                qk = hl.dot(q_tile, k_tile_t)  # [tile_m, tile_n]
 
                 # Scale QK scores first
                 qk_scaled = qk * sm_scale  # [tile_m, tile_n]
@@ -90,28 +88,28 @@ def fp8_attention_kernel(
                 p_fp8 = p.to(v.dtype)  # Convert to same FP8 type as V
 
                 # Accumulate attention @ V with FP8 GEMM
-                v_t = v_tile.transpose(0, 1)  # [tile_n, dim]
-                pv = torch.matmul(p_fp8, v_t).to(torch.float32)  # [tile_m, dim]
-                acc = acc + pv
+                # v_tile is [dim, tile_n], we need to transpose for P @ V^T
+                v_t = v_tile.t()  # [tile_n, dim]
+                acc = hl.dot(p_fp8, v_t, acc=acc)  # [tile_m, dim]
 
                 # Update max tracker
                 m_i = m_new
 
             # Final normalization
             acc = acc / l_i[:, None]
             # Convert to FP8 before writing to output
-            out[b, h, tile_m, :] = acc.to(torch.float8_e5m2)
+            out[b, h, tile_m, :] = acc.to(torch.float8_e4m3fn)
 
     return out
 
 
 def preprocess_fp8_attention_inputs(
     q: torch.Tensor, k: torch.Tensor, v: torch.Tensor
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    q_fp8 = q.to(torch.float8_e5m2)
-    k_fp8 = k.to(torch.float8_e5m2)
+    q_fp8 = q.to(torch.float8_e4m3fn)
+    k_fp8 = k.to(torch.float8_e4m3fn)
     v = v.permute(0, 1, 3, 2)
-    v_fp8 = v.to(torch.float8_e5m2)
+    v_fp8 = v.to(torch.float8_e4m3fn)
     batch, heads, seq_len, head_dim = q.shape
     q_fp8_reshaped = q_fp8.reshape(batch * heads, seq_len, head_dim)
     k_fp8_reshaped = k_fp8.reshape(batch * heads, seq_len, head_dim)
@@ -147,13 +145,25 @@ def _fp8_attention_pytorch_impl(
         k_i = k_fp8[i]  # [seq, dim] - already FP8
         v_i = v_fp8[i]  # [dim, seq] - pre-transposed, already FP8
 
-        # For Q @ K^T, we need K^T to be column-major
-        kt_fp8 = k_i.t()  # column-major [dim, seq]
-
-        # Q @ K^T - dequantize and use regular matmul since e5m2 not supported by _scaled_mm
-        q_deq = q_i.to(torch.float32)
-        kt_deq = kt_fp8.to(torch.float32)
-        qk = torch.matmul(q_deq, kt_deq)
+        # For Q @ K^T using torch._scaled_mm
+        # torch._scaled_mm requires column-major for second operand
+        # k_i is [seq, dim], we need K^T as [dim, seq] in column-major
+        # Direct conversion: k_i -> contiguous -> transpose view
+        kt_fp8_col_major = k_i.contiguous().t()  # [dim, seq] in column-major
+
+        # Create scale tensors
+        scale_q = torch.tensor(1.0, device=q_i.device)
+        scale_k = torch.tensor(1.0, device=k_i.device)
+
+        # Q @ K^T using torch._scaled_mm
+        qk = torch._scaled_mm(
+            q_i,
+            kt_fp8_col_major,
+            scale_q,
+            scale_k,
+            use_fast_accum=False,
+            out_dtype=torch.float32,
+        )
 
         # Compute max before scaling
         qk_max = torch.amax(qk, dim=-1, keepdim=True)
@@ -168,16 +178,26 @@ def _fp8_attention_pytorch_impl(
         # Step 2: Attention @ V using FP8
         # P is [seq, seq], V is [dim, seq]
         # We want P @ V^T = [seq, seq] @ [seq, dim] = [seq, dim]
-        p_fp8 = p_norm.to(torch.float8_e5m2)  # row-major [seq, seq]
+        p_fp8 = p_norm.to(torch.float8_e4m3fn)  # row-major [seq, seq]
 
         # v_i is [dim, seq], already FP8
-        vt_fp8 = v_i.t()  # column-major [seq, dim]
-
-        # P @ V^T - dequantize and use regular matmul since e5m2 not supported by torch._scaled_mm
-        p_deq = p_fp8.to(torch.float32)
-        vt_deq = vt_fp8.to(torch.float32)
-        out_i = torch.matmul(p_deq, vt_deq)
-        out_i = out_i.to(torch.float8_e5m2)  # convert back to FP8
+        # Direct conversion: v_i -> contiguous -> transpose view
+        vt_fp8_col_major = v_i.contiguous().t()  # [seq, dim] in column-major
+
+        # Create scale tensors for P @ V^T
+        scale_p = torch.tensor(1.0, device=p_fp8.device)
+        scale_v = torch.tensor(1.0, device=v_i.device)
+
+        # P @ V^T using torch._scaled_mm
+        out_i = torch._scaled_mm(
+            p_fp8,
+            vt_fp8_col_major,
+            scale_p,
+            scale_v,
+            use_fast_accum=False,
+            out_dtype=torch.float32,
+        )
+        out_i = out_i.to(torch.float8_e4m3fn)  # convert back to FP8 to match kernel
 
         outputs.append(out_i)
 
@@ -192,7 +212,7 @@ def fp8_attention_pytorch(
     v: torch.Tensor,  # [batch, heads, seq, dim]
 ) -> Callable[[], torch.Tensor]:
     """
-    Baseline PyTorch implementation of FP8 attention using FP8 e5m2.
+    Baseline PyTorch implementation of FP8 attention using torch._scaled_mm.
     """
     batch, heads, seq_len, head_dim = q.shape
     q_fp8, k_fp8, v_fp8 = preprocess_fp8_attention_inputs(q, k, v)
 
@@ -1,13 +1,21 @@
 from __future__ import annotations
 
+import os
+
 import torch
 
 import helion
 from helion._testing import run_example
 import helion.language as hl
 
+# Override default config to work around Triton tl.dot requirement:
+# `AssertionError: Input shapes should have M >= 16, N >= 16 and K >= 32`
+config = None
+if os.environ.get("HELION_USE_DEFAULT_CONFIG") == "1":
+    config = helion.Config(block_sizes=[32, 32, 32])
+
 
-@helion.kernel(static_shapes=True)
+@helion.kernel(static_shapes=True, config=config)
 def fp8_gemm(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     """FP8 General Matrix Multiplication (GEMM).
 
@@ -37,11 +45,8 @@ def fp8_gemm(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
             x_tile = x[tile_m, tile_k]
             y_tile = y[tile_k, tile_n]
 
-            # Use torch.matmul which will be lowered to tl.dot
-            # When the inputs are FP8, tl.dot handles them natively
-            # The result needs to be converted to FP32 for accumulation
-            result = torch.matmul(x_tile, y_tile).to(torch.float32)
-            acc = acc + result
+            # Use hl.dot for FP8 GEMM
+            acc = hl.dot(x_tile, y_tile, acc=acc)
         out[tile_m, tile_n] = acc.to(torch.float16)
 
     return out
 
@@ -70,7 +70,14 @@ def codegen_load(
         extra_mask: ast.AST | None,
     ) -> ast.AST:
         indexing = SubscriptIndexing.create(state, fake_tensor, subscript, extra_mask)
-        extra = ", other=0" if indexing.has_mask() else ""
+        extra = ""
+        if indexing.has_mask():
+            # For FP8 dtypes, use other=0.0 (float literal) instead of other=0 (int literal)
+            # because Triton cannot cast integer 0 to FP8 types
+            if fake_tensor.dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
+                extra = ", other=0.0"
+            else:
+                extra = ", other=0"
         name = state.device_function.tensor_arg(fake_tensor).name
         return expr_from_string(
             f"tl.load({name} + offset, mask{extra})",
 
@@ -848,15 +848,19 @@ def reduce_3d_dot(
         rhs_node = node.args[1]
     assert isinstance(lhs, ast.AST)
     assert isinstance(rhs, ast.AST)
+    assert isinstance(lhs_node, torch.fx.Node)
+    assert isinstance(rhs_node, torch.fx.Node)
 
     # Check if inputs are FP8 - if so, don't specify input_precision to allow native FP8 computation
-    lhs_dtype = lhs_node.meta["val"].dtype  # pyright: ignore[reportAttributeAccessIssue,reportOptionalMemberAccess]
-    rhs_dtype = rhs_node.meta["val"].dtype  # pyright: ignore[reportAttributeAccessIssue,reportOptionalMemberAccess]
+    lhs_dtype = lhs_node.meta["val"].dtype
+    rhs_dtype = rhs_node.meta["val"].dtype
     if lhs_dtype in [torch.float8_e4m3fn, torch.float8_e5m2] and rhs_dtype in [
         torch.float8_e4m3fn,
         torch.float8_e5m2,
     ]:
-        datatype = None  # Let Triton use native FP8 computation
+        raise NotImplementedError(
+            "FP8 GEMM via torch API is not supported yet. Please use hl.dot() instead."
+        )
 
     lhs_size = lhs_node.meta["val"].size()  # pyright: ignore[reportAttributeAccessIssue,reportOptionalMemberAccess]
     rhs_size = rhs_node.meta["val"].size()  # pyright: ignore[reportAttributeAccessIssue,reportOptionalMemberAccess]
@@ -1138,7 +1142,7 @@ def proxy_arg(self, i: int) -> object:
 
     def ast_arg(self, i: int) -> ast.AST:
         rv = self.ast_args[i]
-        if isinstance(rv, int | float | bool):
+        if isinstance(rv, int | float | bool | None):
             rv = ast.Constant(value=rv)
         assert isinstance(rv, ast.AST), "TODO: convert nested/defaults"
         return rv
 
@@ -10,6 +10,7 @@
 from .loops import grid as grid
 from .loops import static_range as static_range
 from .loops import tile as tile
+from .matmul_ops import dot as dot
 from .memory_ops import atomic_add as atomic_add
 from .memory_ops import load as load
 from .memory_ops import store as store