fix dtype

jerryzh168 · jerryzh168 · commit 9df9b495a889 · 2025-05-24T15:44:23.000-07:00
diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py
@@ -1174,7 +1174,7 @@ def callback(x):
         help=(
             "Which quantization techniques to apply: int8dq, int8wo, fp6, int4wo-<groupsize>, int4wo-<groupsize>-hqq, autoquant, "
             + "autoquant-int4, autoquant-gemlite-int4, autoquant-float8, autoquant-sparse, autoquant-all, uintx-<nbits>-<groupsize>, uintx-<nbits>-<groupsize>-hqq, sparse-marlin, spinquant, "
-            + "embed-int8wo, marlin_qqq, gemlite-<pack_bitwidth>-<nbits>-<groupsize>, float8dq, int4dq-<nbits>"
+            + "embed-int8wo, marlin_qqq, gemlite-<pack_bitwidth>-<nbits>-<groupsize>, float8dq, int4dq-<nbits>, fbgemm-int4-<group_size>"
         ),
     )
     parser.add_argument(
diff --git a/torchao/dtypes/fbgemm_int4_tensor.py b/torchao/dtypes/fbgemm_int4_tensor.py
@@ -43,7 +43,7 @@ def int4_row_quantize(
     scales = scales.view(x.shape[0], -1).t().contiguous()
     zeros = zeros.view(x.shape[0], -1).t().contiguous()
 
-    return out, scales, zeros
+    return out, scales.to(x.dtype), zeros.to(x.dtype)
 
 
 def pack_int4(x: torch.Tensor) -> torch.Tensor:
@@ -68,6 +68,7 @@ def __new__(cls, packed_weight, scale, zero_point, group_size):
         shape = packed_weight.shape
         kwargs = {}
         kwargs["device"] = packed_weight.device
+        kwargs["dtype"] = scale.dtype
         kwargs["requires_grad"] = False
         return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)  # type: ignore[attr-defined]
 
@@ -98,7 +99,10 @@ def _apply_fn_to_data(self, fn):
         )
 
     def __repr__(self):
-        raise NotImplementedError("Subclasses must implement __repr__")
+        return (
+            f"{self.__class__.__name__}(weight={self.packed_weight}, group_size={self.group_size}, "
+            f"shape={self.shape}, device={self.device}, dtype={self.dtype}, requires_grad={self.requires_grad})"
+        )
 
     @classmethod
     def from_float(cls, w: torch.Tensor, group_size: int = 128):
@@ -136,6 +140,9 @@ def _(func, types, args, kwargs):
             f"{func} is not implemented for non floating point input"
         )
 
+    orig_act_size = input_tensor.size()
+    orig_out_features = weight_tensor.shape[-2]
+
     res = torch.ops.fbgemm.bf16i4bf16_rowwise(
         input_tensor,
         weight_tensor.packed_weight,
@@ -144,7 +151,7 @@ def _(func, types, args, kwargs):
     )
     if bias is not None:
         res = res + bias
-    return res
+    return res.reshape(*orig_act_size[:-1], orig_out_features)
 
 
 @implements([aten.detach.default, aten.alias.default])

Original file line number	Diff line number	Diff line change
`@@ -1174,7 +1174,7 @@ def callback(x):`
`1174`	`1174`	`help=(`
`1175`	`1175`	`"Which quantization techniques to apply: int8dq, int8wo, fp6, int4wo-<groupsize>, int4wo-<groupsize>-hqq, autoquant, "`
`1176`	`1176`	`+ "autoquant-int4, autoquant-gemlite-int4, autoquant-float8, autoquant-sparse, autoquant-all, uintx-<nbits>-<groupsize>, uintx-<nbits>-<groupsize>-hqq, sparse-marlin, spinquant, "`
`1177`		`- + "embed-int8wo, marlin_qqq, gemlite-<pack_bitwidth>-<nbits>-<groupsize>, float8dq, int4dq-<nbits>"`
	`1177`	`+ + "embed-int8wo, marlin_qqq, gemlite-<pack_bitwidth>-<nbits>-<groupsize>, float8dq, int4dq-<nbits>, fbgemm-int4-<group_size>"`
`1178`	`1178`	`),`
`1179`	`1179`	`)`
`1180`	`1180`	`parser.add_argument(`