Use a symmetric quantization with no clipping error to improve llama perplexity (#5163)

iseeyuan · facebook-github-bot · commit 2db6f07f96fd · 2024-09-11T16:59:19.000-07:00
Summary: Refer to pytorch/ao#805 for the details. With this change, the perplexity of a llama model is improved 4% on wikitext. Reviewed By: mergennachin, helunwencser Differential Revision: D62342523 Pulled By: iseeyuan
diff --git a/.ci/docker/ci_commit_pins/torchao.txt b/.ci/docker/ci_commit_pins/torchao.txt
@@ -1 +1 @@
-0916b5b29b092afcbf2b898caae49abe80662bac
+c6abf2bd576828dc8ed175fba2c4c1d0d3681a1d
diff --git a/examples/models/llama2/source_transformation/quantize.py b/examples/models/llama2/source_transformation/quantize.py
@@ -73,9 +73,12 @@ def quantize(
         if group_size is None:
             raise Exception("For 8da4w quantization, group size must be specified.")
         from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer
+        from torchao.quantization.quant_primitives import MappingType
 
         model = Int8DynActInt4WeightQuantizer(
-            precision=torch_dtype, groupsize=group_size
+            precision=torch_dtype,
+            groupsize=group_size,
+            mapping_type=MappingType.SYMMETRIC_NO_CLIPPING_ERR,
         ).quantize(model)
         if verbose:
             print("quantized model:", model)

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-0916b5b29b092afcbf2b898caae49abe80662bac`
	`1`	`+c6abf2bd576828dc8ed175fba2c4c1d0d3681a1d`