Make the kernel fail for sm75 + bfloat16 inputs

tobiasvanderwerff · tobiasvanderwerff · commit a6de35a17a4c · 2024-11-01T11:37:29.000+01:00
diff --git a/torchao/csrc/cuda/fp6_llm/fp6_linear.cu b/torchao/csrc/cuda/fp6_llm/fp6_linear.cu
@@ -109,12 +109,11 @@ void        fpx_linear_kernel(cudaStream_t    stream,
     CHECK_CUDA(cudaGetDevice(&device));
     CHECK_CUDA(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device));
     CHECK_CUDA(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device));
-
-    if ((major < 7) || (major == 7 && minor < 5)) {
-        TORCH_CHECK(false, "FP6LLM_API Error: FP6LLM requires GPU with SM75 or higher!\n");
-    }
-
     const bool is_sm75_gpu = (major == 7) && (minor == 5);
+    if (is_sm75_gpu && std::is_same<InputDataType, __nv_bfloat16>::value)
+        TORCH_CHECK(false, "Bfloat16 inputs are not supported for SM75");
+    if ((major < 7) || (major == 7 && minor < 5))
+        TORCH_CHECK(false, "FP6LLM_API Error: FP6LLM requires GPU with SM75 or higher!\n");
 
     if (is_sm75_gpu && (N_PowerOf2 == 64 || N_PowerOf2 == 128 || N_PowerOf2 % 128 == 0)) {
         // For SM75 and N >= 64, we use a different TilingConfig to deal with smaller shared memory.