NVIDIA · co63oc · Sep 3, 2025 · Sep 3, 2025 · Sep 3, 2025 · Sep 3, 2025
diff --git a/examples/00_basic_gemm/basic_gemm.cu b/examples/00_basic_gemm/basic_gemm.cu
@@ -47,7 +47,7 @@
   or utilities within CUTLASS. Such utilities are demonstrated elsewhere in other examples and are
   prevalent in the CUTLASS unit tests.
 
-  This example has delibrately been kept similar to the basic_gemm example from cutlass-1.3 to
+  This example has deliberately been kept similar to the basic_gemm example from cutlass-1.3 to
   highlight the minimum amount of differences needed to transition to cutlass-2.0.
 
   Cutlass-1.3 sgemm: https://github.com/NVIDIA/cutlass/blob/master/examples/00_basic_gemm/basic_gemm.cu

diff --git a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu
@@ -45,7 +45,7 @@ composed from lower level ones. Multiple thread-tiles (tile size each thread com
 to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute
 threadblock-tile (tile size computed by a threadblock).
 
-In thie example, we split variable initialization into
+In this example, we split variable initialization into
 1. Setting up data properties : describes how matrices are laid out in the memory and how the kernel
 can view them (logical to physical mapping)
 2. Setting up computation properties : describes how the above set matrices will be used to compute

diff --git a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
@@ -45,7 +45,7 @@ composed from lower level ones. Multiple thread-tiles (tile size each thread com
 to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute
 threadblock-tile (tile size computed by a threadblock).
 
-In thie example, we split variable initialization into
+In this example, we split variable initialization into
 1. Setting up data properties : describes how matrices are laid out in the memory and how the kernel
 can view them (logical to physical mapping)
 2. Setting up computation properties : describes how the above set matrices will be used to compute

diff --git a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
@@ -47,7 +47,7 @@ composed from lower level ones. Multiple thread-tiles (tile size each thread com
 to form warp-tiles (tile size each warp computes) and multiple warp tiles can be used to compute
 threadblock-tile (tile size computed by a threadblock).
 
-In thie example, we split variable initialization into
+In this example, we split variable initialization into
 1. Setting up data properties : describes how tensors are laid out in the memory and how the kernel
 can view them (logical to physical mapping)
 2. Setting up computation properties : describes how the above set tensors will be used to compute

diff --git a/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h b/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h
@@ -560,7 +560,7 @@ class B2bFusedConv2dRun {
 
     if(status != cutlass::Status::kSuccess) {
         std::cout << "Problem sizes not supported.\n"
-                << "Requirments:\n"
+                << "Requirements:\n"
                 << "    problem_size_0.N*P*Q = problem_size_1.N*P*Q\n"
                 << "    problem_size_0.K = problem_size_1.C\n"
                 << "    problem_size_1.R = problem_size_1.S = 1\n"

diff --git a/examples/13_two_tensor_op_fusion/b2b_gemm_run.h b/examples/13_two_tensor_op_fusion/b2b_gemm_run.h
@@ -604,7 +604,7 @@ struct B2bFusedGemmRun
 
     if(status != cutlass::Status::kSuccess) {
         std::cout << "Problem sizes not supported.\n"
-                << "Requirments:\n"
+                << "Requirements:\n"
                 << "    problem_size_0.M = problem_size_1.M\n"
                 << "    problem_size_0.N = problem_size_1.K\n"
                 << "    ThreadblockShape0::kN = problem_size_0.N\n"

diff --git a/examples/13_two_tensor_op_fusion/b2b_grouped_gemm_run.h b/examples/13_two_tensor_op_fusion/b2b_grouped_gemm_run.h
@@ -302,7 +302,7 @@ struct B2bFusedGroupedGemmRun
 
     if(status != cutlass::Status::kSuccess) {
         std::cout << "Problem sizes not supported.\n"
-                << "Requirments:\n"
+                << "Requirements:\n"
                 << "    problem_size_0.M = problem_size_1.M\n"
                 << "    problem_size_0.N = problem_size_1.K\n"
                 << "    ThreadblockShape0::kN = problem_size_0.N\n"

diff --git a/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h b/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h
@@ -586,7 +586,7 @@ class B2bInterleavedFusedConv2dRun {
 
     if(status != cutlass::Status::kSuccess) {
         std::cout << "Problem sizes not supported.\n"
-                << "Requirments:\n"
+                << "Requirements:\n"
                 << "    problem_size_0.N*P*Q = problem_size_1.N*P*Q\n"
                 << "    problem_size_0.K = problem_size_1.C\n"
                 << "    problem_size_1.R = problem_size_1.S = 1\n"

diff --git a/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h b/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h
@@ -638,7 +638,7 @@ struct B2bInterleavedFusedGemmRun
 
     if(status != cutlass::Status::kSuccess) {
         std::cout << "Problem sizes not supported.\n"
-                << "Requirments:\n"
+                << "Requirements:\n"
                 << "    problem_size_0.M = problem_size_1.M\n"
                 << "    problem_size_0.N = problem_size_1.K\n"
                 << "    ThreadblockShape0::kN = problem_size_0.N\n"

diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h
@@ -97,7 +97,7 @@ class B2bMmaBase {
                                Shape1::kN / WarpGemm1::kN,
                                Shape1::kK / WarpGemm1::kK>;
 
-  /// Number of warp-level GEMM oeprations
+  /// Number of warp-level GEMM operations
   static int const kWarpGemmIterations0 =
       (WarpGemm0::kK / Operator0::Policy::MmaShape::kK);
   static int const kWarpGemmIterations1 =

diff --git a/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu b/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu
@@ -41,7 +41,7 @@ change to method to visit the global memory:
      address
      addr = base_pointer + coord1 * stride1 + coord2 * stride2
 
-The rest part of GEMM which includes shared memory load/store, mma comutation
+The rest part of GEMM which includes shared memory load/store, mma computation
 is the same.
 
 This example uses Ampere fp64 tensore core Affine2 GEMM as an example.  SIMT 

diff --git a/examples/39_gemm_permute/layouts.h b/examples/39_gemm_permute/layouts.h
@@ -71,7 +71,7 @@ class TensorCWHN {
   // Data members
   //
 
-  /// Stride data member - [n, hn, whn]
+  /// Stride data member - [n, hn, when]
   Stride stride_;
 
 public:

diff --git a/examples/41_fused_multi_head_attention/gemm/custom_mma_base.h b/examples/41_fused_multi_head_attention/gemm/custom_mma_base.h
@@ -87,7 +87,7 @@ class CustomMmaBase {
       Shape::kN / WarpGemm::kN,
       Shape::kK / WarpGemm::kK>;
 
-  /// Number of warp-level GEMM oeprations
+  /// Number of warp-level GEMM operations
   static int const kWarpGemmIterations =
       (WarpGemm::kK / Operator::Policy::MmaShape::kK);
 

diff --git a/examples/41_fused_multi_head_attention/gemm/mma_from_smem.h b/examples/41_fused_multi_head_attention/gemm/mma_from_smem.h
@@ -167,7 +167,7 @@ class MmaBaseFromSharedMemory {
       Shape::kK / WarpGemm::kK>;
   using WarpCount1 = WarpCount;
 
-  /// Number of warp-level GEMM oeprations
+  /// Number of warp-level GEMM operations
   static int const kWarpGemmIterations =
       (WarpGemm::kK / Operator::Policy::MmaShape::kK);
   static int const kWarpGemmIterations1 = kWarpGemmIterations;

diff --git a/examples/41_fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h b/examples/41_fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h
@@ -67,7 +67,7 @@ namespace threadblock {
 /// ForwardTileIterator
 ///
 template <
-    typename ThreadMap_, ///< Thread map (conept: OutputTileThreadMap)
+    typename ThreadMap_, ///< Thread map (concept: OutputTileThreadMap)
     typename Element_, ///< Element data type
     bool ScatterD = false, ///< Scatter D operand or not
     bool UseCUDAStore = false>

diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py
@@ -45,7 +45,7 @@ def __init__(self, fuse_gemm_info, gen_class_name, user_header_file, cutlass_dep
         self.b2b_num = len(fuse_gemm_info)
         self.user_header_file = user_header_file
         self.args = {}
-        # device arg struct memebr
+        # device arg struct member
         self.arg_member = []
         self.gen_class_name = gen_class_name
         self.gen_kernel_name = gen_class_name + "Kernel"

diff --git a/examples/45_dual_gemm/threadblock/dual_mma_base.h b/examples/45_dual_gemm/threadblock/dual_mma_base.h
@@ -91,7 +91,7 @@ class DualMmaBase {
                               Shape::kN / WarpGemm::kN,
                               Shape::kK / WarpGemm::kK>;
 
-  /// Number of warp-level GEMM oeprations
+  /// Number of warp-level GEMM operations
   static int const kWarpGemmIterations =
       (WarpGemm::kK / Operator0::Policy::MmaShape::kK);
 

diff --git a/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu b/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu
@@ -238,7 +238,7 @@ bool initialize_block(
 // to select an appropriate value on its own. The CollectiveBuilder will attempt to select
 // configurations that will result in the most-performant kernel, but this is not a guarantee.
 //
-// If relying on 'Auto' schedules, all builders must use the 'Auto' schedule to ensure compatiblity.
+// If relying on 'Auto' schedules, all builders must use the 'Auto' schedule to ensure compatibility.
 // For example, if `KernelScheduleAuto` is used for the mainloop builder, `EpilogueScheduleAuto` must
 // be used for the epilogue builder.
 //

diff --git a/examples/51_hopper_gett/51_hopper_gett.cu b/examples/51_hopper_gett/51_hopper_gett.cu
@@ -58,7 +58,7 @@
   In the following example, we illustrate how every Hopper GEMM in CUTLASS 3.0 is a GETT in disguise.
   We begin by defining the four modes detailed above as Row, Col (column), Red (reduction), and
   Bat (batch) strides, which we then nest for each of the in/out tensors to create our rank-3 stride
-  tuples. Note that although we do not define the problem shape type explicitely, it too remains a
+  tuples. Note that although we do not define the problem shape type explicitly, it too remains a
   rank-4 shape tuple just like any other batched GEMM, but instead with multi-mode shapes for each
   of the four corresponding multi-modes within it. After this, the same CollectiveMma and
   CollectiveBuilder we describe in examples 50 and 49 are used to create our kernel type. Nothing

diff --git a/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu b/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu
@@ -180,7 +180,7 @@ struct Options {
       "  --alpha=<float>             GEMM alpha parameter\n"
       "  --beta=<float>              GEMM beta parameter\n"
       "  --iterations=<int>          Number of profiling iterations to perform.\n"
-      "  --check=<bool>              Validate results against a reference (unfused) imlementation"
+      "  --check=<bool>              Validate results against a reference (unfused) implementation"
       "  --verbose=<bool>            Enable verbose output"
       "\n"
       "Examples:\n"

diff --git a/examples/59_ampere_gather_scatter_conv/README.md b/examples/59_ampere_gather_scatter_conv/README.md
@@ -85,8 +85,8 @@ Now that we have transformed our problem in such a way that allows us to dispatc
 we can reuse much of the machinery CUTLASS offers to implement this forward pass convolution
 operator. CUTLASS decomposes these "moving parts" of GPU linear algebra into reusable,
 modular software components abstracted by C++ template classes. This example
-demonstrates how some of the lower layers of the hierarchy can be re-used for custom kernels
-by writing a custom kernel for convolution that re-uses the Ampere/Ada GEMM collectives
+demonstrates how some of the lower layers of the hierarchy can be reused for custom kernels
+by writing a custom kernel for convolution that reuses the Ampere/Ada GEMM collectives
 from CUTLASS 3.
 
 A kernel author is free to compose their custom components with any of the existing templates

diff --git a/...gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu b/...gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu
@@ -302,7 +302,7 @@ bool initialize_tensor(
     cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
   }
   else {
-    throw std::runtime_error("Not implementated.");
+    throw std::runtime_error("Not implemented.");
   }
 
   return true;
@@ -340,7 +340,7 @@ bool initialize_scale_tensor(
     cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
   }
   else {
-    throw std::runtime_error("Not implementated.");
+    throw std::runtime_error("Not implemented.");
   }
 
   return true;

diff --git a/...gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu b/...gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu
@@ -312,7 +312,7 @@ bool initialize_tensor(
     cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
   }
   else {
-    throw std::runtime_error("Not implementated.");
+    throw std::runtime_error("Not implemented.");
   }
 
   return true;
@@ -350,7 +350,7 @@ bool initialize_scale_tensor(
     cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
   }
   else {
-    throw std::runtime_error("Not implementated.");
+    throw std::runtime_error("Not implemented.");
   }
 
   return true;

diff --git a/examples/77_blackwell_fmha/README.md b/examples/77_blackwell_fmha/README.md
@@ -8,7 +8,7 @@ For generation usage, use an M-blocking (Num-Groups) of 128 (although the limit
 
 Context loads are done via TMA, whereas generation usage utilized `cp.async` and is thus more amenable to complex load patterns.
 
-For variable sequence lenght, the code requires a batch of valid (but never used) padding memory ahead of the first input batch. This is achieved with least overhead by leaving one batch free and then arranging QKV consecutively.
+For variable sequence length, the code requires a batch of valid (but never used) padding memory ahead of the first input batch. This is achieved with least overhead by leaving one batch free and then arranging QKV consecutively.
 
 The approach of this implementation is to reuse the selection logic of the collective gemm builder and recombine the result into an FMHA kernel.
 The kernel and collective layer are then formulated to be fmha-specific.

diff --git a/examples/77_blackwell_fmha/collective/sm100_fmha_fwd_epilogue_tma_warpspecialized.hpp b/examples/77_blackwell_fmha/collective/sm100_fmha_fwd_epilogue_tma_warpspecialized.hpp
@@ -118,7 +118,7 @@ struct Sm100FmhaFwdEpilogueTmaWarpspecialized {
       auto cumulative_length_q = get<0>(problem_shape).cumulative_length;
       if (cumulative_length_q != nullptr) {
           int max_length_q = get<0>(problem_shape).max_length;
-          // for variable sequence lenght, the batch is in units of row_stride
+          // for variable sequence length, the batch is in units of row_stride
           get<2,1>(dO) = get<0>(dO);
           get<2,1>(problem_shape_O) = max_length_q * (1 + get<2,1>(problem_shape_O));
           // offset ptr by the amount we add back in later

diff --git a/examples/77_blackwell_fmha/collective/sm100_fmha_load_tma_warpspecialized.hpp b/examples/77_blackwell_fmha/collective/sm100_fmha_load_tma_warpspecialized.hpp
@@ -101,7 +101,7 @@ struct Sm100FmhaLoadTmaWarpspecialized {
       auto cumulative_length_q = get<0>(problem_shape).cumulative_length;
       if (cumulative_length_q != nullptr) {
           int max_length_q = get<0>(problem_shape).max_length;
-          // for variable sequence lenght, the batch is in units of row_stride
+          // for variable sequence length, the batch is in units of row_stride
           get<2,1>(dQ) = get<0>(dQ);
           get<3,1>(problem_shape_qk) = std::max(get<3,1>(problem_shape_qk), max_length_q * (1 + get<3,1>(problem_shape)));
           // offset ptr by the amount we add back in later
@@ -113,7 +113,7 @@ struct Sm100FmhaLoadTmaWarpspecialized {
       auto cumulative_length_kv = get<1>(problem_shape).cumulative_length;
       if (cumulative_length_kv != nullptr) {
           int max_length_kv = get<1>(problem_shape).max_length;
-          // for variable sequence lenght, the batch is in units of row_stride
+          // for variable sequence length, the batch is in units of row_stride
           get<2,1>(dK) = get<0>(dK);
           get<2,1>(dV) = get<0>(dV);
           get<3,1>(problem_shape_qk) = std::max(get<3,1>(problem_shape_qk), max_length_kv * (1 + get<3,1>(problem_shape)));

diff --git a/examples/77_blackwell_fmha/collective/sm100_fmha_mla_load_tma_warpspecialized.hpp b/examples/77_blackwell_fmha/collective/sm100_fmha_mla_load_tma_warpspecialized.hpp
@@ -108,7 +108,7 @@ struct Sm100MlaFwdLoadTmaWarpspecialized {
       auto cumulative_length_q = get<0>(problem_shape).cumulative_length;
       if (cumulative_length_q != nullptr) {
           int max_length_q = get<0>(problem_shape).max_length;
-          // for variable sequence lenght, the batch is in units of row_stride
+          // for variable sequence length, the batch is in units of row_stride
           get<2,1>(dQ) = get<0>(dQ);
           get<3,1>(problem_shape_qk) = std::max(get<3,1>(problem_shape_qk), max_length_q * (1 + get<3,1>(problem_shape)));
           // offset ptr by the amount we add back in later
@@ -120,7 +120,7 @@ struct Sm100MlaFwdLoadTmaWarpspecialized {
       auto cumulative_length_kv = get<1>(problem_shape).cumulative_length;
       if (cumulative_length_kv != nullptr) {
           int max_length_kv = get<1>(problem_shape).max_length;
-          // for variable sequence lenght, the batch is in units of row_stride
+          // for variable sequence length, the batch is in units of row_stride
           get<2,1>(dK) = get<0>(dK);
           get<2,1>(dV) = get<0>(dV);
           get<3,1>(problem_shape_qk) = std::max(get<3,1>(problem_shape_qk), max_length_kv * (1 + get<3,1>(problem_shape)));

diff --git a/examples/77_blackwell_fmha/common/pipeline_mla.hpp b/examples/77_blackwell_fmha/common/pipeline_mla.hpp
@@ -224,7 +224,7 @@ class PipelineTmaAsyncMla {
   static constexpr bool is_2sm_mma = size(AtomThrShape_MNK{}) > 1;
 
   // Consumer signalling Producer of completion
-  // Ensures all blocks in the Same Row and Column get notifed.
+  // Ensures all blocks in the Same Row and Column get notified.
   CUTLASS_DEVICE
   void consumer_release(uint32_t stage, uint32_t skip) {
     detail::pipeline_check_is_consumer(params_.role);

diff --git a/examples/77_blackwell_fmha/kernel/sm100_fmha_mla_tma_warpspecialized.hpp b/examples/77_blackwell_fmha/kernel/sm100_fmha_mla_tma_warpspecialized.hpp
@@ -435,7 +435,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized {
     size_t workspace_size {0};
     if (args.is_fused_reduction && args.split_kv > 1) {
       //  one exchange buffer for LSE max and another buffer for total LSE 
-      //  two locks per batch, frist lock is for CTA0 / H=0..63 and the second is for CTA1 / H=64..127
+      //  two locks per batch, first lock is for CTA0 / H=0..63 and the second is for CTA1 / H=64..127
       workspace_size = H * B * (sizeof(int) + sizeof(ElementLSE)) + 2 * B * sizeof(int);
     } else if (!args.is_fused_reduction && args.split_kv > 1) {
       workspace_size = (sizeof(ElementAcc) * D_latent + sizeof(ElementLSE)) * H * split_kv * B;

diff --git a/examples/77_blackwell_fmha/reference/fmha_mla_reference.hpp b/examples/77_blackwell_fmha/reference/fmha_mla_reference.hpp
@@ -186,7 +186,7 @@ void fmha_mla_reference(
         shared_mem);
     if (cudaSuccess != result) {
       result = cudaGetLastError(); // to clear the error bit
-      throw std::runtime_error("couldn't perform smem optin");
+      throw std::runtime_error("couldn't perform smem option");
     }
   }
   fmha_mla_reference_kernel<<<grid, block, shared_mem>>>(

diff --git a/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu b/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu
@@ -301,7 +301,7 @@ bool initialize_tensor(
     cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
   }
   else {
-    throw std::runtime_error("Not implementated.");
+    throw std::runtime_error("Not implemented.");
   }
 
   return true;
@@ -339,7 +339,7 @@ bool initialize_scale_tensor(
     cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
   }
   else {
-    throw std::runtime_error("Not implementated.");
+    throw std::runtime_error("Not implemented.");
   }
 
   return true;

diff --git a/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu b/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu
@@ -308,7 +308,7 @@ bool initialize_tensor(
     cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
   }
   else {
-    throw std::runtime_error("Not implementated.");
+    throw std::runtime_error("Not implemented.");
   }
 
   return true;
@@ -346,7 +346,7 @@ bool initialize_scale_tensor(
     cutlass::reference::host::BlockFillSequential(view.data(), view.capacity());
   }
   else {
-    throw std::runtime_error("Not implementated.");
+    throw std::runtime_error("Not implemented.");
   }
 
   return true;

diff --git a/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu b/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu
@@ -349,7 +349,7 @@ bool initialize_tensor(
     cutlass::reference::host::TensorFill(view, Element(1));
   }
   else {
-    throw std::runtime_error("Not implementated.");
+    throw std::runtime_error("Not implemented.");
   }
 
   return true;
@@ -390,7 +390,7 @@ bool initialize_scale_tensor(
     cutlass::reference::host::TensorFill(view, Element(1));
   }
   else {
-    throw std::runtime_error("Not implementated.");
+    throw std::runtime_error("Not implemented.");
   }
 
   return true;

diff --git a/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu b/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu
@@ -356,7 +356,7 @@ bool initialize_tensor(
     cutlass::reference::host::TensorFill(view, Element(1));
   }
   else {
-    throw std::runtime_error("Not implementated.");
+    throw std::runtime_error("Not implemented.");
   }
 
   return true;
@@ -397,7 +397,7 @@ bool initialize_scale_tensor(
     cutlass::reference::host::TensorFill(view, Element(1));
   }
   else {
-    throw std::runtime_error("Not implementated.");
+    throw std::runtime_error("Not implemented.");
   }
 
   return true;