intel · AdityaKulshrestha · Dec 21, 2025 · Dec 21, 2025
diff --git a/examples/cpu/features/int8_recipe_tuning/int8_autotune.py b/examples/cpu/features/int8_recipe_tuning/int8_autotune.py
@@ -84,7 +84,7 @@ def train(dataloader, model, loss_fn, optimizer):
 
 epochs = 5
 for t in range(epochs):
-    print(f"Epoch {t+1}\n-------------------------------")
+    print(f"Epoch {t + 1}\n-------------------------------")
     train(train_dataloader, model, loss_fn, optimizer)
 print("Done!")
 

diff --git a/examples/cpu/inference/python/models/bert_large/training/cpu/run_pretrain_mlperf.py b/examples/cpu/inference/python/models/bert_large/training/cpu/run_pretrain_mlperf.py
@@ -1158,8 +1158,8 @@ def main():
                 print(
                     f"Step {training_steps:5d}: loss: {gloss:6.3f} lm_acc: {lm_acc:.3f} \
                     seq_acc: {seq_acc:.3f} lbs: {args.train_batch_size} gbs: {total_batch_size} \
-                    DT: {(t1-t0)*1000.0:.1f} XT: {(t2-t1)*1000.0:.1f} FT: {(t3-t2)*1000.0:.1f} \
-                    BT: {(t4-t3)*1000.0:.1f} OT: {(t5-t4)*1000.0:.1f} TT: {(t5-t0)*1000.0:.1f}"
+                    DT: {(t1 - t0) * 1000.0:.1f} XT: {(t2 - t1) * 1000.0:.1f} FT: {(t3 - t2) * 1000.0:.1f} \
+                    BT: {(t4 - t3) * 1000.0:.1f} OT: {(t5 - t4) * 1000.0:.1f} TT: {(t5 - t0) * 1000.0:.1f}"
                 )
 
                 update_step = training_steps % args.gradient_accumulation_steps == 0

diff --git a/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py b/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py
@@ -1064,7 +1064,7 @@ def trace_handler(prof):
     generated, _ = generate()
     t_generate_span = time.time() - t_generate_start
     for i, o, _ in generated:
-        print_rank0(f"{'-'*60}\nin={i}\nout={o}\n")
+        print_rank0(f"{'-' * 60}\nin={i}\nout={o}\n")
 
 # benchmark it!
 else:

diff --git a/intel_extension_for_pytorch/llm/modules/mha_fusion.py b/intel_extension_for_pytorch/llm/modules/mha_fusion.py
@@ -553,12 +553,12 @@ class PagedAttention:
         alibi_slopes (torch.Tensor, optinal): which is the alibi slope with the shape of (num_heads).
         softcap (float): the positive softcap value to apply on the attention weights, default is -1.
 
-    [class method]: flash_atten_varlen
+    [class method]: flash_attn_varlen_func
 
     .. highlight:: python
     .. code-block:: python
 
-        ipex.llm.modules.PagedAttention.flash_atten_varlen(
+        ipex.llm.modules.PagedAttention.flash_attn_varlen_func(
             out,
             query,
             key_cache,
@@ -573,8 +573,8 @@ class PagedAttention:
             alibi_slopes,
             window_size_left,
             window_size_right,
-            k_scale,
-            v_scale
+            k_scale=k_scale,
+            v_scale=v_scale
         )
 
     Args: