From ed413217e4abe559f54ef883f03f1c5eb3251792 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Thu, 18 Dec 2025 03:40:02 -0500
Subject: [PATCH 01/24] initial implementation

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 test/README.md                                |   0
 test/{test_hpu => }/conftest.py               |  14 ++
 test/fixtures.py                              |  63 ++++++++
 .../{test_hpu/_test_helpers.py => helpers.py} |  22 +++
 test/test_ark/test_model.py                   |  78 +++-------
 test/test_cpu/__init__.py                     |   0
 test/test_cpu/_test_helpers.py                |  32 ----
 test/test_cpu/test_act_quantization.py        | 135 ++++++-----------
 test/test_cpu/test_alg_ext.py                 |  21 +--
 test/test_cpu/test_auto_scheme.py             |  52 +++----
 test/test_cpu/test_autoopt.py                 |  43 ++----
 test/test_cpu/test_autoround.py               | 141 ++++++++----------
 test/test_cpu/test_autoround_acc.py           |  44 ++----
 .../test_autoround_export_to_itrex.py         |  42 ++----
 test/test_cpu/test_block_names.py             |  35 ++---
 test/test_cpu/test_calib_dataset.py           |  26 +---
 test/test_cpu/test_cli_usage.py               |  13 +-
 test/test_cpu/test_conv1d.py                  |  31 +---
 test/test_cpu/test_export.py                  |  81 ++++------
 test/test_cpu/test_generation.py              |  28 +---
 test/test_cpu/test_gguf_format.py             |  49 ++----
 test/test_cpu/test_llmcompressor.py           |  27 ++--
 test/test_cpu/test_load_awq_gptq.py           |  35 +----
 test/test_cpu/test_mix_bits.py                |  51 +++----
 test/test_cpu/test_mllm.py                    |  22 +--
 test/test_cpu/test_model_scope.py             |  27 +---
 test/test_cpu/test_mxfp_nvfp.py               |  71 ++++-----
 test/test_cpu/test_scheme.py                  |  90 +++++------
 test/test_cpu/test_script.py                  |  10 +-
 test/test_cpu/test_torch_backend.py           |  65 ++------
 test/test_cpu/test_utils.py                   |   2 -
 test/test_cpu/test_woq_linear.py              |   3 -
 test/test_cuda/__init__.py                    |   0
 test/test_cuda/_test_helpers.py               |  32 ----
 test/test_cuda/test_2_3bits.py                |  44 +-----
 test/test_cuda/test_alg_ext.py                |  14 +-
 test/test_cuda/test_auto_round_format.py      |  87 +++--------
 test/test_cuda/test_auto_scheme.py            |  45 +++---
 test/test_cuda/test_calib_dataset.py          |  15 +-
 test/test_cuda/test_conv1d.py                 |  29 +---
 test/test_cuda/test_diffusion.py              |  16 +-
 test/test_cuda/test_exllamav2_backend.py      |  68 ++-------
 test/test_cuda/test_export.py                 |  64 +++-----
 test/test_cuda/test_fp8_input.py              |  14 +-
 test/test_cuda/test_get_block_name.py         |  14 +-
 test/test_cuda/test_gguf.py                   |  28 +---
 test/test_cuda/test_main_func.py              |  14 +-
 test/test_cuda/test_marlin_backend.py         |  74 ++-------
 test/test_cuda/test_mix_bits.py               |  60 +++-----
 test/test_cuda/test_multiple_card.py          |  32 ++--
 test/test_cuda/test_multiple_card_calib.py    |  13 +-
 test/test_cuda/test_mxfp_nvfp.py              |  44 ++----
 test/test_cuda/test_qbits.py                  |  42 +-----
 test/test_cuda/test_scheme.py                 |  73 ++++-----
 test/test_cuda/test_support_vlms.py           |  14 +-
 test/test_cuda/test_torch_backend.py          |  66 ++------
 test/test_cuda/test_transformers.py           |  10 +-
 test/test_cuda/test_triton_backend.py         |  78 +++-------
 test/test_cuda/test_vlms.py                   |  16 +-
 test/test_hpu/__init__.py                     |   0
 test/test_hpu/test_auto_round.py              |   3 +-
 test/test_hpu/test_inference.py               |  24 +--
 test/test_xpu/__init__.py                     |   0
 test/test_xpu/test_autoround.py               |  28 +---
 64 files changed, 782 insertions(+), 1632 deletions(-)
 create mode 100644 test/README.md
 rename test/{test_hpu => }/conftest.py (72%)
 create mode 100644 test/fixtures.py
 rename test/{test_hpu/_test_helpers.py => helpers.py} (63%)
 create mode 100644 test/test_cpu/__init__.py
 delete mode 100644 test/test_cpu/_test_helpers.py
 create mode 100644 test/test_cuda/__init__.py
 delete mode 100644 test/test_cuda/_test_helpers.py
 create mode 100644 test/test_hpu/__init__.py
 create mode 100644 test/test_xpu/__init__.py

diff --git a/test/README.md b/test/README.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/test_hpu/conftest.py b/test/conftest.py
similarity index 72%
rename from test/test_hpu/conftest.py
rename to test/conftest.py
index f4e9675bf..ebe377e48 100644
--- a/test/test_hpu/conftest.py
+++ b/test/conftest.py
@@ -1,9 +1,23 @@
 import os
+import sys
 from typing import Mapping
 
 import pytest
 
+from .fixtures import (
+    dataloader,
+    model,
+    tiny_opt_model,
+    tiny_opt_model_path,
+    tokenizer,
+)
+from .helpers import model_infer
 
+# Easy debugging without installing auto-round.
+sys.path.insert(0, "..")
+
+
+### HPU related configuration, usage: `pytest --mode=compile/lazy``
 def pytest_addoption(parser):
     parser.addoption(
         "--mode",
diff --git a/test/fixtures.py b/test/fixtures.py
new file mode 100644
index 000000000..615e579a8
--- /dev/null
+++ b/test/fixtures.py
@@ -0,0 +1,63 @@
+import shutil
+
+import pytest
+import torch
+import transformers
+
+from .helpers import opt_name_or_path
+
+
+class DataLoader:
+    def __init__(self):
+        self.batch_size = 1
+
+    def __iter__(self):
+        for i in range(2):
+            yield torch.ones([1, 10], dtype=torch.long)
+
+
+@pytest.fixture(scope="session")
+def tiny_opt_model_path():
+    tiny_opt_model_path = "./tmp_tiny_opt_model_path"
+    model = transformers.AutoModelForCausalLM.from_pretrained(opt_name_or_path, dtype="auto", trust_remote_code=True)
+    model.config.num_hidden_layers = 3
+    setattr(model.model.decoder, "layers", model.model.decoder.layers[:3])
+    tokenizer = transformers.AutoTokenizer.from_pretrained(opt_name_or_path, trust_remote_code=True)
+    model.save_pretrained(tiny_opt_model_path)
+    tokenizer.save_pretrained(tiny_opt_model_path)
+    print("[Fixture]: built tiny model path for testing in session")
+    yield tiny_opt_model_path
+    shutil.rmtree(tiny_opt_model_path)
+
+
+@pytest.fixture(scope="function")
+def tiny_opt_model():
+    model = transformers.AutoModelForCausalLM.from_pretrained(opt_name_or_path, dtype="auto", trust_remote_code=True)
+    model.config.num_hidden_layers = 3
+    setattr(model.model.decoder, "layers", model.model.decoder.layers[:3])
+    return model
+
+
+@pytest.fixture(scope="function")
+def tiny_opt_model():
+    model = transformers.AutoModelForCausalLM.from_pretrained(opt_name_or_path, dtype="auto", trust_remote_code=True)
+    model.config.num_hidden_layers = 3
+    setattr(model.model.decoder, "layers", model.model.decoder.layers[:3])
+    return model
+
+
+@pytest.fixture(scope="function")
+def opt_model():
+    model = transformers.AutoModelForCausalLM.from_pretrained(opt_name_or_path, dtype="auto", trust_remote_code=True)
+    return model
+
+
+@pytest.fixture(scope="session")
+def opt_tokenizer():
+    tokenizer = transformers.AutoTokenizer.from_pretrained(opt_name_or_path, trust_remote_code=True)
+    return tokenizer
+
+
+@pytest.fixture(scope="session")
+def dataloader():
+    return DataLoader()
diff --git a/test/test_hpu/_test_helpers.py b/test/helpers.py
similarity index 63%
rename from test/test_hpu/_test_helpers.py
rename to test/helpers.py
index 48e8398d7..97870eba6 100644
--- a/test/test_hpu/_test_helpers.py
+++ b/test/helpers.py
@@ -1,6 +1,27 @@
+import os
+
 import pytest
 
 
+# Automatic choose local path or model name.
+opt_name_or_path = "/tf_dataset/auto_round/models/facebook/opt-125m"
+if not os.path.exists(opt_name_or_path):
+    opt_name_or_path = "facebook/opt-125m"
+
+qwen_name_or_path = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B"
+if not os.path.exists(qwen_name_or_path):
+    qwen_name_or_path = "Qwen/Qwen3-0.6B"
+
+lamini_name_or_path = "/tf_dataset/auto_round/models/MBZUAI/LaMini-GPT-124M"
+if not os.path.exists(lamini_name_or_path):
+    lamini_name_or_path = "MBZUAI/LaMini-GPT-124M"
+
+gptj_name_or_path = "/tf_dataset/auto_round/models/hf-internal-testing/tiny-random-GPTJForCausalLM"
+if not os.path.exists(gptj_name_or_path):
+    gptj_name_or_path = "hf-internal-testing/tiny-random-GPTJForCausalLM"
+
+
+# HPU mode checking
 def is_pytest_mode_compile():
     return pytest.mode == "compile"
 
@@ -9,6 +30,7 @@ def is_pytest_mode_lazy():
     return pytest.mode == "lazy"
 
 
+# General model inference code
 def model_infer(model, tokenizer, apply_chat_template=False):
     prompts = [
         "Hello,my name is",
diff --git a/test/test_ark/test_model.py b/test/test_ark/test_model.py
index 911a186c0..622f4a6dd 100644
--- a/test/test_ark/test_model.py
+++ b/test/test_ark/test_model.py
@@ -1,11 +1,6 @@
 import shutil
-import sys
-import unittest
 
 import pytest
-
-sys.path.insert(0, "../..")
-
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -13,58 +8,25 @@
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.testing_utils import require_autogptq, require_gptqmodel
 
-
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
+from ..helpers import model_infer
 
 
-class TestAutoRoundTorchBackend(unittest.TestCase):
+class TestAutoRoundTorchBackend:
 
-    @classmethod
-    def setUpClass(self):
-        self.model_name = "facebook/opt-125m"
-        self.save_folder = "./saved"
-        self.llm_dataloader = LLMDataLoader()
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
 
-    def model_infer(self, model, tokenizer):
-        prompts = [
-            "Hello,my name is",
-            # "The president of the United States is",
-            # "The capital of France is",
-            # "The future of AI is",
-        ]
-
-        inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
-
-        outputs = model.generate(
-            input_ids=inputs["input_ids"].to(model.device),
-            attention_mask=inputs["attention_mask"].to(model.device),
-            do_sample=False,  ## change this to follow official usage
-            max_new_tokens=5,
-        )
-        generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
+        # Yield to hand control to the test methods
+        yield
 
-        decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-        for i, prompt in enumerate(prompts):
-            print(f"Prompt: {prompt}")
-            print(f"Generated: {decoded_outputs[i]}")
-            print("-" * 50)
-        return decoded_outputs[0]
-
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
+        shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_torch_4bits_sym_cpu(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+    def test_torch_4bits_sym_cpu(self, model, tokenizer, dataloader):
         bits, group_size, sym = 4, 32, True
         autoround = AutoRound(
             model,
@@ -74,7 +36,7 @@ def test_torch_4bits_sym_cpu(self):
             sym=sym,
             iters=0,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_folder
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel")
@@ -85,16 +47,14 @@ def test_torch_4bits_sym_cpu(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=1000)
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.28)
 
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_torch_4bits_sym_xpu(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+    def test_torch_4bits_sym_xpu(self, model, tokenizer, dataloader):
         bits, group_size, sym = 4, 32, True
         autoround = AutoRound(
             model,
@@ -104,7 +64,7 @@ def test_torch_4bits_sym_xpu(self):
             sym=sym,
             iters=0,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_folder
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")  ##will convert to gptq model
@@ -115,13 +75,9 @@ def test_torch_4bits_sym_xpu(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=1000)
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.28)
         torch.xpu.empty_cache()
         shutil.rmtree(self.save_folder, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/__init__.py b/test/test_cpu/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/test_cpu/_test_helpers.py b/test/test_cpu/_test_helpers.py
deleted file mode 100644
index b4b8a5955..000000000
--- a/test/test_cpu/_test_helpers.py
+++ /dev/null
@@ -1,32 +0,0 @@
-def model_infer(model, tokenizer, apply_chat_template=False):
-    prompts = [
-        "Hello,my name is",
-        # "The president of the United States is",
-        # "The capital of France is",
-        # "The future of AI is",
-    ]
-    if apply_chat_template:
-        texts = []
-        for prompt in prompts:
-            messages = [{"role": "user", "content": prompt}]
-            text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-            texts.append(text)
-        prompts = texts
-
-    inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
-
-    outputs = model.generate(
-        input_ids=inputs["input_ids"].to(model.device),
-        attention_mask=inputs["attention_mask"].to(model.device),
-        do_sample=False,  ## change this to follow official usage
-        max_new_tokens=5,
-    )
-    generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
-
-    decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-    for i, prompt in enumerate(prompts):
-        print(f"Prompt: {prompt}")
-        print(f"Generated: {decoded_outputs[i]}")
-        print("-" * 50)
-    return decoded_outputs[0]
diff --git a/test/test_cpu/test_act_quantization.py b/test/test_cpu/test_act_quantization.py
index 31ba51f1b..0483c027d 100644
--- a/test/test_cpu/test_act_quantization.py
+++ b/test/test_cpu/test_act_quantization.py
@@ -1,87 +1,72 @@
 import copy
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
 
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
+class TestAutoRoundAct:
+    save_dir = "./saved"
 
-    def __iter__(self):
-        for i in range(3):
-            yield torch.ones([1, 10], dtype=torch.long)
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
 
+        # Yield to hand control to the test methods
+        yield
 
-class TestAutoRoundAct(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        self.save_dir = "./saved"
-        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-        self.llm_dataloader = LLMDataLoader()
-
-    @classmethod
-    def tearDownClass(self):
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_mx_fp4(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    def test_mx_fp4(self, tiny_opt_model, tokenizer, dataloader):
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
-            model,
+            tiny_opt_model,
             tokenizer,
             bits=bits,
             group_size=group_size,
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             act_bits=4,
             data_type="mx_fp",
         )
         autoround.quantize()
 
-    def test_wint4fp8_dynamic(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    def test_wint4fp8_dynamic(self, tiny_opt_model, tokenizer, dataloader):
         bits, group_size = 4, 128
         autoround = AutoRound(
-            model,
+            tiny_opt_model,
             tokenizer,
             bits=bits,
             group_size=group_size,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             act_bits=8,
             data_type="fp8",
             act_data_type="fp8",
         )
         autoround.quantize()
 
-    def test_wint4fp8_static(self):
+    def test_wint4fp8_static(self, tiny_opt_model, tokenizer, dataloader):
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
-            self.model,
-            self.tokenizer,
+            tiny_opt_model,
+            tokenizer,
             bits=bits,
             group_size=group_size,
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             act_bits=8,
             data_type="fp8_to_int_sym",
             act_dynamic=False,
@@ -89,66 +74,42 @@ def test_wint4fp8_static(self):
         )
         autoround.quantize()
 
-    def test_wfp8afp8_static(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    @pytest.mark.parametrize("act_group_size", [-1, 128])
+    def test_wfp8afp8_static(self, act_group_size, tiny_opt_model, tokenizer, dataloader):
         from auto_round.wrapper import WrapperWALayer
 
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         autoround = AutoRound(
-            model,
+            tiny_opt_model,
             tokenizer,
             group_size=128,
-            act_group_size=-1,
+            act_group_size=act_group_size,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             data_type="fp8",
             act_dynamic=False,
             act_data_type="fp8",
         )
         autoround.quantize()
 
-        self.assertTrue(isinstance(autoround.model.model.decoder.layers[2].self_attn.k_proj, WrapperWALayer))
-        self.assertEqual(autoround.model.model.decoder.layers[2].self_attn.k_proj.orig_layer.act_scale.shape[0], 30)
-        self.assertEqual(autoround.model.model.decoder.layers[2].self_attn.k_proj.orig_layer.act_max.shape[0], 30)
+        k_proj = autoround.model.model.decoder.layers[2].self_attn.k_proj
+        assert isinstance(k_proj, WrapperWALayer), "k_proj should be WrapperWALayer"
+        if act_group_size == -1:
+            assert k_proj.orig_layer.act_scale.shape[0] == 20, "act_scale shape[0] should be 20"
+            assert k_proj.orig_layer.act_max.shape[0] == 20, "act_max shape[0] should be 20"
+        else:
+            assert k_proj.orig_layer.act_scale.shape[0] == int(2 * 10 * 768 / 128), "act_scale shape[0] is incorrect"
+            assert k_proj.orig_layer.act_max.shape[0] == int(2 * 10 * 768 / 128), "act_max shape[0] is incorrect"
 
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-        autoround = AutoRound(
-            model,
-            tokenizer,
-            group_size=128,
-            act_group_size=128,
-            iters=0,
-            seqlen=2,
-            dataset=self.llm_dataloader,
-            data_type="fp8",
-            act_dynamic=False,
-            act_data_type="fp8",
-        )
-        autoround.quantize()
-        self.assertTrue(isinstance(autoround.model.model.decoder.layers[2].self_attn.k_proj, WrapperWALayer))
-
-        self.assertEqual(
-            autoround.model.model.decoder.layers[2].self_attn.k_proj.orig_layer.act_scale.shape[0],
-            int(3 * 10 * 768 / 128),
-        )
-        self.assertEqual(
-            autoround.model.model.decoder.layers[2].self_attn.k_proj.orig_layer.act_max.shape[0],
-            int(3 * 10 * 768 / 128),
-        )
-
-    def test_act_config_MXFP4_saving(self):
+    def test_act_config_MXFP4_saving(self, tiny_opt_model_path, dataloader):
         scheme = "MXFP4"
         layer_config = {"lm_head": {"act_bits": 8, "bits": 8}, "k_proj": {"act_bits": 8, "bits": 8}}
         autoround = AutoRound(
-            self.model_name,
+            tiny_opt_model_path,
             scheme=scheme,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
@@ -168,15 +129,15 @@ def test_act_config_MXFP4_saving(self):
         assert "sym" in kproj_config.keys() and kproj_config["sym"]
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_act_config_NVFP4_saving(self):
+    def test_act_config_NVFP4_saving(self, tiny_opt_model_path, dataloader):
         scheme = "NVFP4"
         layer_config = {"k_proj": {"act_bits": 16, "bits": 16}}
         autoround = AutoRound(
-            self.model_name,
+            tiny_opt_model_path,
             scheme=scheme,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
@@ -193,16 +154,16 @@ def test_act_config_NVFP4_saving(self):
         assert "sym" in kproj_config.keys() and kproj_config["sym"]
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_WOQ_config_INT_saving(self):
+    def test_WOQ_config_INT_saving(self, tiny_opt_model_path, dataloader):
         scheme = "W4A16"
         layer_config = {"k_proj": {"bits": 8}}
         autoround = AutoRound(
-            self.model_name,
+            tiny_opt_model_path,
             scheme=scheme,
             iters=2,
             seqlen=2,
             sym=False,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
@@ -223,7 +184,7 @@ def test_WOQ_config_INT_saving(self):
         assert "act_dynamic" in kproj_config.keys() and kproj_config["act_dynamic"]
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_act_config_FP8_saving(self):
+    def test_act_config_FP8_saving(self, tiny_opt_model_path, dataloader):
         scheme = "FP8_STATIC"
         layer_config = {
             "lm_head": {"act_bits": 8, "bits": 8},
@@ -237,11 +198,11 @@ def test_act_config_FP8_saving(self):
             },
         }
         autoround = AutoRound(
-            self.model_name,
+            tiny_opt_model_path,
             scheme=scheme,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
@@ -262,7 +223,3 @@ def test_act_config_FP8_saving(self):
         assert "group_size" in kproj_config.keys() and kproj_config["group_size"] == 0
         assert "sym" in kproj_config.keys() and kproj_config["sym"]
         shutil.rmtree(quantized_model_path, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_alg_ext.py b/test/test_cpu/test_alg_ext.py
index b0c909bd3..504b7d0f8 100644
--- a/test/test_cpu/test_alg_ext.py
+++ b/test/test_cpu/test_alg_ext.py
@@ -1,29 +1,22 @@
-import copy
-import shutil
-import sys
-import unittest
-
-from parameterized import parameterized
-
-sys.path.insert(0, "../..")
-
 from auto_round import AutoRound
 
+from ..helpers import opt_name_or_path, qwen_name_or_path
+
 
-class TestAlgExt(unittest.TestCase):
+class TestAlgExt:
     def test_alg_ext(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = opt_name_or_path
         ar = AutoRound(model_name, scheme="W2A16", iters=1, nsamples=1, enable_alg_ext=True)
         ar.quantize()
 
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B"
+        model_name = qwen_name_or_path
         ar = AutoRound(model_name, scheme="gguf:q4_k_s", iters=1, nsamples=1, enable_alg_ext=True)
         ar.quantize()
 
         from auto_round.auto_scheme import AutoScheme
 
         scheme = AutoScheme(options=["mxfp4", "mxfp8"], avg_bits=5.5, ignore_scale_zp_bits=True)
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B"
+        model_name = qwen_name_or_path
         ar = AutoRound(model_name, scheme=scheme, iters=1, nsamples=1, enable_alg_ext=True, enable_torch_compile=True)
         ar.quantize()
 
@@ -31,7 +24,7 @@ def test_alg_ext_import(self):
         from auto_round.alg_ext import wrapper_autoround
 
     def test_all_support_dtype(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = opt_name_or_path
         for scheme in ["MXFP4", "NVFP4", "W2A16G64"]:
             ar = AutoRound(
                 model_name, scheme=scheme, iters=1, nsamples=1, enable_alg_ext=True, enable_torch_compile=True
diff --git a/test/test_cpu/test_auto_scheme.py b/test/test_cpu/test_auto_scheme.py
index cd38b220d..b6c20826e 100644
--- a/test/test_cpu/test_auto_scheme.py
+++ b/test/test_cpu/test_auto_scheme.py
@@ -1,24 +1,28 @@
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
+
 from auto_round import AutoRound, AutoRoundConfig, AutoScheme
 
 
-class TestAutoScheme(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.save_dir = "./saved"
-        self.tasks = "lambada_openai"
+class TestAutoScheme:
+    save_dir = "./saved"
+
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
+
+        # Yield to hand control to the test methods
+        yield
 
-    @classmethod
-    def tearDownClass(self):
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_auto_scheme_export(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_auto_scheme_export(self, tiny_opt_model_path):
+        model_name = tiny_opt_model_path
         scheme = AutoScheme(avg_bits=2, options=("W2A16"), nsamples=1, ignore_scale_zp_bits=True)
         ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1)
         ar.quantize_and_save(self.save_dir)
@@ -29,27 +33,23 @@ def test_auto_scheme_export(self):
         ar.quantize_and_save(self.save_dir)
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
-    def test_layer_config(self):
+    def test_layer_config(self, tiny_opt_model_path):
         from auto_round.auto_scheme.utils import compute_avg_bits_for_model
         from auto_round.utils import get_module
 
         target_bits = 3.0
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = tiny_opt_model_path
         scheme = AutoScheme(avg_bits=3, options=("W2A16", "W4A16", "BF16"))
-        user_layer_config = {"model.decoder.layers.10.fc1": {"bits": 8, "group_size": 32, "sym": False}}
+        user_layer_config = {"model.decoder.layers.1.fc1": {"bits": 8, "group_size": 32, "sym": False}}
         ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config)
         model, layer_config = ar.quantize()
-        self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["bits"], 8)
-        self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["sym"], False)
-        self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["group_size"], 32)
-        layer = get_module(model, "model.decoder.layers.10.fc1")
-        self.assertEqual(layer.bits, 8)
-        self.assertEqual(layer.sym, False)
-        self.assertEqual(layer.group_size, 32)
+        assert layer_config["model.decoder.layers.1.fc1"]["bits"] == 8
+        assert layer_config["model.decoder.layers.1.fc1"]["sym"] == False
+        assert layer_config["model.decoder.layers.1.fc1"]["group_size"] == 32
+        layer = get_module(model, "model.decoder.layers.1.fc1")
+        assert layer.bits == 8
+        assert layer.sym == False
+        assert layer.group_size == 32
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)
         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_autoopt.py b/test/test_cpu/test_autoopt.py
index f9801217e..472711155 100644
--- a/test/test_cpu/test_autoopt.py
+++ b/test/test_cpu/test_autoopt.py
@@ -1,9 +1,7 @@
 import copy
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -11,48 +9,37 @@
 from auto_round import AutoRoundAdam
 
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
+class TestAutoRound:
 
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
 
+        # Yield to hand control to the test methods
+        yield
 
-class TestAutoRound(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-        self.llm_dataloader = LLMDataLoader()
-
-    @classmethod
-    def tearDownClass(self):
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_Adam(self):
+    def test_Adam(self, tiny_opt_model, tokenizer, dataloader):
         bits, group_size, sym = 4, 128, False
         from auto_round.utils import get_block_names
 
-        llm_block_names = get_block_names(self.model, quant_vision=True)
+        llm_block_names = get_block_names(tiny_opt_model, quant_vision=True)
         bits, group_size, sym, batch_size = 4, 128, False, 20
         adamround = AutoRoundAdam(
-            self.model,
-            self.tokenizer,
+            tiny_opt_model,
+            tokenizer,
             bits=bits,
             group_size=group_size,
             sym=sym,
             iters=2,
             seqlen=2,
             batch_size=batch_size,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             to_quant_block_names=llm_block_names,
         )
         adamround.quantize()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_autoround.py b/test/test_cpu/test_autoround.py
index 2790f8817..1f1f85f55 100644
--- a/test/test_cpu/test_autoround.py
+++ b/test/test_cpu/test_autoround.py
@@ -1,41 +1,28 @@
 import copy
 import shutil
-import sys
-import unittest
-
-from parameterized import parameterized
-
-sys.path.insert(0, "../..")
 
+import pytest
 import torch
-from _test_helpers import model_infer
+from parameterized import parameterized
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
 from auto_round import AutoRound
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.utils import get_module
 
+from ..helpers import model_infer
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
 
-    def __iter__(self):
-        for i in range(3):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestAutoRound(unittest.TestCase):
+class TestAutoRound:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-        self.llm_dataloader = LLMDataLoader()
         self.save_folder = "./saved"
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree(self.save_folder, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
@@ -49,7 +36,7 @@ def test_bits_setting(self):
         if module.bits != 8:
             raise ValueError(f"Expected bits to be 8, but got {module.bits}")
 
-    def test_layer_config(self):
+    def test_layer_config(self, dataloader):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         layer_config = {"self_attn": {"bits": 4, "data_type": "nv_fp", "act_bits": 16, "group_size": 16}}
         autoround = AutoRound(
@@ -58,14 +45,14 @@ def test_layer_config(self):
             scheme="NVFP4",
             iters=0,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
             amp=False,
         )
         autoround.quantize_and_save(self.save_folder, inplace=False, format="fake")
         shutil.rmtree(self.save_folder)
 
-    def test_remove_whole_block(self):
+    def test_remove_whole_block(self, dataloader):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         layer_config = {
             "model.decoder.layers.0.self_attn.k_proj": {"bits": 32},
@@ -83,12 +70,12 @@ def test_remove_whole_block(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         autoround.quantize()
 
-    def test_consecutive_quant(self):
+    def test_consecutive_quant(self, dataloader):
         bits, group_size, sym = 4, -1, False
         autoround = AutoRound(
             self.model,
@@ -98,7 +85,7 @@ def test_consecutive_quant(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
 
@@ -116,11 +103,11 @@ def test_consecutive_quant(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
 
-    def test_mx_fp4(self):
+    def test_mx_fp4(self, dataloader):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         bits, group_size, sym = 4, 32, False
         autoround = AutoRound(
@@ -142,7 +129,7 @@ def test_mx_fp4(self):
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.3)  # 0.375
 
-    def test_nv_fp4(self):
+    def test_nv_fp4(self, dataloader):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         bits, group_size, sym = 4, 16, False
         autoround = AutoRound(
@@ -152,7 +139,7 @@ def test_nv_fp4(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             data_type="nv_fp4",
         )
         model, _ = autoround.quantize()
@@ -162,7 +149,7 @@ def test_nv_fp4(self):
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35)
 
-    def test_w4g1(self):
+    def test_w4g1(self, dataloader):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         bits, group_size, sym = 4, -1, True
         autoround = AutoRound(
@@ -172,12 +159,12 @@ def test_w4g1(self):
             sym=sym,
             iters=2,
             seqlen=10,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
 
     @parameterized.expand([(2,), (3,), (4,)])
-    def test_g128(self, bits):
+    def test_g128(self, bits, dataloader):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         group_size, sym = 128, True
         autoround = AutoRound(
@@ -187,7 +174,7 @@ def test_g128(self, bits):
             sym=sym,
             iters=2,
             seqlen=10,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         model, _ = autoround.quantize()
         if bits > 2:
@@ -197,7 +184,7 @@ def test_g128(self, bits):
             print(result["results"]["lambada_openai"]["acc,none"])
             self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.3)
 
-    def test_disable_quanted_input(self):
+    def test_disable_quanted_input(self, dataloader):
         bits, group_size, sym = 4, -1, True
         autoround = AutoRound(
             self.model,
@@ -208,11 +195,11 @@ def test_disable_quanted_input(self):
             iters=2,
             seqlen=10,
             enable_quanted_input=False,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
 
-    def test_enable_norm_bias_tuning_qwen3(self):
+    def test_enable_norm_bias_tuning_qwen3(self, dataloader):
         bits, group_size, sym = 4, 128, True
         model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B"
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
@@ -226,11 +213,11 @@ def test_enable_norm_bias_tuning_qwen3(self):
             iters=2,
             seqlen=10,
             enable_norm_bias_tuning=True,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
 
-    def test_enable_norm_bias_tuning(self):
+    def test_enable_norm_bias_tuning(self, dataloader):
         bits, group_size, sym = 4, -1, True
         autoround = AutoRound(
             self.model,
@@ -242,11 +229,11 @@ def test_enable_norm_bias_tuning(self):
             seqlen=10,
             enable_quanted_input=False,
             enable_norm_bias_tuning=True,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
 
-    def test_disable_minmax_tuning(self):
+    def test_disable_minmax_tuning(self, dataloader):
         bits, group_size, sym = 4, -1, True
         autoround = AutoRound(
             self.model,
@@ -257,12 +244,12 @@ def test_disable_minmax_tuning(self):
             iters=2,
             seqlen=10,
             enable_minmax_tuning=False,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
 
     #
-    def test_signround(self):
+    def test_signround(self, dataloader):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         bits, group_size, sym = 4, -1, False
         autoround = AutoRound(
@@ -274,11 +261,11 @@ def test_signround(self):
             seqlen=10,
             enable_minmax_tuning=False,
             enable_quanted_input=False,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
 
-    def test_lm_head_layer_config_way(self):
+    def test_lm_head_layer_config_way(self, dataloader):
         bits, group_size, sym = 4, -1, False
         layer_config = {"lm_head": {"data_type": "int"}}
         autoround = AutoRound(
@@ -291,12 +278,12 @@ def test_lm_head_layer_config_way(self):
             seqlen=10,
             enable_minmax_tuning=False,
             enable_quanted_input=False,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         autoround.quantize()
 
-    def test_wa_quant(self):
+    def test_wa_quant(self, dataloader):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         bits, group_size, sym, act_bits = 4, 128, False, 4
         autoround = AutoRound(
@@ -306,12 +293,12 @@ def test_wa_quant(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             act_bits=act_bits,
         )
         autoround.quantize()
 
-    def test_auto_device_map(self):
+    def test_auto_device_map(self, dataloader):
         bits, group_size, sym = 4, 128, False
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(
@@ -325,11 +312,11 @@ def test_auto_device_map(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
 
-    def test_device_map_dict(self):
+    def test_device_map_dict(self, dataloader):
         bits, group_size, sym = 4, 128, False
         device_map = {".*": "cpu"}
         autoround = AutoRound(
@@ -340,7 +327,7 @@ def test_device_map_dict(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             device_map=device_map,
         )
         autoround.quantize()
@@ -355,12 +342,12 @@ def test_device_map_dict(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             device_map=device_map,
         )
         autoround.quantize()
 
-    def test_fp32(self):
+    def test_fp32(self, dataloader):
         bits, group_size, sym = 4, 128, False
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(
@@ -374,12 +361,12 @@ def test_fp32(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             amp=False,
         )
         autoround.quantize()
 
-    def test_tensor_reshape(self):
+    def test_tensor_reshape(self, dataloader):
         bits, group_size, sym = 4, 100, False
         autoround = AutoRound(
             self.model,
@@ -389,7 +376,7 @@ def test_tensor_reshape(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
 
@@ -412,7 +399,7 @@ def test_rtn(self):
         model_infer(model, tokenizer)
         shutil.rmtree(self.save_folder)
 
-    def test_embed_quant(self):
+    def test_embed_quant(self, dataloader):
         bits, group_size, sym = 4, 128, True
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         layer_config = {
@@ -426,12 +413,12 @@ def test_embed_quant(self):
             iters=2,
             seqlen=2,
             nsamples=3,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         autoround.quantize()
 
-    def test_fallback_layers(self):
+    def test_fallback_layers(self, dataloader):
         bits, group_size, sym = 4, 128, True
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(
@@ -450,7 +437,7 @@ def test_fallback_layers(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         autoround.quantize()
@@ -520,7 +507,7 @@ def test_not_convert_modules(self):
         )
         print(output_text)
 
-    def test_fallback_layers_regex_awq(self):
+    def test_fallback_layers_regex_awq(self, dataloader):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         bits, group_size, sym = 4, 128, True
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
@@ -537,7 +524,7 @@ def test_fallback_layers_regex_awq(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         autoround.quantize()
@@ -556,7 +543,7 @@ def test_fallback_layers_regex_awq(self):
         print(res)
         shutil.rmtree(self.save_folder, ignore_errors=True)
 
-    def test_fallback_layers_regex_gptq(self):
+    def test_fallback_layers_regex_gptq(self, dataloader):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         bits, group_size, sym = 4, 128, True
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
@@ -573,7 +560,7 @@ def test_fallback_layers_regex_gptq(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         autoround.quantize()
@@ -592,7 +579,7 @@ def test_fallback_layers_regex_gptq(self):
         print(res)
         shutil.rmtree(self.save_folder, ignore_errors=True)
 
-    def test_fallback_layers_regex_round(self):
+    def test_fallback_layers_regex_round(self, dataloader):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         bits, group_size, sym = 4, 128, True
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
@@ -609,7 +596,7 @@ def test_fallback_layers_regex_round(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         autoround.quantize()
@@ -628,7 +615,7 @@ def test_fallback_layers_regex_round(self):
         print(res)
         shutil.rmtree(self.save_folder, ignore_errors=True)
 
-    def test_fallback_layers_regex_exception(self):
+    def test_fallback_layers_regex_exception(self, dataloader):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         bits, group_size, sym = 4, 128, True
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
@@ -643,7 +630,7 @@ def test_fallback_layers_regex_exception(self):
                 sym=sym,
                 iters=2,
                 seqlen=2,
-                dataset=self.llm_dataloader,
+                dataset=dataloader,
                 layer_config=layer_config,
             )
             autoround.quantize()
@@ -666,16 +653,16 @@ def test_dequant_fp8_weight(self):
         weight_scale = torch.randn(5, 56)
         block_size = [128, 128]
         dequant_weight = dequant_block_fp8_weight(weight, weight_scale, block_size)
-        self.assertEqual(dequant_weight.shape.numel(), 4207616)
+        assert dequant_weight.shape.numel() == 4207616
 
         # test experts are stacked.
         weight = torch.randn([32, 5760, 1440])
         weight_scale = torch.randn([32, 5760, 90])
         block_size = [1, 16]
         dequant_weight = dequant_block_fp8_weight(weight, weight_scale, block_size)
-        self.assertEqual(len(dequant_weight.shape), 3)
-        self.assertEqual(dequant_weight.shape[0], 32)
-        self.assertEqual(dequant_weight.shape.numel(), 32 * 5760 * 1440)
+        assert len(dequant_weight.shape) == 3
+        assert dequant_weight.shape[0] == 32
+        assert dequant_weight.shape.numel() == 32 * 5760 * 1440
 
     def test_mixed_bit_setting(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
@@ -754,7 +741,7 @@ def test_quant_lm_head_layer_config(self):
     def test_compressor(self):
         model_name = "Qwen/Qwen2-VL-2B-Instruct"
         ar = AutoRound(model_name, enable_adam=True)
-        self.assertEqual(ar.optimizer, torch.optim.AdamW)
+        assert ar.optimizer == torch.optim.AdamW
         self.assertTrue(ar.mllm)
 
         # test old api
@@ -801,7 +788,7 @@ def test_attention_mask_via_tokenize_in_dataset(self):
         ar = AutoRound(model_name, iters=1, dataset=data, seqlen=8)
         ar.quantize()
 
-    def test_low_cpu_mem_usage(self):
+    def test_low_cpu_mem_usage(self, dataloader):
         bits, group_size = 4, 32
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
@@ -814,7 +801,7 @@ def test_low_cpu_mem_usage(self):
             group_size=group_size,
             iters=2,
             seqlen=10,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             low_cpu_mem_usage=True,
             device_map="cpu",
         )
@@ -826,7 +813,3 @@ def test_create_adam(self):
         from auto_round import AutoRound
 
         ar = AutoRound(model=model_name, enable_adam=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_autoround_acc.py b/test/test_cpu/test_autoround_acc.py
index 41b28e663..721a5c8ed 100644
--- a/test/test_cpu/test_autoround_acc.py
+++ b/test/test_cpu/test_autoround_acc.py
@@ -1,42 +1,29 @@
 import copy
 import shutil
-import sys
-import unittest
-
-from auto_round.eval.evaluation import simple_evaluate
-
-sys.path.insert(0, "../..")
 from math import isclose
 
+import pytest
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound  # pylint: disable=E0401
 
-
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
+from ..helpers import gptj_name_or_path
 
 
-class TestAutoRound(unittest.TestCase):
+class TestAutoRound:
     @classmethod
-    def setUpClass(self):
-        self.llm_dataloader = LLMDataLoader()
+    def setup_class(self):
         self.save_dir = "./saved"
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree(self.save_dir, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_default_acc(self):
-        model_name = "/tf_dataset/auto_round/models/hf-internal-testing/tiny-random-GPTJForCausalLM"
+    def test_default_acc(self, dataloader):
+        model_name = gptj_name_or_path
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
@@ -50,7 +37,7 @@ def test_default_acc(self):
             sym=sym,
             iters=2,
             seqlen=10,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
         out0 = model(inp)
@@ -66,7 +53,7 @@ def test_default_acc(self):
             device="cpu",
             iters=2,
             seqlen=10,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround_1.quantize()
         out1 = model_tmp(inp)
@@ -74,20 +61,11 @@ def test_default_acc(self):
         assert out0[0].equal(out1[0])
         self.assertTrue(isclose(float(out0[0][0][0][0]), -0.021002087742090225, rel_tol=5e-04))
 
-    def test_3bits_asym_autoround(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_3bits_asym_autoround(self, tiny_opt_model_path):
+        model_name = tiny_opt_model_path
 
         bits, sym = 3, False
         autoround = AutoRound(model_name, bits=bits, sym=sym, iters=0)
         autoround.quantize_and_save(self.save_dir, format="auto_round", inplace=False)
         model_args = f"pretrained={self.save_dir}"
-        # res = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto", limit=10)
-
-        # accuracy = res["results"]["lambada_openai"]["acc,none"]
-        # print(f"accuracy = {accuracy}")
-        # assert accuracy > 0.15
         shutil.rmtree(self.save_dir, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_autoround_export_to_itrex.py b/test/test_cpu/test_autoround_export_to_itrex.py
index d9b4f42c6..d4cc2a73c 100644
--- a/test/test_cpu/test_autoround_export_to_itrex.py
+++ b/test/test_cpu/test_autoround_export_to_itrex.py
@@ -1,15 +1,15 @@
 import copy
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
 
+from ..helper import gptj_name_or_path
+
 
 class SimpleDataLoader:
     def __init__(self):
@@ -20,35 +20,23 @@ def __iter__(self):
             yield torch.randn([1, 30])
 
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestAutoroundExport(unittest.TestCase):
+class TestAutoroundExport:
     approach = "weight_only"
 
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.gptj = transformers.AutoModelForCausalLM.from_pretrained(
-            "/tf_dataset/auto_round/models/hf-internal-testing/tiny-random-GPTJForCausalLM",
+            gptj_name_or_path,
             torchscript=True,
         )
-        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-            "/tf_dataset/auto_round/models/hf-internal-testing/tiny-random-GPTJForCausalLM", trust_remote_code=True
-        )
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(gptj_name_or_path, trust_remote_code=True)
         self.gptj_no_jit = transformers.AutoModelForCausalLM.from_pretrained(
-            "/tf_dataset/auto_round/models/hf-internal-testing/tiny-random-GPTJForCausalLM",
+            gptj_name_or_path,
         )
-        self.llm_dataloader = LLMDataLoader()
         self.lm_input = torch.ones([1, 10], dtype=torch.long)
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
@@ -87,10 +75,10 @@ def test_config(self):
         config = QuantConfig.from_pretrained("/tf_dataset/auto_round/models/TheBloke/Llama-2-7B-Chat-GPTQ")
         config.save_pretrained("quantization_config_dir")
         loaded_config = QuantConfig.from_pretrained("quantization_config_dir")
-        self.assertEqual(config.group_size, loaded_config.group_size)
-        self.assertEqual(config.desc_act, loaded_config.desc_act)
-        self.assertEqual(config.bits, loaded_config.bits)
-        self.assertEqual(config.sym, loaded_config.sym)
+        assert config.group_size == loaded_config.group_size
+        assert config.desc_act == loaded_config.desc_act
+        assert config.bits == loaded_config.bits
+        assert config.sym == loaded_config.sym
 
     def test_xpu_export(self):
         model = copy.deepcopy(self.gptj)
@@ -111,7 +99,3 @@ def test_xpu_export(self):
         self.assertTrue(torch.all(out2[0] == out3[0]))
         self.assertTrue(torch.all(torch.isclose(out3[0], out4[0], atol=1e-3)))
         self.assertTrue(torch.all(torch.isclose(out4[0], out5[0], atol=1e-5)))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_block_names.py b/test/test_cpu/test_block_names.py
index 501caee25..8d5f935d9 100644
--- a/test/test_cpu/test_block_names.py
+++ b/test/test_cpu/test_block_names.py
@@ -1,25 +1,16 @@
 import os
 import shutil
-import sys
-import unittest
+
+import pytest
 
 sys.path.insert(0, ".")
-sys.path.insert(0, "../..")
 import torch
 import torch.nn as nn
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
 
-
-class LLMDataLoader:
-    def __init__(self, input_size=10):
-        self.batch_size = 1
-        self.input_size = input_size
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, self.input_size], dtype=torch.long)
+from ..helper import lamini_name_or_path
 
 
 # ================= simple multimodal model =================
@@ -116,24 +107,22 @@ def forward(self, x):
         return output
 
 
-class TestQuantizationBlocks(unittest.TestCase):
+class TestQuantizationBlocks:
     @classmethod
-    def setUpClass(self):
-        self.model_name = "/tf_dataset/auto_round/models/MBZUAI/LaMini-GPT-124M"
+    def setup_class(self):
+        self.model_name = lamini_name_or_path
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-        self.llm_dataloader = LLMDataLoader()
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_moe_quant(self):
+    def test_moe_quant(self, dataloader):
         input_size = 10
         hidden_size = 10
         num_groups = 2
         experts_per_group = 2
-        self.llm_dataloader = LLMDataLoader(input_size)
         self.model = NestedMoEModel(input_size, hidden_size, num_groups, experts_per_group)
         from auto_round.utils import get_block_names
 
@@ -159,7 +148,7 @@ def test_multimodal_quant(self):
         assert block_names_wo_vision == llm_block_names
         assert len(block_names_wo_vision) != (block_names_with_vision)
 
-    def test_block_name_quant(self):
+    def test_block_name_quant(self, dataloader):
         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         from auto_round.utils import get_block_names
 
@@ -174,7 +163,7 @@ def test_block_name_quant(self):
             iters=2,
             seqlen=2,
             batch_size=batch_size,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             to_quant_block_names=llm_block_names,
         )
         autoround.quantize()
@@ -217,7 +206,3 @@ def test_moe(self):
         self.assertTrue(block_name == block_name_2)
         self.assertTrue(len(block_name_2) == 1)
         self.assertTrue("model.layers.23" == block_name_2[0][-1])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_calib_dataset.py b/test/test_cpu/test_calib_dataset.py
index 689cc705c..fc95966b6 100644
--- a/test/test_cpu/test_calib_dataset.py
+++ b/test/test_cpu/test_calib_dataset.py
@@ -1,29 +1,17 @@
+import json
 import os
 import shutil
-import sys
-import unittest
-
-sys.path.insert(0, "../..")
-import json
 
+import pytest
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
 
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestLocalCalibDataset(unittest.TestCase):
+class TestLocalCalibDataset:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         json_data = [{"text": "awefdsfsddfd"}, {"text": "fdfdfsdfdfdfd"}, {"text": "dfdsfsdfdfdfdf"}]
         os.makedirs("./saved", exist_ok=True)
         self.json_file = "./saved/tmp.json"
@@ -130,10 +118,6 @@ def test_combine_dataset2(self):
     #     autoround.quantize()
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_cli_usage.py b/test/test_cpu/test_cli_usage.py
index 2b93f5131..ffc04d8f1 100644
--- a/test/test_cpu/test_cli_usage.py
+++ b/test/test_cpu/test_cli_usage.py
@@ -1,18 +1,17 @@
 import os
 import shutil
 import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 
 
-class TestAutoRoundCmd(unittest.TestCase):
+class TestAutoRoundCmd:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         pass
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
         shutil.rmtree("../../saved", ignore_errors=True)
@@ -68,7 +67,3 @@ def test_auto_round_cmd(self):
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_conv1d.py b/test/test_cpu/test_conv1d.py
index edd28110f..1997026b3 100644
--- a/test/test_cpu/test_conv1d.py
+++ b/test/test_cpu/test_conv1d.py
@@ -1,38 +1,27 @@
 import copy
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
-from _test_helpers import model_infer
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
 
+from ..helpers import lamini_name_or_path, model_infer
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
 
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestQuantizationConv1d(unittest.TestCase):
+class TestQuantizationConv1d:
     @classmethod
-    def setUpClass(self):
-        self.model_name = "/tf_dataset/auto_round/models/MBZUAI/LaMini-GPT-124M"
+    def setup_class(self):
+        self.model_name = lamini_name_or_path
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-        self.llm_dataloader = LLMDataLoader()
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_quant(self):
+    def test_quant(self, dataloader):
         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
@@ -43,7 +32,7 @@ def test_quant(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
 
         autoround.quantize()
@@ -51,7 +40,3 @@ def test_quant(self):
 
         model = AutoModelForCausalLM.from_pretrained("./saved", device_map="cpu", trust_remote_code=True)
         model_infer(model, self.tokenizer)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py
index 57b30354b..866a7d396 100644
--- a/test/test_cpu/test_export.py
+++ b/test/test_cpu/test_export.py
@@ -1,12 +1,9 @@
 import os
 import shutil
-import sys
-import unittest
 
-from parameterized import parameterized
-
-sys.path.insert(0, "../..")
+import pytest
 import torch
+from parameterized import parameterized
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
 from auto_round import AutoRound
@@ -23,30 +20,20 @@ def _get_folder_size(path: str) -> float:
     return total_size / (1024**3)  # convert to GB
 
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestAutoRound(unittest.TestCase):
+class TestAutoRound:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         self.save_dir = "./saved"
         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-        self.llm_dataloader = LLMDataLoader()
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_autogptq_format(self):
+    def test_autogptq_format(self, dataloader):
         for group_size in [-1, 32, 128]:
             bits, sym = 4, False
             model_name = self.model_name
@@ -57,7 +44,7 @@ def test_autogptq_format(self):
                 sym=sym,
                 iters=2,
                 seqlen=2,
-                dataset=self.llm_dataloader,
+                dataset=dataloader,
             )
 
             quantized_model_path = "./saved"
@@ -76,7 +63,7 @@ def test_autogptq_format(self):
             print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
             shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_autoround_format(self):
+    def test_autoround_format(self, dataloader):
         for group_size in [-1, 32, 128]:
             bits, sym = 4, True
             model_name = self.model_name
@@ -87,7 +74,7 @@ def test_autoround_format(self):
                 sym=sym,
                 iters=2,
                 seqlen=2,
-                dataset=self.llm_dataloader,
+                dataset=dataloader,
             )
             quantized_model_path = "./saved"
             autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
@@ -102,7 +89,7 @@ def test_autoround_format(self):
             print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
             shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_autoround_awq_format(self):
+    def test_autoround_awq_format(self, dataloader):
         for group_size in [-1, 32, 128]:
             bits, sym = 4, False
             model_name = self.model_name
@@ -113,7 +100,7 @@ def test_autoround_awq_format(self):
                 sym=sym,
                 iters=2,
                 seqlen=2,
-                dataset=self.llm_dataloader,
+                dataset=dataloader,
             )
             quantized_model_path = "./saved"
 
@@ -132,7 +119,7 @@ def test_autoround_awq_format(self):
             print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
             shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_autoawq_format(self):
+    def test_autoawq_format(self, dataloader):
         for group_size in [-1, 32, 128]:
             bits, sym = 4, False
             autoround = AutoRound(
@@ -143,7 +130,7 @@ def test_autoawq_format(self):
                 sym=sym,
                 iters=2,
                 seqlen=2,
-                dataset=self.llm_dataloader,
+                dataset=dataloader,
             )
             autoround.quantize()
             quantized_model_path = "./saved"
@@ -163,7 +150,7 @@ def test_autoawq_format(self):
             print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
             shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_autoround_3bit_asym_format(self):
+    def test_autoround_3bit_asym_format(self, dataloader):
         bits, group_size, sym = 3, 128, False
         autoround = AutoRound(
             self.model,
@@ -173,7 +160,7 @@ def test_autoround_3bit_asym_format(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
         quantized_model_path = self.save_dir
@@ -187,7 +174,7 @@ def test_autoround_3bit_asym_format(self):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_autoround_3bit_sym_format(self):
+    def test_autoround_3bit_sym_format(self, dataloader):
         bits, group_size, sym = 3, 128, True
         autoround = AutoRound(
             self.model,
@@ -197,7 +184,7 @@ def test_autoround_3bit_sym_format(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
         quantized_model_path = self.save_dir
@@ -239,8 +226,8 @@ def test_static_afp8_export(self, static_kv_dtype):
         f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt")
         self.assertIn("model.decoder.layers.8.self_attn.k_proj.input_scale", f.keys())
         self.assertIn("model.decoder.layers.8.self_attn.k_proj.weight_scale", f.keys())
-        self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape, torch.Size([1]))
-        self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype, torch.float8_e4m3fn)
+        assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape == torch.Size([1])
+        assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype == torch.float8_e4m3fn
         if static_kv_dtype is None:
             with torch.no_grad():
                 import transformers
@@ -272,9 +259,9 @@ def test_static_afp8_export(self, static_kv_dtype):
         if static_kv_dtype == "fp8":
             self.assertIn("model.decoder.layers.8.self_attn.k_scale", f.keys())
             self.assertIn("model.decoder.layers.8.self_attn.v_scale", f.keys())
-            self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_scale").shape, torch.Size([1]))
-            self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.k_scale").shape, torch.Size([1]))
-            self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.k_scale").dtype, torch.float32)
+            assert f.get_tensor("model.decoder.layers.5.self_attn.v_scale").shape == torch.Size([1])
+            assert f.get_tensor("model.decoder.layers.5.self_attn.k_scale").shape == torch.Size([1])
+            assert f.get_tensor("model.decoder.layers.5.self_attn.k_scale").dtype == torch.float32
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
@@ -298,8 +285,8 @@ def test_static_afp8_export(self, static_kv_dtype):
         f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt")
         self.assertIn("model.decoder.layers.8.self_attn.k_proj.input_scale", f.keys())
         self.assertIn("model.decoder.layers.8.self_attn.k_proj.weight_scale", f.keys())
-        self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape, torch.Size([1]))
-        self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype, torch.float8_e4m3fn)
+        assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape == torch.Size([1])
+        assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype == torch.float8_e4m3fn
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     def test_static_fp8_attn(self):
@@ -323,18 +310,18 @@ def test_static_fp8_attn(self):
         f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt")
         self.assertIn("model.decoder.layers.8.self_attn.k_proj.input_scale", f.keys())
         self.assertIn("model.decoder.layers.8.self_attn.k_proj.weight_scale", f.keys())
-        self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape, torch.Size([1]))
-        self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype, torch.float8_e4m3fn)
+        assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape == torch.Size([1])
+        assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype == torch.float8_e4m3fn
         check_attrs = ["k_scale", "v_scale", "q_scale"]
         for attr in check_attrs:
             weight_name = f"model.decoder.layers.8.self_attn.{attr}"
             self.assertIn(weight_name, f.keys())
-            self.assertEqual(f.get_tensor(weight_name).shape, torch.Size([1]))
-            self.assertEqual(f.get_tensor(weight_name).dtype, torch.float32)
+            assert f.get_tensor(weight_name).shape == torch.Size([1])
+            assert f.get_tensor(weight_name).dtype == torch.float32
 
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_awq_lmhead_export(self):
+    def test_awq_lmhead_export(self, dataloader):
         bits, sym, group_size = 4, False, 128
         model_name = "/tf_dataset/auto_round/models/microsoft/phi-2"
         layer_config = {
@@ -350,7 +337,7 @@ def test_awq_lmhead_export(self):
             nsamples=2,
             seqlen=2,
             layer_config=layer_config,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = "./saved"
         compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq")
@@ -368,7 +355,7 @@ def test_awq_lmhead_export(self):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_gptq_lmhead_export(self):
+    def test_gptq_lmhead_export(self, dataloader):
         bits, sym, group_size = 4, True, 128
         # Note that, to save UT tuning time, the local model is intentionally kept lightweight, using only 2 hidden layers.
         model_name = "/tf_dataset/auto_round/models/microsoft/phi-2"
@@ -385,7 +372,7 @@ def test_gptq_lmhead_export(self):
             iters=2,
             seqlen=2,
             layer_config=layer_config,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = "./saved"
         compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
@@ -401,7 +388,3 @@ def test_gptq_lmhead_export(self):
         res = tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0])
         print(res)
         shutil.rmtree(quantized_model_path, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_generation.py b/test/test_cpu/test_generation.py
index 5018d1610..4c72db93c 100644
--- a/test/test_cpu/test_generation.py
+++ b/test/test_cpu/test_generation.py
@@ -1,39 +1,27 @@
 import copy
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
 from auto_round import AutoRound
 
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestAutoRoundFormatGeneration(unittest.TestCase):
+class TestAutoRoundFormatGeneration:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-        self.llm_dataloader = LLMDataLoader()
         self.save_folder = "./saved"
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree(self.save_folder, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_4bits_sym(self):
+    def test_4bits_sym(self, dataloader):
         bits = 4
         group_size = 128
         sym = True
@@ -45,7 +33,7 @@ def test_4bits_sym(self):
             sym=sym,
             iters=1,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_folder
 
@@ -72,7 +60,7 @@ def test_4bits_sym(self):
         print(res)
         assert "!!!" not in res
 
-    def test_autoround_sym(self):
+    def test_autoround_sym(self, dataloader):
         for bits in [4]:
             model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
             tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -85,7 +73,7 @@ def test_autoround_sym(self):
                 sym=sym,
                 iters=2,
                 seqlen=2,
-                dataset=self.llm_dataloader,
+                dataset=dataloader,
             )
             quantized_model_path = "./saved"
 
diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py
index 53b199c41..393e11dba 100644
--- a/test/test_cpu/test_gguf_format.py
+++ b/test/test_cpu/test_gguf_format.py
@@ -1,36 +1,23 @@
 import os
 import shutil
 import sys
-import unittest
-
-sys.path.insert(0, "../..")
 
+import pytest
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
 
 
-class LLMDataLoader:
-
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestGGUF(unittest.TestCase):
+class TestGGUF:
 
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct"
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-        self.llm_dataloader = LLMDataLoader()
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
@@ -241,7 +228,7 @@ def test_gguf_baseline(self):
         # print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
         # shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_q4_k_m(self):
+    def test_q4_k_m(self, dataloader):
         model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-1.5B-Instruct"
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -265,21 +252,21 @@ def test_q4_k_m(self):
             iters=0,
             seqlen=1,
             nsamples=8,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             disable_opt_rtn=True,
         )
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake")
-        self.assertEqual(autoround.layer_config["model.layers.11.self_attn.v_proj"]["super_group_size"], 16)
-        self.assertEqual(autoround.layer_config["model.layers.11.self_attn.v_proj"]["data_type"], "int_sym_dq")
-        self.assertEqual(autoround.layer_config["model.layers.7.self_attn.v_proj"]["data_type"], "int_asym_dq")
-        self.assertEqual(autoround.model.model.layers[0].self_attn.v_proj.bits, 6)
-        self.assertEqual(autoround.model.model.layers[12].self_attn.v_proj.bits, 4)
-        self.assertEqual(autoround.model.model.embed_tokens.bits, 6)
-        self.assertEqual(autoround.model.model.embed_tokens.group_size, 16)
-        self.assertEqual(autoround.model.model.layers[12].mlp.gate_proj.bits, 3)
-        self.assertEqual(autoround.model.model.layers[10].mlp.gate_proj.bits, 8)
-        self.assertEqual(autoround.layer_config["model.layers.10.mlp.gate_proj"]["mostly"], "gguf:q8_0")
+        assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["super_group_size"] == 16
+        assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["data_type"] == "int_sym_dq"
+        assert autoround.layer_config["model.layers.7.self_attn.v_proj"]["data_type"] == "int_asym_dq"
+        assert autoround.model.model.layers[0].self_attn.v_proj.bits == 6
+        assert autoround.model.model.layers[12].self_attn.v_proj.bits == 4
+        assert autoround.model.model.embed_tokens.bits == 6
+        assert autoround.model.model.embed_tokens.group_size == 16
+        assert autoround.model.model.layers[12].mlp.gate_proj.bits == 3
+        assert autoround.model.model.layers[10].mlp.gate_proj.bits == 8
+        assert autoround.layer_config["model.layers.10.mlp.gate_proj"]["mostly"] == "gguf:q8_0"
         shutil.rmtree("./saved", ignore_errors=True)
 
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
@@ -413,7 +400,3 @@ def test_qtype_setting(self):
             ar.layer_config["model.embed_tokens"]["bits"] == 6
             and ar.layer_config["model.embed_tokens"]["super_bits"] == 8
         )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_llmcompressor.py b/test/test_cpu/test_llmcompressor.py
index 051dfb075..ebe531f75 100644
--- a/test/test_cpu/test_llmcompressor.py
+++ b/test/test_cpu/test_llmcompressor.py
@@ -1,25 +1,22 @@
 import os
 import shutil
-import sys
-import unittest
-
-sys.path.insert(0, "../..")
 
+import pytest
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
 
 
-class TestLLMC(unittest.TestCase):
+class TestLLMC:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.model_name = "/tf_dataset/auto_round/models/stas/tiny-random-llama-2"
         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
@@ -60,9 +57,9 @@ def test_llmcompressor_fp8(self):
 
         config = json.load(open("./saved/config.json"))
         self.assertIn("group_0", config["quantization_config"]["config_groups"])
-        self.assertEqual(config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"], 8)
-        self.assertEqual(config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"], "channel")
-        self.assertEqual(config["quantization_config"]["quant_method"], "compressed-tensors")
+        assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8
+        assert config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"] == "channel"
+        assert config["quantization_config"]["quant_method"] == "compressed-tensors"
 
     def test_autoround_llmcompressor_fp8(self):
         ## quantize the model
@@ -81,13 +78,9 @@ def test_autoround_llmcompressor_fp8(self):
 
         config = json.load(open("./saved/config.json"))
         self.assertIn("group_0", config["quantization_config"]["config_groups"])
-        self.assertEqual(config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"], 8)
-        self.assertEqual(config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"], "tensor")
+        assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8
+        assert config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"] == "tensor"
         self.assertEqual(
             config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["strategy"], "tensor"
         )
-        self.assertEqual(config["quantization_config"]["quant_method"], "compressed-tensors")
-
-
-if __name__ == "__main__":
-    unittest.main()
+        assert config["quantization_config"]["quant_method"] == "compressed-tensors"
diff --git a/test/test_cpu/test_load_awq_gptq.py b/test/test_cpu/test_load_awq_gptq.py
index 4fb6bb977..e78266182 100644
--- a/test/test_cpu/test_load_awq_gptq.py
+++ b/test/test_cpu/test_load_awq_gptq.py
@@ -1,40 +1,15 @@
 import shutil
-import sys
-import unittest
-
-sys.path.insert(0, "../..")
 
+import pytest
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
+from ..helpers import model_infer
 
-class TestAutoRound(unittest.TestCase):
-    def model_infer(self, model, tokenizer):
-        prompts = [
-            "Hello,my name is",
-            # "The president of the United States is",
-            # "The capital of France is",
-            # "The future of AI is",
-        ]
-
-        inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
-
-        outputs = model.generate(
-            input_ids=inputs["input_ids"].to(model.device),
-            attention_mask=inputs["attention_mask"].to(model.device),
-            do_sample=False,  ## change this to follow official usage
-            max_new_tokens=5,
-        )
-        generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
-
-        decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
 
-        for i, prompt in enumerate(prompts):
-            print(f"Prompt: {prompt}")
-            print(f"Generated: {decoded_outputs[i]}")
-            print("-" * 50)
+class TestAutoRound:
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("runs", ignore_errors=True)
 
     def test_load_gptq_no_dummy_gidx_model(self):
@@ -60,4 +35,4 @@ def test_load_awq(self):
             quantization_config=quantization_config,
         )
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
diff --git a/test/test_cpu/test_mix_bits.py b/test/test_cpu/test_mix_bits.py
index 2c73d42cd..71354feb9 100644
--- a/test/test_cpu/test_mix_bits.py
+++ b/test/test_cpu/test_mix_bits.py
@@ -1,14 +1,11 @@
 import json
 import os
 import shutil
-import sys
-import unittest
 from pathlib import Path
 
-from parameterized import parameterized
-
-sys.path.insert(0, "../..")
+import pytest
 import torch
+from parameterized import parameterized
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
 from auto_round import AutoRound
@@ -26,31 +23,21 @@ def _get_folder_size(path: str) -> float:
     return total_size / (1024**3)  # convert to GB
 
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestAutoRound(unittest.TestCase):
+class TestAutoRound:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         self.save_dir = ".saved/"
         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-        self.llm_dataloader = LLMDataLoader()
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_gptqmodel
-    def test_mixed_gptqmodel(self):
+    def test_mixed_gptqmodel(self, dataloader):
         layer_config = {
             "k_proj": {"bits": 8},  # part name
             "lm_head": {"bits": 4},  # set lm_head quant
@@ -64,7 +51,7 @@ def test_mixed_gptqmodel(self):
             iters=2,
             seqlen=2,
             layer_config=layer_config,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
@@ -79,7 +66,7 @@ def test_mixed_gptqmodel(self):
         assert "!!!" not in model.tokenizer.decode(result)  # string output
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mixed_gptqmodel_convert_to_ar(self):
+    def test_mixed_gptqmodel_convert_to_ar(self, dataloader):
         layer_config = {
             "k_proj": {"bits": 8},  # part name
             "lm_head": {"bits": 4},  # set lm_head quant
@@ -93,7 +80,7 @@ def test_mixed_gptqmodel_convert_to_ar(self):
             iters=2,
             seqlen=2,
             layer_config=layer_config,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
@@ -108,7 +95,7 @@ def test_mixed_gptqmodel_convert_to_ar(self):
         print(res)
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mixed_autoround_format(self):
+    def test_mixed_autoround_format(self, dataloader):
         layer_config = {
             "k_proj": {"bits": 8},
             "q_proj": {"bits": 3},
@@ -120,7 +107,7 @@ def test_mixed_autoround_format(self):
             scheme="W4A16",
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = "./saved"
@@ -134,7 +121,7 @@ def test_mixed_autoround_format(self):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_fallback_regex_for_awq_format(self):
+    def test_fallback_regex_for_awq_format(self, dataloader):
         layer_config = {
             "lm_head": {"bits": 16},
             "fc1": {"bits": 16},
@@ -144,7 +131,7 @@ def test_fallback_regex_for_awq_format(self):
             scheme="W4A16",
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = "./saved"
@@ -159,7 +146,7 @@ def test_fallback_regex_for_awq_format(self):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mixed_ar_format_part_name_hf_loading(self):
+    def test_mixed_ar_format_part_name_hf_loading(self, dataloader):
         layer_config = {
             "k_proj": {"bits": 8},  # part name
             "lm_head": {"bits": 16},  # full name
@@ -170,7 +157,7 @@ def test_mixed_ar_format_part_name_hf_loading(self):
             scheme="W4A16",
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = "./saved"
@@ -220,7 +207,7 @@ def test_mixed_ar_format_part_name_hf_loading(self):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mixed_MXFP_autoround_format_loading(self):
+    def test_mixed_MXFP_autoround_format_loading(self, dataloader):
         layer_config = {
             "k_proj": {"bits": 8, "act_bits": 8},
             "lm_head": {"bits": 16, "act_bits": 16},
@@ -231,7 +218,7 @@ def test_mixed_MXFP_autoround_format_loading(self):
             scheme="MXFP4",
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
@@ -248,7 +235,3 @@ def test_mixed_MXFP_autoround_format_loading(self):
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.14)
         shutil.rmtree(quantized_model_path, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_mllm.py b/test/test_cpu/test_mllm.py
index 8510adca5..25f2a209a 100644
--- a/test/test_cpu/test_mllm.py
+++ b/test/test_cpu/test_mllm.py
@@ -1,10 +1,6 @@
-import sys
-import unittest
-
-sys.path.insert(0, "../..")
-
 import shutil
 
+import pytest
 from transformers import AutoProcessor, AutoTokenizer, Qwen2VLForConditionalGeneration
 
 from auto_round import AutoRoundMLLM
@@ -27,18 +23,18 @@ def __iter__(self):
             yield self.data
 
 
-class TestAutoRoundMLLM(unittest.TestCase):
+class TestAutoRoundMLLM:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct"
         self.dataset = FakeDataLoader()
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-        return super().tearDownClass()
+        return super().teardown_class()
 
     def test_tune(self):
         bits, group_size = 4, 128
@@ -105,11 +101,11 @@ class Myclass:
         dataset = MLLM_DATASET["liuhaotian/llava"](
             template=Myclass(), model=None, tokenzier=None, dataset_path="liuhaotian/llava", seqlen=32, nsamples=32
         )
-        self.assertEqual(len(dataset.questions), 32)
+        assert len(dataset.questions) == 32
         dataset = MLLM_DATASET["liuhaotian/llava"](
             template=Myclass(), model=None, tokenzier=None, dataset_path="liuhaotian/llava", seqlen=2048, nsamples=512
         )
-        self.assertEqual(len(dataset.questions), 512)
+        assert len(dataset.questions) == 512
 
     def test_diff_dataset(self):
         tokenizer = AutoTokenizer.from_pretrained(self.model_name)
@@ -265,7 +261,3 @@ def test_qwen2_5(self):
             generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
         print(output_text)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_model_scope.py b/test/test_cpu/test_model_scope.py
index 6da33cdc3..0097b3584 100644
--- a/test/test_cpu/test_model_scope.py
+++ b/test/test_cpu/test_model_scope.py
@@ -1,30 +1,17 @@
 import copy
 import os
 import shutil
-import sys
-import unittest
-
-sys.path.insert(0, "../..")
 
+import pytest
 import torch
 
 from auto_round import AutoRound
 
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(3):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestModelScope(unittest.TestCase):
+class TestModelScope:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.saved_path = "./saved"
-        self.dataset = LLMDataLoader()
 
         self.source_path, self.cache_path = "/tf_dataset/auto_round/modelscope", "/home/hostuser/.cache/modelscope"
         if os.path.exists(self.source_path):
@@ -33,13 +20,13 @@ def setUpClass(self):
             shutil.copytree(self.source_path, self.cache_path, dirs_exist_ok=True)
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
         if os.path.exists(self.cache_path):
             shutil.rmtree(self.cache_path, ignore_errors=True)
 
-        return super().tearDownClass()
+        return super().teardown_class()
 
     def test_llm(self):
         model_name = "Qwen/Qwen2.5-0.5B-Instruct"
@@ -54,7 +41,3 @@ def test_mllm(self):
             model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=self.dataset, batch_size=2
         )
         autoround.quantize_and_save(self.saved_path)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py
index ba9d3a1a8..1144d00d6 100644
--- a/test/test_cpu/test_mxfp_nvfp.py
+++ b/test/test_cpu/test_mxfp_nvfp.py
@@ -1,12 +1,9 @@
 import os
 import shutil
-import sys
-import unittest
 
-from parameterized import parameterized
-
-sys.path.insert(0, "../..")
+import pytest
 import torch
+from parameterized import parameterized
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
 from auto_round import AutoRound
@@ -23,30 +20,20 @@ def _get_folder_size(path: str) -> float:
     return total_size / (1024**3)  # convert to GB
 
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestAutoRoundFP(unittest.TestCase):
+class TestAutoRoundFP:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         self.save_dir = "./saved"
         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto")
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-        self.llm_dataloader = LLMDataLoader()
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_nvfp4_moe_actmax_rtn(self):
+    def test_nvfp4_moe_actmax_rtn(self, dataloader):
         model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
         layer_config = {
             "self_attn": {"bits": 16, "act_bits": 16},
@@ -62,7 +49,7 @@ def test_nvfp4_moe_actmax_rtn(self):
             iters=0,
             seqlen=2,
             nsamples=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         compressed_model, _ = autoround.quantize()
@@ -73,7 +60,7 @@ def test_nvfp4_moe_actmax_rtn(self):
         ), "Illegal NVFP4 quantization for lm_head layer"
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
-    def test_nvfp4_moe_actmax_ar(self):
+    def test_nvfp4_moe_actmax_ar(self, dataloader):
         model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
         layer_config = {
             "q_proj": {"bits": 16, "act_bits": 16},
@@ -89,7 +76,7 @@ def test_nvfp4_moe_actmax_ar(self):
             iters=1,
             seqlen=2,
             nsamples=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         compressed_model, _ = autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round")
@@ -111,7 +98,7 @@ def test_nvfp4_moe_actmax_ar(self):
         self.assertGreater(result["results"]["piqa"]["acc,none"], 0.7)
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
-    def test_mxfp4_moe_ar(self):
+    def test_mxfp4_moe_ar(self, dataloader):
         model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
         layer_config = {
             "q_proj": {"bits": 16, "act_bits": 16, "data_type": "float"},
@@ -127,7 +114,7 @@ def test_mxfp4_moe_ar(self):
             iters=1,
             seqlen=2,
             nsamples=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         compressed_model, _ = autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round")
@@ -139,7 +126,7 @@ def test_mxfp4_moe_ar(self):
         ), "Illegal MXFP4 packing for lm_head layer"
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
-    def test_mxfp4_llmcompressor_format(self):
+    def test_mxfp4_llmcompressor_format(self, dataloader):
         model_name = self.model_name
         from transformers import AutoConfig
 
@@ -151,7 +138,7 @@ def test_mxfp4_llmcompressor_format(self):
             iters=2,
             seqlen=2,
             layer_config=layer_config,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_dir
         autoround.quantize()
@@ -179,7 +166,7 @@ def test_mxfp4_llmcompressor_format(self):
         ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_rtn_mxfp4_llmcompressor_format(self):
+    def test_rtn_mxfp4_llmcompressor_format(self, dataloader):
         model_name = self.model_name
         from transformers import AutoConfig
 
@@ -191,7 +178,7 @@ def test_rtn_mxfp4_llmcompressor_format(self):
             iters=0,
             seqlen=2,
             layer_config=layer_config,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_dir
         autoround.quantize()
@@ -219,7 +206,7 @@ def test_rtn_mxfp4_llmcompressor_format(self):
         ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mxfp8_llmcompressor_format(self):
+    def test_mxfp8_llmcompressor_format(self, dataloader):
         model_name = self.model_name
         from transformers import AutoConfig
 
@@ -229,7 +216,7 @@ def test_mxfp8_llmcompressor_format(self):
             scheme=scheme,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_dir
         compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
@@ -256,7 +243,7 @@ def test_mxfp8_llmcompressor_format(self):
         ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.2 GB)"
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_nvfp4_llmcompressor_format(self):
+    def test_nvfp4_llmcompressor_format(self, dataloader):
         model_name = self.model_name
         from transformers import AutoConfig
 
@@ -266,7 +253,7 @@ def test_nvfp4_llmcompressor_format(self):
             scheme=scheme,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_dir
         compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
@@ -293,7 +280,7 @@ def test_nvfp4_llmcompressor_format(self):
         ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.15 GB)"
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_nvfp4_autoround_format(self):
+    def test_nvfp4_autoround_format(self, dataloader):
         model_name = self.model_name
         from transformers import AutoConfig
 
@@ -303,7 +290,7 @@ def test_nvfp4_autoround_format(self):
             scheme=scheme,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_dir
         compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
@@ -318,7 +305,7 @@ def test_nvfp4_autoround_format(self):
         ), "Illegal NVFP4 packing name or data_type or shape"
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_nvfp4_autoround_save_quantized(self):
+    def test_nvfp4_autoround_save_quantized(self, dataloader):
         model_name = self.model_name
         from transformers import AutoConfig
 
@@ -328,7 +315,7 @@ def test_nvfp4_autoround_save_quantized(self):
             scheme=scheme,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_dir
         autoround.quantize()
@@ -344,7 +331,7 @@ def test_nvfp4_autoround_save_quantized(self):
         ), "Illegal NVFP4 packing name or data_type or shape"
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_qwen_moe_quant_infer(self):
+    def test_qwen_moe_quant_infer(self, dataloader):
         model_name = "/tf_dataset/auto_round/models/Qwen/Qwen1.5-MoE-A2.7B"
         layer_config = {
             "layers\.(?:[3-9]|1[0-9]|2[0-3])": {"bits": 16, "act_bits": 16},
@@ -356,7 +343,7 @@ def test_qwen_moe_quant_infer(self):
             iters=1,
             seqlen=2,
             nsamples=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
@@ -381,7 +368,7 @@ def test_qwen_moe_quant_infer(self):
             ("NVFP4", "fp8", None),
         ]
     )
-    def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype):
+    def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype, dataloader):
         model_name = self.model_name
         from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
         from transformers.models.opt.modeling_opt import OPTForCausalLM
@@ -397,7 +384,7 @@ def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype):
             scheme=scheme,
             iters=0,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             static_kv_dtype=static_kv_dtype,
             static_attention_dtype=static_attention_dtype,
         )
@@ -433,7 +420,3 @@ def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype):
                 getattr(attn, "q_scale", None) is not None
             ), f"Missing q_scale in attention for scheme={scheme}, static_attention_dtype={static_attention_dtype}"
         shutil.rmtree(quantized_model_path, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_scheme.py b/test/test_cpu/test_scheme.py
index c2d165639..71f02dc96 100644
--- a/test/test_cpu/test_scheme.py
+++ b/test/test_cpu/test_scheme.py
@@ -1,64 +1,52 @@
 import shutil
-import sys
-import unittest
 
+import pytest
 import torch
 
-sys.path.insert(0, "../..")
 from auto_round import AutoRound
 from auto_round.schemes import QuantizationScheme
 
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestAutoRound(unittest.TestCase):
+class TestAutoRound:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         self.save_folder = "./saved"
-        self.llm_dataloader = LLMDataLoader()
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree(self.save_folder, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_gguf(self):
+    def test_gguf(self, dataloader):
         ar = AutoRound(
             "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B",
             scheme="W2A16",
             nsamples=1,
             iters=1,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         ar.quantize_and_save(self.save_folder, format="gguf:q4_k_m")
-        self.assertEqual(ar.bits, 4)
+        assert ar.bits == 4
         shutil.rmtree(self.save_folder, ignore_errors=True)
 
-    def test_w4a16(self):
-        ar = AutoRound(self.model_name, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=self.llm_dataloader)
-        self.assertEqual(ar.bits, 4)
+    def test_w4a16(self, dataloader):
+        ar = AutoRound(self.model_name, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
+        assert ar.bits == 4
         ar.quantize()
 
-    def test_w2a16_rtn(self):
-        ar = AutoRound(self.model_name, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=self.llm_dataloader)
-        self.assertEqual(ar.bits, 2)
+    def test_w2a16_rtn(self, dataloader):
+        ar = AutoRound(self.model_name, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=dataloader)
+        assert ar.bits == 2
         ar.quantize()
 
-    def test_mxfp4(self):
-        ar = AutoRound(self.model_name, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=self.llm_dataloader)
-        self.assertEqual(ar.bits, 4)
-        self.assertEqual(ar.act_bits, 4)
-        self.assertEqual(ar.data_type, "mx_fp")
-        self.assertEqual(ar.act_data_type, "mx_fp_rceil")
+    def test_mxfp4(self, dataloader):
+        ar = AutoRound(self.model_name, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
+        assert ar.bits == 4
+        assert ar.act_bits == 4
+        assert ar.data_type == "mx_fp"
+        assert ar.act_data_type == "mx_fp_rceil"
         ar.quantize()
 
     def test_vllm(self):
@@ -67,18 +55,18 @@ def test_vllm(self):
         ar = AutoRoundMLLM(
             "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct", scheme="W2A16", nsamples=1, iters=1, seqlen=2
         )
-        self.assertEqual(ar.bits, 2)
-        self.assertEqual(ar.act_bits, 16)
-
-    def test_nvfp4(self):
-        ar = AutoRound(self.model_name, scheme="NVFP4", nsamples=1, iters=1, seqlen=2, dataset=self.llm_dataloader)
-        self.assertEqual(ar.bits, 4)
-        self.assertEqual(ar.act_bits, 4)
-        self.assertEqual(ar.data_type, "nv_fp")
-        self.assertEqual(ar.act_data_type, "nv_fp4_with_static_gs")
+        assert ar.bits == 2
+        assert ar.act_bits == 16
+
+    def test_nvfp4(self, dataloader):
+        ar = AutoRound(self.model_name, scheme="NVFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
+        assert ar.bits == 4
+        assert ar.act_bits == 4
+        assert ar.data_type == "nv_fp"
+        assert ar.act_data_type == "nv_fp4_with_static_gs"
         ar.quantize()
 
-    def test_all_scheme(self):
+    def test_all_scheme(self, dataloader):
         import copy
 
         preset_schemes = ["W8A16", "MXFP8", "FPW8A16", "FP8_STATIC", "GGUF:Q2_K_S", "GGUF:Q4_K_M"]
@@ -87,11 +75,11 @@ def test_all_scheme(self):
             if "gguf" in scheme.lower():
                 model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-1.5B-Instruct"
             print(f"scheme={scheme}")
-            ar = AutoRound(model_name, scheme=scheme, nsamples=1, iters=1, seqlen=2, dataset=self.llm_dataloader)
+            ar = AutoRound(model_name, scheme=scheme, nsamples=1, iters=1, seqlen=2, dataset=dataloader)
             ar.quantize_and_save(self.save_folder)
             shutil.rmtree(self.save_folder, ignore_errors=True)
 
-    def test_scheme_in_layer_config(self):
+    def test_scheme_in_layer_config(self, dataloader):
         layer_config = {
             "model.decoder.layers.2.self_attn": {"bits": 2},
             "model.decoder.layers.3.self_attn.v_proj": "W8A16",
@@ -104,19 +92,19 @@ def test_scheme_in_layer_config(self):
             iters=1,
             layer_config=layer_config,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
 
         ar.quantize()
         for n, m in ar.model.named_modules():
             if n == "model.decoder.layers.2.self_attn.q_proj":
-                self.assertEqual(m.bits, 2)
+                assert m.bits == 2
             if n == "model.decoder.layers.2.self_attn.k_proj":
-                self.assertEqual(m.bits, 2)
+                assert m.bits == 2
             if n == "model.decoder.layers.3.self_attn.v_proj":
-                self.assertEqual(m.bits, 8)
+                assert m.bits == 8
             if n == "model.decoder.layers.4.self_attn.k_proj":
-                self.assertEqual(m.group_size, 64)
+                assert m.group_size == 64
 
     def test_parse_available_devices(self):
         from auto_round.utils.device import parse_available_devices
@@ -125,10 +113,6 @@ def test_parse_available_devices(self):
         self.assertTrue(len(device_list) == 1 and "cpu" in device_list)
         device_list = parse_available_devices("a:cuda:0,b:cuda:1,c:cpu")
         self.assertTrue(len(device_list) == 3)
-        self.assertEqual(device_list, ["cuda:0", "cuda:1", "cpu"])
+        assert device_list == ["cuda:0", "cuda:1", "cpu"]
         device_list = parse_available_devices("0,1")
         self.assertTrue(len(device_list) == 1 and "cpu" in device_list)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_script.py b/test/test_cpu/test_script.py
index 01bbba644..aa25d7f61 100644
--- a/test/test_cpu/test_script.py
+++ b/test/test_cpu/test_script.py
@@ -1,11 +1,9 @@
 import os
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 
 
-class TestScript(unittest.TestCase):
+class TestScript:
     def test_default(self):
         os.system(
             """
@@ -15,7 +13,3 @@ def test_default(self):
                     --deployment_device fake
                     --output_dir ./tmp_script_test"""
         )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_torch_backend.py b/test/test_cpu/test_torch_backend.py
index d1e9bd293..e27914d9b 100644
--- a/test/test_cpu/test_torch_backend.py
+++ b/test/test_cpu/test_torch_backend.py
@@ -1,11 +1,6 @@
 import shutil
-import sys
-import unittest
 
 import pytest
-
-sys.path.insert(0, "../..")
-
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -13,56 +8,22 @@
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.testing_utils import require_autogptq, require_gptqmodel
 
+from ..helpers import model_infer
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
 
-
-class TestAutoRoundTorchBackend(unittest.TestCase):
+class TestAutoRoundTorchBackend:
 
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.model_name = "facebook/opt-125m"
         self.save_folder = "./saved"
-        self.llm_dataloader = LLMDataLoader()
-
-    def model_infer(self, model, tokenizer):
-        prompts = [
-            "Hello,my name is",
-            # "The president of the United States is",
-            # "The capital of France is",
-            # "The future of AI is",
-        ]
-
-        inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
-
-        outputs = model.generate(
-            input_ids=inputs["input_ids"].to(model.device),
-            attention_mask=inputs["attention_mask"].to(model.device),
-            do_sample=False,  ## change this to follow official usage
-            max_new_tokens=5,
-        )
-        generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
-
-        decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-        for i, prompt in enumerate(prompts):
-            print(f"Prompt: {prompt}")
-            print(f"Generated: {decoded_outputs[i]}")
-            print("-" * 50)
-        return decoded_outputs[0]
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree(self.save_folder, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_torch_4bits_asym(self):
+    def test_torch_4bits_asym(self, dataloader):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, False
@@ -74,7 +35,7 @@ def test_torch_4bits_asym(self):
             sym=sym,
             iters=0,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_folder
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel")
@@ -85,7 +46,7 @@ def test_torch_4bits_asym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10)
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35)
@@ -96,14 +57,14 @@ def test_torch_4bits_asym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10)
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35)
         torch.cuda.empty_cache()
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_torch_4bits_sym(self):
+    def test_torch_4bits_sym(self, dataloader):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 32, True
@@ -115,7 +76,7 @@ def test_torch_4bits_sym(self):
             sym=sym,
             iters=0,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_folder
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")  ##will convert to gptq model
@@ -126,13 +87,9 @@ def test_torch_4bits_sym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=1000)
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.28)
         torch.cuda.empty_cache()
         shutil.rmtree(self.save_folder, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_utils.py b/test/test_cpu/test_utils.py
index e70a4b7b4..3dec97010 100644
--- a/test/test_cpu/test_utils.py
+++ b/test/test_cpu/test_utils.py
@@ -1,7 +1,5 @@
-import sys
 from unittest.mock import patch
 
-sys.path.insert(0, "../..")
 import auto_round.utils.device as auto_round_utils
 
 
diff --git a/test/test_cpu/test_woq_linear.py b/test/test_cpu/test_woq_linear.py
index e077c7a21..8f5bedc2c 100644
--- a/test/test_cpu/test_woq_linear.py
+++ b/test/test_cpu/test_woq_linear.py
@@ -1,9 +1,6 @@
-import sys
-
 import pytest
 import torch
 
-sys.path.insert(0, "../..")
 from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear
 
 
diff --git a/test/test_cuda/__init__.py b/test/test_cuda/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/test_cuda/_test_helpers.py b/test/test_cuda/_test_helpers.py
deleted file mode 100644
index b4b8a5955..000000000
--- a/test/test_cuda/_test_helpers.py
+++ /dev/null
@@ -1,32 +0,0 @@
-def model_infer(model, tokenizer, apply_chat_template=False):
-    prompts = [
-        "Hello,my name is",
-        # "The president of the United States is",
-        # "The capital of France is",
-        # "The future of AI is",
-    ]
-    if apply_chat_template:
-        texts = []
-        for prompt in prompts:
-            messages = [{"role": "user", "content": prompt}]
-            text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-            texts.append(text)
-        prompts = texts
-
-    inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
-
-    outputs = model.generate(
-        input_ids=inputs["input_ids"].to(model.device),
-        attention_mask=inputs["attention_mask"].to(model.device),
-        do_sample=False,  ## change this to follow official usage
-        max_new_tokens=5,
-    )
-    generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
-
-    decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-    for i, prompt in enumerate(prompts):
-        print(f"Prompt: {prompt}")
-        print(f"Generated: {decoded_outputs[i]}")
-        print("-" * 50)
-    return decoded_outputs[0]
diff --git a/test/test_cuda/test_2_3bits.py b/test/test_cuda/test_2_3bits.py
index 2ea407f20..f12bf240c 100644
--- a/test/test_cuda/test_2_3bits.py
+++ b/test/test_cuda/test_2_3bits.py
@@ -1,10 +1,8 @@
 import copy
 import re
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 import transformers
 from lm_eval.utils import make_table  # pylint: disable=E0401
@@ -14,6 +12,8 @@
 from auto_round.eval.evaluation import simple_evaluate, simple_evaluate_user_model
 from auto_round.testing_utils import require_autogptq, require_greater_than_050, require_greater_than_051
 
+from ..helpers import model_infer
+
 
 def get_accuracy(data):
     match = re.search(r"\|acc\s+\|[↑↓]\s+\|\s+([\d.]+)\|", data)
@@ -25,43 +25,17 @@ def get_accuracy(data):
         return 0.0
 
 
-class TestAutoRound(unittest.TestCase):
+class TestAutoRound:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.save_dir = "./saved"
         self.tasks = "lambada_openai"
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def model_infer(self, model, tokenizer):
-        prompts = [
-            "Hello,my name is",
-            # "The president of the United States is",
-            # "The capital of France is",
-            # "The future of AI is",
-        ]
-
-        inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
-
-        outputs = model.generate(
-            input_ids=inputs["input_ids"].to(model.device),
-            attention_mask=inputs["attention_mask"].to(model.device),
-            do_sample=False,  ## change this to follow official usage
-            max_new_tokens=5,
-        )
-        generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
-
-        decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-        for i, prompt in enumerate(prompts):
-            print(f"Prompt: {prompt}")
-            print(f"Generated: {decoded_outputs[i]}")
-            print("-" * 50)
-        return decoded_outputs[0]
-
     @require_greater_than_051
     def test_3bits_autoround(self):
         model_name = "/models/opt-125m"
@@ -77,7 +51,7 @@ def test_3bits_autoround(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.3)  ## 0.3130
@@ -145,7 +119,3 @@ def test_2bits_autoround(self):
         accuracy = get_accuracy(res)
         assert accuracy > 0.17
         shutil.rmtree("./saved", ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_alg_ext.py b/test/test_cuda/test_alg_ext.py
index c83d6f3b4..499213c74 100644
--- a/test/test_cuda/test_alg_ext.py
+++ b/test/test_cuda/test_alg_ext.py
@@ -1,9 +1,7 @@
 import shutil
 import sys
-import unittest
-
-sys.path.insert(0, "../..")
 
+import pytest
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -11,15 +9,15 @@
 from auto_round.eval.evaluation import simple_evaluate_user_model
 
 
-class TestAlgExt(unittest.TestCase):
+class TestAlgExt:
 
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.model_name = "/models/opt-125m"
         self.save_folder = "./saved"
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree(self.save_folder, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
@@ -68,7 +66,3 @@ def test_all_support_dtype(self):
                 model_name, scheme=scheme, iters=1, nsamples=1, enable_alg_ext=True, enable_torch_compile=True
             )
             ar.quantize()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_auto_round_format.py b/test/test_cuda/test_auto_round_format.py
index 55fc1690f..cbc6868f1 100644
--- a/test/test_cuda/test_auto_round_format.py
+++ b/test/test_cuda/test_auto_round_format.py
@@ -1,9 +1,7 @@
 import copy
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -18,69 +16,24 @@
     require_package_version_ut,
 )
 
+from ..helpers import model_infer
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
 
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestAutoRound(unittest.TestCase):
+class TestAutoRound:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.model_name = "facebook/opt-125m"
 
-        self.llm_dataloader = LLMDataLoader()
         self.save_folder = "./saved"
 
-    def model_infer(self, model, tokenizer):
-        prompts = [
-            "Hello,my name is",
-            # "The president of the United States is",
-            # "The capital of France is",
-            # "The future of AI is",
-        ]
-
-        ##texts = []
-        # for prompt in prompts:
-        #     messages = [
-        #         {"role": "user", "content": prompt}
-        #     ]
-        #     text = tokenizer.apply_chat_template(
-        #         messages,
-        #         tokenize=False,
-        #         add_generation_prompt=True
-        #     )
-        #     texts.append(text)
-
-        inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
-
-        outputs = model.generate(
-            input_ids=inputs["input_ids"].to(model.device),
-            attention_mask=inputs["attention_mask"].to(model.device),
-            do_sample=False,  ## change this to follow official usage
-            max_new_tokens=5,
-        )
-        generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
-
-        decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-        for i, prompt in enumerate(prompts):
-            print(f"Prompt: {prompt}")
-            print(f"Generated: {decoded_outputs[i]}")
-            print("-" * 50)
-
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree(self.save_folder, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_greater_than_050
     @require_package_version_ut("transformers", "<4.57.0")
-    def test_autoround_asym(self):
+    def test_autoround_asym(self, dataloader):
         for bits in [2, 3, 4, 8]:
             model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
             tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -93,7 +46,7 @@ def test_autoround_asym(self):
                 sym=sym,
                 iters=2,
                 seqlen=2,
-                dataset=self.llm_dataloader,
+                dataset=dataloader,
             )
             quantized_model_path = self.save_folder
 
@@ -132,7 +85,7 @@ def test_mixed_precision(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.32)
@@ -161,7 +114,7 @@ def test_awq_backend(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.18)
@@ -172,7 +125,7 @@ def test_awq_backend(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         shutil.rmtree(self.save_folder, ignore_errors=True)
 
     @require_greater_than_050
@@ -184,12 +137,12 @@ def test_tritonv2_bf16(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(model_name)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
 
         torch.cuda.empty_cache()
 
     @require_ipex
-    def test_autoround_gptq_sym_format(self):
+    def test_autoround_gptq_sym_format(self, dataloader):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
@@ -201,7 +154,7 @@ def test_autoround_gptq_sym_format(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = "./saved"
 
@@ -244,7 +197,7 @@ def test_autoround_gptq_sym_format(self):
     @require_awq
     @require_ipex
     @require_package_version_ut("transformers", "<4.57.0")
-    def test_autoround_awq_sym_format(self):
+    def test_autoround_awq_sym_format(self, dataloader):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
@@ -256,7 +209,7 @@ def test_autoround_awq_sym_format(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = "./saved"
 
@@ -283,7 +236,7 @@ def test_autoround_awq_sym_format(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
     @require_greater_than_050
-    def test_autoround_sym(self):
+    def test_autoround_sym(self, dataloader):
         for bits in [2, 3, 4, 8]:
             model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
             tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -296,7 +249,7 @@ def test_autoround_sym(self):
                 sym=sym,
                 iters=2,
                 seqlen=2,
-                dataset=self.llm_dataloader,
+                dataset=dataloader,
             )
             quantized_model_path = "./saved"
 
@@ -325,8 +278,4 @@ def test_load_gptq_model_3bits(self):
             quantization_config=quantization_config,
         )
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-        self.model_infer(model, tokenizer)
-
-
-if __name__ == "__main__":
-    unittest.main()
+        model_infer(model, tokenizer)
diff --git a/test/test_cuda/test_auto_scheme.py b/test/test_cuda/test_auto_scheme.py
index 681e3b29b..b6f5d8066 100644
--- a/test/test_cuda/test_auto_scheme.py
+++ b/test/test_cuda/test_auto_scheme.py
@@ -1,10 +1,9 @@
 import copy
 import re
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
+
 from auto_round import AutoRound, AutoRoundConfig, AutoScheme
 from auto_round.auto_scheme.utils import compute_avg_bits_for_model
 from auto_round.eval.evaluation import simple_evaluate
@@ -12,14 +11,14 @@
 from auto_round.utils import get_module
 
 
-class TestAutoScheme(unittest.TestCase):
+class TestAutoScheme:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.save_dir = "./saved"
         self.tasks = "lambada_openai"
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
@@ -79,7 +78,7 @@ def test_shared_layers(self):
         from auto_round.auto_scheme.utils import parse_shared_layers
 
         res = parse_shared_layers(model, shared_layers)
-        self.assertEqual(len(res), 24)
+        assert len(res) == 24
         assert [
             "model.decoder.layers.2.self_attn.out_proj",
             "model.decoder.layers.2.self_attn.q_proj",
@@ -101,7 +100,7 @@ def test_shared_layers(self):
                 else:
                     bits.append(module.bits)
             bits = set(bits)
-            self.assertEqual(len(bits), 1)
+            assert len(bits) == 1
         print(avg_bits)
         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
 
@@ -187,7 +186,7 @@ def test_patch_scheme(self):
         model, layer_config = ar.quantize()
         for n, m in model.named_modules():
             if hasattr(m, "group_size"):
-                self.assertEqual(m.group_size, 32)
+                assert m.group_size == 32
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)
         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
@@ -199,13 +198,13 @@ def test_layer_config(self):
         user_layer_config = {"model.decoder.layers.10.fc1": {"bits": 8, "group_size": 32, "sym": False}}
         ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config)
         model, layer_config = ar.quantize()
-        self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["bits"], 8)
-        self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["sym"], False)
-        self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["group_size"], 32)
+        assert layer_config["model.decoder.layers.10.fc1"]["bits"] == 8
+        assert layer_config["model.decoder.layers.10.fc1"]["sym"] == False
+        assert layer_config["model.decoder.layers.10.fc1"]["group_size"] == 32
         layer = get_module(model, "model.decoder.layers.10.fc1")
-        self.assertEqual(layer.bits, 8)
-        self.assertEqual(layer.sym, False)
-        self.assertEqual(layer.group_size, 32)
+        assert layer.bits == 8
+        assert layer.sym == False
+        assert layer.group_size == 32
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)
         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
@@ -216,13 +215,13 @@ def test_layer_config(self):
         user_layer_config = {"model.decoder.layers.10.fc1": {"bits": 8, "group_size": 32, "sym": False}}
         ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config)
         model, layer_config = ar.quantize()
-        self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["bits"], 8)
-        self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["sym"], False)
-        self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["group_size"], 32)
+        assert layer_config["model.decoder.layers.10.fc1"]["bits"] == 8
+        assert layer_config["model.decoder.layers.10.fc1"]["sym"] == False
+        assert layer_config["model.decoder.layers.10.fc1"]["group_size"] == 32
         layer = get_module(model, "model.decoder.layers.10.fc1")
-        self.assertEqual(layer.orig_layer.bits, 8)
-        self.assertEqual(layer.orig_layer.sym, False)
-        self.assertEqual(layer.orig_layer.group_size, 32)
+        assert layer.orig_layer.bits == 8
+        assert layer.orig_layer.sym == False
+        assert layer.orig_layer.group_size == 32
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)
         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
@@ -265,7 +264,3 @@ def test_enable_torch_compile(self):
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.10)
         shutil.rmtree(self.save_dir, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_calib_dataset.py b/test/test_cuda/test_calib_dataset.py
index b66f60127..6a36c21b1 100644
--- a/test/test_cuda/test_calib_dataset.py
+++ b/test/test_cuda/test_calib_dataset.py
@@ -1,20 +1,17 @@
+import json
 import os
 import shutil
-import sys
-import unittest
-
-sys.path.insert(0, "../..")
-import json
 
+import pytest
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
 
 
-class TestLocalCalibDataset(unittest.TestCase):
+class TestLocalCalibDataset:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         json_data = [{"text": "awefdsfsddfd"}, {"text": "fdfdfsdfdfdfd"}, {"text": "dfdsfsdfdfdfdf"}]
         os.makedirs("./saved", exist_ok=True)
         self.json_file = "./saved/tmp.json"
@@ -40,7 +37,3 @@ def test_combine_dataset(self):
             self.model, self.tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=128, dataset=dataset
         )
         autoround.quantize()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_conv1d.py b/test/test_cuda/test_conv1d.py
index e617bf55e..c5384a384 100644
--- a/test/test_cuda/test_conv1d.py
+++ b/test/test_cuda/test_conv1d.py
@@ -1,40 +1,29 @@
 import copy
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
-from _test_helpers import model_infer
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
 from auto_round.testing_utils import require_gptqmodel
 
+from ..helpers import model_infer
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
 
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestQuantizationConv1d(unittest.TestCase):
+class TestQuantizationConv1d:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.model_name = "MBZUAI/LaMini-GPT-124M"
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-        self.llm_dataloader = LLMDataLoader()
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_gptqmodel
-    def test_quant(self):
+    def test_quant(self, dataloader):
         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
         from auto_round import AutoRoundConfig
@@ -47,7 +36,7 @@ def test_quant(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
 
         autoround.quantize()
@@ -55,7 +44,3 @@ def test_quant(self):
 
         model = AutoModelForCausalLM.from_pretrained("./saved", device_map="cuda", trust_remote_code=True)
         model_infer(model, self.tokenizer)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_diffusion.py b/test/test_cuda/test_diffusion.py
index 9a5a8bfd3..147a34d47 100644
--- a/test/test_cuda/test_diffusion.py
+++ b/test/test_cuda/test_diffusion.py
@@ -2,13 +2,9 @@
 import os
 import re
 import shutil
-import sys
-import unittest
 
+import pytest
 import requests
-
-sys.path.insert(0, "../..")
-
 from diffusers import AutoPipelineForText2Image
 from PIL import Image
 
@@ -16,13 +12,13 @@
 from auto_round.testing_utils import require_gptqmodel, require_optimum, require_vlm_env
 
 
-class TestAutoRound(unittest.TestCase):
+class TestAutoRound:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.model_name = "/dataset/FLUX.1-dev"
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_optimum
@@ -77,7 +73,3 @@ def test_diffusion_model_checker(self):
         self.assertTrue(is_diffusion_model("/models/stable-diffusion-2-1"))
         self.assertTrue(is_diffusion_model("/models/stable-diffusion-xl-base-1.0"))
         self.assertFalse(is_diffusion_model("/models/Qwen3-8B"))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_exllamav2_backend.py b/test/test_cuda/test_exllamav2_backend.py
index c489b37b2..e6f78ba90 100644
--- a/test/test_cuda/test_exllamav2_backend.py
+++ b/test/test_cuda/test_exllamav2_backend.py
@@ -1,12 +1,6 @@
 import shutil
-import sys
-import unittest
 
 import pytest
-
-sys.path.insert(0, "../..")
-
-
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -14,62 +8,28 @@
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.testing_utils import require_autogptq, require_gptqmodel, require_package_version_ut
 
-
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
+from ..helpers import model_infer
 
 
-class TestAutoRoundexllamaBackend(unittest.TestCase):
+class TestAutoRoundexllamaBackend:
 
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.model_name = "/models/opt-125m"
         self.save_folder = "./saved"
-        self.llm_dataloader = LLMDataLoader()
-
-    def model_infer(self, model, tokenizer):
-        prompts = [
-            "Hello,my name is",
-            # "The president of the United States is",
-            # "The capital of France is",
-            # "The future of AI is",
-        ]
-
-        inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
-
-        outputs = model.generate(
-            input_ids=inputs["input_ids"].to(model.device),
-            attention_mask=inputs["attention_mask"].to(model.device),
-            do_sample=False,  ## change this to follow official usage
-            max_new_tokens=5,
-        )
-        generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
-
-        decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-        for i, prompt in enumerate(prompts):
-            print(f"Prompt: {prompt}")
-            print(f"Generated: {decoded_outputs[i]}")
-            print("-" * 50)
-        return decoded_outputs[0]
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree(self.save_folder, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_gptqmodel
-    def test_gptqmodel_exllmav2_4bits_asym(self):
+    def test_gptqmodel_exllmav2_4bits_asym(self, dataloader):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, False
         autoround = AutoRound(
-            model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, dataset=self.llm_dataloader
+            model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, dataset=dataloader
         )
         quantized_model_path = self.save_folder
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel")
@@ -80,7 +40,7 @@ def test_gptqmodel_exllmav2_4bits_asym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35)
@@ -91,7 +51,7 @@ def test_gptqmodel_exllmav2_4bits_asym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35)
@@ -100,7 +60,7 @@ def test_gptqmodel_exllmav2_4bits_asym(self):
 
     @require_autogptq
     @require_package_version_ut("torch", "<2.6.0")
-    def test_gptq_exllamav2_4bits_sym(self):
+    def test_gptq_exllamav2_4bits_sym(self, dataloader):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
@@ -112,7 +72,7 @@ def test_gptq_exllamav2_4bits_sym(self):
             sym=sym,
             iters=1,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_folder
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")  ##will convert to gptq model
@@ -123,7 +83,7 @@ def test_gptq_exllamav2_4bits_sym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27)
@@ -158,13 +118,9 @@ def test_gptq_exllamav2_4bits_sym_group_size(self):
             )
 
             tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-            self.model_infer(model, tokenizer)
+            model_infer(model, tokenizer)
             result = simple_evaluate_user_model(model, tokenizer, batch_size=64, tasks="lambada_openai")
             print(result["results"]["lambada_openai"]["acc,none"])
             self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.15)
             torch.cuda.empty_cache()
             shutil.rmtree(self.save_folder, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_export.py b/test/test_cuda/test_export.py
index 6bb2612e8..3e1171162 100644
--- a/test/test_cuda/test_export.py
+++ b/test/test_cuda/test_export.py
@@ -1,9 +1,7 @@
 import copy
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 import transformers
 from transformers import AutoConfig, AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
@@ -12,29 +10,19 @@
 from auto_round.testing_utils import require_awq, require_optimum, require_package_version_ut
 
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestAutoRound(unittest.TestCase):
+class TestAutoRound:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.model_name = "facebook/opt-125m"
         self.save_dir = "./saved"
-        self.llm_dataloader = LLMDataLoader()
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_optimum
-    def test_autogptq_format(self):
+    def test_autogptq_format(self, dataloader):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, False
@@ -46,7 +34,7 @@ def test_autogptq_format(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
         quantized_model_path = "./saved"
@@ -65,7 +53,7 @@ def test_autogptq_format(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
     @require_optimum
-    def test_autogptq_format_fp_layers(self):
+    def test_autogptq_format_fp_layers(self, dataloader):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         layer_config = {}
@@ -82,7 +70,7 @@ def test_autogptq_format_fp_layers(self):
             sym=sym,
             iters=1,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         autoround.quantize()
@@ -102,7 +90,7 @@ def test_autogptq_format_fp_layers(self):
         #     "there there there there there there")
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_autogptq_format_qsave_fp_layers(self):
+    def test_autogptq_format_qsave_fp_layers(self, dataloader):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         layer_config = {}
@@ -119,7 +107,7 @@ def test_autogptq_format_qsave_fp_layers(self):
             sym=sym,
             iters=1,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = "./saved"
@@ -153,7 +141,7 @@ def test_autogptq_format_qsave_fp_layers(self):
         ##print(res)
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_autoround_format(self):
+    def test_autoround_format(self, dataloader):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
@@ -165,7 +153,7 @@ def test_autoround_format(self):
             sym=sym,
             iters=1,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
         quantized_model_path = "./saved"
@@ -186,7 +174,7 @@ def test_autoround_format(self):
 
     @require_awq
     @require_package_version_ut("transformers", "<4.57.0")
-    def test_autoawq_format(self):
+    def test_autoawq_format(self, dataloader):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, False
@@ -198,7 +186,7 @@ def test_autoawq_format(self):
             sym=sym,
             iters=1,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
         quantized_model_path = "./saved"
@@ -220,7 +208,7 @@ def test_autoawq_format(self):
     @require_optimum
     @require_awq
     @require_package_version_ut("transformers", "<4.57.0")
-    def test_autoawq_format_fp_qsave_layers(self):
+    def test_autoawq_format_fp_qsave_layers(self, dataloader):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         layer_config = {
             "model.decoder.layers.0.self_attn.k_proj": {"bits": 16},
@@ -236,7 +224,7 @@ def test_autoawq_format_fp_qsave_layers(self):
             sym=sym,
             iters=1,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = "./saved/test_export"
@@ -261,7 +249,7 @@ def test_autoawq_format_fp_qsave_layers(self):
 
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_autoround_3bit_asym_torch_format(self):
+    def test_autoround_3bit_asym_torch_format(self, dataloader):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 3, 128, False
@@ -273,7 +261,7 @@ def test_autoround_3bit_asym_torch_format(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
         quantized_model_path = "./saved"
@@ -290,7 +278,7 @@ def test_autoround_3bit_asym_torch_format(self):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_autoround_3bit_sym_torch_format(self):
+    def test_autoround_3bit_sym_torch_format(self, dataloader):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 3, 128, True
@@ -302,7 +290,7 @@ def test_autoround_3bit_sym_torch_format(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
         quantized_model_path = "./saved"
@@ -322,7 +310,7 @@ def test_autoround_3bit_sym_torch_format(self):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_awq_lmhead_export(self):
+    def test_awq_lmhead_export(self, dataloader):
         bits, sym, group_size = 4, False, 128
         model_name = "/models/phi-2"
         layer_config = {
@@ -336,7 +324,7 @@ def test_awq_lmhead_export(self):
             iters=2,
             seqlen=2,
             layer_config=layer_config,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = "./saved"
         compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq")
@@ -354,7 +342,7 @@ def test_awq_lmhead_export(self):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_gptq_lmhead_export(self):
+    def test_gptq_lmhead_export(self, dataloader):
         bits, sym, group_size = 4, True, 128
         model_name = "/models/phi-2"
         layer_config = {
@@ -368,7 +356,7 @@ def test_gptq_lmhead_export(self):
             iters=2,
             seqlen=2,
             layer_config=layer_config,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = "./saved"
         compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
@@ -384,7 +372,3 @@ def test_gptq_lmhead_export(self):
         res = tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0])
         print(res)
         shutil.rmtree(quantized_model_path, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_fp8_input.py b/test/test_cuda/test_fp8_input.py
index 5258fe183..90a177ef3 100644
--- a/test/test_cuda/test_fp8_input.py
+++ b/test/test_cuda/test_fp8_input.py
@@ -1,9 +1,7 @@
 import os
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -12,13 +10,13 @@
 from auto_round.eval.evaluation import simple_evaluate
 
 
-class TestAutoRound(unittest.TestCase):
+class TestAutoRound:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.save_dir = "./saved"
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree(self.save_dir, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
@@ -128,7 +126,3 @@ def test_diff_datatype(self):
                 ar = AutoRound(model=model_name, iters=iters, scheme=scheme)
                 ar.quantize_and_save(output_dir=self.save_dir)
                 shutil.rmtree(self.save_dir, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_get_block_name.py b/test/test_cuda/test_get_block_name.py
index cc9297653..52f251cb7 100644
--- a/test/test_cuda/test_get_block_name.py
+++ b/test/test_cuda/test_get_block_name.py
@@ -1,9 +1,7 @@
 import copy
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 import transformers
 from diffusers import AutoPipelineForText2Image
@@ -20,13 +18,13 @@
 from auto_round.utils import get_block_names, is_pure_text_model
 
 
-class TestAutoRound(unittest.TestCase):
+class TestAutoRound:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         pass
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("runs", ignore_errors=True)
 
     def check_block_names(self, block_names, prefixs=[], n_layers=[]):
@@ -199,7 +197,3 @@ def test_flux(self):
 
         block_names = get_block_names(model, quant_vision=True)
         self.check_block_names(block_names, ["transformer_blocks", "single_transformer_blocks"], [19, 38])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_gguf.py b/test/test_cuda/test_gguf.py
index 312e561cf..b8ee88d0b 100644
--- a/test/test_cuda/test_gguf.py
+++ b/test/test_cuda/test_gguf.py
@@ -1,9 +1,8 @@
 import os
 import shutil
 import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -12,23 +11,14 @@
 from auto_round.testing_utils import require_gguf
 
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestAutoRound(unittest.TestCase):
+class TestAutoRound:
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_gguf
-    def test_gguf_format(self):
+    def test_gguf_format(self, dataloader):
         model_name = "Qwen/Qwen2.5-0.5B-Instruct"
         bits, group_size, sym = 4, 32, False
         autoround = AutoRound(
@@ -39,7 +29,7 @@ def test_gguf_format(self):
             iters=2,
             seqlen=2,
             nsamples=2,
-            dataset=LLMDataLoader(),
+            dataset=dataloader,
         )
         autoround.quantize()
         quantized_model_path = "./saved"
@@ -71,7 +61,7 @@ def test_gguf_format(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
     @require_gguf
-    def test_q2_k_export(self):
+    def test_q2_k_export(self, dataloader):
         bits, group_size, sym = 2, 16, False
         model_name = "Qwen/Qwen2.5-1.5B-Instruct"
         autoround = AutoRound(
@@ -81,7 +71,7 @@ def test_q2_k_export(self):
             sym=sym,
             iters=1,
             seqlen=1,
-            dataset=LLMDataLoader(),
+            dataset=dataloader,
             data_type="int_asym_dq",
         )
         autoround.quantize()
@@ -252,7 +242,3 @@ def test_vlm_gguf(self):
     #     file_size = os.path.getsize(os.path.join(quantized_model_path, "mmproj-model.gguf")) / 1024**2
     #     self.assertAlmostEqual(file_size, 3326.18, delta=5.0)
     #     shutil.rmtree(quantized_model_path, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_main_func.py b/test/test_cuda/test_main_func.py
index 571fc10f5..20dc7bdc8 100644
--- a/test/test_cuda/test_main_func.py
+++ b/test/test_cuda/test_main_func.py
@@ -1,10 +1,8 @@
 import copy
 import re
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 import transformers
 from lm_eval.utils import make_table  # pylint: disable=E0401
@@ -26,14 +24,14 @@ def get_accuracy(data):
         return 0.0
 
 
-class TestMainFunc(unittest.TestCase):
+class TestMainFunc:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.save_dir = "./saved"
         self.tasks = "lambada_openai"
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
@@ -216,7 +214,3 @@ def test_low_cpu_mem_usage(self):
         )
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
         shutil.rmtree(quantized_model_path, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_marlin_backend.py b/test/test_cuda/test_marlin_backend.py
index 26d3ddca2..b920d9478 100644
--- a/test/test_cuda/test_marlin_backend.py
+++ b/test/test_cuda/test_marlin_backend.py
@@ -1,29 +1,18 @@
 import shutil
-import sys
-import unittest
 
 import pytest
-
-sys.path.insert(0, "../..")
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound, AutoRoundConfig
 from auto_round.eval.evaluation import simple_evaluate_user_model
 
+from ..helpers import model_infer
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
 
+class TestAutoRoundMarlinBackend:
 
-class TestAutoRoundMarlinBackend(unittest.TestCase):
-
-    def test_marlin_group_size(self):
+    def test_marlin_group_size(self, dataloader):
         for group_size in [-1, 64]:
             print(f"{group_size}!!!!!!!!!!!!!!!!!")
             model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
@@ -37,7 +26,7 @@ def test_marlin_group_size(self):
                 sym=sym,
                 iters=1,
                 seqlen=2,
-                dataset=self.llm_dataloader,
+                dataset=dataloader,
             )
             quantized_model_path = self.save_folder
             autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
@@ -48,7 +37,7 @@ def test_marlin_group_size(self):
             )
 
             tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-            self.model_infer(model, tokenizer)
+            model_infer(model, tokenizer)
             result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
             print(result["results"]["lambada_openai"]["acc,none"])
             self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.14)
@@ -66,7 +55,7 @@ def test_marlin_group_size(self):
                 sym=sym,
                 iters=1,
                 seqlen=2,
-                dataset=self.llm_dataloader,
+                dataset=dataloader,
             )
             quantized_model_path = self.save_folder
             autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
@@ -77,49 +66,22 @@ def test_marlin_group_size(self):
             )
 
             tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-            self.model_infer(model, tokenizer)
+            model_infer(model, tokenizer)
             result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
             print(result["results"]["lambada_openai"]["acc,none"])
             self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.14)
 
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.model_name = "/models/opt-125m"
         self.save_folder = "./saved"
-        self.llm_dataloader = LLMDataLoader()
-
-    def model_infer(self, model, tokenizer):
-        prompts = [
-            "Hello,my name is",
-            # "The president of the United States is",
-            # "The capital of France is",
-            # "The future of AI is",
-        ]
-
-        inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
-
-        outputs = model.generate(
-            input_ids=inputs["input_ids"].to(model.device),
-            attention_mask=inputs["attention_mask"].to(model.device),
-            do_sample=False,  ## change this to follow official usage
-            max_new_tokens=5,
-        )
-        generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
-
-        decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-        for i, prompt in enumerate(prompts):
-            print(f"Prompt: {prompt}")
-            print(f"Generated: {decoded_outputs[i]}")
-            print("-" * 50)
-        return decoded_outputs[0]
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_marlin_4bits_sym_with_zp_m_1(self):
+    def test_marlin_4bits_sym_with_zp_m_1(self, dataloader):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
@@ -131,7 +93,7 @@ def test_marlin_4bits_sym_with_zp_m_1(self):
             sym=sym,
             iters=1,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_folder
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
@@ -142,7 +104,7 @@ def test_marlin_4bits_sym_with_zp_m_1(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27)
@@ -153,7 +115,7 @@ def test_marlin_4bits_sym_with_zp_m_1(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27)
@@ -172,7 +134,7 @@ def test_marlin_4bits_sym_with_zp_m_1(self):
     #         sym=sym,
     #         iters=1,
     #         seqlen=2,
-    #         dataset=self.llm_dataloader,
+    #         dataset=dataloader,
     #     )
     #     quantized_model_path = self.save_folder
     #     autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
@@ -186,7 +148,7 @@ def test_marlin_4bits_sym_with_zp_m_1(self):
     #     )
     #
     #     tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-    #     self.model_infer(model, tokenizer)
+    #     model_infer(model, tokenizer)
     #     result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
     #     print(result['results']['lambada_openai']['acc,none'])
     #     self.assertGreater(result['results']['lambada_openai']['acc,none'], 0.27)
@@ -200,13 +162,9 @@ def test_marlin_4bits_sym_with_zp_m_1(self):
     #     )
     #
     #     tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-    #     self.model_infer(model, tokenizer)
+    #     model_infer(model, tokenizer)
     #     result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
     #     print(result['results']['lambada_openai']['acc,none'])
     #     self.assertGreater(result['results']['lambada_openai']['acc,none'], 0.27)
     #     torch.cuda.empty_cache()
     #     shutil.rmtree("./saved", ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_mix_bits.py b/test/test_cuda/test_mix_bits.py
index 4f7d39d8c..b9b7dde5c 100644
--- a/test/test_cuda/test_mix_bits.py
+++ b/test/test_cuda/test_mix_bits.py
@@ -1,15 +1,11 @@
 import json
 import os
 import shutil
-import sys
-import unittest
-
-from parameterized import parameterized
-
-sys.path.insert(0, "../..")
 from pathlib import Path
 
+import pytest
 import torch
+from parameterized import parameterized
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
 from auto_round import AutoRound
@@ -20,31 +16,21 @@
 )
 
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestAutoRound(unittest.TestCase):
+class TestAutoRound:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.model_name = "/models/opt-125m"
         self.save_dir = "./saved"
         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-        self.llm_dataloader = LLMDataLoader()
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_gptqmodel
-    def test_mixed_gptqmodel(self):
+    def test_mixed_gptqmodel(self, dataloader):
         scheme = "W4A16"
         layer_config = {
             "k_proj": {"bits": 8},  # part name
@@ -59,7 +45,7 @@ def test_mixed_gptqmodel(self):
             iters=2,
             seqlen=2,
             layer_config=layer_config,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
@@ -73,7 +59,7 @@ def test_mixed_gptqmodel(self):
         print(res)
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mixed_gptqmodel_convert_to_ar(self):
+    def test_mixed_gptqmodel_convert_to_ar(self, dataloader):
         layer_config = {
             "k_proj": {"bits": 8},  # part name
             "lm_head": {"bits": 4},  # set lm_head quant
@@ -86,7 +72,7 @@ def test_mixed_gptqmodel_convert_to_ar(self):
             iters=2,
             seqlen=2,
             layer_config=layer_config,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
@@ -101,7 +87,7 @@ def test_mixed_gptqmodel_convert_to_ar(self):
         print(res)
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mixed_autoround_format(self):
+    def test_mixed_autoround_format(self, dataloader):
         layer_config = {
             "k_proj": {"bits": 8},
             "q_proj": {"bits": 3},
@@ -113,7 +99,7 @@ def test_mixed_autoround_format(self):
             scheme="W4A16",
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = "self.save_dir"
@@ -129,7 +115,7 @@ def test_mixed_autoround_format(self):
 
     @require_awq
     @require_package_version_ut("transformers", "<4.57.0")
-    def test_fallback_regex_for_awq_format(self):
+    def test_fallback_regex_for_awq_format(self, dataloader):
         model_name = "facebook/opt-125m"
         layer_config = {
             "lm_head": {"bits": 16},
@@ -140,7 +126,7 @@ def test_fallback_regex_for_awq_format(self):
             scheme="W4A16",
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = "self.save_dir"
@@ -155,7 +141,7 @@ def test_fallback_regex_for_awq_format(self):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mixed_ar_format_part_name_hf_loading(self):
+    def test_mixed_ar_format_part_name_hf_loading(self, dataloader):
         layer_config = {
             "k_proj": {"bits": 8},  # part name
             "lm_head": {"bits": 16},  # full name
@@ -166,7 +152,7 @@ def test_mixed_ar_format_part_name_hf_loading(self):
             scheme="W4A16",
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = "self.save_dir"
@@ -216,7 +202,7 @@ def test_mixed_ar_format_part_name_hf_loading(self):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mixed_MXFP_autoround_format_loading(self):
+    def test_mixed_MXFP_autoround_format_loading(self, dataloader):
         layer_config = {
             "k_proj": {"bits": 8, "act_bits": 8},
             "lm_head": {"bits": 16, "act_bits": 16},
@@ -227,7 +213,7 @@ def test_mixed_MXFP_autoround_format_loading(self):
             scheme="MXFP4",
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
@@ -245,7 +231,7 @@ def test_mixed_MXFP_autoround_format_loading(self):
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.32)
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mixed_autoround_format_vllm(self):
+    def test_mixed_autoround_format_vllm(self, dataloader):
         layer_config = {
             "self_attn": {"bits": 8},
             "lm_head": {"bits": 16},
@@ -256,7 +242,7 @@ def test_mixed_autoround_format_vllm(self):
             scheme="W4A16",
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         autoround.quantize()
@@ -285,7 +271,7 @@ def test_mixed_autoround_format_vllm(self):
             print(f"{prompt}: {generated_text}")
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mixed_llmcompressor_format_vllm(self):
+    def test_mixed_llmcompressor_format_vllm(self, dataloader):
         layer_config = {
             "self_attn": {"bits": 16, "act_bits": 16},
             "lm_head": {"bits": 16, "act_bits": 16},
@@ -296,7 +282,7 @@ def test_mixed_llmcompressor_format_vllm(self):
             scheme="NVFP4",
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
@@ -323,7 +309,3 @@ def test_mixed_llmcompressor_format_vllm(self):
             print(f"{prompt}: {generated_text}")
             assert "!!!" not in generated_text
         shutil.rmtree(quantized_model_path, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_multiple_card.py b/test/test_cuda/test_multiple_card.py
index 5dac584fe..2f29f7a37 100644
--- a/test/test_cuda/test_multiple_card.py
+++ b/test/test_cuda/test_multiple_card.py
@@ -1,11 +1,7 @@
 import re
 import shutil
-import sys
-import unittest
-
-sys.path.insert(0, "../..")
-
 
+import pytest
 import torch
 from lm_eval.utils import make_table  # pylint: disable=E0401
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -27,14 +23,14 @@ def get_accuracy(data):
 
 # import os
 # os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
-class TestAutoRound(unittest.TestCase):
+class TestAutoRound:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.save_dir = "./saved"
         self.tasks = "lambada_openai"
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree(self.save_dir, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
@@ -361,24 +357,20 @@ def test_mllm_device_map(self):
 
         device_map = "0,1"
         ar = AutoRoundMLLM(model_name, device_map=device_map)
-        self.assertEqual(ar.device, "cuda:0")
-        self.assertEqual(ar.device_map, device_map)
+        assert ar.device == "cuda:0"
+        assert ar.device_map == device_map
 
         device_map = 1
         ar = AutoRoundMLLM(ar.model, ar.tokenizer, processor=ar.processor, device_map=device_map)
-        self.assertEqual(ar.device, "cuda:1")
-        self.assertEqual(ar.device_map, device_map)
+        assert ar.device == "cuda:1"
+        assert ar.device_map == device_map
 
         device_map = "auto"
         ar = AutoRoundMLLM(ar.model, ar.tokenizer, processor=ar.processor, device_map=device_map)
-        self.assertEqual(ar.device, "cuda")
-        self.assertEqual(ar.device_map, device_map)
+        assert ar.device == "cuda"
+        assert ar.device_map == device_map
 
         device_map = {"model.language_model.layers": 0, "model.visual.blocks": 1}
         ar = AutoRoundMLLM(ar.model, ar.tokenizer, processor=ar.processor, device_map=device_map)
-        self.assertEqual(ar.model.model.language_model.layers[0].self_attn.q_proj.tuning_device, "cuda:0")
-        self.assertEqual(ar.model.model.visual.blocks[0].mlp.fc1.tuning_device, "cuda:1")
-
-
-if __name__ == "__main__":
-    unittest.main()
+        assert ar.model.model.language_model.layers[0].self_attn.q_proj.tuning_device == "cuda:0"
+        assert ar.model.model.visual.blocks[0].mlp.fc1.tuning_device == "cuda:1"
diff --git a/test/test_cuda/test_multiple_card_calib.py b/test/test_cuda/test_multiple_card_calib.py
index 490193532..410855c33 100644
--- a/test/test_cuda/test_multiple_card_calib.py
+++ b/test/test_cuda/test_multiple_card_calib.py
@@ -2,9 +2,8 @@
 import re
 import shutil
 import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 
 from auto_round.testing_utils import multi_card
 
@@ -19,14 +18,14 @@ def get_accuracy(data):
         return 0.0
 
 
-class TestAutoRound(unittest.TestCase):
+class TestAutoRound:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.save_dir = "./saved"
         self.tasks = "lambada_openai"
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
@@ -40,7 +39,3 @@ def test_multiple_card_calib(self):
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_mxfp_nvfp.py b/test/test_cuda/test_mxfp_nvfp.py
index 48dd27d9b..357afb0f3 100644
--- a/test/test_cuda/test_mxfp_nvfp.py
+++ b/test/test_cuda/test_mxfp_nvfp.py
@@ -1,9 +1,7 @@
 import copy
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 import transformers
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
@@ -12,28 +10,18 @@
 from auto_round.testing_utils import require_awq, require_optimum
 
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestAutoRound(unittest.TestCase):
+class TestAutoRound:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.model_name = "facebook/opt-125m"
         self.save_dir = "./saved"
-        self.llm_dataloader = LLMDataLoader()
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_fp8input_mxfp4_llmcompressor_format(self):
+    def test_fp8input_mxfp4_llmcompressor_format(self, dataloader):
         model_name = "/models/Qwen3-0.6B-FP8"
         scheme = "mxfp4"
         ar = AutoRound(
@@ -41,7 +29,7 @@ def test_fp8input_mxfp4_llmcompressor_format(self):
             iters=2,
             seqlen=2,
             scheme=scheme,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         compressed_model, _ = ar.quantize_and_save(output_dir=self.save_dir, format="llm_compressor")
         tmp_layer = compressed_model.model.layers[3].self_attn.q_proj
@@ -59,14 +47,14 @@ def test_fp8input_mxfp4_llmcompressor_format(self):
         ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
-    def test_nvfp4_llmcompressor_format(self):
+    def test_nvfp4_llmcompressor_format(self, dataloader):
         scheme = "nvfp4"
         autoround = AutoRound(
             self.model_name,
             scheme=scheme,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_dir
         compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
@@ -110,7 +98,7 @@ def test_nvfp4_llmcompressor_format(self):
         #     if "France" in prompt:
         #         assert "Paris" in generated_text
 
-    def test_nvfp4_moe_actmax_rtn(self):
+    def test_nvfp4_moe_actmax_rtn(self, dataloader):
         model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite"
         scheme = "nvfp4"
         autoround = AutoRound(
@@ -119,13 +107,13 @@ def test_nvfp4_moe_actmax_rtn(self):
             iters=0,
             seqlen=2,
             nsamples=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
         quantized_model_path = self.save_dir
         autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round")
 
-    def test_nvfp4_moe_actmax_ar(self):
+    def test_nvfp4_moe_actmax_ar(self, dataloader):
         model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite"
         scheme = "nvfp4"
         autoround = AutoRound(
@@ -134,13 +122,13 @@ def test_nvfp4_moe_actmax_ar(self):
             iters=1,
             seqlen=2,
             nsamples=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
         quantized_model_path = self.save_dir
         autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round")
 
-    def test_qwen_moe_quant_infer(self):
+    def test_qwen_moe_quant_infer(self, dataloader):
         model_name = "/models/Qwen1.5-MoE-A2.7B"
         layer_config = {
             "layers\.(?:[3-9]|1[0-9]|2[0-3])": {"bits": 16, "act_bits": 16},
@@ -152,7 +140,7 @@ def test_qwen_moe_quant_infer(self):
             iters=1,
             seqlen=2,
             nsamples=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
@@ -165,7 +153,3 @@ def test_qwen_moe_quant_infer(self):
         print(result["results"]["piqa"]["acc,none"])
         self.assertGreater(result["results"]["piqa"]["acc,none"], 0.7)
         shutil.rmtree(quantized_model_path, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_qbits.py b/test/test_cuda/test_qbits.py
index d73d474d6..0ce3597db 100644
--- a/test/test_cuda/test_qbits.py
+++ b/test/test_cuda/test_qbits.py
@@ -1,48 +1,22 @@
 import shutil
-import sys
-import unittest
-
-sys.path.insert(0, "../..")
 
+import pytest
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound, AutoRoundConfig
 from auto_round.testing_utils import require_gptqmodel, require_itrex
 
+from ..helpers import model_infer
+
 
-class TestAutoRound(unittest.TestCase):
+class TestAutoRound:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.model_name = "/models/opt-125m"
         self.save_folder = "./saved"
 
-    def model_infer(self, model, tokenizer):
-        prompts = [
-            "Hello,my name is",
-            # "The president of the United States is",
-            # "The capital of France is",
-            # "The future of AI is",
-        ]
-
-        inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
-
-        outputs = model.generate(
-            input_ids=inputs["input_ids"].to(model.device),
-            attention_mask=inputs["attention_mask"].to(model.device),
-            do_sample=False,  ## change this to follow official usage
-            max_new_tokens=5,
-        )
-        generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
-
-        decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-        for i, prompt in enumerate(prompts):
-            print(f"Prompt: {prompt}")
-            print(f"Generated: {decoded_outputs[i]}")
-            print("-" * 50)
-
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("runs", ignore_errors=True)
 
     ## require torch 2.6
@@ -58,7 +32,7 @@ def test_load_gptq_model_8bits(self):
             quantization_config=quantization_config,
         )
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
 
     @require_itrex
     def test_load_gptq_model_2bits(self):
@@ -72,7 +46,7 @@ def test_load_gptq_model_2bits(self):
             quantization_config=quantization_config,
         )
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
 
     @require_itrex
     def test_mixed_precision(self):
diff --git a/test/test_cuda/test_scheme.py b/test/test_cuda/test_scheme.py
index 1c603c7ed..06c5b27e0 100644
--- a/test/test_cuda/test_scheme.py
+++ b/test/test_cuda/test_scheme.py
@@ -1,22 +1,19 @@
 import shutil
-import sys
-import unittest
 
-from auto_round.schemes import QuantizationScheme
-
-sys.path.insert(0, "../..")
+import pytest
 
 from auto_round import AutoRound
+from auto_round.schemes import QuantizationScheme
 
 
-class TestAutoRound(unittest.TestCase):
+class TestAutoRound:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.model_name = "/models/opt-125m"
         self.save_folder = "./saved"
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree(self.save_folder, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
@@ -24,59 +21,59 @@ def tearDownClass(self):
     def test_gguf(self):
         ar = AutoRound("/models/Qwen3-0.6B", scheme="W2A16", nsamples=1, iters=1)
         ar.quantize_and_save(self.save_folder, format="gguf:q4_k_m")
-        self.assertEqual(ar.bits, 4)
+        assert ar.bits == 4
         shutil.rmtree(self.save_folder, ignore_errors=True)
 
     def test_w4a16(self):
         ar = AutoRound(self.model_name, scheme="W4A16", nsamples=1, iters=1)
-        self.assertEqual(ar.bits, 4)
+        assert ar.bits == 4
         ar.quantize()
 
     def test_w2a16(self):
         ar = AutoRound(self.model_name, scheme="W2A16", nsamples=1, iters=1)
-        self.assertEqual(ar.bits, 2)
+        assert ar.bits == 2
         ar.quantize()
 
     def test_mxfp4(self):
         ar = AutoRound(self.model_name, scheme="MXFP4", nsamples=1, iters=1)
-        self.assertEqual(ar.bits, 4)
-        self.assertEqual(ar.act_bits, 4)
-        self.assertEqual(ar.data_type, "mx_fp")
-        self.assertEqual(ar.act_data_type, "mx_fp_rceil")
+        assert ar.bits == 4
+        assert ar.act_bits == 4
+        assert ar.data_type == "mx_fp"
+        assert ar.act_data_type == "mx_fp_rceil"
         ar.quantize()
 
     def test_fp8_static(self):
         ar = AutoRound(self.model_name, scheme="FP8_STATIC", nsamples=1, iters=1)
-        self.assertEqual(ar.bits, 8)
-        self.assertEqual(ar.act_bits, 8)
-        self.assertEqual(ar.data_type, "fp")
-        self.assertEqual(ar.act_data_type, "fp")
-        self.assertEqual(ar.group_size, -1)
-        self.assertEqual(ar.act_dynamic, False)
+        assert ar.bits == 8
+        assert ar.act_bits == 8
+        assert ar.data_type == "fp"
+        assert ar.act_data_type == "fp"
+        assert ar.group_size == -1
+        assert ar.act_dynamic == False
         ar.quantize()
 
     ## RTN tests
     def test_w2a16_rtn(self):
         ar = AutoRound(self.model_name, scheme="W2A16", nsamples=1, iters=0)
-        self.assertEqual(ar.bits, 2)
+        assert ar.bits == 2
         ar.quantize()
 
     def test_mxfp4_rtn(self):
         ar = AutoRound(self.model_name, scheme="MXFP4", nsamples=1, iters=0)
-        self.assertEqual(ar.bits, 4)
-        self.assertEqual(ar.act_bits, 4)
-        self.assertEqual(ar.data_type, "mx_fp")
-        self.assertEqual(ar.act_data_type, "mx_fp_rceil")
+        assert ar.bits == 4
+        assert ar.act_bits == 4
+        assert ar.data_type == "mx_fp"
+        assert ar.act_data_type == "mx_fp_rceil"
         ar.quantize()
 
     def test_fp8_static_rtn(self):
         ar = AutoRound(self.model_name, scheme="FP8_STATIC", nsamples=1, iters=0)
-        self.assertEqual(ar.bits, 8)
-        self.assertEqual(ar.act_bits, 8)
-        self.assertEqual(ar.data_type, "fp")
-        self.assertEqual(ar.act_data_type, "fp")
-        self.assertEqual(ar.group_size, -1)
-        self.assertEqual(ar.act_dynamic, False)
+        assert ar.bits == 8
+        assert ar.act_bits == 8
+        assert ar.data_type == "fp"
+        assert ar.act_data_type == "fp"
+        assert ar.group_size == -1
+        assert ar.act_dynamic == False
         ar.quantize()
 
     def test_scheme_in_layer_config(self):
@@ -90,14 +87,10 @@ def test_scheme_in_layer_config(self):
         ar.quantize()
         for n, m in ar.model.named_modules():
             if n == "model.decoder.layers.2.self_attn.q_proj":
-                self.assertEqual(m.bits, 2)
+                assert m.bits == 2
             if n == "model.decoder.layers.2.self_attn.k_proj":
-                self.assertEqual(m.bits, 2)
+                assert m.bits == 2
             if n == "model.decoder.layers.3.self_attn.v_proj":
-                self.assertEqual(m.bits, 8)
+                assert m.bits == 8
             if n == "model.decoder.layers.4.self_attn.k_proj":
-                self.assertEqual(m.group_size, 64)
-
-
-if __name__ == "__main__":
-    unittest.main()
+                assert m.group_size == 64
diff --git a/test/test_cuda/test_support_vlms.py b/test/test_cuda/test_support_vlms.py
index 5a2759021..15c86363b 100644
--- a/test/test_cuda/test_support_vlms.py
+++ b/test/test_cuda/test_support_vlms.py
@@ -1,10 +1,8 @@
 import os
 import shutil
 import sys
-import unittest
-
-sys.path.insert(0, "../..")
 
+import pytest
 import requests
 from PIL import Image
 
@@ -12,15 +10,15 @@
 from auto_round.testing_utils import require_gptqmodel, require_package_version_ut, require_vlm_env
 
 
-class TestSupportVLMS(unittest.TestCase):
+class TestSupportVLMS:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.save_dir = os.path.join(os.path.dirname(__file__), "ut_saved")
         self.python_path = sys.executable
         self.device = 0
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     @require_gptqmodel
@@ -192,7 +190,3 @@ def test_granite_vision(self):
             f"--model {model_path} --iter 1 --output_dir {self.save_dir} --device {self.device}"
         )
         self.assertFalse(res > 0 or res == -1, msg="granite-vision-3.2-2b tuning fail")
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_torch_backend.py b/test/test_cuda/test_torch_backend.py
index 3f7cb4141..495da24e3 100644
--- a/test/test_cuda/test_torch_backend.py
+++ b/test/test_cuda/test_torch_backend.py
@@ -1,12 +1,6 @@
 import shutil
-import sys
-import unittest
 
 import pytest
-
-sys.path.insert(0, "../..")
-
-
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -14,56 +8,22 @@
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.testing_utils import require_autogptq, require_gptqmodel
 
+from ..helpers import model_infer
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
 
-
-class TestAutoRoundTorchBackend(unittest.TestCase):
+class TestAutoRoundTorchBackend:
 
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.model_name = "/models/opt-125m"
         self.save_folder = "./saved"
-        self.llm_dataloader = LLMDataLoader()
-
-    def model_infer(self, model, tokenizer):
-        prompts = [
-            "Hello,my name is",
-            # "The president of the United States is",
-            # "The capital of France is",
-            # "The future of AI is",
-        ]
-
-        inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
-
-        outputs = model.generate(
-            input_ids=inputs["input_ids"].to(model.device),
-            attention_mask=inputs["attention_mask"].to(model.device),
-            do_sample=False,  ## change this to follow official usage
-            max_new_tokens=5,
-        )
-        generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
-
-        decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-        for i, prompt in enumerate(prompts):
-            print(f"Prompt: {prompt}")
-            print(f"Generated: {decoded_outputs[i]}")
-            print("-" * 50)
-        return decoded_outputs[0]
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree(self.save_folder, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_torch_4bits_asym(self):
+    def test_torch_4bits_asym(self, dataloader):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, False
@@ -75,7 +35,7 @@ def test_torch_4bits_asym(self):
             sym=sym,
             iters=1,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_folder
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel")
@@ -86,7 +46,7 @@ def test_torch_4bits_asym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35)
@@ -97,14 +57,14 @@ def test_torch_4bits_asym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35)
         torch.cuda.empty_cache()
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_torch_4bits_sym(self):
+    def test_torch_4bits_sym(self, dataloader):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
@@ -116,7 +76,7 @@ def test_torch_4bits_sym(self):
             sym=sym,
             iters=1,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_folder
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")  ##will convert to gptq model
@@ -127,13 +87,9 @@ def test_torch_4bits_sym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.28)
         torch.cuda.empty_cache()
         shutil.rmtree(self.save_folder, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_transformers.py b/test/test_cuda/test_transformers.py
index 6f953339d..0e43a7e70 100644
--- a/test/test_cuda/test_transformers.py
+++ b/test/test_cuda/test_transformers.py
@@ -14,8 +14,8 @@
 import gc
 import os
 import tempfile
-import unittest
 
+import pytest
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 from transformers.testing_utils import (
     require_accelerate,
@@ -34,7 +34,7 @@
 # @slow
 @require_torch_gpu
 @require_accelerate
-class AutoRoundTest(unittest.TestCase):
+class AutoRoundTest:
     model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc"
     input_text = "There is a girl who likes adventure,"
     EXPECTED_OUTPUTS = set()
@@ -53,7 +53,7 @@ class AutoRoundTest(unittest.TestCase):
 
     # called only once for all test in this class
     @classmethod
-    def setUpClass(cls):
+    def setup_class(cls):
         """
         Setup quantized model
         """
@@ -203,7 +203,3 @@ def test_mixed_bits(self):
             text = "There is a girl who likes adventure,"
             inputs = tokenizer(text, return_tensors="pt").to(model.device)
             tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_triton_backend.py b/test/test_cuda/test_triton_backend.py
index 7cbc8719d..38958014b 100644
--- a/test/test_cuda/test_triton_backend.py
+++ b/test/test_cuda/test_triton_backend.py
@@ -1,8 +1,6 @@
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -10,56 +8,22 @@
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.testing_utils import require_greater_than_050
 
+from ..helpers import model_infer
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
 
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestAutoRoundTritonBackend(unittest.TestCase):
+class TestAutoRoundTritonBackend:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.model_name = "/models/opt-125m"
         self.save_folder = "./saved"
-        self.llm_dataloader = LLMDataLoader()
-
-    def model_infer(self, model, tokenizer):
-        prompts = [
-            "Hello,my name is",
-            # "The president of the United States is",
-            # "The capital of France is",
-            # "The future of AI is",
-        ]
-
-        inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
-
-        outputs = model.generate(
-            input_ids=inputs["input_ids"].to(model.device),
-            attention_mask=inputs["attention_mask"].to(model.device),
-            do_sample=False,  ## change this to follow official usage
-            max_new_tokens=5,
-        )
-        generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
-
-        decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-        for i, prompt in enumerate(prompts):
-            print(f"Prompt: {prompt}")
-            print(f"Generated: {decoded_outputs[i]}")
-            print("-" * 50)
-        return decoded_outputs[0]
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_greater_than_050
-    def test_tritonv2_4bits_asym(self):
+    def test_tritonv2_4bits_asym(self, dataloader):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, False
@@ -71,7 +35,7 @@ def test_tritonv2_4bits_asym(self):
             sym=sym,
             iters=1,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_folder
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel")
@@ -82,7 +46,7 @@ def test_tritonv2_4bits_asym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.34)
@@ -93,7 +57,7 @@ def test_tritonv2_4bits_asym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.34)
@@ -115,7 +79,7 @@ def test_tritonv2_2bits_asym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.19)
@@ -126,7 +90,7 @@ def test_tritonv2_2bits_asym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.19)
@@ -134,7 +98,7 @@ def test_tritonv2_2bits_asym(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
     @require_greater_than_050
-    def test_tritonv2_4bits_sym(self):
+    def test_tritonv2_4bits_sym(self, dataloader):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
@@ -146,7 +110,7 @@ def test_tritonv2_4bits_sym(self):
             sym=sym,
             iters=1,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_folder
         autoround.quantize_and_save(output_dir=quantized_model_path)
@@ -157,7 +121,7 @@ def test_tritonv2_4bits_sym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         # print(result['results']['lambada_openai']['acc,none'])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.26)
@@ -168,7 +132,7 @@ def test_tritonv2_4bits_sym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         # print(result['results']['lambada_openai']['acc,none'])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.26)
@@ -191,7 +155,7 @@ def test_tritonv2_8bits_sym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27)
@@ -202,7 +166,7 @@ def test_tritonv2_8bits_sym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         # print(result['results']['lambada_openai']['acc,none'])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27)
@@ -230,7 +194,7 @@ def test_tritonv2_2bits_sym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.18)
@@ -241,13 +205,9 @@ def test_tritonv2_2bits_sym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         # print(result['results']['lambada_openai']['acc,none'])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.18)
         torch.cuda.empty_cache()
         shutil.rmtree("./saved", ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_vlms.py b/test/test_cuda/test_vlms.py
index d06c48ff5..bfc7cf52c 100644
--- a/test/test_cuda/test_vlms.py
+++ b/test/test_cuda/test_vlms.py
@@ -2,26 +2,22 @@
 import os
 import re
 import shutil
-import sys
-import unittest
 
+import pytest
 import requests
-
-sys.path.insert(0, "../..")
-
 from PIL import Image
 
 from auto_round import AutoRoundConfig
 from auto_round.testing_utils import require_gptqmodel, require_optimum, require_vlm_env
 
 
-class TestAutoRound(unittest.TestCase):
+class TestAutoRound:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.save_dir = "./saved"
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree(self.save_dir, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
@@ -155,7 +151,3 @@ def test_mllm_detect(self):
             self.assertFalse(is_mllm_model(model_name))
             model, _ = llm_load_model(model_name)
             self.assertFalse(is_mllm_model(model))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_hpu/__init__.py b/test/test_hpu/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/test_hpu/test_auto_round.py b/test/test_hpu/test_auto_round.py
index 2bb7983e5..eb6066982 100644
--- a/test/test_hpu/test_auto_round.py
+++ b/test/test_hpu/test_auto_round.py
@@ -1,9 +1,10 @@
 import pytest
 import torch
-from _test_helpers import is_pytest_mode_compile, is_pytest_mode_lazy
 
 from auto_round.utils import is_hpex_available
 
+from ..helpers import is_pytest_mode_compile, is_pytest_mode_lazy
+
 
 def run_opt_125m_on_hpu():
     from transformers import AutoModelForCausalLM, AutoTokenizer
diff --git a/test/test_hpu/test_inference.py b/test/test_hpu/test_inference.py
index e0a0ef321..95c680c2d 100644
--- a/test/test_hpu/test_inference.py
+++ b/test/test_hpu/test_inference.py
@@ -1,23 +1,12 @@
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
 
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
 def is_hpex_available():
     try:
         import habana_frameworks.torch.core as htcore  # pylint: disable=E0401
@@ -28,16 +17,15 @@ def is_hpex_available():
 
 # TODO: This test case is temporarily commented out since it not tested for a long time. We need to add it back and change it into pytest format.
 
-# class TestAutoRound(unittest.TestCase):
+# class TestAutoRound:
 #     @classmethod
-#     def setUpClass(self):
+#     def setup_class(self):
 #         model_name = "facebook/opt-125m"
 #         self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
 #         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-#         self.llm_dataloader = LLMDataLoader()
 
 #     @classmethod
-#     def tearDownClass(self):
+#     def teardown_class(self):
 #         shutil.rmtree("./saved", ignore_errors=True)
 #         shutil.rmtree("runs", ignore_errors=True)
 
@@ -57,7 +45,7 @@ def is_hpex_available():
 #             sym=sym,
 #             iters=2,
 #             seqlen=2,
-#             dataset=self.llm_dataloader,
+#             dataset=dataloader,
 #         )
 #         autoround.quantize()
 #         quantized_model_path = "./saved"
@@ -86,7 +74,7 @@ def is_hpex_available():
 #             sym=sym,
 #             iters=2,
 #             seqlen=2,
-#             dataset=self.llm_dataloader,
+#             dataset=dataloader,
 #         )
 #         autoround.quantize()
 #         quantized_model_path = "./saved"
diff --git a/test/test_xpu/__init__.py b/test/test_xpu/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/test_xpu/test_autoround.py b/test/test_xpu/test_autoround.py
index 8052a8af0..b9894cecf 100644
--- a/test/test_xpu/test_autoround.py
+++ b/test/test_xpu/test_autoround.py
@@ -1,9 +1,7 @@
 import copy
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -11,23 +9,13 @@
 from auto_round import AutoRound, AutoRoundConfig
 
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(3):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestAutoRoundXPU(unittest.TestCase):
+class TestAutoRoundXPU:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
 
-        self.llm_dataloader = LLMDataLoader()
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
         pass
@@ -48,7 +36,7 @@ def test_gptq_format(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path)
@@ -80,7 +68,7 @@ def test_awq_format(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq")
@@ -97,7 +85,3 @@ def test_awq_format(self):
         res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])
         print(res)
         assert "!!!" not in res
-
-
-if __name__ == "__main__":
-    unittest.main()

From 25694c0ea722cbd920085b149ffb5e10c75b0773 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Thu, 18 Dec 2025 04:11:18 -0500
Subject: [PATCH 02/24] add readme

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 test/README.md | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/test/README.md b/test/README.md
index e69de29bb..f5bc6ddba 100644
--- a/test/README.md
+++ b/test/README.md
@@ -0,0 +1,46 @@
+# Unit Test (UT) Guide
+
+This project uses `pytest` for unit testing. All test cases are under the `test/` directory. Below is a simple guide for new users to write and run UTs:
+
+## 1. Environment Setup
+- Recommended Python 3.8 or above.
+- Install dependencies:
+  ```sh
+  pip install -r ../requirements.txt
+  pip install pytest
+  ```
+
+## 2. Test Structure
+- Place your test files in the `test/` directory, and name them starting with `test_`.
+- You can refer to existing `test_*.py` files.
+- Common fixtures (such as `tiny_opt_model`, `opt_model`, `opt_tokenizer`, `dataloader`) and helper functions (such as `model_infer`) are defined in `confest.py` and `helpers.py` and can be imported directly.
+- Example:
+  ```python
+  # test_example.py
+    from ..helper import model_infer
+
+    def test_model_infer(tiny_opt_model, opt_tokenizer):
+        result = model_infer(tiny_opt_model, opt_tokenizer, input_text="hello world")
+        assert result is not None
+  ```
+
+## 3. Running Tests
+- In the `test/` directory, run:
+  ```sh
+  pytest
+  ```
+- You can specify a single file or test case:
+  ```sh
+  pytest test_xxx.py
+  pytest -k "test_func_name"
+  ```
+
+## 4. Debugging Tips
+- `confest.py` adds the parent directory to `sys.path`, so you can debug without installing the local package.
+- You can directly import project source code in your test cases.
+
+## 5. Reference
+- Fixtures are defined in `confest.py` and `fixtures.py`
+- Helper functions are in `helpers.py`
+
+If you have any questions, feel free to open an issue.

From 0f20e5ffc028cd60125358fc3b2a44e68223728d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 18 Dec 2025 09:17:55 +0000
Subject: [PATCH 03/24] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 test/helpers.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/helpers.py b/test/helpers.py
index 97870eba6..907507e45 100644
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -2,7 +2,6 @@
 
 import pytest
 
-
 # Automatic choose local path or model name.
 opt_name_or_path = "/tf_dataset/auto_round/models/facebook/opt-125m"
 if not os.path.exists(opt_name_or_path):

From 28ccfae32d852449754339c8de83b3b5cca27e11 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Thu, 18 Dec 2025 04:22:01 -0500
Subject: [PATCH 04/24] add get_model_path func

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 test/helpers.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/test/helpers.py b/test/helpers.py
index 907507e45..e7c40ffe1 100644
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -20,6 +20,18 @@
     gptj_name_or_path = "hf-internal-testing/tiny-random-GPTJForCausalLM"
 
 
+def get_model_path(model_name: str) -> str:
+    ut_path = f"/tf_dataset/auto_round/models/{model_name}"
+    local_path = f"/models/{model_name.split('/')[-1]}"
+
+    if os.path.exists(ut_path):
+        return ut_path
+    elif os.path.exists(local_path):
+        return local_path
+    else:
+        return model_name
+
+
 # HPU mode checking
 def is_pytest_mode_compile():
     return pytest.mode == "compile"

From b9a177d01711d3cc91a40e9bc05ea0ce32740743 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Thu, 18 Dec 2025 04:39:00 -0500
Subject: [PATCH 05/24] add more fixtures

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 test/fixtures.py | 84 +++++++++++++++++++++++++++++++++++-------------
 test/helpers.py  | 47 ++++++++++++++++-----------
 2 files changed, 90 insertions(+), 41 deletions(-)

diff --git a/test/fixtures.py b/test/fixtures.py
index 615e579a8..a96010060 100644
--- a/test/fixtures.py
+++ b/test/fixtures.py
@@ -4,7 +4,13 @@
 import torch
 import transformers
 
-from .helpers import opt_name_or_path
+from .helpers import (
+    opt_name_or_path,
+    qwen_name_or_path,
+    lamini_name_or_path,
+    gptj_name_or_path,
+    get_tiny_model,
+)
 
 
 class DataLoader:
@@ -16,45 +22,77 @@ def __iter__(self):
             yield torch.ones([1, 10], dtype=torch.long)
 
 
+# Create tiny model path fixtures for testing
 @pytest.fixture(scope="session")
 def tiny_opt_model_path():
-    tiny_opt_model_path = "./tmp_tiny_opt_model_path"
-    model = transformers.AutoModelForCausalLM.from_pretrained(opt_name_or_path, dtype="auto", trust_remote_code=True)
-    model.config.num_hidden_layers = 3
-    setattr(model.model.decoder, "layers", model.model.decoder.layers[:3])
-    tokenizer = transformers.AutoTokenizer.from_pretrained(opt_name_or_path, trust_remote_code=True)
-    model.save_pretrained(tiny_opt_model_path)
-    tokenizer.save_pretrained(tiny_opt_model_path)
-    print("[Fixture]: built tiny model path for testing in session")
-    yield tiny_opt_model_path
-    shutil.rmtree(tiny_opt_model_path)
+    model_name_or_path = opt_name_or_path
+    tiny_model_path = "./tmp_tiny_opt_model_path"
+    model = get_tiny_model(model_name_or_path, num_layers=3)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
+    model.save_pretrained(tiny_model_path)
+    tokenizer.save_pretrained(tiny_model_path)
+    print(f"[Fixture]: built tiny model path:{tiny_model_path} for testing in session")
+    yield tiny_model_path
+    shutil.rmtree(tiny_model_path)
 
 
-@pytest.fixture(scope="function")
-def tiny_opt_model():
-    model = transformers.AutoModelForCausalLM.from_pretrained(opt_name_or_path, dtype="auto", trust_remote_code=True)
-    model.config.num_hidden_layers = 3
-    setattr(model.model.decoder, "layers", model.model.decoder.layers[:3])
-    return model
+@pytest.fixture(scope="session")
+def tiny_qwen_model_path():
+    model_name_or_path = qwen_name_or_path
+    tiny_model_path = "./tmp_tiny_qwen_model_path"
+    model = get_tiny_model(model_name_or_path, num_layers=3)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
+    model.save_pretrained(tiny_model_path)
+    tokenizer.save_pretrained(tiny_model_path)
+    print(f"[Fixture]: built tiny model path:{tiny_model_path} for testing in session")
+    yield tiny_model_path
+    shutil.rmtree(tiny_model_path)
 
 
+@pytest.fixture(scope="session")
+def tiny_lamini_model_path():
+    model_name_or_path = lamini_name_or_path
+    tiny_model_path = "./tmp_tiny_lamini_model_path"
+    model = get_tiny_model(model_name_or_path, num_layers=3)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
+    model.save_pretrained(tiny_model_path)
+    tokenizer.save_pretrained(tiny_model_path)
+    print(f"[Fixture]: built tiny model path:{tiny_model_path} for testing in session")
+    yield tiny_model_path
+    shutil.rmtree(tiny_model_path)
+
+
+@pytest.fixture(scope="session")
+def tiny_gptj_model_path():
+    model_name_or_path = gptj_name_or_path
+    tiny_model_path = "./tmp_tiny_gptj_model_path"
+    model = get_tiny_model(model_name_or_path, num_layers=3)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
+    model.save_pretrained(tiny_model_path)
+    tokenizer.save_pretrained(tiny_model_path)
+    print(f"[Fixture]: built tiny model path:{tiny_model_path} for testing in session")
+    yield tiny_model_path
+    shutil.rmtree(tiny_model_path)
+
+
+# Create objective fixtures for testing
 @pytest.fixture(scope="function")
 def tiny_opt_model():
-    model = transformers.AutoModelForCausalLM.from_pretrained(opt_name_or_path, dtype="auto", trust_remote_code=True)
-    model.config.num_hidden_layers = 3
-    setattr(model.model.decoder, "layers", model.model.decoder.layers[:3])
-    return model
+    model_name_or_path = opt_name_or_path
+    return get_tiny_model(model_name_or_path, num_layers=3)
 
 
 @pytest.fixture(scope="function")
 def opt_model():
-    model = transformers.AutoModelForCausalLM.from_pretrained(opt_name_or_path, dtype="auto", trust_remote_code=True)
+    model_name_or_path = opt_name_or_path
+    model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype="auto", trust_remote_code=True)
     return model
 
 
 @pytest.fixture(scope="session")
 def opt_tokenizer():
-    tokenizer = transformers.AutoTokenizer.from_pretrained(opt_name_or_path, trust_remote_code=True)
+    model_name_or_path = opt_name_or_path
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
     return tokenizer
 
 
diff --git a/test/helpers.py b/test/helpers.py
index e7c40ffe1..2a78551fc 100644
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -1,25 +1,10 @@
 import os
-
 import pytest
-
-# Automatic choose local path or model name.
-opt_name_or_path = "/tf_dataset/auto_round/models/facebook/opt-125m"
-if not os.path.exists(opt_name_or_path):
-    opt_name_or_path = "facebook/opt-125m"
-
-qwen_name_or_path = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B"
-if not os.path.exists(qwen_name_or_path):
-    qwen_name_or_path = "Qwen/Qwen3-0.6B"
-
-lamini_name_or_path = "/tf_dataset/auto_round/models/MBZUAI/LaMini-GPT-124M"
-if not os.path.exists(lamini_name_or_path):
-    lamini_name_or_path = "MBZUAI/LaMini-GPT-124M"
-
-gptj_name_or_path = "/tf_dataset/auto_round/models/hf-internal-testing/tiny-random-GPTJForCausalLM"
-if not os.path.exists(gptj_name_or_path):
-    gptj_name_or_path = "hf-internal-testing/tiny-random-GPTJForCausalLM"
+import torch
+import transformers
 
 
+# Automatic choose local path or model name.
 def get_model_path(model_name: str) -> str:
     ut_path = f"/tf_dataset/auto_round/models/{model_name}"
     local_path = f"/models/{model_name.split('/')[-1]}"
@@ -31,6 +16,32 @@ def get_model_path(model_name: str) -> str:
     else:
         return model_name
 
+opt_name_or_path = get_model_path("facebook/opt-125m")
+qwen_name_or_path = get_model_path("Qwen/Qwen3-0.6B")
+lamini_name_or_path = get_model_path("MBZUAI/LaMini-GPT-124M")
+gptj_name_or_path = get_model_path("hf-internal-testing/tiny-random-GPTJForCausalLM")
+
+
+# Slice model into tiny model for speedup
+def get_tiny_model(model_name_or_path, num_layers=3):
+    model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype="auto", trust_remote_code=True)
+
+    if hasattr(model.config, "num_hidden_layers"):
+        model.config.num_hidden_layers = num_layers
+    
+    def slice_layers(module):
+        for name, child in module.named_children():
+            if isinstance(child, torch.nn.ModuleList) and len(child) > num_layers:
+                new_layers = torch.nn.ModuleList(child[:num_layers])
+                setattr(module, name, new_layers)
+                return True
+            if slice_layers(child):
+                return True
+        return False
+
+    slice_layers(model)
+    return model
+
 
 # HPU mode checking
 def is_pytest_mode_compile():

From 07b741e2034b7d625af96c03396caeebf9f42b8f Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Thu, 18 Dec 2025 20:30:29 -0500
Subject: [PATCH 06/24] fix bug

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 test/conftest.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/conftest.py b/test/conftest.py
index ebe377e48..bdaf69b8f 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -6,10 +6,10 @@
 
 from .fixtures import (
     dataloader,
-    model,
+    opt_model,
     tiny_opt_model,
     tiny_opt_model_path,
-    tokenizer,
+    opt_tokenizer,
 )
 from .helpers import model_infer
 

From 03917948b05e0537badcb5e70a918cdef5180086 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 19 Dec 2025 01:31:39 +0000
Subject: [PATCH 07/24] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 test/conftest.py | 2 +-
 test/fixtures.py | 6 +++---
 test/helpers.py  | 4 +++-
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/test/conftest.py b/test/conftest.py
index bdaf69b8f..4b0f4709f 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -7,9 +7,9 @@
 from .fixtures import (
     dataloader,
     opt_model,
+    opt_tokenizer,
     tiny_opt_model,
     tiny_opt_model_path,
-    opt_tokenizer,
 )
 from .helpers import model_infer
 
diff --git a/test/fixtures.py b/test/fixtures.py
index a96010060..9f46196c2 100644
--- a/test/fixtures.py
+++ b/test/fixtures.py
@@ -5,11 +5,11 @@
 import transformers
 
 from .helpers import (
+    get_tiny_model,
+    gptj_name_or_path,
+    lamini_name_or_path,
     opt_name_or_path,
     qwen_name_or_path,
-    lamini_name_or_path,
-    gptj_name_or_path,
-    get_tiny_model,
 )
 
 
diff --git a/test/helpers.py b/test/helpers.py
index 2a78551fc..a82e1ed28 100644
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -1,4 +1,5 @@
 import os
+
 import pytest
 import torch
 import transformers
@@ -16,6 +17,7 @@ def get_model_path(model_name: str) -> str:
     else:
         return model_name
 
+
 opt_name_or_path = get_model_path("facebook/opt-125m")
 qwen_name_or_path = get_model_path("Qwen/Qwen3-0.6B")
 lamini_name_or_path = get_model_path("MBZUAI/LaMini-GPT-124M")
@@ -28,7 +30,7 @@ def get_tiny_model(model_name_or_path, num_layers=3):
 
     if hasattr(model.config, "num_hidden_layers"):
         model.config.num_hidden_layers = num_layers
-    
+
     def slice_layers(module):
         for name, child in module.named_children():
             if isinstance(child, torch.nn.ModuleList) and len(child) > num_layers:

From d35b14834e5cea1ebd943081d11fd34169b5f27a Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Thu, 18 Dec 2025 21:35:24 -0500
Subject: [PATCH 08/24] fix few bugs

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 test/conftest.py               |  3 +++
 test/fixtures.py               | 13 +++++++---
 test/helpers.py                |  2 ++
 test/test_cuda/test_2_3bits.py | 46 ++++++++++++++++------------------
 test/test_cuda/test_alg_ext.py | 42 +++++++++++++++++++------------
 5 files changed, 62 insertions(+), 44 deletions(-)

diff --git a/test/conftest.py b/test/conftest.py
index 4b0f4709f..109b504e8 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -8,8 +8,11 @@
     dataloader,
     opt_model,
     opt_tokenizer,
+    tiny_gptj_model_path,
+    tiny_lamini_model_path,
     tiny_opt_model,
     tiny_opt_model_path,
+    tiny_qwen_model_path,
 )
 from .helpers import model_infer
 
diff --git a/test/fixtures.py b/test/fixtures.py
index 9f46196c2..005f321d0 100644
--- a/test/fixtures.py
+++ b/test/fixtures.py
@@ -1,3 +1,4 @@
+import os
 import shutil
 
 import pytest
@@ -26,7 +27,8 @@ def __iter__(self):
 @pytest.fixture(scope="session")
 def tiny_opt_model_path():
     model_name_or_path = opt_name_or_path
-    tiny_model_path = "./tmp_tiny_opt_model_path"
+    test_path = os.path.dirname(__file__)
+    tiny_model_path = os.path.join(test_path, "tmp_tiny_opt_model_path")
     model = get_tiny_model(model_name_or_path, num_layers=3)
     tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
     model.save_pretrained(tiny_model_path)
@@ -39,7 +41,8 @@ def tiny_opt_model_path():
 @pytest.fixture(scope="session")
 def tiny_qwen_model_path():
     model_name_or_path = qwen_name_or_path
-    tiny_model_path = "./tmp_tiny_qwen_model_path"
+    test_path = os.path.dirname(__file__)
+    tiny_model_path = os.path.join(test_path, "tmp_tiny_qwen_model_path")
     model = get_tiny_model(model_name_or_path, num_layers=3)
     tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
     model.save_pretrained(tiny_model_path)
@@ -52,7 +55,8 @@ def tiny_qwen_model_path():
 @pytest.fixture(scope="session")
 def tiny_lamini_model_path():
     model_name_or_path = lamini_name_or_path
-    tiny_model_path = "./tmp_tiny_lamini_model_path"
+    test_path = os.path.dirname(__file__)
+    tiny_model_path = os.path.join(test_path, "tmp_tiny_lamini_model_path")
     model = get_tiny_model(model_name_or_path, num_layers=3)
     tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
     model.save_pretrained(tiny_model_path)
@@ -65,7 +69,8 @@ def tiny_lamini_model_path():
 @pytest.fixture(scope="session")
 def tiny_gptj_model_path():
     model_name_or_path = gptj_name_or_path
-    tiny_model_path = "./tmp_tiny_gptj_model_path"
+    test_path = os.path.dirname(__file__)
+    tiny_model_path = os.path.join(test_path, "tmp_tiny_gptj_model_path")
     model = get_tiny_model(model_name_or_path, num_layers=3)
     tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
     model.save_pretrained(tiny_model_path)
diff --git a/test/helpers.py b/test/helpers.py
index a82e1ed28..d67f85599 100644
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -42,6 +42,8 @@ def slice_layers(module):
         return False
 
     slice_layers(model)
+    if hasattr(model.config, "layer_types"):
+        model.config.layer_types = model.config.layer_types[:num_layers]
     return model
 
 
diff --git a/test/test_cuda/test_2_3bits.py b/test/test_cuda/test_2_3bits.py
index f12bf240c..1b305f494 100644
--- a/test/test_cuda/test_2_3bits.py
+++ b/test/test_cuda/test_2_3bits.py
@@ -12,7 +12,7 @@
 from auto_round.eval.evaluation import simple_evaluate, simple_evaluate_user_model
 from auto_round.testing_utils import require_autogptq, require_greater_than_050, require_greater_than_051
 
-from ..helpers import model_infer
+from ..helpers import get_model_path, model_infer
 
 
 def get_accuracy(data):
@@ -26,22 +26,26 @@ def get_accuracy(data):
 
 
 class TestAutoRound:
-    @classmethod
-    def setup_class(self):
-        self.save_dir = "./saved"
-        self.tasks = "lambada_openai"
+    save_dir = "./saved"
+    tasks = "lambada_openai"
 
-    @classmethod
-    def teardown_class(self):
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
+
+        # Yield to hand control to the test methods
+        yield
+
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_greater_than_051
     def test_3bits_autoround(self):
-        model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        autoround = AutoRound(model, tokenizer, bits=3)
+        model_name = get_model_path("facebook/opt-125m")
+        autoround = AutoRound(model_name, bits=3)
         quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")  ##will convert to gptq model
 
@@ -54,15 +58,13 @@ def test_3bits_autoround(self):
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.3)  ## 0.3130
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.3
 
     @require_greater_than_051
     def test_3bits_asym_autoround(self):
-        model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model_name = get_model_path("facebook/opt-125m")
         bits, sym = 3, False
-        autoround = AutoRound(model, tokenizer, bits=bits, sym=sym)
+        autoround = AutoRound(model_name, bits=bits, sym=sym)
         autoround.quantize_and_save(self.save_dir, format="auto_round", inplace=False)
         model_args = f"pretrained={self.save_dir}"
         res = simple_evaluate(
@@ -80,10 +82,8 @@ def test_3bits_asym_autoround(self):
 
     @require_greater_than_050
     def test_norm_bias_tuning(self):
-        model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        autoround = AutoRound(model, tokenizer, bits=2, group_size=64, enable_norm_bias_tuning=True)
+        model_name = get_model_path("facebook/opt-125m")
+        autoround = AutoRound(model_name, bits=2, group_size=64, enable_norm_bias_tuning=True)
         autoround.quantize()
 
         ##test auto_round format
@@ -97,10 +97,8 @@ def test_norm_bias_tuning(self):
 
     @require_greater_than_050
     def test_2bits_autoround(self):
-        model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        autoround = AutoRound(model, tokenizer, bits=2, group_size=64)
+        model_name = get_model_path("facebook/opt-125m")
+        autoround = AutoRound(model_name, bits=2, group_size=64)
         autoround.quantize()
 
         ##test auto_round format
diff --git a/test/test_cuda/test_alg_ext.py b/test/test_cuda/test_alg_ext.py
index 499213c74..e13bfac4a 100644
--- a/test/test_cuda/test_alg_ext.py
+++ b/test/test_cuda/test_alg_ext.py
@@ -8,21 +8,27 @@
 from auto_round import AutoRound, AutoRoundConfig
 from auto_round.eval.evaluation import simple_evaluate_user_model
 
+from ..helpers import get_model_path
+
 
 class TestAlgExt:
+    save_folder = "./saved"
 
-    @classmethod
-    def setup_class(self):
-        self.model_name = "/models/opt-125m"
-        self.save_folder = "./saved"
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
 
-    @classmethod
-    def teardown_class(self):
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+        # Yield to hand control to the test methods
+        yield
+
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
+        shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     def test_2bits(self):
-        model_name = "/models/opt-125m"
+        model_name = get_model_path("facebook/opt-125m")
         ar = AutoRound(model=model_name, bits=2, group_size=64, enable_alg_ext=True)
         ar.quantize_and_save(self.save_folder)
         model = AutoModelForCausalLM.from_pretrained(
@@ -34,35 +40,39 @@ def test_2bits(self):
         result = simple_evaluate_user_model(model, tokenizer, batch_size=64, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
         # wo alg ext 0.2078, with 0.2371
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.22)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.22
         shutil.rmtree(self.save_folder, ignore_errors=True)
 
-    def test_cli(self):
+    def test_cli(self, tiny_opt_model_path):
         import os
 
-        model_name = "/models/opt-125m"
         python_path = sys.executable
 
         res = os.system(
-            f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {model_name} --iters 1 --device auto --enable_alg_ext --avg_bits 2 --options=W2A16,W4A16 --ignore_scale_zp_bits"
+            f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 2 --options=W2A16,W4A16 --ignore_scale_zp_bits --nsampes 1 --seqlen 32"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {model_name} --iters 1 --device auto --enable_alg_ext --avg_bits 5.5 --options=mxfp4,mxfp8 --ignore_scale_zp_bits --enable_torch_compile"
+            f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 5.5 --options=mxfp4,mxfp8 --ignore_scale_zp_bits --enable_torch_compile --nsampes 1 --seqlen 32"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
-    def test_all_support_dtype(self):
+    def test_all_support_dtype(self, tiny_qwen_model_path):
         from auto_round.auto_scheme import AutoScheme
 
-        model_name = "/models/Qwen3-0.6B"
         for scheme in ["MXFP4", "NVFP4", "W2A16G64", "gguf:q2_k_s,gguf:q4_k_s"]:
             avg_bits = 2 if scheme == "W2A16G64" else 4
             scheme = AutoScheme(options=scheme, avg_bits=avg_bits, ignore_scale_zp_bits=True)
             ar = AutoRound(
-                model_name, scheme=scheme, iters=1, nsamples=1, enable_alg_ext=True, enable_torch_compile=True
+                tiny_qwen_model_path,
+                scheme=scheme,
+                iters=1,
+                nsamples=1,
+                seqlen=32,
+                enable_alg_ext=True,
+                enable_torch_compile=True,
             )
             ar.quantize()

From b15adc8dbf4e82090cd39cc1a5406a84388e964f Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Fri, 19 Dec 2025 02:34:33 -0500
Subject: [PATCH 09/24] use get_model_path and remove self.assertTrue/False

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 test/README.md                                |   2 +-
 test/conftest.py                              |  12 +-
 test/fixtures.py                              |  63 ++++----
 test/helpers.py                               |  24 ++-
 test/test_ark/test_model.py                   |  16 +-
 test/test_cpu/requirements.txt                |   1 -
 test/test_cpu/test_act_quantization.py        |  18 +--
 test/test_cpu/test_alg_ext.py                 |  14 +-
 test/test_cpu/test_auto_scheme.py             |   4 +-
 test/test_cpu/test_autoopt.py                 |   4 +-
 test/test_cpu/test_autoround.py               | 149 ++++++++----------
 test/test_cpu/test_autoround_acc.py           |   2 +-
 .../test_autoround_export_to_itrex.py         |  28 ++--
 test/test_cpu/test_block_names.py             |  24 ++-
 test/test_cpu/test_calib_dataset.py           |   6 +-
 test/test_cpu/test_cli_usage.py               |   6 +-
 test/test_cpu/test_export.py                  |  29 ++--
 test/test_cpu/test_generation.py              |   4 +-
 test/test_cpu/test_gguf_format.py             |  44 +++---
 test/test_cpu/test_gpt_oss.py                 |   4 +-
 test/test_cpu/test_llmc_integration.py        |   2 +-
 test/test_cpu/test_llmcompressor.py           |  16 +-
 test/test_cpu/test_load_awq_gptq.py           |   8 +-
 test/test_cpu/test_mix_bits.py                |   7 +-
 test/test_cpu/test_mllm.py                    |  14 +-
 test/test_cpu/test_model_scope.py             |   6 +-
 test/test_cpu/test_moe_model.py               |   6 +-
 test/test_cpu/test_mxfp_nvfp.py               |  22 +--
 test/test_cpu/test_mxfp_save_load.py          |   4 +-
 test/test_cpu/test_scheme.py                  |  20 +--
 test/test_cpu/test_torch_backend.py           |   6 +-
 test/test_cuda/requirements.txt               |   1 -
 test/test_cuda/test_auto_round_format.py      |   4 +-
 test/test_cuda/test_auto_scheme.py            |  14 +-
 test/test_cuda/test_diffusion.py              |   8 +-
 test/test_cuda/test_exllamav2_backend.py      |   8 +-
 test/test_cuda/test_fp8_input.py              |   8 +-
 test/test_cuda/test_get_block_name.py         |   2 +-
 test/test_cuda/test_gguf.py                   |  26 +--
 test/test_cuda/test_main_func.py              |   6 +-
 test/test_cuda/test_marlin_backend.py         |  12 +-
 test/test_cuda/test_mix_bits.py               |   3 +-
 test/test_cuda/test_mxfp_nvfp.py              |   2 +-
 test/test_cuda/test_scheme.py                 |   4 +-
 test/test_cuda/test_support_vlms.py           |  10 +-
 test/test_cuda/test_torch_backend.py          |   6 +-
 test/test_cuda/test_transformers.py           |  12 +-
 test/test_cuda/test_triton_backend.py         |  20 +--
 test/test_cuda/test_vlms.py                   |  18 +--
 test/test_hpu/test_auto_round.py              |   6 +-
 test/test_xpu/test_autoround.py               |  12 +-
 51 files changed, 380 insertions(+), 367 deletions(-)

diff --git a/test/README.md b/test/README.md
index f5bc6ddba..9ccca0017 100644
--- a/test/README.md
+++ b/test/README.md
@@ -17,7 +17,7 @@ This project uses `pytest` for unit testing. All test cases are under the `test/
 - Example:
   ```python
   # test_example.py
-    from ..helper import model_infer
+    from ..helpers import model_infer
 
     def test_model_infer(tiny_opt_model, opt_tokenizer):
         result = model_infer(tiny_opt_model, opt_tokenizer, input_text="hello world")
diff --git a/test/conftest.py b/test/conftest.py
index 109b504e8..d21100824 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -4,17 +4,7 @@
 
 import pytest
 
-from .fixtures import (
-    dataloader,
-    opt_model,
-    opt_tokenizer,
-    tiny_gptj_model_path,
-    tiny_lamini_model_path,
-    tiny_opt_model,
-    tiny_opt_model_path,
-    tiny_qwen_model_path,
-)
-from .helpers import model_infer
+from .fixtures import *
 
 # Easy debugging without installing auto-round.
 sys.path.insert(0, "..")
diff --git a/test/fixtures.py b/test/fixtures.py
index 005f321d0..87d0a5f75 100644
--- a/test/fixtures.py
+++ b/test/fixtures.py
@@ -10,7 +10,9 @@
     gptj_name_or_path,
     lamini_name_or_path,
     opt_name_or_path,
+    phi2_name_or_path,
     qwen_name_or_path,
+    save_tiny_model,
 )
 
 
@@ -27,13 +29,8 @@ def __iter__(self):
 @pytest.fixture(scope="session")
 def tiny_opt_model_path():
     model_name_or_path = opt_name_or_path
-    test_path = os.path.dirname(__file__)
-    tiny_model_path = os.path.join(test_path, "tmp_tiny_opt_model_path")
-    model = get_tiny_model(model_name_or_path, num_layers=3)
-    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
-    model.save_pretrained(tiny_model_path)
-    tokenizer.save_pretrained(tiny_model_path)
-    print(f"[Fixture]: built tiny model path:{tiny_model_path} for testing in session")
+    tiny_model_path = "./tmp_tiny_opt_model_path"
+    tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path)
     yield tiny_model_path
     shutil.rmtree(tiny_model_path)
 
@@ -41,13 +38,8 @@ def tiny_opt_model_path():
 @pytest.fixture(scope="session")
 def tiny_qwen_model_path():
     model_name_or_path = qwen_name_or_path
-    test_path = os.path.dirname(__file__)
-    tiny_model_path = os.path.join(test_path, "tmp_tiny_qwen_model_path")
-    model = get_tiny_model(model_name_or_path, num_layers=3)
-    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
-    model.save_pretrained(tiny_model_path)
-    tokenizer.save_pretrained(tiny_model_path)
-    print(f"[Fixture]: built tiny model path:{tiny_model_path} for testing in session")
+    tiny_model_path = "./tmp_tiny_qwen_model_path"
+    tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path)
     yield tiny_model_path
     shutil.rmtree(tiny_model_path)
 
@@ -55,13 +47,8 @@ def tiny_qwen_model_path():
 @pytest.fixture(scope="session")
 def tiny_lamini_model_path():
     model_name_or_path = lamini_name_or_path
-    test_path = os.path.dirname(__file__)
-    tiny_model_path = os.path.join(test_path, "tmp_tiny_lamini_model_path")
-    model = get_tiny_model(model_name_or_path, num_layers=3)
-    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
-    model.save_pretrained(tiny_model_path)
-    tokenizer.save_pretrained(tiny_model_path)
-    print(f"[Fixture]: built tiny model path:{tiny_model_path} for testing in session")
+    tiny_model_path = "./tmp_tiny_lamini_model_path"
+    tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path)
     yield tiny_model_path
     shutil.rmtree(tiny_model_path)
 
@@ -69,13 +56,17 @@ def tiny_lamini_model_path():
 @pytest.fixture(scope="session")
 def tiny_gptj_model_path():
     model_name_or_path = gptj_name_or_path
-    test_path = os.path.dirname(__file__)
-    tiny_model_path = os.path.join(test_path, "tmp_tiny_gptj_model_path")
-    model = get_tiny_model(model_name_or_path, num_layers=3)
-    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
-    model.save_pretrained(tiny_model_path)
-    tokenizer.save_pretrained(tiny_model_path)
-    print(f"[Fixture]: built tiny model path:{tiny_model_path} for testing in session")
+    tiny_model_path = "./tmp_tiny_gptj_model_path"
+    tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path)
+    yield tiny_model_path
+    shutil.rmtree(tiny_model_path)
+
+
+@pytest.fixture(scope="session")
+def tiny_phi2_model_path():
+    model_name_or_path = phi2_name_or_path
+    tiny_model_path = "./tmp_tiny_phi2_model_path"
+    tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path)
     yield tiny_model_path
     shutil.rmtree(tiny_model_path)
 
@@ -84,7 +75,7 @@ def tiny_gptj_model_path():
 @pytest.fixture(scope="function")
 def tiny_opt_model():
     model_name_or_path = opt_name_or_path
-    return get_tiny_model(model_name_or_path, num_layers=3)
+    return get_tiny_model(model_name_or_path, num_layers=2)
 
 
 @pytest.fixture(scope="function")
@@ -101,6 +92,20 @@ def opt_tokenizer():
     return tokenizer
 
 
+@pytest.fixture(scope="function")
+def model():
+    model_name_or_path = opt_name_or_path
+    model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype="auto", trust_remote_code=True)
+    return model
+
+
+@pytest.fixture(scope="session")
+def tokenizer():
+    model_name_or_path = opt_name_or_path
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
+    return tokenizer
+
+
 @pytest.fixture(scope="session")
 def dataloader():
     return DataLoader()
diff --git a/test/helpers.py b/test/helpers.py
index d67f85599..77c219452 100644
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -22,14 +22,11 @@ def get_model_path(model_name: str) -> str:
 qwen_name_or_path = get_model_path("Qwen/Qwen3-0.6B")
 lamini_name_or_path = get_model_path("MBZUAI/LaMini-GPT-124M")
 gptj_name_or_path = get_model_path("hf-internal-testing/tiny-random-GPTJForCausalLM")
+phi2_name_or_path = get_model_path("microsoft/phi-2")
 
 
 # Slice model into tiny model for speedup
-def get_tiny_model(model_name_or_path, num_layers=3):
-    model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype="auto", trust_remote_code=True)
-
-    if hasattr(model.config, "num_hidden_layers"):
-        model.config.num_hidden_layers = num_layers
+def get_tiny_model(model_name_or_path, num_layers=2):
 
     def slice_layers(module):
         for name, child in module.named_children():
@@ -41,12 +38,29 @@ def slice_layers(module):
                 return True
         return False
 
+    model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype="auto", trust_remote_code=True)
     slice_layers(model)
+
+    if hasattr(model.config, "num_hidden_layers"):
+        model.config.num_hidden_layers = num_layers
     if hasattr(model.config, "layer_types"):
         model.config.layer_types = model.config.layer_types[:num_layers]
+
     return model
 
 
+# for fixture usage only
+def save_tiny_model(model_name_or_path, tiny_model_path):
+    model = get_tiny_model(model_name_or_path, num_layers=2)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
+    test_path = os.path.dirname(__file__)
+    tiny_model_path = os.path.join(test_path, tiny_model_path)
+    model.save_pretrained(tiny_model_path)
+    tokenizer.save_pretrained(tiny_model_path)
+    print(f"[Fixture]: built tiny model path:{tiny_model_path} for testing in session")
+    return tiny_model_path
+
+
 # HPU mode checking
 def is_pytest_mode_compile():
     return pytest.mode == "compile"
diff --git a/test/test_ark/test_model.py b/test/test_ark/test_model.py
index 622f4a6dd..b8dfdca5c 100644
--- a/test/test_ark/test_model.py
+++ b/test/test_ark/test_model.py
@@ -26,11 +26,11 @@ def setup_and_teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_torch_4bits_sym_cpu(self, model, tokenizer, dataloader):
+    def test_torch_4bits_sym_cpu(self, opt_model, opt_tokenizer, dataloader):
         bits, group_size, sym = 4, 32, True
         autoround = AutoRound(
-            model,
-            tokenizer,
+            opt_model,
+            opt_tokenizer,
             bits=bits,
             group_size=group_size,
             sym=sym,
@@ -50,15 +50,15 @@ def test_torch_4bits_sym_cpu(self, model, tokenizer, dataloader):
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=1000)
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.28)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.28
 
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_torch_4bits_sym_xpu(self, model, tokenizer, dataloader):
+    def test_torch_4bits_sym_xpu(self, opt_model, opt_tokenizer, dataloader):
         bits, group_size, sym = 4, 32, True
         autoround = AutoRound(
-            model,
-            tokenizer,
+            opt_model,
+            opt_tokenizer,
             bits=bits,
             group_size=group_size,
             sym=sym,
@@ -78,6 +78,6 @@ def test_torch_4bits_sym_xpu(self, model, tokenizer, dataloader):
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=1000)
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.28)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.28
         torch.xpu.empty_cache()
         shutil.rmtree(self.save_folder, ignore_errors=True)
diff --git a/test/test_cpu/requirements.txt b/test/test_cpu/requirements.txt
index 219189829..a54cc4e4e 100644
--- a/test/test_cpu/requirements.txt
+++ b/test/test_cpu/requirements.txt
@@ -3,7 +3,6 @@ modelscope
 gguf
 sentencepiece
 torchvision
-parameterized
 pillow
 numba
 llmcompressor @ git+https://github.com/vllm-project/llm-compressor.git@main
diff --git a/test/test_cpu/test_act_quantization.py b/test/test_cpu/test_act_quantization.py
index 0483c027d..cd41c0985 100644
--- a/test/test_cpu/test_act_quantization.py
+++ b/test/test_cpu/test_act_quantization.py
@@ -24,11 +24,11 @@ def setup_and_teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_mx_fp4(self, tiny_opt_model, tokenizer, dataloader):
+    def test_mx_fp4(self, tiny_opt_model, opt_tokenizer, dataloader):
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
             tiny_opt_model,
-            tokenizer,
+            opt_tokenizer,
             bits=bits,
             group_size=group_size,
             sym=sym,
@@ -40,11 +40,11 @@ def test_mx_fp4(self, tiny_opt_model, tokenizer, dataloader):
         )
         autoround.quantize()
 
-    def test_wint4fp8_dynamic(self, tiny_opt_model, tokenizer, dataloader):
+    def test_wint4fp8_dynamic(self, tiny_opt_model, opt_tokenizer, dataloader):
         bits, group_size = 4, 128
         autoround = AutoRound(
             tiny_opt_model,
-            tokenizer,
+            opt_tokenizer,
             bits=bits,
             group_size=group_size,
             iters=2,
@@ -56,11 +56,11 @@ def test_wint4fp8_dynamic(self, tiny_opt_model, tokenizer, dataloader):
         )
         autoround.quantize()
 
-    def test_wint4fp8_static(self, tiny_opt_model, tokenizer, dataloader):
+    def test_wint4fp8_static(self, tiny_opt_model, opt_tokenizer, dataloader):
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
             tiny_opt_model,
-            tokenizer,
+            opt_tokenizer,
             bits=bits,
             group_size=group_size,
             sym=sym,
@@ -75,12 +75,12 @@ def test_wint4fp8_static(self, tiny_opt_model, tokenizer, dataloader):
         autoround.quantize()
 
     @pytest.mark.parametrize("act_group_size", [-1, 128])
-    def test_wfp8afp8_static(self, act_group_size, tiny_opt_model, tokenizer, dataloader):
+    def test_wfp8afp8_static(self, act_group_size, tiny_opt_model, opt_tokenizer, dataloader):
         from auto_round.wrapper import WrapperWALayer
 
         autoround = AutoRound(
             tiny_opt_model,
-            tokenizer,
+            opt_tokenizer,
             group_size=128,
             act_group_size=act_group_size,
             iters=2,
@@ -92,7 +92,7 @@ def test_wfp8afp8_static(self, act_group_size, tiny_opt_model, tokenizer, datalo
         )
         autoround.quantize()
 
-        k_proj = autoround.model.model.decoder.layers[2].self_attn.k_proj
+        k_proj = autoround.model.model.decoder.layers[1].self_attn.k_proj
         assert isinstance(k_proj, WrapperWALayer), "k_proj should be WrapperWALayer"
         if act_group_size == -1:
             assert k_proj.orig_layer.act_scale.shape[0] == 20, "act_scale shape[0] should be 20"
diff --git a/test/test_cpu/test_alg_ext.py b/test/test_cpu/test_alg_ext.py
index 504b7d0f8..0bfdfba47 100644
--- a/test/test_cpu/test_alg_ext.py
+++ b/test/test_cpu/test_alg_ext.py
@@ -1,30 +1,30 @@
 from auto_round import AutoRound
 
-from ..helpers import opt_name_or_path, qwen_name_or_path
+from ..helpers import qwen_name_or_path
 
 
 class TestAlgExt:
-    def test_alg_ext(self):
-        model_name = opt_name_or_path
+    def test_alg_ext(self, tiny_opt_model_path, tiny_qwen_model_path):
+        model_name = tiny_opt_model_path
         ar = AutoRound(model_name, scheme="W2A16", iters=1, nsamples=1, enable_alg_ext=True)
         ar.quantize()
 
-        model_name = qwen_name_or_path
+        model_name = tiny_qwen_model_path
         ar = AutoRound(model_name, scheme="gguf:q4_k_s", iters=1, nsamples=1, enable_alg_ext=True)
         ar.quantize()
 
         from auto_round.auto_scheme import AutoScheme
 
         scheme = AutoScheme(options=["mxfp4", "mxfp8"], avg_bits=5.5, ignore_scale_zp_bits=True)
-        model_name = qwen_name_or_path
+        model_name = tiny_qwen_model_path
         ar = AutoRound(model_name, scheme=scheme, iters=1, nsamples=1, enable_alg_ext=True, enable_torch_compile=True)
         ar.quantize()
 
     def test_alg_ext_import(self):
         from auto_round.alg_ext import wrapper_autoround
 
-    def test_all_support_dtype(self):
-        model_name = opt_name_or_path
+    def test_all_support_dtype(self, tiny_opt_model_path):
+        model_name = tiny_opt_model_path
         for scheme in ["MXFP4", "NVFP4", "W2A16G64"]:
             ar = AutoRound(
                 model_name, scheme=scheme, iters=1, nsamples=1, enable_alg_ext=True, enable_torch_compile=True
diff --git a/test/test_cpu/test_auto_scheme.py b/test/test_cpu/test_auto_scheme.py
index b6c20826e..b38e84dc6 100644
--- a/test/test_cpu/test_auto_scheme.py
+++ b/test/test_cpu/test_auto_scheme.py
@@ -44,11 +44,11 @@ def test_layer_config(self, tiny_opt_model_path):
         ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config)
         model, layer_config = ar.quantize()
         assert layer_config["model.decoder.layers.1.fc1"]["bits"] == 8
-        assert layer_config["model.decoder.layers.1.fc1"]["sym"] == False
+        assert not layer_config["model.decoder.layers.1.fc1"]["sym"]
         assert layer_config["model.decoder.layers.1.fc1"]["group_size"] == 32
         layer = get_module(model, "model.decoder.layers.1.fc1")
         assert layer.bits == 8
-        assert layer.sym == False
+        assert not layer.sym
         assert layer.group_size == 32
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)
diff --git a/test/test_cpu/test_autoopt.py b/test/test_cpu/test_autoopt.py
index 472711155..c14e04c0e 100644
--- a/test/test_cpu/test_autoopt.py
+++ b/test/test_cpu/test_autoopt.py
@@ -24,7 +24,7 @@ def setup_and_teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_Adam(self, tiny_opt_model, tokenizer, dataloader):
+    def test_Adam(self, tiny_opt_model, opt_tokenizer, dataloader):
         bits, group_size, sym = 4, 128, False
         from auto_round.utils import get_block_names
 
@@ -32,7 +32,7 @@ def test_Adam(self, tiny_opt_model, tokenizer, dataloader):
         bits, group_size, sym, batch_size = 4, 128, False, 20
         adamround = AutoRoundAdam(
             tiny_opt_model,
-            tokenizer,
+            opt_tokenizer,
             bits=bits,
             group_size=group_size,
             sym=sym,
diff --git a/test/test_cpu/test_autoround.py b/test/test_cpu/test_autoround.py
index 1f1f85f55..d0049765e 100644
--- a/test/test_cpu/test_autoround.py
+++ b/test/test_cpu/test_autoround.py
@@ -3,20 +3,19 @@
 
 import pytest
 import torch
-from parameterized import parameterized
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
 from auto_round import AutoRound
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.utils import get_module
 
-from ..helpers import model_infer
+from ..helpers import get_model_path, model_infer, opt_name_or_path, qwen_name_or_path
 
 
 class TestAutoRound:
     @classmethod
     def setup_class(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = opt_name_or_path
         self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         self.save_folder = "./saved"
@@ -26,18 +25,16 @@ def teardown_class(self):
         shutil.rmtree(self.save_folder, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_bits_setting(self):
+    def test_bits_setting(self, tiny_opt_model_path):
         layer_config = {"model.decoder.layers.0.self_attn.k_proj": {"data_type": "mx_fp8", "group_size": 32}}
-        autoround = AutoRound(
-            "/tf_dataset/auto_round/models/facebook/opt-125m", iters=2, seqlen=2, nsamples=1, layer_config=layer_config
-        )
+        autoround = AutoRound(tiny_opt_model_path, iters=2, seqlen=2, nsamples=1, layer_config=layer_config)
         autoround.quantize()
         module = get_module(autoround.model, "model.decoder.layers.0.self_attn.k_proj")
         if module.bits != 8:
             raise ValueError(f"Expected bits to be 8, but got {module.bits}")
 
-    def test_layer_config(self, dataloader):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_layer_config(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         layer_config = {"self_attn": {"bits": 4, "data_type": "nv_fp", "act_bits": 16, "group_size": 16}}
         autoround = AutoRound(
             model_name,
@@ -52,8 +49,8 @@ def test_layer_config(self, dataloader):
         autoround.quantize_and_save(self.save_folder, inplace=False, format="fake")
         shutil.rmtree(self.save_folder)
 
-    def test_remove_whole_block(self, dataloader):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_remove_whole_block(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         layer_config = {
             "model.decoder.layers.0.self_attn.k_proj": {"bits": 32},
             "model.decoder.layers.0.self_attn.v_proj": {"bits": 32},
@@ -75,11 +72,10 @@ def test_remove_whole_block(self, dataloader):
         )
         autoround.quantize()
 
-    def test_consecutive_quant(self, dataloader):
+    def test_consecutive_quant(self, tiny_opt_model_path, tiny_phi2_model_path, dataloader):
         bits, group_size, sym = 4, -1, False
         autoround = AutoRound(
-            self.model,
-            self.tokenizer,
+            tiny_opt_model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
@@ -89,15 +85,8 @@ def test_consecutive_quant(self, dataloader):
         )
         autoround.quantize()
 
-        model = AutoModelForCausalLM.from_pretrained(
-            "/tf_dataset/auto_round/models/microsoft/phi-2", torch_dtype="auto", trust_remote_code=True
-        )
-        tokenizer = AutoTokenizer.from_pretrained(
-            "/tf_dataset/auto_round/models/microsoft/phi-2", trust_remote_code=True
-        )
         autoround = AutoRound(
-            model,
-            tokenizer,
+            tiny_phi2_model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
@@ -108,7 +97,7 @@ def test_consecutive_quant(self, dataloader):
         autoround.quantize()
 
     def test_mx_fp4(self, dataloader):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = opt_name_or_path
         bits, group_size, sym = 4, 32, False
         autoround = AutoRound(
             model_name,
@@ -127,10 +116,10 @@ def test_mx_fp4(self, dataloader):
             model, self.tokenizer, batch_size="auto:8", tasks="lambada_openai", limit=32
         )
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.3)  # 0.375
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.3  # 0.375
 
     def test_nv_fp4(self, dataloader):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = opt_name_or_path
         bits, group_size, sym = 4, 16, False
         autoround = AutoRound(
             model_name,
@@ -147,10 +136,10 @@ def test_nv_fp4(self, dataloader):
             model, self.tokenizer, batch_size="auto:8", tasks="lambada_openai", limit=32
         )
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.35
 
-    def test_w4g1(self, dataloader):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_w4g1(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         bits, group_size, sym = 4, -1, True
         autoround = AutoRound(
             model_name,
@@ -163,9 +152,9 @@ def test_w4g1(self, dataloader):
         )
         autoround.quantize()
 
-    @parameterized.expand([(2,), (3,), (4,)])
+    @pytest.mark.parametrize("bits", [2, 3, 4])
     def test_g128(self, bits, dataloader):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = opt_name_or_path
         group_size, sym = 128, True
         autoround = AutoRound(
             model_name,
@@ -182,7 +171,7 @@ def test_g128(self, bits, dataloader):
                 model, self.tokenizer, batch_size="auto:8", tasks="lambada_openai", limit=32
             )
             print(result["results"]["lambada_openai"]["acc,none"])
-            self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.3)
+            assert result["results"]["lambada_openai"]["acc,none"] > 0.3
 
     def test_disable_quanted_input(self, dataloader):
         bits, group_size, sym = 4, -1, True
@@ -199,9 +188,9 @@ def test_disable_quanted_input(self, dataloader):
         )
         autoround.quantize()
 
-    def test_enable_norm_bias_tuning_qwen3(self, dataloader):
+    def test_enable_norm_bias_tuning_qwen3(self, tiny_qwen_model_path, dataloader):
         bits, group_size, sym = 4, 128, True
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B"
+        model_name = tiny_qwen_model_path
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         autoround = AutoRound(
@@ -249,8 +238,8 @@ def test_disable_minmax_tuning(self, dataloader):
         autoround.quantize()
 
     #
-    def test_signround(self, dataloader):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_signround(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         bits, group_size, sym = 4, -1, False
         autoround = AutoRound(
             model_name,
@@ -283,8 +272,8 @@ def test_lm_head_layer_config_way(self, dataloader):
         )
         autoround.quantize()
 
-    def test_wa_quant(self, dataloader):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_wa_quant(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         bits, group_size, sym, act_bits = 4, 128, False, 4
         autoround = AutoRound(
             model_name,
@@ -298,9 +287,9 @@ def test_wa_quant(self, dataloader):
         )
         autoround.quantize()
 
-    def test_auto_device_map(self, dataloader):
+    def test_auto_device_map(self, tiny_opt_model_path, dataloader):
         bits, group_size, sym = 4, 128, False
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = tiny_opt_model_path
         model = AutoModelForCausalLM.from_pretrained(
             model_name, torch_dtype="auto", trust_remote_code=True, device_map="auto"
         )
@@ -316,7 +305,7 @@ def test_auto_device_map(self, dataloader):
         )
         autoround.quantize()
 
-    def test_device_map_dict(self, dataloader):
+    def test_device_map_dict(self, tiny_opt_model_path, dataloader):
         bits, group_size, sym = 4, 128, False
         device_map = {".*": "cpu"}
         autoround = AutoRound(
@@ -333,7 +322,7 @@ def test_device_map_dict(self, dataloader):
         autoround.quantize()
 
         # test model_name
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = tiny_opt_model_path
         autoround = AutoRound(
             model_name,
             self.tokenizer,
@@ -347,9 +336,9 @@ def test_device_map_dict(self, dataloader):
         )
         autoround.quantize()
 
-    def test_fp32(self, dataloader):
+    def test_fp32(self, tiny_opt_model_path, dataloader):
         bits, group_size, sym = 4, 128, False
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = tiny_opt_model_path
         model = AutoModelForCausalLM.from_pretrained(
             model_name, torch_dtype=torch.float32, trust_remote_code=True, device_map="auto"
         )
@@ -380,8 +369,8 @@ def test_tensor_reshape(self, dataloader):
         )
         autoround.quantize()
 
-    def test_rtn(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_rtn(self, tiny_opt_model_path):
+        model_name = tiny_opt_model_path
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 
@@ -399,9 +388,9 @@ def test_rtn(self):
         model_infer(model, tokenizer)
         shutil.rmtree(self.save_folder)
 
-    def test_embed_quant(self, dataloader):
+    def test_embed_quant(self, tiny_opt_model_path, dataloader):
         bits, group_size, sym = 4, 128, True
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = tiny_opt_model_path
         layer_config = {
             "model.decoder.embed_tokens": {"bits": 4},
         }
@@ -418,9 +407,9 @@ def test_embed_quant(self, dataloader):
         )
         autoround.quantize()
 
-    def test_fallback_layers(self, dataloader):
+    def test_fallback_layers(self, tiny_opt_model_path, dataloader):
         bits, group_size, sym = 4, 128, True
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = tiny_opt_model_path
         model = AutoModelForCausalLM.from_pretrained(
             model_name, torch_dtype=torch.float32, trust_remote_code=True, device_map="auto"
         )
@@ -462,17 +451,17 @@ def test_not_convert_modules(self):
 
         from auto_round_extension.ipex.qlinear_ipex_awq import QuantLinear
 
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct-AWQ"
+        model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct-AWQ")
         quantization_config = AutoRoundConfig()
         model = Qwen2VLForConditionalGeneration.from_pretrained(
             model_name, quantization_config=quantization_config, device_map="cpu", torch_dtype=torch.float16
         )
-        self.assertTrue(isinstance(model.visual.blocks[0].attn.qkv, torch.nn.Linear))
-        self.assertFalse(isinstance(model.visual.merger.mlp[0], QuantLinear))
+        assert isinstance(model.visual.blocks[0].attn.qkv, torch.nn.Linear)
+        assert not isinstance(model.visual.merger.mlp[0], QuantLinear)
         if hasattr(model.model, "language_model"):
-            self.assertTrue(isinstance(model.model.language_model.layers[0].self_attn.v_proj, QuantLinear))
+            assert isinstance(model.model.language_model.layers[0].self_attn.v_proj, QuantLinear)
         else:
-            self.assertTrue(isinstance(model.model.layers[0].self_attn.v_proj, QuantLinear))
+            assert isinstance(model.model.layers[0].self_attn.v_proj, QuantLinear)
 
         processor = AutoProcessor.from_pretrained(model_name, size=None)
         image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
@@ -507,8 +496,8 @@ def test_not_convert_modules(self):
         )
         print(output_text)
 
-    def test_fallback_layers_regex_awq(self, dataloader):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_fallback_layers_regex_awq(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         bits, group_size, sym = 4, 128, True
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -543,8 +532,8 @@ def test_fallback_layers_regex_awq(self, dataloader):
         print(res)
         shutil.rmtree(self.save_folder, ignore_errors=True)
 
-    def test_fallback_layers_regex_gptq(self, dataloader):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_fallback_layers_regex_gptq(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         bits, group_size, sym = 4, 128, True
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -579,8 +568,8 @@ def test_fallback_layers_regex_gptq(self, dataloader):
         print(res)
         shutil.rmtree(self.save_folder, ignore_errors=True)
 
-    def test_fallback_layers_regex_round(self, dataloader):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_fallback_layers_regex_round(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         bits, group_size, sym = 4, 128, True
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -615,13 +604,13 @@ def test_fallback_layers_regex_round(self, dataloader):
         print(res)
         shutil.rmtree(self.save_folder, ignore_errors=True)
 
-    def test_fallback_layers_regex_exception(self, dataloader):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_fallback_layers_regex_exception(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         bits, group_size, sym = 4, 128, True
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         layer_config = {"model.decoder.layers.12.self_attn.k_proj": {"bits": 16}}
-        with self.assertRaises(ValueError):
+        with pytest.raises(ValueError):
             autoround = AutoRound(
                 model,
                 tokenizer=tokenizer,
@@ -664,8 +653,8 @@ def test_dequant_fp8_weight(self):
         assert dequant_weight.shape[0] == 32
         assert dequant_weight.shape.numel() == 32 * 5760 * 1440
 
-    def test_mixed_bit_setting(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_mixed_bit_setting(self, tiny_opt_model_path):
+        model_name = tiny_opt_model_path
         layer_config = {"model.decoder.layers.7.fc1": {"bits": 8, "act_bits": 8}}
         ar = AutoRound(model_name, data_type="mx_fp4", act_bits=4, iters=0, layer_config=layer_config)
         ar.quantize()
@@ -676,21 +665,21 @@ def test_mixed_bit_setting(self):
         ):
             raise ValueError("mixed bits is not correct")
 
-    def test_invalid_layer_config(self):
-        with self.assertRaises(ValueError):
+    def test_invalid_layer_config(self, tiny_opt_model_path):
+        with pytest.raises(ValueError):
             layer_config = {"model.decoder.layers.2.self_attnx": {"bits": 2}}
             ar = AutoRound(
-                "/tf_dataset/auto_round/models/facebook/opt-125m",
+                tiny_opt_model_path,
                 scheme="W3A16",
                 nsamples=1,
                 iters=1,
                 layer_config=layer_config,
             )
             ar.quantize()
-        with self.assertRaises(ValueError):
+        with pytest.raises(ValueError):
             layer_config = {"model.decoder.layers.2.self_attn": {"bit": 2}}  # should be bits
             ar = AutoRound(
-                "/tf_dataset/auto_round/models/facebook/opt-125m",
+                tiny_opt_model_path,
                 scheme="W3A16",
                 nsamples=1,
                 iters=1,
@@ -699,7 +688,7 @@ def test_invalid_layer_config(self):
             ar.quantize()
 
     def test_quant_lm_head(self):
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-8B"
+        model_name = get_model_path("Qwen/Qwen3-8B")
         ar = AutoRound(model_name, quant_lm_head=True, iters=0, seqlen=8, nsamples=1, disable_opt_rtn=True)
         ar.quantize_and_save(output_dir=self.save_folder, format="auto_round")
         model = AutoModelForCausalLM.from_pretrained(self.save_folder, device_map="cpu")
@@ -722,7 +711,7 @@ def test_quant_lm_head(self):
         assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4
 
     def test_quant_lm_head_layer_config(self):
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-8B"
+        model_name = get_model_path("Qwen/Qwen3-8B")
         layer_config = {"lm_head": {"bits": 4}}
         ar = AutoRound(
             model_name,
@@ -739,21 +728,21 @@ def test_quant_lm_head_layer_config(self):
         assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4
 
     def test_compressor(self):
-        model_name = "Qwen/Qwen2-VL-2B-Instruct"
+        model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct")
         ar = AutoRound(model_name, enable_adam=True)
         assert ar.optimizer == torch.optim.AdamW
-        self.assertTrue(ar.mllm)
+        assert ar.mllm
 
         # test old api
         from auto_round import AutoRoundMLLM
 
         ar = AutoRoundMLLM(model_name)
-        self.assertTrue(ar.mllm)
+        assert ar.mllm
 
     def test_attention_mask_in_dataset(self):
         from transformers import AutoTokenizer
 
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B"
+        model_name = qwen_name_or_path
         # model_name = "/models/Qwen3-0.6B"
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         text = ["haha", "hello world"]
@@ -771,7 +760,7 @@ def test_attention_mask_in_dataset(self):
     def test_attention_mask_via_tokenize_in_dataset(self):
         from transformers import AutoTokenizer
 
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B"
+        model_name = qwen_name_or_path
         # model_name = "/models/Qwen3-0.6B"
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         text = ["haha", "hello world"]
@@ -788,9 +777,9 @@ def test_attention_mask_via_tokenize_in_dataset(self):
         ar = AutoRound(model_name, iters=1, dataset=data, seqlen=8)
         ar.quantize()
 
-    def test_low_cpu_mem_usage(self, dataloader):
+    def test_low_cpu_mem_usage(self, tiny_opt_model_path, dataloader):
         bits, group_size = 4, 32
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = tiny_opt_model_path
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         quantized_model_path = self.save_folder
@@ -809,7 +798,7 @@ def test_low_cpu_mem_usage(self, dataloader):
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     def test_create_adam(self):
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B"
+        model_name = qwen_name_or_path
         from auto_round import AutoRound
 
         ar = AutoRound(model=model_name, enable_adam=True)
diff --git a/test/test_cpu/test_autoround_acc.py b/test/test_cpu/test_autoround_acc.py
index 721a5c8ed..876d4a452 100644
--- a/test/test_cpu/test_autoround_acc.py
+++ b/test/test_cpu/test_autoround_acc.py
@@ -59,7 +59,7 @@ def test_default_acc(self, dataloader):
         out1 = model_tmp(inp)
 
         assert out0[0].equal(out1[0])
-        self.assertTrue(isclose(float(out0[0][0][0][0]), -0.021002087742090225, rel_tol=5e-04))
+        assert isclose(float(out0[0][0][0][0]), -0.021002087742090225, rel_tol=5e-04)
 
     def test_3bits_asym_autoround(self, tiny_opt_model_path):
         model_name = tiny_opt_model_path
diff --git a/test/test_cpu/test_autoround_export_to_itrex.py b/test/test_cpu/test_autoround_export_to_itrex.py
index d4cc2a73c..19f196270 100644
--- a/test/test_cpu/test_autoround_export_to_itrex.py
+++ b/test/test_cpu/test_autoround_export_to_itrex.py
@@ -8,7 +8,7 @@
 
 from auto_round import AutoRound
 
-from ..helper import gptj_name_or_path
+from ..helpers import get_model_path, gptj_name_or_path
 
 
 class SimpleDataLoader:
@@ -52,11 +52,11 @@ def test_autoround_int_quant(self):
         out2 = model(self.lm_input)
         out3 = q_model(self.lm_input)
         out4 = compressed_model(self.lm_input)
-        self.assertTrue(torch.all(torch.isclose(out1[0], out2[0], atol=1e-1)))
-        self.assertFalse(torch.all(out1[0] == out2[0]))
-        self.assertTrue(torch.all(out2[0] == out3[0]))
-        self.assertTrue(torch.all(torch.isclose(out3[0], out4[0], atol=1e-3)))
-        self.assertTrue("transformer.h.0.attn.k_proj.qzeros" in compressed_model.state_dict().keys())
+        assert torch.all(torch.isclose(out1[0], out2[0], atol=1e-1))
+        assert not torch.all(out1[0] == out2[0])
+        assert torch.all(out2[0] == out3[0])
+        assert torch.all(torch.isclose(out3[0], out4[0], atol=1e-3))
+        assert "transformer.h.0.attn.k_proj.qzeros" in compressed_model.state_dict().keys()
 
         model = copy.deepcopy(self.gptj)
         out6 = model(self.lm_input)
@@ -66,13 +66,13 @@ def test_autoround_int_quant(self):
         compressed_model = compressed_model.to(torch.float32)
         out4 = q_model(self.lm_input)
         out5 = compressed_model(self.lm_input)
-        self.assertTrue(torch.all(out1[0] == out6[0]))
-        self.assertTrue(torch.all(torch.isclose(out4[0], out5[0], atol=5e-3)))
+        assert torch.all(out1[0] == out6[0])
+        assert torch.all(torch.isclose(out4[0], out5[0], atol=5e-3))
 
     def test_config(self):
         from auto_round.export.export_to_itrex import QuantConfig
 
-        config = QuantConfig.from_pretrained("/tf_dataset/auto_round/models/TheBloke/Llama-2-7B-Chat-GPTQ")
+        config = QuantConfig.from_pretrained(get_model_path("TheBloke/Llama-2-7B-Chat-GPTQ"))
         config.save_pretrained("quantization_config_dir")
         loaded_config = QuantConfig.from_pretrained("quantization_config_dir")
         assert config.group_size == loaded_config.group_size
@@ -94,8 +94,8 @@ def test_xpu_export(self):
         out3 = q_model(self.lm_input)
         out4 = compressed_model_xpu(self.lm_input)
         out5 = compressed_model_cpu(self.lm_input)
-        self.assertTrue(torch.all(torch.isclose(out1[0], out2[0], atol=1e-1)))
-        self.assertFalse(torch.all(out1[0] == out2[0]))
-        self.assertTrue(torch.all(out2[0] == out3[0]))
-        self.assertTrue(torch.all(torch.isclose(out3[0], out4[0], atol=1e-3)))
-        self.assertTrue(torch.all(torch.isclose(out4[0], out5[0], atol=1e-5)))
+        assert torch.all(torch.isclose(out1[0], out2[0], atol=1e-1))
+        assert not torch.all(out1[0] == out2[0])
+        assert torch.all(out2[0] == out3[0])
+        assert torch.all(torch.isclose(out3[0], out4[0], atol=1e-3))
+        assert torch.all(torch.isclose(out4[0], out5[0], atol=1e-5))
diff --git a/test/test_cpu/test_block_names.py b/test/test_cpu/test_block_names.py
index 8d5f935d9..5d0423fa2 100644
--- a/test/test_cpu/test_block_names.py
+++ b/test/test_cpu/test_block_names.py
@@ -2,15 +2,13 @@
 import shutil
 
 import pytest
-
-sys.path.insert(0, ".")
 import torch
 import torch.nn as nn
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
 
-from ..helper import lamini_name_or_path
+from ..helpers import get_model_path, lamini_name_or_path
 
 
 # ================= simple multimodal model =================
@@ -118,7 +116,7 @@ def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_moe_quant(self, dataloader):
+    def test_moe_quant(self):
         input_size = 10
         hidden_size = 10
         num_groups = 2
@@ -185,24 +183,24 @@ def test_mm_block_name(self):
 
         from auto_round.utils import get_block_names
 
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct"
+        model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct")
         model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, trust_remote_code=True, device_map="auto")
         block_name = get_block_names(model, quant_vision=True)
-        self.assertTrue(len(block_name) == 2)
-        self.assertTrue(all(["visual.merger.mlp" not in n for n in block_name]))
+        assert len(block_name) == 2
+        assert all(["visual.merger.mlp" not in n for n in block_name])
         block_name = get_block_names(model, quant_vision=False)
-        self.assertTrue(len(block_name) == 1)
-        self.assertTrue(block_name == get_block_names(model))
+        assert len(block_name) == 1
+        assert block_name == get_block_names(model)
 
     def test_moe(self):
         from auto_round.utils import get_block_names
 
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen1.5-MoE-A2.7B"
+        model_name = get_model_path("Qwen/Qwen1.5-MoE-A2.7B")
         # config = AutoConfig.from_pretrained(model_name)
         model = AutoModelForCausalLM.from_pretrained(model_name)
 
         block_name = get_block_names(model)
         block_name_2 = get_block_names(model, quant_vision=True)
-        self.assertTrue(block_name == block_name_2)
-        self.assertTrue(len(block_name_2) == 1)
-        self.assertTrue("model.layers.23" == block_name_2[0][-1])
+        assert block_name == block_name_2
+        assert len(block_name_2) == 1
+        assert "model.layers.23" == block_name_2[0][-1]
diff --git a/test/test_cpu/test_calib_dataset.py b/test/test_cpu/test_calib_dataset.py
index fc95966b6..cb276147e 100644
--- a/test/test_cpu/test_calib_dataset.py
+++ b/test/test_cpu/test_calib_dataset.py
@@ -8,6 +8,8 @@
 
 from auto_round import AutoRound
 
+from ..helpers import get_model_path, opt_name_or_path
+
 
 class TestLocalCalibDataset:
     @classmethod
@@ -26,7 +28,7 @@ def setup_class(self):
                 json.dump(item, jsonl_file, ensure_ascii=False)
                 jsonl_file.write("\n")
 
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = opt_name_or_path
         self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 
@@ -59,7 +61,7 @@ def test_jsonl(self):
         autoround.quantize()
 
     def test_apply_chat_template(self):
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct"
+        model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct")
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         dataset = "NeelNanda/pile-10k:apply_chat_template:system_prompt=''"
diff --git a/test/test_cpu/test_cli_usage.py b/test/test_cpu/test_cli_usage.py
index ffc04d8f1..e71b2854a 100644
--- a/test/test_cpu/test_cli_usage.py
+++ b/test/test_cpu/test_cli_usage.py
@@ -17,7 +17,7 @@ def teardown_class(self):
         shutil.rmtree("../../saved", ignore_errors=True)
         shutil.rmtree("../../tmp_autoround", ignore_errors=True)
 
-    def test_auto_round_cmd(self):
+    def test_auto_round_cmd(self, tiny_opt_model_path):
         python_path = sys.executable
 
         # Test llm script
@@ -26,13 +26,13 @@ def test_auto_round_cmd(self):
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --model '/tf_dataset/auto_round/models/facebook/opt-125m' --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved  --tasks piqa"
+            f"cd ../.. && {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved  --tasks piqa"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --model '/tf_dataset/auto_round/models/facebook/opt-125m' --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32"
+            f"cd ../.. && {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py
index 866a7d396..36ddee546 100644
--- a/test/test_cpu/test_export.py
+++ b/test/test_cpu/test_export.py
@@ -3,11 +3,12 @@
 
 import pytest
 import torch
-from parameterized import parameterized
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
 from auto_round import AutoRound
 
+from ..helpers import get_model_path, opt_name_or_path
+
 
 def _get_folder_size(path: str) -> float:
     """Return folder size in GB."""
@@ -23,7 +24,7 @@ def _get_folder_size(path: str) -> float:
 class TestAutoRound:
     @classmethod
     def setup_class(self):
-        self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        self.model_name = opt_name_or_path
         self.save_dir = "./saved"
         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -198,7 +199,7 @@ def test_autoround_3bit_sym_format(self, dataloader):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    @parameterized.expand([(None,), ("fp8",), ("float16")])
+    @pytest.mark.parametrize("static_kv_dtype", ["fp8", "float16"])
     def test_static_afp8_export(self, static_kv_dtype):
         import os
 
@@ -224,8 +225,8 @@ def test_static_afp8_export(self, static_kv_dtype):
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
         f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt")
-        self.assertIn("model.decoder.layers.8.self_attn.k_proj.input_scale", f.keys())
-        self.assertIn("model.decoder.layers.8.self_attn.k_proj.weight_scale", f.keys())
+        assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys()
+        assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys()
         assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape == torch.Size([1])
         assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype == torch.float8_e4m3fn
         if static_kv_dtype is None:
@@ -257,8 +258,8 @@ def test_static_afp8_export(self, static_kv_dtype):
                     assert output is not None, "Output should not be None"
 
         if static_kv_dtype == "fp8":
-            self.assertIn("model.decoder.layers.8.self_attn.k_scale", f.keys())
-            self.assertIn("model.decoder.layers.8.self_attn.v_scale", f.keys())
+            assert "model.decoder.layers.8.self_attn.k_scale" in f.keys()
+            assert "model.decoder.layers.8.self_attn.v_scale" in f.keys()
             assert f.get_tensor("model.decoder.layers.5.self_attn.v_scale").shape == torch.Size([1])
             assert f.get_tensor("model.decoder.layers.5.self_attn.k_scale").shape == torch.Size([1])
             assert f.get_tensor("model.decoder.layers.5.self_attn.k_scale").dtype == torch.float32
@@ -283,8 +284,8 @@ def test_static_afp8_export(self, static_kv_dtype):
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
 
         f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt")
-        self.assertIn("model.decoder.layers.8.self_attn.k_proj.input_scale", f.keys())
-        self.assertIn("model.decoder.layers.8.self_attn.k_proj.weight_scale", f.keys())
+        assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys()
+        assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys()
         assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape == torch.Size([1])
         assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype == torch.float8_e4m3fn
         shutil.rmtree(quantized_model_path, ignore_errors=True)
@@ -308,14 +309,14 @@ def test_static_fp8_attn(self):
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
         f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt")
-        self.assertIn("model.decoder.layers.8.self_attn.k_proj.input_scale", f.keys())
-        self.assertIn("model.decoder.layers.8.self_attn.k_proj.weight_scale", f.keys())
+        assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys()
+        assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys()
         assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape == torch.Size([1])
         assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype == torch.float8_e4m3fn
         check_attrs = ["k_scale", "v_scale", "q_scale"]
         for attr in check_attrs:
             weight_name = f"model.decoder.layers.8.self_attn.{attr}"
-            self.assertIn(weight_name, f.keys())
+            assert weight_name in f.keys()
             assert f.get_tensor(weight_name).shape == torch.Size([1])
             assert f.get_tensor(weight_name).dtype == torch.float32
 
@@ -323,7 +324,7 @@ def test_static_fp8_attn(self):
 
     def test_awq_lmhead_export(self, dataloader):
         bits, sym, group_size = 4, False, 128
-        model_name = "/tf_dataset/auto_round/models/microsoft/phi-2"
+        model_name = get_model_path("microsoft/phi-2")
         layer_config = {
             "lm_head": {"bits": 4},  # set lm_head quant
             "layer": {"bits": 16},
@@ -358,7 +359,7 @@ def test_awq_lmhead_export(self, dataloader):
     def test_gptq_lmhead_export(self, dataloader):
         bits, sym, group_size = 4, True, 128
         # Note that, to save UT tuning time, the local model is intentionally kept lightweight, using only 2 hidden layers.
-        model_name = "/tf_dataset/auto_round/models/microsoft/phi-2"
+        model_name = get_model_path("microsoft/phi-2")
         layer_config = {
             "lm_head": {"bits": 4},  # set lm_head quant
             "layer": {"bits": 16},
diff --git a/test/test_cpu/test_generation.py b/test/test_cpu/test_generation.py
index 4c72db93c..e1e9dc3f1 100644
--- a/test/test_cpu/test_generation.py
+++ b/test/test_cpu/test_generation.py
@@ -7,11 +7,13 @@
 
 from auto_round import AutoRound
 
+from ..helpers import opt_name_or_path
+
 
 class TestAutoRoundFormatGeneration:
     @classmethod
     def setup_class(self):
-        self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        self.model_name = opt_name_or_path
         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         self.save_folder = "./saved"
diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py
index 393e11dba..c34c4f096 100644
--- a/test/test_cpu/test_gguf_format.py
+++ b/test/test_cpu/test_gguf_format.py
@@ -8,12 +8,14 @@
 
 from auto_round import AutoRound
 
+from ..helpers import get_model_path
+
 
 class TestGGUF:
 
     @classmethod
     def setup_class(self):
-        self.model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct"
+        self.model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct")
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
 
     @classmethod
@@ -63,7 +65,7 @@ def test_q4_0(self):
         # from auto_round.eval.evaluation import simple_evaluate_user_model
         # result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="openbookqa", eval_model_dtype="bf16")
         # # 0.246
-        # self.assertGreater(result['results']['openbookqa']['acc,none'], 0.23)
+        # assert result['results']['openbookqa']['acc,none'] > 0.23
         shutil.rmtree("./saved", ignore_errors=True)
 
     # def test_q4_1(self):
@@ -83,7 +85,7 @@ def test_q4_0(self):
     #     # from auto_round.eval.evaluation import simple_evaluate_user_model
     #     # result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="openbookqa", eval_model_dtype="bf16")
     #     # # 0.23
-    #     # self.assertGreater(result['results']['openbookqa']['acc,none'], 0.22)
+    #     # assert result['results']['openbookqa']['acc,none'] > 0.22
     #     shutil.rmtree("./saved", ignore_errors=True)
 
     def test_func(self):
@@ -100,8 +102,8 @@ def test_func(self):
         )
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q*_1")
-        self.assertTrue(autoround.group_size == 32)
-        self.assertFalse(autoround.sym)
+        assert autoround.group_size == 32
+        assert not autoround.sym
         gguf_file = os.listdir("saved")[0]
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
         text = "There is a girl who likes adventure,"
@@ -129,7 +131,7 @@ def test_func(self):
         # gguf_file = os.listdir("saved")[0]
         # model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
         # result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="lambada_openai", eval_model_dtype="bf16")
-        # self.assertGreater(result['results']['lambada_openai']['acc,none'], 0.5)
+        # assert result['results']['lambada_openai']['acc,none'] > 0.5
         shutil.rmtree("./saved", ignore_errors=True)
 
     #
@@ -182,7 +184,7 @@ def test_func(self):
     #     shutil.rmtree("./saved", ignore_errors=True)
 
     def test_gguf_baseline(self):
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-1.5B-Instruct"
+        model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct")
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         autoround = AutoRound(
             model,
@@ -229,7 +231,7 @@ def test_gguf_baseline(self):
         # shutil.rmtree("./saved", ignore_errors=True)
 
     def test_q4_k_m(self, dataloader):
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-1.5B-Instruct"
+        model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct")
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         layer_config = {
@@ -276,7 +278,7 @@ def test_q4_k_m(self, dataloader):
         shutil.rmtree("./saved", ignore_errors=True)
 
     def test_all_format(self):
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-1.5B-Instruct"
+        model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct")
         python_path = sys.executable
         # for gguf_format in ["gguf:q4_0", "gguf:q4_1", "gguf:q4_k_m", "gguf:q6_k"]:
         for gguf_format in ["gguf:q4_k_m"]:
@@ -306,7 +308,7 @@ def test_all_format(self):
         shutil.rmtree("../../tmp_autoround", ignore_errors=True)
 
     def test_vlm_gguf(self):
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct"
+        model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct")
         from auto_round import AutoRoundMLLM
         from auto_round.utils import mllm_load_model
 
@@ -321,13 +323,13 @@ def test_vlm_gguf(self):
         )
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
-        self.assertTrue("mmproj-model.gguf" in os.listdir("./saved"))
+        assert "mmproj-model.gguf" in os.listdir("./saved")
         for file_name in os.listdir(quantized_model_path):
             file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2
             if file_name == "mmproj-model.gguf":
-                self.assertAlmostEqual(file_size, 2537, delta=5.0)
+                assert abs(file_size - 2537) < 5.0
             else:
-                self.assertAlmostEqual(file_size, 892, delta=5.0)
+                assert abs(file_size - 892) < 5.0
         shutil.rmtree("./saved", ignore_errors=True)
 
     def test_qtype_setting(self):
@@ -338,7 +340,7 @@ def test_qtype_setting(self):
         from auto_round.compressors.utils import set_layer_config
         from auto_round.export.export_to_gguf.config import ModelType
 
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct"
+        model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct")
         ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0)
         ar.formats = ["gguf:q4_0"]
         ar.layer_config, _, _ = set_layer_config(
@@ -354,8 +356,8 @@ def test_qtype_setting(self):
             enable_gguf_official_mixed=True,
             is_mllm=ar.mllm,
         )
-        self.assertTrue(ar.layer_config["model.embed_tokens"]["bits"] == 8)
-        self.assertTrue("lm_head" not in ar.layer_config)
+        assert ar.layer_config["model.embed_tokens"]["bits"] == 8
+        assert "lm_head" not in ar.layer_config
 
         model_name = "Qwen/Qwen3-0.6B"
         ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0)
@@ -373,8 +375,8 @@ def test_qtype_setting(self):
             enable_gguf_official_mixed=True,
             is_mllm=ar.mllm,
         )
-        self.assertTrue(ar.layer_config["model.embed_tokens"]["bits"] == 4)
-        self.assertTrue(ar.layer_config["lm_head"]["bits"] == 6 and ar.layer_config["lm_head"]["super_bits"] == 8)
+        assert ar.layer_config["model.embed_tokens"]["bits"] == 4
+        assert ar.layer_config["lm_head"]["bits"] == 6 and ar.layer_config["lm_head"]["super_bits"] == 8
 
         layer_config = {
             "model.embed_tokens": {"bits": 6, "super_bits": 8},
@@ -395,8 +397,8 @@ def test_qtype_setting(self):
             enable_gguf_official_mixed=True,
             is_mllm=ar.mllm,
         )
-        self.assertTrue(ar.layer_config["lm_head"]["bits"] == 4)
-        self.assertTrue(
-            ar.layer_config["model.embed_tokens"]["bits"] == 6
+        assert (
+            ar.layer_config["lm_head"]["bits"] == 4
+            and ar.layer_config["model.embed_tokens"]["bits"] == 6
             and ar.layer_config["model.embed_tokens"]["super_bits"] == 8
         )
diff --git a/test/test_cpu/test_gpt_oss.py b/test/test_cpu/test_gpt_oss.py
index ccc997eba..b82c04c31 100644
--- a/test/test_cpu/test_gpt_oss.py
+++ b/test/test_cpu/test_gpt_oss.py
@@ -4,11 +4,13 @@
 
 from auto_round import AutoRound
 
+from ..helpers import get_model_path
+
 
 @pytest.fixture
 def setup_gpt_oss():
     """Fixture to set up the GPT-OSS model and tokenizer."""
-    model_name = "/tf_dataset/auto_round/models/unsloth/gpt-oss-20b-BF16"
+    model_name = get_model_path("unsloth/gpt-oss-20b-BF16")
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
     config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
     config.num_hidden_layers = 1  # Reduce layers for testing
diff --git a/test/test_cpu/test_llmc_integration.py b/test/test_cpu/test_llmc_integration.py
index 6dba09cfa..cea412327 100644
--- a/test/test_cpu/test_llmc_integration.py
+++ b/test/test_cpu/test_llmc_integration.py
@@ -85,7 +85,7 @@ def test_oneshot_application(recipe, tmp_path):
     assert weight_args.num_bits == 4
 
     # Check a specific layer is quantized
-    targeted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj
+    targeted_linear_layer = model_loaded.model.layers[1].self_attn.q_proj
     assert hasattr(targeted_linear_layer, "quantization_scheme")
 
     # Check lm-head is not quantized
diff --git a/test/test_cpu/test_llmcompressor.py b/test/test_cpu/test_llmcompressor.py
index ebe531f75..614701943 100644
--- a/test/test_cpu/test_llmcompressor.py
+++ b/test/test_cpu/test_llmcompressor.py
@@ -7,11 +7,13 @@
 
 from auto_round import AutoRound
 
+from ..helpers import get_model_path, opt_name_or_path
+
 
 class TestLLMC:
     @classmethod
     def setup_class(self):
-        self.model_name = "/tf_dataset/auto_round/models/stas/tiny-random-llama-2"
+        self.model_name = get_model_path("stas/tiny-random-llama-2")
         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
 
@@ -39,7 +41,7 @@ def test_llmcompressor_w8a8(self):
 
     def test_llmcompressor_fp8(self):
         ## quantize the model
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = opt_name_or_path
         autoround = AutoRound(
             model_name,
             scheme="FP8_STATIC",
@@ -56,14 +58,14 @@ def test_llmcompressor_fp8(self):
         import json
 
         config = json.load(open("./saved/config.json"))
-        self.assertIn("group_0", config["quantization_config"]["config_groups"])
+        assert "group_0" in config["quantization_config"]["config_groups"]
         assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8
         assert config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"] == "channel"
         assert config["quantization_config"]["quant_method"] == "compressed-tensors"
 
     def test_autoround_llmcompressor_fp8(self):
         ## quantize the model
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = opt_name_or_path
         autoround = AutoRound(
             model_name,
             scheme="FP8_STATIC",
@@ -77,10 +79,8 @@ def test_autoround_llmcompressor_fp8(self):
         import json
 
         config = json.load(open("./saved/config.json"))
-        self.assertIn("group_0", config["quantization_config"]["config_groups"])
+        assert "group_0" in config["quantization_config"]["config_groups"]
         assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8
         assert config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"] == "tensor"
-        self.assertEqual(
-            config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["strategy"], "tensor"
-        )
+        assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["strategy"] == "tensor"
         assert config["quantization_config"]["quant_method"] == "compressed-tensors"
diff --git a/test/test_cpu/test_load_awq_gptq.py b/test/test_cpu/test_load_awq_gptq.py
index e78266182..6dc295b4e 100644
--- a/test/test_cpu/test_load_awq_gptq.py
+++ b/test/test_cpu/test_load_awq_gptq.py
@@ -3,7 +3,7 @@
 import pytest
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
-from ..helpers import model_infer
+from ..helpers import get_model_path, model_infer
 
 
 class TestAutoRound:
@@ -13,9 +13,9 @@ def teardown_class(self):
         shutil.rmtree("runs", ignore_errors=True)
 
     def test_load_gptq_no_dummy_gidx_model(self):
-        model_name = "/tf_dataset/auto_round/models/ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1"
+        model_name = get_model_path("ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1")
         quantization_config = AutoRoundConfig()
-        with self.assertRaises(NotImplementedError) as cm:
+        with pytest.raises(NotImplementedError):
             model = AutoModelForCausalLM.from_pretrained(
                 model_name,
                 torch_dtype="auto",
@@ -25,7 +25,7 @@ def test_load_gptq_no_dummy_gidx_model(self):
             )
 
     def test_load_awq(self):
-        model_name = "/tf_dataset/auto_round/models/casperhansen/opt-125m-awq"
+        model_name = get_model_path("casperhansen/opt-125m-awq")
         quantization_config = AutoRoundConfig()
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
diff --git a/test/test_cpu/test_mix_bits.py b/test/test_cpu/test_mix_bits.py
index 71354feb9..6cc390637 100644
--- a/test/test_cpu/test_mix_bits.py
+++ b/test/test_cpu/test_mix_bits.py
@@ -5,12 +5,13 @@
 
 import pytest
 import torch
-from parameterized import parameterized
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
 from auto_round import AutoRound
 from auto_round.testing_utils import require_gptqmodel
 
+from ..helpers import opt_name_or_path
+
 
 def _get_folder_size(path: str) -> float:
     """Return folder size in GB."""
@@ -26,7 +27,7 @@ def _get_folder_size(path: str) -> float:
 class TestAutoRound:
     @classmethod
     def setup_class(self):
-        self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        self.model_name = opt_name_or_path
         self.save_dir = ".saved/"
         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -233,5 +234,5 @@ def test_mixed_MXFP_autoround_format_loading(self, dataloader):
 
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10)
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.14)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.14
         shutil.rmtree(quantized_model_path, ignore_errors=True)
diff --git a/test/test_cpu/test_mllm.py b/test/test_cpu/test_mllm.py
index 25f2a209a..5e4842d4c 100644
--- a/test/test_cpu/test_mllm.py
+++ b/test/test_cpu/test_mllm.py
@@ -5,6 +5,8 @@
 
 from auto_round import AutoRoundMLLM
 
+from ..helpers import get_model_path, opt_name_or_path
+
 
 class FakeDataLoader:
     def __init__(self):
@@ -26,7 +28,7 @@ def __iter__(self):
 class TestAutoRoundMLLM:
     @classmethod
     def setup_class(self):
-        self.model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct"
+        self.model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct")
         self.dataset = FakeDataLoader()
 
     @classmethod
@@ -137,11 +139,9 @@ def test_pure_text_model_check(self):
         model = Qwen2VLForConditionalGeneration.from_pretrained(
             self.model_name, trust_remote_code=True, device_map="auto"
         )
-        self.assertFalse(is_pure_text_model(model))
-        model = AutoModelForCausalLM.from_pretrained(
-            "/tf_dataset/auto_round/models/facebook/opt-125m", trust_remote_code=True
-        )
-        self.assertTrue(is_pure_text_model(model))
+        assert not is_pure_text_model(model)
+        model = AutoModelForCausalLM.from_pretrained(opt_name_or_path, trust_remote_code=True)
+        assert is_pure_text_model(model)
 
     def test_str_input(self):
         tokenizer = AutoTokenizer.from_pretrained(self.model_name)
@@ -210,7 +210,7 @@ def test_str_input(self):
     def test_qwen2_5(self):
         from auto_round.utils import mllm_load_model
 
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-VL-3B-Instruct"
+        model_name = get_model_path("Qwen/Qwen2.5-VL-3B-Instruct")
         model, processor, tokenizer, image_processor = mllm_load_model(model_name)
         autoround = AutoRoundMLLM(
             model,
diff --git a/test/test_cpu/test_model_scope.py b/test/test_cpu/test_model_scope.py
index 0097b3584..cf48eeaab 100644
--- a/test/test_cpu/test_model_scope.py
+++ b/test/test_cpu/test_model_scope.py
@@ -7,6 +7,8 @@
 
 from auto_round import AutoRound
 
+from ..helpers import get_model_path
+
 
 class TestModelScope:
     @classmethod
@@ -29,14 +31,14 @@ def teardown_class(self):
         return super().teardown_class()
 
     def test_llm(self):
-        model_name = "Qwen/Qwen2.5-0.5B-Instruct"
+        model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct")
         autoround = AutoRound(
             model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=self.dataset
         )
         autoround.quantize_and_save()
 
     def test_mllm(self):
-        model_name = "Qwen/Qwen2-VL-2B-Instruct"
+        model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct")
         autoround = AutoRound(
             model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=self.dataset, batch_size=2
         )
diff --git a/test/test_cpu/test_moe_model.py b/test/test_cpu/test_moe_model.py
index c88571346..62bac4efc 100644
--- a/test/test_cpu/test_moe_model.py
+++ b/test/test_cpu/test_moe_model.py
@@ -6,11 +6,13 @@
 
 from auto_round import AutoRound
 
+from ..helpers import get_model_path
+
 
 @pytest.fixture
 def setup_gpt_oss():
     """Fixture to set up the GPT-OSS model and tokenizer."""
-    model_name = "/tf_dataset/auto_round/models/unsloth/gpt-oss-20b-BF16"
+    model_name = get_model_path("unsloth/gpt-oss-20b-BF16")
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
     config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
     config.num_hidden_layers = 1  # Reduce layers for testing
@@ -22,7 +24,7 @@ def setup_gpt_oss():
 @pytest.fixture
 def setup_llama4():
     """Fixture to set up the llama4 model and tokenizer."""
-    model_name = "/tf_dataset/auto_round/models/meta-llama/Llama-4-Scout-17B-16E-Instruct"
+    model_name = get_model_path("meta-llama/Llama-4-Scout-17B-16E-Instruct")
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
     config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
     config.vision_config.num_hidden_layers = 2  # Reduce layers for testing
diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py
index 1144d00d6..695371061 100644
--- a/test/test_cpu/test_mxfp_nvfp.py
+++ b/test/test_cpu/test_mxfp_nvfp.py
@@ -3,11 +3,12 @@
 
 import pytest
 import torch
-from parameterized import parameterized
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
 from auto_round import AutoRound
 
+from ..helpers import get_model_path, opt_name_or_path
+
 
 def _get_folder_size(path: str) -> float:
     """Return folder size in GB."""
@@ -23,7 +24,7 @@ def _get_folder_size(path: str) -> float:
 class TestAutoRoundFP:
     @classmethod
     def setup_class(self):
-        self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        self.model_name = opt_name_or_path
         self.save_dir = "./saved"
         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto")
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -34,7 +35,7 @@ def teardown_class(self):
         shutil.rmtree("runs", ignore_errors=True)
 
     def test_nvfp4_moe_actmax_rtn(self, dataloader):
-        model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
+        model_name = get_model_path("deepseek-ai/DeepSeek-V2-Lite")
         layer_config = {
             "self_attn": {"bits": 16, "act_bits": 16},
             "mlp.shared_experts": {"bits": 16, "act_bits": 16},
@@ -61,7 +62,7 @@ def test_nvfp4_moe_actmax_rtn(self, dataloader):
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     def test_nvfp4_moe_actmax_ar(self, dataloader):
-        model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
+        model_name = get_model_path("deepseek-ai/DeepSeek-V2-Lite")
         layer_config = {
             "q_proj": {"bits": 16, "act_bits": 16},
             "mlp.shared_experts": {"bits": 16, "act_bits": 16},
@@ -95,11 +96,11 @@ def test_nvfp4_moe_actmax_ar(self, dataloader):
 
         result = simple_evaluate_user_model(model, tokenizer, batch_size=4, tasks="piqa", limit=4)
         print(result["results"]["piqa"]["acc,none"])
-        self.assertGreater(result["results"]["piqa"]["acc,none"], 0.7)
+        assert result["results"]["piqa"]["acc,none"] > 0.7
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     def test_mxfp4_moe_ar(self, dataloader):
-        model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
+        model_name = get_model_path("deepseek-ai/DeepSeek-V2-Lite")
         layer_config = {
             "q_proj": {"bits": 16, "act_bits": 16, "data_type": "float"},
             "mlp.shared_experts": {"bits": 16, "act_bits": 16, "data_type": "float"},
@@ -332,7 +333,7 @@ def test_nvfp4_autoround_save_quantized(self, dataloader):
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     def test_qwen_moe_quant_infer(self, dataloader):
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen1.5-MoE-A2.7B"
+        model_name = get_model_path("Qwen/Qwen1.5-MoE-A2.7B")
         layer_config = {
             "layers\.(?:[3-9]|1[0-9]|2[0-3])": {"bits": 16, "act_bits": 16},
         }
@@ -354,10 +355,11 @@ def test_qwen_moe_quant_infer(self, dataloader):
 
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="piqa", limit=10)
         print(result["results"]["piqa"]["acc,none"])
-        self.assertGreater(result["results"]["piqa"]["acc,none"], 0.60)
+        assert result["results"]["piqa"]["acc,none"] > 0.60
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    @parameterized.expand(
+    @pytest.mark.parametrize(
+        "scheme, static_kv_dtype, static_attention_dtype",
         [
             # scheme,  static_kv_dtype, static_attention_dtype
             ("MXFP4", None, "fp8"),
@@ -366,7 +368,7 @@ def test_qwen_moe_quant_infer(self, dataloader):
             ("MXFP8", "fp8", None),
             ("NVFP4", None, "fp8"),
             ("NVFP4", "fp8", None),
-        ]
+        ],
     )
     def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype, dataloader):
         model_name = self.model_name
diff --git a/test/test_cpu/test_mxfp_save_load.py b/test/test_cpu/test_mxfp_save_load.py
index aca5c7592..bf3e9853b 100644
--- a/test/test_cpu/test_mxfp_save_load.py
+++ b/test/test_cpu/test_mxfp_save_load.py
@@ -14,6 +14,8 @@
 from auto_round.inference.backend import MX_TENSOR_DATA_TYPES
 from auto_round.testing_utils import has_module
 
+from ..helpers import get_model_path
+
 testing_scheme_name_lst = [
     AutoRoundFormat.MXFP8.value,
     AutoRoundFormat.MXFP4.value,
@@ -35,7 +37,7 @@
 def test_e2e_quant_and_load(scheme_name, weight_data_type, act_data_type):
     # Use a temporary directory for saving the quantized model
     with tempfile.TemporaryDirectory() as temp_dir:
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct"
+        model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct")
         config = AutoConfig.from_pretrained(model_name)
         config.num_hidden_layers = 2  # Use a smaller model for testing
         # Fix configuration validation issues
diff --git a/test/test_cpu/test_scheme.py b/test/test_cpu/test_scheme.py
index 71f02dc96..9bd236765 100644
--- a/test/test_cpu/test_scheme.py
+++ b/test/test_cpu/test_scheme.py
@@ -6,11 +6,13 @@
 from auto_round import AutoRound
 from auto_round.schemes import QuantizationScheme
 
+from ..helpers import get_model_path, opt_name_or_path, qwen_name_or_path
+
 
 class TestAutoRound:
     @classmethod
     def setup_class(self):
-        self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        self.model_name = opt_name_or_path
         self.save_folder = "./saved"
 
     @classmethod
@@ -20,7 +22,7 @@ def teardown_class(self):
 
     def test_gguf(self, dataloader):
         ar = AutoRound(
-            "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B",
+            qwen_name_or_path,
             scheme="W2A16",
             nsamples=1,
             iters=1,
@@ -52,9 +54,7 @@ def test_mxfp4(self, dataloader):
     def test_vllm(self):
         from auto_round import AutoRoundMLLM
 
-        ar = AutoRoundMLLM(
-            "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct", scheme="W2A16", nsamples=1, iters=1, seqlen=2
-        )
+        ar = AutoRoundMLLM(get_model_path("Qwen/Qwen2-VL-2B-Instruct", scheme="W2A16"), nsamples=1, iters=1, seqlen=2)
         assert ar.bits == 2
         assert ar.act_bits == 16
 
@@ -73,7 +73,7 @@ def test_all_scheme(self, dataloader):
         for scheme in preset_schemes:
             model_name = self.model_name
             if "gguf" in scheme.lower():
-                model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-1.5B-Instruct"
+                model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct")
             print(f"scheme={scheme}")
             ar = AutoRound(model_name, scheme=scheme, nsamples=1, iters=1, seqlen=2, dataset=dataloader)
             ar.quantize_and_save(self.save_folder)
@@ -86,7 +86,7 @@ def test_scheme_in_layer_config(self, dataloader):
             "model.decoder.layers.4.self_attn.k_proj": QuantizationScheme.from_dict({"group_size": 64}),
         }
         ar = AutoRound(
-            "/tf_dataset/auto_round/models/facebook/opt-125m",
+            opt_name_or_path,
             scheme="W3A16",
             nsamples=1,
             iters=1,
@@ -110,9 +110,9 @@ def test_parse_available_devices(self):
         from auto_round.utils.device import parse_available_devices
 
         device_list = parse_available_devices("auto")
-        self.assertTrue(len(device_list) == 1 and "cpu" in device_list)
+        assert len(device_list) == 1 and "cpu" in device_list
         device_list = parse_available_devices("a:cuda:0,b:cuda:1,c:cpu")
-        self.assertTrue(len(device_list) == 3)
+        assert len(device_list) == 3
         assert device_list == ["cuda:0", "cuda:1", "cpu"]
         device_list = parse_available_devices("0,1")
-        self.assertTrue(len(device_list) == 1 and "cpu" in device_list)
+        assert len(device_list) == 1 and "cpu" in device_list
diff --git a/test/test_cpu/test_torch_backend.py b/test/test_cpu/test_torch_backend.py
index e27914d9b..81e009c06 100644
--- a/test/test_cpu/test_torch_backend.py
+++ b/test/test_cpu/test_torch_backend.py
@@ -49,7 +49,7 @@ def test_torch_4bits_asym(self, dataloader):
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10)
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.35
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
@@ -60,7 +60,7 @@ def test_torch_4bits_asym(self, dataloader):
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10)
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.35
         torch.cuda.empty_cache()
         shutil.rmtree("./saved", ignore_errors=True)
 
@@ -90,6 +90,6 @@ def test_torch_4bits_sym(self, dataloader):
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=1000)
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.28)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.28
         torch.cuda.empty_cache()
         shutil.rmtree(self.save_folder, ignore_errors=True)
diff --git a/test/test_cuda/requirements.txt b/test/test_cuda/requirements.txt
index e7dd4e0d8..071eb233e 100644
--- a/test/test_cuda/requirements.txt
+++ b/test/test_cuda/requirements.txt
@@ -6,7 +6,6 @@ intel-extension-for-pytorch
 lm-eval>=0.4.9.1
 optimum
 pandas
-parameterized
 pillow
 torchvision
 numba
diff --git a/test/test_cuda/test_auto_round_format.py b/test/test_cuda/test_auto_round_format.py
index cbc6868f1..7299eae0c 100644
--- a/test/test_cuda/test_auto_round_format.py
+++ b/test/test_cuda/test_auto_round_format.py
@@ -88,7 +88,7 @@ def test_mixed_precision(self):
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.32)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.32
 
     @require_awq
     @require_package_version_ut("transformers", "<4.57.0")
@@ -117,7 +117,7 @@ def test_awq_backend(self):
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.18)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.18
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
diff --git a/test/test_cuda/test_auto_scheme.py b/test/test_cuda/test_auto_scheme.py
index b6f5d8066..1a53c2425 100644
--- a/test/test_cuda/test_auto_scheme.py
+++ b/test/test_cuda/test_auto_scheme.py
@@ -199,11 +199,11 @@ def test_layer_config(self):
         ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config)
         model, layer_config = ar.quantize()
         assert layer_config["model.decoder.layers.10.fc1"]["bits"] == 8
-        assert layer_config["model.decoder.layers.10.fc1"]["sym"] == False
+        assert not layer_config["model.decoder.layers.10.fc1"]["sym"]
         assert layer_config["model.decoder.layers.10.fc1"]["group_size"] == 32
         layer = get_module(model, "model.decoder.layers.10.fc1")
         assert layer.bits == 8
-        assert layer.sym == False
+        assert not layer.sym
         assert layer.group_size == 32
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)
@@ -216,11 +216,11 @@ def test_layer_config(self):
         ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config)
         model, layer_config = ar.quantize()
         assert layer_config["model.decoder.layers.10.fc1"]["bits"] == 8
-        assert layer_config["model.decoder.layers.10.fc1"]["sym"] == False
+        assert not layer_config["model.decoder.layers.10.fc1"]["sym"]
         assert layer_config["model.decoder.layers.10.fc1"]["group_size"] == 32
         layer = get_module(model, "model.decoder.layers.10.fc1")
         assert layer.orig_layer.bits == 8
-        assert layer.orig_layer.sym == False
+        assert not layer.orig_layer.sym
         assert layer.orig_layer.group_size == 32
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)
@@ -232,7 +232,7 @@ def test_lm_head_and_mix_dtype(self):
         scheme = AutoScheme(avg_bits=target_bits, options=("MXFP4", "MXFP8"))
         ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, quant_lm_head=True)
         model, layer_config = ar.quantize()
-        self.assertLessEqual(layer_config["lm_head"]["bits"], 8)
+        assert layer_config["lm_head"]["bits"] <= 8
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)
         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
@@ -245,7 +245,7 @@ def test_auto_scheme_export(self):
         model_args = f"pretrained={self.save_dir}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.25)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.25
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
         model_name = "/models/Qwen3-0.6B"
@@ -262,5 +262,5 @@ def test_enable_torch_compile(self):
         model_args = f"pretrained={self.save_dir}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.10)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.10
         shutil.rmtree(self.save_dir, ignore_errors=True)
diff --git a/test/test_cuda/test_diffusion.py b/test/test_cuda/test_diffusion.py
index 147a34d47..27d72908d 100644
--- a/test/test_cuda/test_diffusion.py
+++ b/test/test_cuda/test_diffusion.py
@@ -69,7 +69,7 @@ def test_diffusion_rtn(self):
     def test_diffusion_model_checker(self):
         from auto_round.utils import is_diffusion_model
 
-        self.assertTrue(is_diffusion_model("/dataset/FLUX.1-dev"))
-        self.assertTrue(is_diffusion_model("/models/stable-diffusion-2-1"))
-        self.assertTrue(is_diffusion_model("/models/stable-diffusion-xl-base-1.0"))
-        self.assertFalse(is_diffusion_model("/models/Qwen3-8B"))
+        assert is_diffusion_model("/dataset/FLUX.1-dev")
+        assert is_diffusion_model("/models/stable-diffusion-2-1")
+        assert is_diffusion_model("/models/stable-diffusion-xl-base-1.0")
+        assert not is_diffusion_model("/models/Qwen3-8B")
diff --git a/test/test_cuda/test_exllamav2_backend.py b/test/test_cuda/test_exllamav2_backend.py
index e6f78ba90..b7f271f20 100644
--- a/test/test_cuda/test_exllamav2_backend.py
+++ b/test/test_cuda/test_exllamav2_backend.py
@@ -43,7 +43,7 @@ def test_gptqmodel_exllmav2_4bits_asym(self, dataloader):
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.35
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
@@ -54,7 +54,7 @@ def test_gptqmodel_exllmav2_4bits_asym(self, dataloader):
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.35
         torch.cuda.empty_cache()
         shutil.rmtree("./saved", ignore_errors=True)
 
@@ -86,7 +86,7 @@ def test_gptq_exllamav2_4bits_sym(self, dataloader):
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.27
         torch.cuda.empty_cache()
         shutil.rmtree(self.save_folder, ignore_errors=True)
 
@@ -121,6 +121,6 @@ def test_gptq_exllamav2_4bits_sym_group_size(self):
             model_infer(model, tokenizer)
             result = simple_evaluate_user_model(model, tokenizer, batch_size=64, tasks="lambada_openai")
             print(result["results"]["lambada_openai"]["acc,none"])
-            self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.15)
+            assert result["results"]["lambada_openai"]["acc,none"] > 0.15
             torch.cuda.empty_cache()
             shutil.rmtree(self.save_folder, ignore_errors=True)
diff --git a/test/test_cuda/test_fp8_input.py b/test/test_cuda/test_fp8_input.py
index 90a177ef3..777abaf04 100644
--- a/test/test_cuda/test_fp8_input.py
+++ b/test/test_cuda/test_fp8_input.py
@@ -55,7 +55,7 @@ def test_small_model_rtn(self):
         model_args = f"pretrained={self.save_dir}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.25)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.25
 
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
@@ -66,7 +66,7 @@ def test_small_model_iters1(self):
         model_args = f"pretrained={self.save_dir}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.25)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.25
 
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
@@ -77,7 +77,7 @@ def test_medium_model_rtn(self):
         model_args = f"pretrained={self.save_dir}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.55)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.55
 
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
@@ -89,7 +89,7 @@ def test_medium_model_rtn_with_lm_head(self):
         model_args = f"pretrained={self.save_dir}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.55)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.55
 
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
diff --git a/test/test_cuda/test_get_block_name.py b/test/test_cuda/test_get_block_name.py
index 52f251cb7..829ac1e46 100644
--- a/test/test_cuda/test_get_block_name.py
+++ b/test/test_cuda/test_get_block_name.py
@@ -193,7 +193,7 @@ def test_flux(self):
 
         block_names = get_block_names(model)
         self.check_block_names(block_names, ["transformer_blocks", "single_transformer_blocks"], [19, 38])
-        self.assertTrue(any(["context_embedder" not in n for n in block_names]))
+        assert any(["context_embedder" not in n for n in block_names])
 
         block_names = get_block_names(model, quant_vision=True)
         self.check_block_names(block_names, ["transformer_blocks", "single_transformer_blocks"], [19, 38])
diff --git a/test/test_cuda/test_gguf.py b/test/test_cuda/test_gguf.py
index b8ee88d0b..a7076667c 100644
--- a/test/test_cuda/test_gguf.py
+++ b/test/test_cuda/test_gguf.py
@@ -50,7 +50,7 @@ def test_gguf_format(self, dataloader):
             f"--output_dir {save_dir} --nsample 2 --format gguf:q4_0 --device 0"
         )
         print(save_dir)
-        self.assertFalse(res > 0 or res == -1, msg="qwen2 tuning fail")
+        assert not (res > 0 or res == -1), "qwen2 tuning fail"
 
         from llama_cpp import Llama
 
@@ -88,7 +88,7 @@ def test_q2_k_export(self, dataloader):
         from auto_round.eval.evaluation import simple_evaluate_user_model
 
         result = simple_evaluate_user_model(model, autoround.tokenizer, batch_size=16, tasks="piqa")
-        self.assertGreater(result["results"]["piqa"]["acc,none"], 0.45)
+        assert result["results"]["piqa"]["acc,none"] > 0.45
 
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
@@ -122,7 +122,7 @@ def test_q4_0(self):
         from auto_round.eval.evaluation import simple_evaluate_user_model
 
         result = simple_evaluate_user_model(model, autoround.tokenizer, batch_size=16, tasks="piqa")
-        self.assertGreater(result["results"]["piqa"]["acc,none"], 0.54)
+        assert result["results"]["piqa"]["acc,none"] > 0.54
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     @require_gguf
@@ -143,7 +143,7 @@ def test_q4_1(self):
         from auto_round.eval.evaluation import simple_evaluate_user_model
 
         result = simple_evaluate_user_model(model, autoround.tokenizer, batch_size=16, tasks="piqa")
-        self.assertGreater(result["results"]["piqa"]["acc,none"], 0.54)
+        assert result["results"]["piqa"]["acc,none"] > 0.54
         shutil.rmtree("./saved", ignore_errors=True)
 
     @require_gguf
@@ -187,11 +187,11 @@ def test_vlm_gguf(self):
         )
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
-        self.assertTrue("mmproj-model.gguf" in os.listdir("./saved"))
+        assert "mmproj-model.gguf" in os.listdir("./saved")
         file_size = os.path.getsize("./saved/Qwen2.5-VL-7B-Instruct-Q4_0.gguf") / 1024**2
-        self.assertAlmostEqual(file_size, 4242, delta=5.0)
+        assert abs(file_size - 4242) < 5.0
         file_size = os.path.getsize("./saved/mmproj-model.gguf") / 1024**2
-        self.assertAlmostEqual(file_size, 2580, delta=5.0)
+        assert abs(file_size - 2580) < 5.0
         shutil.rmtree("./saved", ignore_errors=True)
 
         model_name = "/models/gemma-3-12b-it"
@@ -208,11 +208,11 @@ def test_vlm_gguf(self):
         )
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m")
-        self.assertTrue("mmproj-model.gguf" in os.listdir("./saved"))
+        assert "mmproj-model.gguf" in os.listdir("./saved")
         file_size = os.path.getsize("./saved/gemma-3-12B-it-Q4_K_M.gguf") / 1024**2
-        self.assertAlmostEqual(file_size, 6568, delta=5.0)
+        assert abs(file_size - 6568) < 5.0
         file_size = os.path.getsize("./saved/mmproj-model.gguf") / 1024**2
-        self.assertAlmostEqual(file_size, 1599, delta=5.0)
+        assert abs(file_size - 1599) < 5.0
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     # @require_gguf
@@ -233,12 +233,12 @@ def test_vlm_gguf(self):
     #     quantized_model_path = "/dataset/Llam-4-test"
     #     shutil.rmtree(quantized_model_path, ignore_errors=True)
     #     autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
-    #     self.assertTrue("mmproj-model.gguf" in os.listdir(quantized_model_path))
+    #     assert "mmproj-model.gguf" in os.listdir(quantized_model_path)
     #     file_size = (
     #         os.path.getsize(os.path.join(quantized_model_path, "Llama-4-Scout-17B-16E-Instruct-16x17B-Q4_0.gguf"))
     #         / 1024**2
     #     )
-    #     self.assertAlmostEqual(file_size, 58093.62, delta=1.0)
+    #     assert abs(file_size - 58093.62) < 1.0
     #     file_size = os.path.getsize(os.path.join(quantized_model_path, "mmproj-model.gguf")) / 1024**2
-    #     self.assertAlmostEqual(file_size, 3326.18, delta=5.0)
+    #     assert abs(file_size - 3326.18) < 5.0
     #     shutil.rmtree(quantized_model_path, ignore_errors=True)
diff --git a/test/test_cuda/test_main_func.py b/test/test_cuda/test_main_func.py
index 20dc7bdc8..3243963fe 100644
--- a/test/test_cuda/test_main_func.py
+++ b/test/test_cuda/test_main_func.py
@@ -81,7 +81,7 @@ def test_backend_awq(self):
         assert accuracy > 0.35
         shutil.rmtree("./saved", ignore_errors=True)
 
-    @unittest.skipIf(torch.cuda.is_available() is False, "Skipping because no cuda")
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
     @require_gptqmodel
     def test_fp_layers(self):
         model_name = "/models/opt-125m"
@@ -105,7 +105,7 @@ def test_fp_layers(self):
         assert accuracy > 0.35
         shutil.rmtree("./saved", ignore_errors=True)
 
-    @unittest.skipIf(torch.cuda.is_available() is False, "Skipping because no cuda")
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
     @require_awq
     @require_package_version_ut("transformers", "<4.57.0")
     def test_fp_layers_awq(self):
@@ -130,7 +130,7 @@ def test_fp_layers_awq(self):
         assert accuracy > 0.35
         shutil.rmtree("./saved", ignore_errors=True)
 
-    @unittest.skipIf(torch.cuda.is_available() is False, "Skipping because no cuda")
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
     def test_undivided_group_size_tuning(self):
         model_name = "/models/opt-125m"
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
diff --git a/test/test_cuda/test_marlin_backend.py b/test/test_cuda/test_marlin_backend.py
index b920d9478..334cb2697 100644
--- a/test/test_cuda/test_marlin_backend.py
+++ b/test/test_cuda/test_marlin_backend.py
@@ -40,7 +40,7 @@ def test_marlin_group_size(self, dataloader):
             model_infer(model, tokenizer)
             result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
             print(result["results"]["lambada_openai"]["acc,none"])
-            self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.14)
+            assert result["results"]["lambada_openai"]["acc,none"] > 0.14
 
         for group_size in [32, 128]:
             print(f"{group_size}!!!!!!!!!!!!!!!!!")
@@ -69,7 +69,7 @@ def test_marlin_group_size(self, dataloader):
             model_infer(model, tokenizer)
             result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
             print(result["results"]["lambada_openai"]["acc,none"])
-            self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.14)
+            assert result["results"]["lambada_openai"]["acc,none"] > 0.14
 
     @classmethod
     def setup_class(self):
@@ -107,7 +107,7 @@ def test_marlin_4bits_sym_with_zp_m_1(self, dataloader):
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.27
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
@@ -118,7 +118,7 @@ def test_marlin_4bits_sym_with_zp_m_1(self, dataloader):
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.27
         torch.cuda.empty_cache()
         shutil.rmtree("./saved", ignore_errors=True)
 
@@ -151,7 +151,7 @@ def test_marlin_4bits_sym_with_zp_m_1(self, dataloader):
     #     model_infer(model, tokenizer)
     #     result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
     #     print(result['results']['lambada_openai']['acc,none'])
-    #     self.assertGreater(result['results']['lambada_openai']['acc,none'], 0.27)
+    #     assert result['results']['lambada_openai']['acc,none'] > 0.27
     #     torch.cuda.empty_cache()
     #
     #     model = AutoModelForCausalLM.from_pretrained(
@@ -165,6 +165,6 @@ def test_marlin_4bits_sym_with_zp_m_1(self, dataloader):
     #     model_infer(model, tokenizer)
     #     result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
     #     print(result['results']['lambada_openai']['acc,none'])
-    #     self.assertGreater(result['results']['lambada_openai']['acc,none'], 0.27)
+    #     assert result['results']['lambada_openai']['acc,none'] > 0.27
     #     torch.cuda.empty_cache()
     #     shutil.rmtree("./saved", ignore_errors=True)
diff --git a/test/test_cuda/test_mix_bits.py b/test/test_cuda/test_mix_bits.py
index b9b7dde5c..958b8ba8e 100644
--- a/test/test_cuda/test_mix_bits.py
+++ b/test/test_cuda/test_mix_bits.py
@@ -5,7 +5,6 @@
 
 import pytest
 import torch
-from parameterized import parameterized
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
 from auto_round import AutoRound
@@ -228,7 +227,7 @@ def test_mixed_MXFP_autoround_format_loading(self, dataloader):
 
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.32)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.32
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     def test_mixed_autoround_format_vllm(self, dataloader):
diff --git a/test/test_cuda/test_mxfp_nvfp.py b/test/test_cuda/test_mxfp_nvfp.py
index 357afb0f3..64436f9b6 100644
--- a/test/test_cuda/test_mxfp_nvfp.py
+++ b/test/test_cuda/test_mxfp_nvfp.py
@@ -151,5 +151,5 @@ def test_qwen_moe_quant_infer(self, dataloader):
 
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="piqa")
         print(result["results"]["piqa"]["acc,none"])
-        self.assertGreater(result["results"]["piqa"]["acc,none"], 0.7)
+        assert result["results"]["piqa"]["acc,none"] > 0.7
         shutil.rmtree(quantized_model_path, ignore_errors=True)
diff --git a/test/test_cuda/test_scheme.py b/test/test_cuda/test_scheme.py
index 06c5b27e0..f2ec15bfa 100644
--- a/test/test_cuda/test_scheme.py
+++ b/test/test_cuda/test_scheme.py
@@ -49,7 +49,7 @@ def test_fp8_static(self):
         assert ar.data_type == "fp"
         assert ar.act_data_type == "fp"
         assert ar.group_size == -1
-        assert ar.act_dynamic == False
+        assert not ar.act_dynamic
         ar.quantize()
 
     ## RTN tests
@@ -73,7 +73,7 @@ def test_fp8_static_rtn(self):
         assert ar.data_type == "fp"
         assert ar.act_data_type == "fp"
         assert ar.group_size == -1
-        assert ar.act_dynamic == False
+        assert not ar.act_dynamic
         ar.quantize()
 
     def test_scheme_in_layer_config(self):
diff --git a/test/test_cuda/test_support_vlms.py b/test/test_cuda/test_support_vlms.py
index 15c86363b..9efd53564 100644
--- a/test/test_cuda/test_support_vlms.py
+++ b/test/test_cuda/test_support_vlms.py
@@ -29,7 +29,7 @@ def test_qwen2(self):
             f"cd ../.. && {self.python_path} -m auto_round --mllm "
             f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}"
         )
-        self.assertFalse(res > 0 or res == -1, msg="qwen2 tuning fail")
+        assert not (res > 0 or res == -1), "qwen2 tuning fail"
 
         # test infer
         quantized_model_path = os.path.join(self.save_dir, "Qwen2-VL-2B-Instruct-w4g128")
@@ -84,7 +84,7 @@ def test_phi3(self):
             f"cd ../.. && {self.python_path} -m auto_round --mllm "
             f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}"
         )
-        self.assertFalse(res > 0 or res == -1, msg="Phi-3.5 tuning fail")
+        assert not (res > 0 or res == -1), "Phi-3.5 tuning fail"
 
         ## test infer
         from transformers import AutoModelForCausalLM, AutoProcessor
@@ -134,7 +134,7 @@ def test_phi3_vision_awq(self):
             f"--nsample 64 --seqlen 32 "
             f"--format auto_awq --output_dir {self.save_dir} --device {self.device}"
         )
-        self.assertFalse(res > 0 or res == -1, msg="Phi-3.5 tuning fail")
+        assert not (res > 0 or res == -1), "Phi-3.5 tuning fail"
 
         ## test infer
         from transformers import AutoModelForCausalLM, AutoProcessor
@@ -180,7 +180,7 @@ def test_glm(self):
             f"cd ../.. && {self.python_path} -m auto_round "
             f"--model {model_path} --iter 1 --output_dir {self.save_dir} --device {self.device}"
         )
-        self.assertFalse(res > 0 or res == -1, msg="glm-4v-9b tuning fail")
+        assert not (res > 0 or res == -1), "glm-4v-9b tuning fail"
 
     def test_granite_vision(self):
         model_path = "/models/granite-vision-3.2-2b"
@@ -189,4 +189,4 @@ def test_granite_vision(self):
             f"cd ../.. && {self.python_path} -m auto_round "
             f"--model {model_path} --iter 1 --output_dir {self.save_dir} --device {self.device}"
         )
-        self.assertFalse(res > 0 or res == -1, msg="granite-vision-3.2-2b tuning fail")
+        assert not (res > 0 or res == -1), "granite-vision-3.2-2b tuning fail"
diff --git a/test/test_cuda/test_torch_backend.py b/test/test_cuda/test_torch_backend.py
index 495da24e3..5244725e8 100644
--- a/test/test_cuda/test_torch_backend.py
+++ b/test/test_cuda/test_torch_backend.py
@@ -49,7 +49,7 @@ def test_torch_4bits_asym(self, dataloader):
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.35
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
@@ -60,7 +60,7 @@ def test_torch_4bits_asym(self, dataloader):
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.35
         torch.cuda.empty_cache()
         shutil.rmtree("./saved", ignore_errors=True)
 
@@ -90,6 +90,6 @@ def test_torch_4bits_sym(self, dataloader):
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.28)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.28
         torch.cuda.empty_cache()
         shutil.rmtree(self.save_folder, ignore_errors=True)
diff --git a/test/test_cuda/test_transformers.py b/test/test_cuda/test_transformers.py
index 0e43a7e70..f6e5b4497 100644
--- a/test/test_cuda/test_transformers.py
+++ b/test/test_cuda/test_transformers.py
@@ -74,12 +74,12 @@ def test_quantized_model(self):
         """
         input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
         output = self.quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False)
-        self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
+        assert self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUTS
 
     def test_raise_if_non_quantized(self):
         model_id = "facebook/opt-125m"
         quantization_config = AutoRoundConfig(bits=4)
-        with self.assertRaises(ValueError):
+        with pytest.raises(ValueError):
             _ = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
 
     def test_quantized_model_bf16(self):
@@ -96,7 +96,7 @@ def test_quantized_model_bf16(self):
         )
 
         output = quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False)
-        self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
+        assert self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUTS
 
     @require_intel_extension_for_pytorch
     def test_quantized_model_on_cpu(self):
@@ -108,7 +108,7 @@ def test_quantized_model_on_cpu(self):
         quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto")
         output = quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False)
 
-        self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
+        assert self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUTS
 
     def test_save_pretrained(self):
         """
@@ -131,7 +131,7 @@ def test_save_pretrained(self):
             input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
 
             output = model.generate(**input_ids, max_new_tokens=40, do_sample=False)
-            self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
+            assert self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUTS
 
     @require_torch_multi_gpu
     def test_quantized_model_multi_gpu(self):
@@ -144,7 +144,7 @@ def test_quantized_model_multi_gpu(self):
         )
         input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(quantized_model.device)
         output = quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False)
-        self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
+        assert self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUTS
 
     def test_convert_from_gptq(self):
         """
diff --git a/test/test_cuda/test_triton_backend.py b/test/test_cuda/test_triton_backend.py
index 38958014b..ac5436f47 100644
--- a/test/test_cuda/test_triton_backend.py
+++ b/test/test_cuda/test_triton_backend.py
@@ -49,7 +49,7 @@ def test_tritonv2_4bits_asym(self, dataloader):
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.34)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.34
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
@@ -60,7 +60,7 @@ def test_tritonv2_4bits_asym(self, dataloader):
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.34)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.34
         torch.cuda.empty_cache()
         shutil.rmtree("./saved", ignore_errors=True)
 
@@ -82,7 +82,7 @@ def test_tritonv2_2bits_asym(self):
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.19)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.19
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
@@ -93,7 +93,7 @@ def test_tritonv2_2bits_asym(self):
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.19)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.19
         torch.cuda.empty_cache()
         shutil.rmtree("./saved", ignore_errors=True)
 
@@ -124,7 +124,7 @@ def test_tritonv2_4bits_sym(self, dataloader):
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         # print(result['results']['lambada_openai']['acc,none'])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.26)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.26
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
@@ -135,7 +135,7 @@ def test_tritonv2_4bits_sym(self, dataloader):
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         # print(result['results']['lambada_openai']['acc,none'])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.26)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.26
         torch.cuda.empty_cache()
 
         shutil.rmtree("./saved", ignore_errors=True)
@@ -158,7 +158,7 @@ def test_tritonv2_8bits_sym(self):
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.27
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
@@ -169,7 +169,7 @@ def test_tritonv2_8bits_sym(self):
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         # print(result['results']['lambada_openai']['acc,none'])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.27
         torch.cuda.empty_cache()
         shutil.rmtree("./saved", ignore_errors=True)
 
@@ -197,7 +197,7 @@ def test_tritonv2_2bits_sym(self):
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.18)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.18
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
@@ -208,6 +208,6 @@ def test_tritonv2_2bits_sym(self):
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         # print(result['results']['lambada_openai']['acc,none'])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.18)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.18
         torch.cuda.empty_cache()
         shutil.rmtree("./saved", ignore_errors=True)
diff --git a/test/test_cuda/test_vlms.py b/test/test_cuda/test_vlms.py
index bfc7cf52c..c8a4adb53 100644
--- a/test/test_cuda/test_vlms.py
+++ b/test/test_cuda/test_vlms.py
@@ -119,12 +119,12 @@ def test_mm_block_name(self):
 
         model = MllamaForConditionalGeneration.from_pretrained(model_name, trust_remote_code=True, device_map="auto")
         block_name = get_block_names(model, quant_vision=True)
-        self.assertTrue(len(block_name) == 3)
-        self.assertTrue(any(["vision_model.global_transformer.layers.0" not in n for n in block_name]))
-        self.assertTrue(any(["vision_model.transformer.layers.0" not in n for n in block_name]))
+        assert len(block_name) == 3
+        assert any(["vision_model.global_transformer.layers.0" not in n for n in block_name])
+        assert any(["vision_model.transformer.layers.0" not in n for n in block_name])
         block_name = get_block_names(model, quant_vision=False)
-        self.assertTrue(len(block_name) == 1)
-        self.assertTrue(get_block_names(model) == block_name)
+        assert len(block_name) == 1
+        assert get_block_names(model) == block_name
 
     def test_mllm_detect(self):
         from auto_round.utils import is_mllm_model, llm_load_model, mllm_load_model
@@ -140,14 +140,14 @@ def test_mllm_detect(self):
             "/models/InternVL3-1B",
             "/models/pixtral-12b",
         ]:
-            self.assertTrue(is_mllm_model(model_name))
+            assert is_mllm_model(model_name)
             try:
                 model, _, _, _ = mllm_load_model(model_name)
             except:
                 continue
-            self.assertTrue(is_mllm_model(model))
+            assert is_mllm_model(model)
 
         for model_name in ["/models/glm-4-9b-chat", "/models/Qwen2.5-1.5B-Instruct/"]:
-            self.assertFalse(is_mllm_model(model_name))
+            assert not is_mllm_model(model_name)
             model, _ = llm_load_model(model_name)
-            self.assertFalse(is_mllm_model(model))
+            assert not is_mllm_model(model)
diff --git a/test/test_hpu/test_auto_round.py b/test/test_hpu/test_auto_round.py
index eb6066982..d2e33dd03 100644
--- a/test/test_hpu/test_auto_round.py
+++ b/test/test_hpu/test_auto_round.py
@@ -3,7 +3,7 @@
 
 from auto_round.utils import is_hpex_available
 
-from ..helpers import is_pytest_mode_compile, is_pytest_mode_lazy
+from ..helpers import get_model_path, is_pytest_mode_compile, is_pytest_mode_lazy
 
 
 def run_opt_125m_on_hpu():
@@ -11,7 +11,7 @@ def run_opt_125m_on_hpu():
 
     from auto_round import AutoRound
 
-    model_name = "facebook/opt-125m"
+    model_name = get_model_path("facebook/opt-125m")
     model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 
@@ -56,7 +56,7 @@ def test_w4a8(data_type):
 
     from auto_round import AutoRound
 
-    model_name = "facebook/opt-125m"
+    model_name = get_model_path("facebook/opt-125m")
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
         torch_dtype="auto",
diff --git a/test/test_xpu/test_autoround.py b/test/test_xpu/test_autoround.py
index b9894cecf..d857e3bdc 100644
--- a/test/test_xpu/test_autoround.py
+++ b/test/test_xpu/test_autoround.py
@@ -8,11 +8,13 @@
 
 from auto_round import AutoRound, AutoRoundConfig
 
+from ..helpers import get_model_path
+
 
 class TestAutoRoundXPU:
     @classmethod
     def setup_class(self):
-
+        pass
 
     @classmethod
     def teardown_class(self):
@@ -20,8 +22,8 @@ def teardown_class(self):
         shutil.rmtree("runs", ignore_errors=True)
         pass
 
-    def test_gptq_format(self):
-        model_name = "facebook/opt-125m"
+    def test_gptq_format(self, dataloader):
+        model_name = get_model_path("facebook/opt-125m")
         model = AutoModelForCausalLM.from_pretrained(
             model_name, torch_dtype="auto", trust_remote_code=True, device_map="auto"
         )
@@ -53,8 +55,8 @@ def test_gptq_format(self):
         print(res)
         assert "!!!" not in res
 
-    def test_awq_format(self):
-        model_name = "facebook/opt-125m"
+    def test_awq_format(self, dataloader):
+        model_name = get_model_path("facebook/opt-125m")
         model = AutoModelForCausalLM.from_pretrained(
             model_name, torch_dtype="auto", trust_remote_code=True, device_map="xpu"
         )

From 9d26d04ec8d27be5275ebe8649b79c3ef05c4da2 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Fri, 19 Dec 2025 03:07:20 -0500
Subject: [PATCH 10/24] update cuda ut

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 test/helpers.py                          |   6 +-
 test/test_cuda/test_alg_ext.py           |   4 +-
 test/test_cuda/test_asym.py              | 130 +++++++++++------------
 test/test_cuda/test_auto_round_format.py |  90 +++++++---------
 test/test_cuda/test_auto_scheme.py       | 116 ++++++++++----------
 test/test_cuda/test_calib_dataset.py     |  24 +----
 test/test_cuda/test_conv1d.py            |  26 +++--
 test/test_cuda/test_diffusion.py         |  25 +++--
 test/test_cuda/test_exllamav2_backend.py |  63 ++++++-----
 test/test_cuda/test_export.py            |  88 ++++++++-------
 test/test_cuda/test_fp8_input.py         |  63 ++++++-----
 test/test_cuda/test_scheme.py            |   4 +-
 12 files changed, 318 insertions(+), 321 deletions(-)

diff --git a/test/helpers.py b/test/helpers.py
index d67f85599..a46a9a58b 100644
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -25,8 +25,10 @@ def get_model_path(model_name: str) -> str:
 
 
 # Slice model into tiny model for speedup
-def get_tiny_model(model_name_or_path, num_layers=3):
-    model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype="auto", trust_remote_code=True)
+def get_tiny_model(model_name_or_path, num_layers=3, **kwargs):
+    kwargs["dtype"] = "auto" if "auto" not in kwargs else kwargs["dtype"]
+    kwargs["trust_remote_code"] = True if "trust_remote_code" not in kwargs else kwargs["trust_remote_code"]
+    model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, **kwargs)
 
     if hasattr(model.config, "num_hidden_layers"):
         model.config.num_hidden_layers = num_layers
diff --git a/test/test_cuda/test_alg_ext.py b/test/test_cuda/test_alg_ext.py
index e13bfac4a..6b04847ed 100644
--- a/test/test_cuda/test_alg_ext.py
+++ b/test/test_cuda/test_alg_ext.py
@@ -49,13 +49,13 @@ def test_cli(self, tiny_opt_model_path):
         python_path = sys.executable
 
         res = os.system(
-            f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 2 --options=W2A16,W4A16 --ignore_scale_zp_bits --nsampes 1 --seqlen 32"
+            f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 2 --options=W2A16,W4A16 --ignore_scale_zp_bits --nsamples 1 --seqlen 32"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 5.5 --options=mxfp4,mxfp8 --ignore_scale_zp_bits --enable_torch_compile --nsampes 1 --seqlen 32"
+            f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 5.5 --options=mxfp4,mxfp8 --ignore_scale_zp_bits --enable_torch_compile --nsamples 1 --seqlen 32"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
diff --git a/test/test_cuda/test_asym.py b/test/test_cuda/test_asym.py
index c41c0d5d8..1eda6f146 100644
--- a/test/test_cuda/test_asym.py
+++ b/test/test_cuda/test_asym.py
@@ -3,16 +3,16 @@
 import sys
 import unittest
 
-sys.path.insert(0, "../..")
-
+import pytest
 import torch
-from _test_helpers import model_infer
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
 from auto_round import AutoRound
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.utils import get_module
 
+from ..helpers import model_infer
+
 
 class LLMDataLoader:
     def __init__(self):
@@ -23,140 +23,138 @@ def __iter__(self):
             yield torch.ones([1, 10], dtype=torch.long)
 
 
-class TestAutoRoundAsym(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.model_name = "/models/opt-125m"
-        # self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        self.save_folder = "./saved"
+class TestAutoRoundAsym:
+    save_dir = "./saved"
+
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
+
+        # Yield to hand control to the test methods
+        yield
 
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
+        shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_asym_group_size(self):
-        model_name = self.model_name
-        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def test_asym_group_size(self, tiny_opt_model_path):
         for group_size in [32, 64, 128]:
             bits, sym = 4, False
-            ar = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1)
-            ar.quantize_and_save(format="auto_round", output_dir=self.save_folder)
+            ar = AutoRound(
+                tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1
+            )
+            ar.quantize_and_save(format="auto_round", output_dir=self.save_dir)
 
             # TODO when ark is ready, uncomment the following lines to do inference test
 
             # model = AutoModelForCausalLM.from_pretrained(
-            #     self.save_folder,
+            #     self.save_dir,
             #     torch_dtype="auto",
             #     device_map="auto",
             # )
 
-            # tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+            # tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
             # model_infer(model, tokenizer)
-            shutil.rmtree(self.save_folder)
+            shutil.rmtree(self.save_dir)
 
-    def test_asym_bits(self):
-        model_name = self.model_name
-        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def test_asym_bits(self, tiny_opt_model_path):
         for bits in [2, 3, 8]:
             group_size, sym = 128, False
-            ar = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1)
-            ar.quantize_and_save(format="auto_round", output_dir=self.save_folder)
+            ar = AutoRound(
+                tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1
+            )
+            ar.quantize_and_save(format="auto_round", output_dir=self.save_dir)
 
             # TODO when ark is ready, uncomment the following lines to do inference test
 
             # model = AutoModelForCausalLM.from_pretrained(
-            #     self.save_folder,
+            #     self.save_dir,
             #     torch_dtype="auto",
             #     device_map="auto",
             # )
 
-            # tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+            # tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
             # model_infer(model, tokenizer)
-            shutil.rmtree(self.save_folder)
+            shutil.rmtree(self.save_dir)
 
     # use parameters later
-    def test_asym_format(self):
-        model_name = self.model_name
-        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def test_asym_format(self, tiny_opt_model_path):
         for format in ["auto_round", "auto_round:auto_gptq", "auto_round:gptqmodel"]:
             bits, group_size, sym = 4, 128, False
-            ar = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1)
+            ar = AutoRound(
+                tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1
+            )
             # TODO when ark is ready, uncomment the following lines to do inference test
-            ar.quantize_and_save(format=format, output_dir=self.save_folder)
+            ar.quantize_and_save(format=format, output_dir=self.save_dir)
 
             # model = AutoModelForCausalLM.from_pretrained(
-            #     self.save_folder,
+            #     self.save_dir,
             #     torch_dtype="auto",
             #     device_map="auto",
             # )
 
-            # tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+            # tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
             # model_infer(model, tokenizer)
-            shutil.rmtree(self.save_folder)
+            shutil.rmtree(self.save_dir)
 
-    def test_asym_group_size_with_tuning(self):
-        model_name = self.model_name
-        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def test_asym_group_size_with_tuning(self, tiny_opt_model_path):
         for group_size in [32, 64, 128]:
             bits, sym = 4, False
-            ar = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1)
-            ar.quantize_and_save(format="auto_round", output_dir=self.save_folder)
+            ar = AutoRound(
+                tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1
+            )
+            ar.quantize_and_save(format="auto_round", output_dir=self.save_dir)
 
             # TODO when ark is ready, uncomment the following lines to do inference test
 
             # model = AutoModelForCausalLM.from_pretrained(
-            #     self.save_folder,
+            #     self.save_dir,
             #     torch_dtype="auto",
             #     device_map="auto",
             # )
 
-            # tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+            # tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
             # model_infer(model, tokenizer)
-            shutil.rmtree(self.save_folder)
+            shutil.rmtree(self.save_dir)
 
-    def test_asym_bits_with_tuning(self):
-        model_name = self.model_name
-        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def test_asym_bits_with_tuning(self, tiny_opt_model_path):
         for bits in [2, 3, 8]:
             group_size, sym = 128, False
-            ar = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1)
-            ar.quantize_and_save(format="auto_round", output_dir=self.save_folder)
+            ar = AutoRound(
+                tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1
+            )
+            ar.quantize_and_save(format="auto_round", output_dir=self.save_dir)
 
             # TODO when ark is ready, uncomment the following lines to do inference test
 
             # model = AutoModelForCausalLM.from_pretrained(
-            #     self.save_folder,
+            #     self.save_dir,
             #     torch_dtype="auto",
             #     device_map="auto",
             # )
 
-            # tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+            # tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
             # model_infer(model, tokenizer)
-            shutil.rmtree(self.save_folder)
+            shutil.rmtree(self.save_dir)
 
     # use parameters later
-    def test_asym_format_with_tuning(self):
-        model_name = self.model_name
-        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def test_asym_format_with_tuning(self, tiny_opt_model_path):
         for format in ["auto_round", "auto_round:auto_gptq", "auto_round:gptqmodel"]:
             bits, group_size, sym = 4, 128, False
-            ar = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1)
+            ar = AutoRound(
+                tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1
+            )
             # TODO when ark is ready, uncomment the following lines to do inference test
-            ar.quantize_and_save(format=format, output_dir=self.save_folder)
+            ar.quantize_and_save(format=format, output_dir=self.save_dir)
 
             # model = AutoModelForCausalLM.from_pretrained(
-            #     self.save_folder,
+            #     self.save_dir,
             #     torch_dtype="auto",
             #     device_map="auto",
             # )
 
-            # tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+            # tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
             # model_infer(model, tokenizer)
-            shutil.rmtree(self.save_folder)
+            shutil.rmtree(self.save_dir)
diff --git a/test/test_cuda/test_auto_round_format.py b/test/test_cuda/test_auto_round_format.py
index cbc6868f1..6ec5edff9 100644
--- a/test/test_cuda/test_auto_round_format.py
+++ b/test/test_cuda/test_auto_round_format.py
@@ -16,31 +16,33 @@
     require_package_version_ut,
 )
 
-from ..helpers import model_infer
+from ..helpers import get_model_path, get_tiny_model, model_infer
 
 
 class TestAutoRound:
-    @classmethod
-    def setup_class(self):
-        self.model_name = "facebook/opt-125m"
+    save_dir = "./saved"
 
-        self.save_folder = "./saved"
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
 
-    @classmethod
-    def teardown_class(self):
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+        # Yield to hand control to the test methods
+        yield
+
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
+        shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_greater_than_050
     @require_package_version_ut("transformers", "<4.57.0")
-    def test_autoround_asym(self, dataloader):
+    def test_autoround_asym(self, tiny_opt_model_path, dataloader):
         for bits in [2, 3, 4, 8]:
-            model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-            tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+            # model_name = get_model_path("facebook/opt-125m")
             bits, group_size, sym = bits, 128, False
             autoround = AutoRound(
-                model,
-                tokenizer,
+                tiny_opt_model_path,
                 bits=bits,
                 group_size=group_size,
                 sym=sym,
@@ -48,7 +50,7 @@ def test_autoround_asym(self, dataloader):
                 seqlen=2,
                 dataset=dataloader,
             )
-            quantized_model_path = self.save_folder
+            quantized_model_path = self.save_dir
 
             autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
 
@@ -61,12 +63,11 @@ def test_autoround_asym(self, dataloader):
             res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])
             print(res)
             assert "!!!" not in res
-            shutil.rmtree(self.save_folder, ignore_errors=True)
+            shutil.rmtree(self.save_dir, ignore_errors=True)
 
     @require_autogptq
     def test_mixed_precision(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+        model_name = get_model_path("facebook/opt-125m")
         layer_config = {}
 
         layer_config["model.decoder.layers.0.self_attn.k_proj"] = {"bits": 8}
@@ -76,15 +77,15 @@ def test_mixed_precision(self):
         }  ## 3bits when using asym will have some issue
         layer_config["model.decoder.layers.6.self_attn.out_proj"] = {"bits": 2, "group_size": 32}
         bits, group_size, sym = 4, 128, True
-        autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, layer_config=layer_config)
-        quantized_model_path = self.save_folder
+        autoround = AutoRound(model_name, bits=bits, group_size=group_size, sym=sym, layer_config=layer_config)
+        quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
         quantization_config = AutoRoundConfig(backend="auto")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -93,27 +94,25 @@ def test_mixed_precision(self):
     @require_awq
     @require_package_version_ut("transformers", "<4.57.0")
     def test_awq_backend(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+        model_name = get_model_path("facebook/opt-125m")
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
-            model,
-            tokenizer,
+            model_name,
             bits=bits,
             group_size=group_size,
             iters=1,
             nsamples=1,
             sym=sym,
         )
-        quantized_model_path = self.save_folder
+        quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq")
 
         quantization_config = AutoRoundConfig(backend="auto")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -121,18 +120,18 @@ def test_awq_backend(self):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            self.save_dir, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
         model_infer(model, tokenizer)
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+        shutil.rmtree(self.save_dir, ignore_errors=True)
 
     @require_greater_than_050
     def test_tritonv2_bf16(self):
-        model_name = "OPEA/Meta-Llama-3.1-8B-Instruct-int4-sym-inc"
+        model_name = get_model_path("OPEA/Meta-Llama-3.1-8B-Instruct-int4-sym-inc")
         quantization_config = AutoRoundConfig(backend="tritonv2")
-        model = AutoModelForCausalLM.from_pretrained(
+        model = get_tiny_model(
             model_name, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
@@ -142,13 +141,10 @@ def test_tritonv2_bf16(self):
         torch.cuda.empty_cache()
 
     @require_ipex
-    def test_autoround_gptq_sym_format(self, dataloader):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+    def test_autoround_gptq_sym_format(self, tiny_opt_model_path, dataloader):
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
-            model,
-            tokenizer,
+            tiny_opt_model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
@@ -197,13 +193,10 @@ def test_autoround_gptq_sym_format(self, dataloader):
     @require_awq
     @require_ipex
     @require_package_version_ut("transformers", "<4.57.0")
-    def test_autoround_awq_sym_format(self, dataloader):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+    def test_autoround_awq_sym_format(self, tiny_opt_model_path, dataloader):
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
-            model,
-            tokenizer,
+            tiny_opt_model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
@@ -236,14 +229,11 @@ def test_autoround_awq_sym_format(self, dataloader):
         shutil.rmtree("./saved", ignore_errors=True)
 
     @require_greater_than_050
-    def test_autoround_sym(self, dataloader):
+    def test_autoround_sym(self, tiny_opt_model_path, dataloader):
         for bits in [2, 3, 4, 8]:
-            model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-            tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
             bits, group_size, sym = bits, 128, True
             autoround = AutoRound(
-                model,
-                tokenizer,
+                tiny_opt_model_path,
                 bits=bits,
                 group_size=group_size,
                 sym=sym,
@@ -264,11 +254,11 @@ def test_autoround_sym(self, dataloader):
             res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])
             print(res)
             assert "!!!" not in res
-            shutil.rmtree(self.save_folder, ignore_errors=True)
+            shutil.rmtree(self.save_dir, ignore_errors=True)
 
     @require_greater_than_050
     def test_load_gptq_model_3bits(self):
-        model_name = "LucasSantiago257/gemma-2b-2bits-gptq"
+        model_name = get_model_path("LucasSantiago257/gemma-2b-2bits-gptq")
         quantization_config = AutoRoundConfig()
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
diff --git a/test/test_cuda/test_auto_scheme.py b/test/test_cuda/test_auto_scheme.py
index b6f5d8066..b8c40c470 100644
--- a/test/test_cuda/test_auto_scheme.py
+++ b/test/test_cuda/test_auto_scheme.py
@@ -3,6 +3,7 @@
 import shutil
 
 import pytest
+import transformers
 
 from auto_round import AutoRound, AutoRoundConfig, AutoScheme
 from auto_round.auto_scheme.utils import compute_avg_bits_for_model
@@ -10,63 +11,68 @@
 from auto_round.testing_utils import multi_card
 from auto_round.utils import get_module
 
+from ..helpers import get_model_path, get_tiny_model
+
 
 class TestAutoScheme:
-    @classmethod
-    def setup_class(self):
-        self.save_dir = "./saved"
-        self.tasks = "lambada_openai"
+    save_dir = "./saved"
+
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
 
-    @classmethod
-    def teardown_class(self):
+        # Yield to hand control to the test methods
+        yield
+
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_gguf_k_0(self):
-        model_name = "/models/Qwen3-0.6B"
+    def test_gguf_k_0(self, tiny_qwen_model_path):
         target_bits = 5.5
         scheme = AutoScheme(avg_bits=target_bits, options=("GGUF:Q4_K_M", "GGUF:Q8_0"))
-        ar = AutoRound(model=model_name, scheme=scheme, iters=1, enable_alg_ext=True)
+        ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=1, enable_alg_ext=True)
         ar.quantize_and_save(self.save_dir, format="gguf:q2_k_s")
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
-    def test_gguf_k_1(self):
-        model_name = "/models/Qwen3-0.6B"
+    def test_gguf_k_1(self, tiny_qwen_model_path):
         target_bits = 3.5
         scheme = AutoScheme(avg_bits=target_bits, options=("GGUF:Q2_K_S", "GGUF:Q4_1"))
-        ar = AutoRound(model=model_name, scheme=scheme, iters=1, enable_alg_ext=True)
+        ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=1, enable_alg_ext=True)
         ar.quantize_and_save(self.save_dir, format="gguf:q2_k_s")
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     #
-    def test_embedding_fallback(self):
-        model_name = "/models/Qwen3-0.6B"
+    def test_embedding_fallback(self, tiny_qwen_model_path):
         target_bits = 5.0
         scheme = AutoScheme(avg_bits=target_bits, options=("GGUF:Q4_K_M", "GGUF:Q8_0"))
-        ar = AutoRound(model=model_name, scheme=scheme, iters=1, enable_alg_ext=True)
+        ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=1, enable_alg_ext=True)
         ar.quantize_and_save(self.save_dir, format="gguf:q2_k_s")
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
-    def test_gguf_export(self):
-        model_name = "/models/Qwen3-0.6B"
+    def test_gguf_export(self, tiny_qwen_model_path):
         target_bits = 3
         scheme = AutoScheme(avg_bits=target_bits, options=("GGUF:Q2_K_S", "GGUF:Q4_K_M"), ignore_scale_zp_bits=True)
-        ar = AutoRound(model=model_name, scheme=scheme, iters=0)
+        ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0)
         ar.quantize_and_save(self.save_dir, format="gguf:q2_k_s")
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     def test_gguf(self):
-        model_name = "/models/Qwen3-8B"
+        model_name = get_model_path("qwen/Qwen3-8B")
+        model = get_tiny_model(model_name)
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         target_bits = 3
         scheme = AutoScheme(avg_bits=target_bits, options=("GGUF:Q2_K_S", "GGUF:Q4_K_M"), ignore_scale_zp_bits=True)
-        ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, disable_opt_rtn=True)
+        ar = AutoRound(model=model, tokenizer=tokenizer, scheme=scheme, iters=0, nsamples=1, disable_opt_rtn=True)
         model, layer_config = ar.quantize()
         avg_bits, _ = compute_avg_bits_for_model(model, ignore_scale_zp_bits=True)
         print(avg_bits)
         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
 
     def test_shared_layers(self):
-        model_name = "/models/opt-125m"
+        model_name = get_model_path("facebook/opt-125m")
         from transformers import AutoModelForCausalLM, AutoTokenizer
 
         model = AutoModelForCausalLM.from_pretrained(model_name)
@@ -106,62 +112,55 @@ def test_shared_layers(self):
 
     #
     @multi_card
-    def test_multi_card(self):
-        model_name = "/models/Qwen3-0.6B"
+    def test_multi_card(self, tiny_qwen_model_path):
         target_bits = 4.5
         for device_map in ["auto", "0,1", "0", None]:
             scheme = AutoScheme(avg_bits=target_bits, options=("NVFP4"))
-            ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, device_map=device_map)
+            ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0, nsamples=1, device_map=device_map)
             model, layer_config = ar.quantize()
             avg_bits, _ = compute_avg_bits_for_model(model)
             print(avg_bits)
             assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
 
     @multi_card
-    def test_multi_card_1(self):
-        model_name = "/models/Qwen3-0.6B"
+    def test_multi_card_1(self, tiny_qwen_model_path):
         target_bits = 4.5
         from transformers import AutoModelForCausalLM, AutoTokenizer
 
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
         scheme = AutoScheme(avg_bits=target_bits, options=("NVFP4"))
-        ar = AutoRound(model=model, tokenizer=tokenizer, scheme=scheme, iters=0, nsamples=1)
+        ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0, nsamples=1)
         model, layer_config = ar.quantize()
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)
         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
 
-    def test_non_low_gpu_mem_usage(self):
-        model_name = "/models/Qwen3-0.6B"
+    def test_non_low_gpu_mem_usage(self, tiny_qwen_model_path):
         target_bits = 4.5
         # for device_map in ["auto", "0,1", "0", None]:
         scheme = AutoScheme(avg_bits=target_bits, options=("NVFP4"), low_gpu_mem_usage=False, device_map="auto")
 
-        ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1)
+        ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0, nsamples=1)
         model, layer_config = ar.quantize()
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)
         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
 
     @multi_card
-    def test_dict_device_map(self):
-        model_name = "/models/Qwen3-8B"
+    def test_dict_device_map(self, tiny_qwen_model_path):
         target_bits = 8.25
         device_map = {"up_proj": 0, "down_proj": 1}
 
         scheme = AutoScheme(avg_bits=target_bits, options=("MXFP8"))
-        ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, device_map=device_map)
+        ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0, nsamples=1, device_map=device_map)
         model, layer_config = ar.quantize()
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)
         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
 
-    def test_min_target_bits(self):
-        model_name = "/models/opt-125m"
+    def test_min_target_bits(self, tiny_opt_model_path):
         target_bits = 4.644
         scheme = AutoScheme(avg_bits=target_bits, options=("MXFP4", "W8A16"))
-        ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1)
+        ar = AutoRound(model=tiny_opt_model_path, scheme=scheme, iters=0, nsamples=1)
         model, layer_config = ar.quantize()
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)
@@ -169,20 +168,19 @@ def test_min_target_bits(self):
 
     #
     def test_max_target_bits(self):
-        model_name = "/models/opt-125m"
         target_bits = 8.025
+        model_path = get_model_path("facebook/opt-125m")
         scheme = AutoScheme(avg_bits=target_bits, options=("MXFP4", "W8A16"))
-        ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1)
+        ar = AutoRound(model=model_path, scheme=scheme, iters=0, nsamples=1)
         model, layer_config = ar.quantize()
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)
         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
 
-    def test_patch_scheme(self):
-        model_name = "/models/opt-125m"
+    def test_patch_scheme(self, tiny_opt_model_path):
         target_bits = 5
         scheme = AutoScheme(avg_bits=target_bits, options=("MXFP4", "W8A16"))
-        ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, group_size=32)
+        ar = AutoRound(model=tiny_opt_model_path, scheme=scheme, iters=0, nsamples=1, group_size=32)
         model, layer_config = ar.quantize()
         for n, m in model.named_modules():
             if hasattr(m, "group_size"):
@@ -193,74 +191,74 @@ def test_patch_scheme(self):
 
     def test_layer_config(self):
         target_bits = 3.0
-        model_name = "/models/opt-125m"
+        model_name = get_model_path("facebook/opt-125m")
         scheme = AutoScheme(avg_bits=3, options=("W2A16", "W4A16", "BF16"))
         user_layer_config = {"model.decoder.layers.10.fc1": {"bits": 8, "group_size": 32, "sym": False}}
         ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config)
         model, layer_config = ar.quantize()
         assert layer_config["model.decoder.layers.10.fc1"]["bits"] == 8
-        assert layer_config["model.decoder.layers.10.fc1"]["sym"] == False
+        assert layer_config["model.decoder.layers.10.fc1"]["sym"] is False
         assert layer_config["model.decoder.layers.10.fc1"]["group_size"] == 32
         layer = get_module(model, "model.decoder.layers.10.fc1")
         assert layer.bits == 8
-        assert layer.sym == False
+        assert layer.sym is False
         assert layer.group_size == 32
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)
         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
 
         target_bits = 5.5
-        model_name = "/models/opt-125m"
         scheme = AutoScheme(avg_bits=target_bits, options=("mxfp4", "mxfp8"))
         user_layer_config = {"model.decoder.layers.10.fc1": {"bits": 8, "group_size": 32, "sym": False}}
         ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config)
         model, layer_config = ar.quantize()
         assert layer_config["model.decoder.layers.10.fc1"]["bits"] == 8
-        assert layer_config["model.decoder.layers.10.fc1"]["sym"] == False
+        assert layer_config["model.decoder.layers.10.fc1"]["sym"] is False
         assert layer_config["model.decoder.layers.10.fc1"]["group_size"] == 32
         layer = get_module(model, "model.decoder.layers.10.fc1")
         assert layer.orig_layer.bits == 8
-        assert layer.orig_layer.sym == False
+        assert layer.orig_layer.sym is False
         assert layer.orig_layer.group_size == 32
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)
         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
 
     def test_lm_head_and_mix_dtype(self):
-        model_name = "/models/Qwen3-8B"
+        model_name = get_model_path("qwen/Qwen3-8B")
+        model = get_tiny_model(model_name)
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         target_bits = 6
         scheme = AutoScheme(avg_bits=target_bits, options=("MXFP4", "MXFP8"))
-        ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, quant_lm_head=True)
+        ar = AutoRound(model=model, tokenizer=tokenizer, scheme=scheme, iters=0, nsamples=1, quant_lm_head=True)
         model, layer_config = ar.quantize()
-        self.assertLessEqual(layer_config["lm_head"]["bits"], 8)
+        assert layer_config["lm_head"]["bits"] <= 8
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)
         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
 
-    def test_auto_scheme_export(self):
-        model_name = "/models/opt-125m"
+    def test_auto_scheme_export(self, tiny_qwen_model_path):
+        model_name = get_model_path("facebook/opt-125m")
         scheme = AutoScheme(avg_bits=3, options=("W2A16", "W4A16", "W8A16", "BF16"))
         ar = AutoRound(model=model_name, scheme=scheme)
         ar.quantize_and_save(self.save_dir)
         model_args = f"pretrained={self.save_dir}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.25)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.25
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
-        model_name = "/models/Qwen3-0.6B"
         scheme = AutoScheme(avg_bits=3, options=("gguf:q2_k_s,gguf:q4_k_s"), nsamples=1, ignore_scale_zp_bits=True)
-        ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1)
+        ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0, nsamples=1)
         ar.quantize_and_save(self.save_dir)
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     def test_enable_torch_compile(self):
-        model_name = "/models/opt-125m"
+        model_name = get_model_path("facebook/opt-125m")
         scheme = AutoScheme(avg_bits=2, options=("W2A16"), ignore_scale_zp_bits=True)
         ar = AutoRound(model=model_name, scheme=scheme, enable_torch_compile=True)
         ar.quantize_and_save(self.save_dir)
         model_args = f"pretrained={self.save_dir}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.10)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.10
         shutil.rmtree(self.save_dir, ignore_errors=True)
diff --git a/test/test_cuda/test_calib_dataset.py b/test/test_cuda/test_calib_dataset.py
index 6a36c21b1..bdee2ebeb 100644
--- a/test/test_cuda/test_calib_dataset.py
+++ b/test/test_cuda/test_calib_dataset.py
@@ -10,30 +10,10 @@
 
 
 class TestLocalCalibDataset:
-    @classmethod
-    def setup_class(self):
-        json_data = [{"text": "awefdsfsddfd"}, {"text": "fdfdfsdfdfdfd"}, {"text": "dfdsfsdfdfdfdf"}]
-        os.makedirs("./saved", exist_ok=True)
-        self.json_file = "./saved/tmp.json"
-        with open(self.json_file, "w") as json_file:
-            json.dump(json_data, json_file, indent=4)
-
-        jsonl_data = [{"text": "哈哈，開心點"}, {"text": "hello world"}]
-        os.makedirs("./saved", exist_ok=True)
-        self.jsonl_file = "./saved/tmp.jsonl"
-        with open(self.jsonl_file, "w") as jsonl_file:
-            for item in jsonl_data:
-                json.dump(item, jsonl_file, ensure_ascii=False)
-                jsonl_file.write("\n")
-
-        model_name = "facebook/opt-125m"
-        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-
-    def test_combine_dataset(self):
+    def test_combine_dataset(self, tiny_opt_model_path):
         dataset = "NeelNanda/pile-10k" + ",BAAI/CCI3-HQ" + ",madao33/new-title-chinese"
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
-            self.model, self.tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=128, dataset=dataset
+            tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=128, dataset=dataset
         )
         autoround.quantize()
diff --git a/test/test_cuda/test_conv1d.py b/test/test_cuda/test_conv1d.py
index c5384a384..11f80a1b2 100644
--- a/test/test_cuda/test_conv1d.py
+++ b/test/test_cuda/test_conv1d.py
@@ -8,14 +8,22 @@
 from auto_round import AutoRound
 from auto_round.testing_utils import require_gptqmodel
 
-from ..helpers import model_infer
+from ..helpers import get_model_path, get_tiny_model, model_infer
 
 
 class TestQuantizationConv1d:
-    @classmethod
-    def setup_class(self):
-        self.model_name = "MBZUAI/LaMini-GPT-124M"
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
+
+        # Yield to hand control to the test methods
+        yield
+
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
+        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree("runs", ignore_errors=True)
 
     @classmethod
     def teardown_class(self):
@@ -24,13 +32,15 @@ def teardown_class(self):
 
     @require_gptqmodel
     def test_quant(self, dataloader):
-        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model_name = get_model_path("MBZUAI/LaMini-GPT-124M")
+        model = get_tiny_model(model_name)
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
         from auto_round import AutoRoundConfig
 
         autoround = AutoRound(
-            self.model,
-            self.tokenizer,
+            model,
+            tokenizer,
             bits=bits,
             group_size=group_size,
             sym=sym,
diff --git a/test/test_cuda/test_diffusion.py b/test/test_cuda/test_diffusion.py
index 147a34d47..a3a90d14e 100644
--- a/test/test_cuda/test_diffusion.py
+++ b/test/test_cuda/test_diffusion.py
@@ -13,12 +13,19 @@
 
 
 class TestAutoRound:
-    @classmethod
-    def setup_class(self):
-        self.model_name = "/dataset/FLUX.1-dev"
+    model_name = "/dataset/FLUX.1-dev"
 
-    @classmethod
-    def teardown_class(self):
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
+
+        # Yield to hand control to the test methods
+        yield
+
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
+        shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_optimum
@@ -69,7 +76,7 @@ def test_diffusion_rtn(self):
     def test_diffusion_model_checker(self):
         from auto_round.utils import is_diffusion_model
 
-        self.assertTrue(is_diffusion_model("/dataset/FLUX.1-dev"))
-        self.assertTrue(is_diffusion_model("/models/stable-diffusion-2-1"))
-        self.assertTrue(is_diffusion_model("/models/stable-diffusion-xl-base-1.0"))
-        self.assertFalse(is_diffusion_model("/models/Qwen3-8B"))
+        assert is_diffusion_model("/dataset/FLUX.1-dev")
+        assert is_diffusion_model("/models/stable-diffusion-2-1")
+        assert is_diffusion_model("/models/stable-diffusion-xl-base-1.0")
+        assert is_diffusion_model("/models/Qwen3-8B") is False
diff --git a/test/test_cuda/test_exllamav2_backend.py b/test/test_cuda/test_exllamav2_backend.py
index e6f78ba90..fb08acb29 100644
--- a/test/test_cuda/test_exllamav2_backend.py
+++ b/test/test_cuda/test_exllamav2_backend.py
@@ -8,38 +8,41 @@
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.testing_utils import require_autogptq, require_gptqmodel, require_package_version_ut
 
-from ..helpers import model_infer
+from ..helpers import get_model_path, model_infer
 
 
 class TestAutoRoundexllamaBackend:
+    save_dir = "./saved"
 
-    @classmethod
-    def setup_class(self):
-        self.model_name = "/models/opt-125m"
-        self.save_folder = "./saved"
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
 
-    @classmethod
-    def teardown_class(self):
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+        # Yield to hand control to the test methods
+        yield
+
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
+        shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_gptqmodel
     def test_gptqmodel_exllmav2_4bits_asym(self, dataloader):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+        model_path = get_model_path("facebook/opt-125m")
         bits, group_size, sym = 4, 128, False
         autoround = AutoRound(
-            model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, dataset=dataloader
+            model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, dataset=dataloader
         )
-        quantized_model_path = self.save_folder
+        quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel")
 
         quantization_config = AutoRoundConfig(backend="gptqmodel:exllamav2")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -47,10 +50,10 @@ def test_gptqmodel_exllmav2_4bits_asym(self, dataloader):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            self.save_dir, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -61,12 +64,10 @@ def test_gptqmodel_exllmav2_4bits_asym(self, dataloader):
     @require_autogptq
     @require_package_version_ut("torch", "<2.6.0")
     def test_gptq_exllamav2_4bits_sym(self, dataloader):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+        model_path = get_model_path("facebook/opt-125m")
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
-            model,
-            tokenizer,
+            model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
@@ -74,53 +75,51 @@ def test_gptq_exllamav2_4bits_sym(self, dataloader):
             seqlen=2,
             dataset=dataloader,
         )
-        quantized_model_path = self.save_folder
+        quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")  ##will convert to gptq model
 
         quantization_config = AutoRoundConfig(backend="gptq:exllamav2")  ## or exllamav2
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
         self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27)
         torch.cuda.empty_cache()
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+        shutil.rmtree(self.save_dir, ignore_errors=True)
 
     @require_autogptq
     @require_package_version_ut("torch", "<2.6.0")
     def test_gptq_exllamav2_4bits_sym_group_size(self):
+        model_path = get_model_path("facebook/opt-125m")
         for group_size in [-1, 32, 64, 128, 256, 1024]:  ## 384, 768 has accuracy issue
             print(f"!!!!!!!!!!!!!!!!!{group_size}!!!!!!!!!!!!!!!!!")
-            model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-            tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
             bits, group_size, sym = 4, group_size, True
             autoround = AutoRound(
-                model,
-                tokenizer,
+                model_path,
                 bits=bits,
                 iters=1,
                 nsamples=1,
                 group_size=group_size,
                 sym=sym,
             )
-            quantized_model_path = self.save_folder
+            quantized_model_path = self.save_dir
             autoround.quantize_and_save(
                 output_dir=quantized_model_path, format="auto_round"
             )  ##will convert to gptq model
 
             quantization_config = AutoRoundConfig(backend="gptq:exllamav2")  ## or exllamav2
             model = AutoModelForCausalLM.from_pretrained(
-                self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+                self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
             )
 
-            tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+            tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
             model_infer(model, tokenizer)
             result = simple_evaluate_user_model(model, tokenizer, batch_size=64, tasks="lambada_openai")
             print(result["results"]["lambada_openai"]["acc,none"])
             self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.15)
             torch.cuda.empty_cache()
-            shutil.rmtree(self.save_folder, ignore_errors=True)
+            shutil.rmtree(self.save_dir, ignore_errors=True)
diff --git a/test/test_cuda/test_export.py b/test/test_cuda/test_export.py
index 3e1171162..114760ac3 100644
--- a/test/test_cuda/test_export.py
+++ b/test/test_cuda/test_export.py
@@ -9,26 +9,31 @@
 from auto_round import AutoRound
 from auto_round.testing_utils import require_awq, require_optimum, require_package_version_ut
 
+from ..helpers import get_model_path, get_tiny_model
+
 
 class TestAutoRound:
-    @classmethod
-    def setup_class(self):
-        self.model_name = "facebook/opt-125m"
-        self.save_dir = "./saved"
+    save_dir = "./saved"
+
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
+
+        # Yield to hand control to the test methods
+        yield
 
-    @classmethod
-    def teardown_class(self):
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_optimum
     def test_autogptq_format(self, dataloader):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+        model_path = get_model_path("facebook/opt-125m")
         bits, group_size, sym = 4, 128, False
         autoround = AutoRound(
-            model,
-            tokenizer,
+            model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
@@ -53,10 +58,10 @@ def test_autogptq_format(self, dataloader):
         shutil.rmtree("./saved", ignore_errors=True)
 
     @require_optimum
-    def test_autogptq_format_fp_layers(self, dataloader):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+    def test_autogptq_format_fp_layers(self, tiny_opt_model_path, dataloader):
         layer_config = {}
+        model = AutoModelForCausalLM.from_pretrained(tiny_opt_model_path)
+        tokenizer = AutoTokenizer.from_pretrained(tiny_opt_model_path)
         for n, m in model.named_modules():
             if "q_proj" in n:
                 layer_config[n] = {"bits": 16}
@@ -91,8 +96,9 @@ def test_autogptq_format_fp_layers(self, dataloader):
         shutil.rmtree("./saved", ignore_errors=True)
 
     def test_autogptq_format_qsave_fp_layers(self, dataloader):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+        model_path = get_model_path("facebook/opt-125m")
+        model = AutoModelForCausalLM.from_pretrained(model_path)
+
         layer_config = {}
         for n, m in model.named_modules():
             if "q_proj" in n:
@@ -100,8 +106,7 @@ def test_autogptq_format_qsave_fp_layers(self, dataloader):
 
         bits, group_size, sym = 4, 128, False
         autoround = AutoRound(
-            model,
-            tokenizer,
+            model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
@@ -141,13 +146,10 @@ def test_autogptq_format_qsave_fp_layers(self, dataloader):
         ##print(res)
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_autoround_format(self, dataloader):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+    def test_autoround_format(self, tiny_opt_model_path, dataloader):
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
-            model,
-            tokenizer,
+            tiny_opt_model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
@@ -175,12 +177,10 @@ def test_autoround_format(self, dataloader):
     @require_awq
     @require_package_version_ut("transformers", "<4.57.0")
     def test_autoawq_format(self, dataloader):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+        model_path = get_model_path("facebook/opt-125m")
         bits, group_size, sym = 4, 128, False
         autoround = AutoRound(
-            model,
-            tokenizer,
+            model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
@@ -209,16 +209,14 @@ def test_autoawq_format(self, dataloader):
     @require_awq
     @require_package_version_ut("transformers", "<4.57.0")
     def test_autoawq_format_fp_qsave_layers(self, dataloader):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        model_path = get_model_path("facebook/opt-125m")
         layer_config = {
             "model.decoder.layers.0.self_attn.k_proj": {"bits": 16},
             "model.decoder.layers.9.self_attn.v_proj": {"bits": 16},
         }
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, False
         autoround = AutoRound(
-            model,
-            tokenizer,
+            model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
@@ -249,13 +247,10 @@ def test_autoawq_format_fp_qsave_layers(self, dataloader):
 
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_autoround_3bit_asym_torch_format(self, dataloader):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+    def test_autoround_3bit_asym_torch_format(self, tiny_opt_model_path, dataloader):
         bits, group_size, sym = 3, 128, False
         autoround = AutoRound(
-            model,
-            tokenizer,
+            tiny_opt_model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
@@ -278,13 +273,10 @@ def test_autoround_3bit_asym_torch_format(self, dataloader):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_autoround_3bit_sym_torch_format(self, dataloader):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+    def test_autoround_3bit_sym_torch_format(self, tiny_opt_model_path, dataloader):
         bits, group_size, sym = 3, 128, True
         autoround = AutoRound(
-            model,
-            tokenizer,
+            tiny_opt_model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
@@ -312,12 +304,15 @@ def test_autoround_3bit_sym_torch_format(self, dataloader):
 
     def test_awq_lmhead_export(self, dataloader):
         bits, sym, group_size = 4, False, 128
-        model_name = "/models/phi-2"
+        model_name = get_model_path("microsoft/phi-2")
+        tiny_model = get_tiny_model(model_name)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
         layer_config = {
             "lm_head": {"bits": 4},  # set lm_head quant
         }
         autoround = AutoRound(
-            model=model_name,
+            model=tiny_model,
+            tokenizer=tokenizer,
             bits=bits,
             group_size=group_size,
             sym=sym,
@@ -342,14 +337,17 @@ def test_awq_lmhead_export(self, dataloader):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_gptq_lmhead_export(self, dataloader):
+    def test_gptq_lmhead_export(self, tiny_qwen_model_path, dataloader):
         bits, sym, group_size = 4, True, 128
-        model_name = "/models/phi-2"
+        model_name = get_model_path("microsoft/phi-2")
+        tiny_model = get_tiny_model(model_name)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
         layer_config = {
             "lm_head": {"bits": 4},  # set lm_head quant
         }
         autoround = AutoRound(
-            model=model_name,
+            model=tiny_model,
+            tokenizer=tokenizer,
             bits=bits,
             group_size=group_size,
             sym=sym,
diff --git a/test/test_cuda/test_fp8_input.py b/test/test_cuda/test_fp8_input.py
index 90a177ef3..4b597f378 100644
--- a/test/test_cuda/test_fp8_input.py
+++ b/test/test_cuda/test_fp8_input.py
@@ -8,21 +8,36 @@
 
 from auto_round import AutoRound
 from auto_round.eval.evaluation import simple_evaluate
+from auto_round.utils import llm_load_model
+
+from ..helpers import get_model_path, get_tiny_model
 
 
 class TestAutoRound:
-    @classmethod
-    def setup_class(self):
-        self.save_dir = "./saved"
+    save_dir = "./saved"
 
-    @classmethod
-    def teardown_class(self):
-        shutil.rmtree(self.save_dir, ignore_errors=True)
+    def tiny_fp8_model(self):
+        model_name = get_model_path("qwen/Qwen3-0.6B-FP8")
+        model, tokenizer = llm_load_model(model_name)
+        model.model.layers = model.model.layers[:3]
+        return model, tokenizer
+
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
+
+        # Yield to hand control to the test methods
+        yield
+
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
+        shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     def test_small_model_rtn_generation(self):
-        model_name = "/models/Qwen3-0.6B-FP8"
-        ar = AutoRound(model=model_name, iters=0)
+        model, tokenizer = self.tiny_fp8_model()
+        ar = AutoRound(model=model, tokenizer=tokenizer, iters=0)
         ar.quantize_and_save(output_dir=self.save_dir)
         model = AutoModelForCausalLM.from_pretrained(self.save_dir, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
@@ -32,8 +47,8 @@ def test_small_model_rtn_generation(self):
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     def test_gguf_imatrix(self):
-        model_name = "/models/Qwen3-0.6B-FP8"
-        ar = AutoRound(model=model_name, iters=0)
+        model, tokenizer = self.tiny_fp8_model()
+        ar = AutoRound(model=model, tokenizer=tokenizer, iters=0)
         ar.quantize_and_save(format="gguf:q2_k_s", output_dir=self.save_dir)
         # from llama_cpp import Llama
         #
@@ -49,8 +64,8 @@ def test_gguf_imatrix(self):
         # print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
 
     def test_small_model_rtn(self):
-        model_name = "/models/Qwen3-0.6B-FP8"
-        ar = AutoRound(model=model_name, iters=0)
+        model, tokenizer = self.tiny_fp8_model()
+        ar = AutoRound(model=model, tokenizer=tokenizer, iters=0)
         _, folder = ar.quantize_and_save(output_dir=self.save_dir)
         model_args = f"pretrained={self.save_dir}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
@@ -60,8 +75,8 @@ def test_small_model_rtn(self):
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     def test_small_model_iters1(self):
-        model_name = "/models/Qwen3-0.6B-FP8"
-        ar = AutoRound(model=model_name, iters=1)
+        model, tokenizer = self.tiny_fp8_model()
+        ar = AutoRound(model=model, tokenizer=tokenizer, iters=1)
         _, folder = ar.quantize_and_save(output_dir=self.save_dir)
         model_args = f"pretrained={self.save_dir}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
@@ -71,8 +86,8 @@ def test_small_model_iters1(self):
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     def test_medium_model_rtn(self):
-        model_name = "/models/Qwen3-8B-FP8"
-        ar = AutoRound(model=model_name, iters=0)
+        model, tokenizer = self.tiny_fp8_model()
+        ar = AutoRound(model=model, tokenizer=tokenizer, iters=0)
         _, folder = ar.quantize_and_save(output_dir=self.save_dir)
         model_args = f"pretrained={self.save_dir}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
@@ -82,9 +97,9 @@ def test_medium_model_rtn(self):
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     def test_medium_model_rtn_with_lm_head(self):
-        model_name = "/models/Qwen3-8B-FP8"
+        model, tokenizer = self.tiny_fp8_model()
         layer_config = {"lm_head": {"bits": 4}}
-        ar = AutoRound(model=model_name, iters=0, layer_config=layer_config)
+        ar = AutoRound(model=model, tokenizer=tokenizer, iters=0, layer_config=layer_config)
         _, folder = ar.quantize_and_save(output_dir=self.save_dir)
         model_args = f"pretrained={self.save_dir}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
@@ -96,9 +111,8 @@ def test_medium_model_rtn_with_lm_head(self):
     def test_fp8_model_gguf(self):
         from llama_cpp import Llama
 
-        model_name = "Qwen/Qwen3-0.6B-FP8"
-
-        ar = AutoRound(model=model_name, iters=0)
+        model, tokenizer = self.tiny_fp8_model()
+        ar = AutoRound(model=model, tokenizer=tokenizer, iters=0)
         ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q4_0")
         for file in os.listdir(self.save_dir):
             if file.endswith(".gguf"):
@@ -108,7 +122,8 @@ def test_fp8_model_gguf(self):
         print(output)
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
-        ar = AutoRound(model=model_name, iters=1)
+        model, tokenizer = self.tiny_fp8_model()
+        ar = AutoRound(model=model, tokenizer=tokenizer, iters=1)
         ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q3_k_s")
         for file in os.listdir(self.save_dir):
             if file.endswith(".gguf"):
@@ -119,10 +134,10 @@ def test_fp8_model_gguf(self):
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     def test_diff_datatype(self):
-        model_name = "/models/Qwen3-0.6B-FP8"
         for scheme in ["NVFP4", "MXFP4"]:
+            model, tokenizer = self.tiny_fp8_model()
             for iters in [0, 1]:
                 print(f"Testing scheme: {scheme}, iters: {iters}")
-                ar = AutoRound(model=model_name, iters=iters, scheme=scheme)
+                ar = AutoRound(model=model, tokenizer=tokenizer, iters=iters, scheme=scheme)
                 ar.quantize_and_save(output_dir=self.save_dir)
                 shutil.rmtree(self.save_dir, ignore_errors=True)
diff --git a/test/test_cuda/test_scheme.py b/test/test_cuda/test_scheme.py
index 06c5b27e0..d6fe43374 100644
--- a/test/test_cuda/test_scheme.py
+++ b/test/test_cuda/test_scheme.py
@@ -49,7 +49,7 @@ def test_fp8_static(self):
         assert ar.data_type == "fp"
         assert ar.act_data_type == "fp"
         assert ar.group_size == -1
-        assert ar.act_dynamic == False
+        assert ar.act_dynamic is False
         ar.quantize()
 
     ## RTN tests
@@ -73,7 +73,7 @@ def test_fp8_static_rtn(self):
         assert ar.data_type == "fp"
         assert ar.act_data_type == "fp"
         assert ar.group_size == -1
-        assert ar.act_dynamic == False
+        assert ar.act_dynamic is False
         ar.quantize()
 
     def test_scheme_in_layer_config(self):

From 390f997a6e33ba90eaa7b71f239eae3c69be7993 Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Fri, 19 Dec 2025 03:09:14 -0500
Subject: [PATCH 11/24] update cuda ut

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 test/test_cpu/test_auto_scheme.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_cpu/test_auto_scheme.py b/test/test_cpu/test_auto_scheme.py
index b6c20826e..56996dd64 100644
--- a/test/test_cpu/test_auto_scheme.py
+++ b/test/test_cpu/test_auto_scheme.py
@@ -44,11 +44,11 @@ def test_layer_config(self, tiny_opt_model_path):
         ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config)
         model, layer_config = ar.quantize()
         assert layer_config["model.decoder.layers.1.fc1"]["bits"] == 8
-        assert layer_config["model.decoder.layers.1.fc1"]["sym"] == False
+        assert layer_config["model.decoder.layers.1.fc1"]["sym"] is False
         assert layer_config["model.decoder.layers.1.fc1"]["group_size"] == 32
         layer = get_module(model, "model.decoder.layers.1.fc1")
         assert layer.bits == 8
-        assert layer.sym == False
+        assert layer.sym is False
         assert layer.group_size == 32
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)

From 65ac22d52bdc4f8d73366a5953323f2296a9b919 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Sun, 21 Dec 2025 09:18:50 -0500
Subject: [PATCH 12/24] replace model with tiny model and fix bug

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 test/fixtures.py                  |  58 +++++++++-----
 test/helpers.py                   | 102 ++++++++++++++++++++++-
 test/test_cpu/test_asym.py        |   5 +-
 test/test_cpu/test_auto_scheme.py |   4 +-
 test/test_cpu/test_autoround.py   |  20 ++---
 test/test_cpu/test_block_names.py |   4 +-
 test/test_cpu/test_cli_usage.py   |   6 +-
 test/test_cpu/test_gguf_format.py | 129 +-----------------------------
 test/test_cpu/test_mllm.py        |  26 +++---
 test/test_cpu/test_model_scope.py |  12 +--
 test/test_cpu/test_mxfp_nvfp.py   |  99 ++++++++++-------------
 test/test_cpu/test_scheme.py      |   2 +-
 12 files changed, 216 insertions(+), 251 deletions(-)

diff --git a/test/fixtures.py b/test/fixtures.py
index 87d0a5f75..c4e2ea198 100644
--- a/test/fixtures.py
+++ b/test/fixtures.py
@@ -6,25 +6,20 @@
 import transformers
 
 from .helpers import (
+    DataLoader,
+    deepseek_v2_name_or_path,
     get_tiny_model,
     gptj_name_or_path,
     lamini_name_or_path,
     opt_name_or_path,
     phi2_name_or_path,
+    qwen_moe_name_or_path,
     qwen_name_or_path,
+    qwen_vl_name_or_path,
     save_tiny_model,
 )
 
 
-class DataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
 # Create tiny model path fixtures for testing
 @pytest.fixture(scope="session")
 def tiny_opt_model_path():
@@ -35,15 +30,6 @@ def tiny_opt_model_path():
     shutil.rmtree(tiny_model_path)
 
 
-@pytest.fixture(scope="session")
-def tiny_qwen_model_path():
-    model_name_or_path = qwen_name_or_path
-    tiny_model_path = "./tmp_tiny_qwen_model_path"
-    tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path)
-    yield tiny_model_path
-    shutil.rmtree(tiny_model_path)
-
-
 @pytest.fixture(scope="session")
 def tiny_lamini_model_path():
     model_name_or_path = lamini_name_or_path
@@ -71,6 +57,42 @@ def tiny_phi2_model_path():
     shutil.rmtree(tiny_model_path)
 
 
+@pytest.fixture(scope="session")
+def tiny_deepseek_v2_model_path():
+    model_name_or_path = deepseek_v2_name_or_path
+    tiny_model_path = "./tmp_tiny_deepseek_v2_model_path"
+    tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2)
+    yield tiny_model_path
+    shutil.rmtree(tiny_model_path)
+
+
+@pytest.fixture(scope="session")
+def tiny_qwen_model_path():
+    model_name_or_path = qwen_name_or_path
+    tiny_model_path = "./tmp_tiny_qwen_model_path"
+    tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path)
+    yield tiny_model_path
+    shutil.rmtree(tiny_model_path)
+
+
+@pytest.fixture(scope="session")
+def tiny_qwen_moe_model_path():
+    model_name_or_path = qwen_moe_name_or_path
+    tiny_model_path = "./tmp_tiny_qwen_moe_model_path"
+    tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2)
+    yield tiny_model_path
+    shutil.rmtree(tiny_model_path)
+
+
+@pytest.fixture(scope="session")
+def tiny_qwen_vl_model_path():
+    model_name_or_path = qwen_vl_name_or_path
+    tiny_model_path = "./tmp_tiny_qwen_vl_model_path"
+    tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2)
+    yield tiny_model_path
+    shutil.rmtree(tiny_model_path)
+
+
 # Create objective fixtures for testing
 @pytest.fixture(scope="function")
 def tiny_opt_model():
diff --git a/test/helpers.py b/test/helpers.py
index 5f6c9c360..be086497c 100644
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -4,6 +4,8 @@
 import torch
 import transformers
 
+from auto_round.utils import llm_load_model
+
 
 # Automatic choose local path or model name.
 def get_model_path(model_name: str) -> str:
@@ -23,6 +25,9 @@ def get_model_path(model_name: str) -> str:
 lamini_name_or_path = get_model_path("MBZUAI/LaMini-GPT-124M")
 gptj_name_or_path = get_model_path("hf-internal-testing/tiny-random-GPTJForCausalLM")
 phi2_name_or_path = get_model_path("microsoft/phi-2")
+deepseek_v2_name_or_path = get_model_path("deepseek-ai/DeepSeek-V2-Lite")
+qwen_moe_name_or_path = get_model_path("Qwen/Qwen1.5-MoE-A2.7B")
+qwen_vl_name_or_path = get_model_path("Qwen/Qwen2-VL-2B-Instruct")
 
 
 # Slice model into tiny model for speedup
@@ -44,7 +49,7 @@ def slice_layers(module):
                 return True
         return False
 
-    model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype="auto", trust_remote_code=True)
+    model, tokenizer = llm_load_model(model_name_or_path)
     slice_layers(model)
 
     if hasattr(model.config, "num_hidden_layers"):
@@ -56,11 +61,11 @@ def slice_layers(module):
 
 
 # for fixture usage only
-def save_tiny_model(model_name_or_path, tiny_model_path):
-    model = get_tiny_model(model_name_or_path, num_layers=2)
+def save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2):
+    model = get_tiny_model(model_name_or_path, num_layers=num_layers)
     tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
     test_path = os.path.dirname(__file__)
-    tiny_model_path = os.path.join(test_path, tiny_model_path)
+    tiny_model_path = os.path.join(test_path, tiny_model_path.removeprefix("./"))
     model.save_pretrained(tiny_model_path)
     tokenizer.save_pretrained(tiny_model_path)
     print(f"[Fixture]: built tiny model path:{tiny_model_path} for testing in session")
@@ -109,3 +114,92 @@ def model_infer(model, tokenizer, apply_chat_template=False):
         print(f"Generated: {decoded_outputs[i]}")
         print("-" * 50)
     return decoded_outputs[0]
+
+
+# Dummy dataloader for testing
+class DataLoader:
+    def __init__(self):
+        self.batch_size = 1
+
+    def __iter__(self):
+        for i in range(2):
+            yield torch.ones([1, 10], dtype=torch.long)
+
+
+fixed_input = torch.tensor([[10, 20, 30, 40, 50]], dtype=torch.long)
+
+
+def get_output(model_name_or_path):
+    """Get model output for fixed input."""
+    model, tokenizer = llm_load_model(model_name_or_path)
+    outputs = model(fixed_input)[0]
+    return outputs.detach().cpu()
+
+
+def is_model_outputs_similar(model_path_1, model_path_2, metric="cosine_similarity", threshold=0.98, k=5, verbose=True):
+    """
+    Compare outputs from two models using specified metric and return pass/fail.
+
+    Args:
+        model_path_1: Path to first model
+        model_path_2: Path to second model
+        metric: Metric to use - "mse", "cosine_similarity"/"cos_sim", or "topk"
+        threshold: Threshold value for pass/fail
+        k: K value for top-k metric (only used when metric="topk")
+        verbose: Whether to print detailed results
+
+    Returns:
+        bool: True if metric passes threshold, False otherwise
+    """
+    if verbose:
+        print(f"\n{'='*70}")
+        print("Comparing Model Outputs")
+        print(f"{'='*70}")
+        print(f"Model 1: {model_path_1}")
+        print(f"Model 2: {model_path_2}")
+        print(f"Metric:  {metric} | Threshold: {threshold}" + (f" | K: {k}" if "top" in metric.lower() else ""))
+        print(f"{'='*70}\n")
+
+    output_1 = get_output(model_path_1)
+    output_2 = get_output(model_path_2)
+    metric = metric.lower().replace("-", "_")
+
+    # Calculate metric and check threshold
+    if metric == "mse":
+        value = torch.mean((output_1.float() - output_2.float()) ** 2).item()
+        passed = value <= threshold
+        if verbose:
+            print(f"MSE: {value:.6f} | Threshold: <= {threshold} | {'✓ PASS' if passed else '✗ FAIL'}\n")
+
+    elif metric in ["cosine_similarity", "cos_sim", "cosine"]:
+        out1 = output_1.float().flatten()
+        out2 = output_2.float().flatten()
+        value = torch.nn.functional.cosine_similarity(out1.unsqueeze(0), out2.unsqueeze(0)).item()
+        passed = value >= threshold
+        if verbose:
+            print(f"Cosine Similarity: {value:.6f} | Threshold: >= {threshold} | {'✓ PASS' if passed else '✗ FAIL'}\n")
+
+    elif metric in ["topk", "top_k"]:
+        _, topk_1 = torch.topk(output_1, k=min(k, output_1.size(-1)), dim=-1)
+        _, topk_2 = torch.topk(output_2, k=min(k, output_2.size(-1)), dim=-1)
+
+        total_agreement = 0
+        total_positions = topk_1.numel() // topk_1.size(-1)
+
+        for i in range(topk_1.size(0)):
+            for j in range(topk_1.size(1)):
+                set1 = set(topk_1[i, j].tolist())
+                set2 = set(topk_2[i, j].tolist())
+                total_agreement += len(set1 & set2) / k
+
+        value = total_agreement / total_positions
+        passed = value >= threshold
+        if verbose:
+            print(
+                f"Top-{k} Agreement: {value:.4%} | Threshold: >= {threshold:.4%} | {'✓ PASS' if passed else '✗ FAIL'}\n"
+            )
+
+    else:
+        raise ValueError(f"Unknown metric: {metric}. Choose from: 'mse', 'cosine_similarity', 'topk'")
+
+    return passed
diff --git a/test/test_cpu/test_asym.py b/test/test_cpu/test_asym.py
index 842b208ed..32a0151b3 100644
--- a/test/test_cpu/test_asym.py
+++ b/test/test_cpu/test_asym.py
@@ -6,13 +6,14 @@
 sys.path.insert(0, "../..")
 
 import torch
-from _test_helpers import model_infer
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
 from auto_round import AutoRound
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.utils import get_module
 
+from ..helpers import get_model_path, model_infer
+
 
 class LLMDataLoader:
     def __init__(self):
@@ -27,7 +28,7 @@ class TestAutoRoundAsym(unittest.TestCase):
     @classmethod
     def setUpClass(self):
         # self.model_name = "/models/opt-125m"
-        self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        self.model_name = get_model_path("facebook/opt-125m")
         self.save_folder = "./saved"
 
     @classmethod
diff --git a/test/test_cpu/test_auto_scheme.py b/test/test_cpu/test_auto_scheme.py
index 56996dd64..9d549076f 100644
--- a/test/test_cpu/test_auto_scheme.py
+++ b/test/test_cpu/test_auto_scheme.py
@@ -37,9 +37,9 @@ def test_layer_config(self, tiny_opt_model_path):
         from auto_round.auto_scheme.utils import compute_avg_bits_for_model
         from auto_round.utils import get_module
 
-        target_bits = 3.0
+        target_bits = 3.5
         model_name = tiny_opt_model_path
-        scheme = AutoScheme(avg_bits=3, options=("W2A16", "W4A16", "BF16"))
+        scheme = AutoScheme(avg_bits=target_bits, options=("W2A16", "W4A16", "BF16"))
         user_layer_config = {"model.decoder.layers.1.fc1": {"bits": 8, "group_size": 32, "sym": False}}
         ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config)
         model, layer_config = ar.quantize()
diff --git a/test/test_cpu/test_autoround.py b/test/test_cpu/test_autoround.py
index d0049765e..3d5d60c24 100644
--- a/test/test_cpu/test_autoround.py
+++ b/test/test_cpu/test_autoround.py
@@ -624,16 +624,6 @@ def test_fallback_layers_regex_exception(self, tiny_opt_model_path, dataloader):
             )
             autoround.quantize()
 
-    # def test_fp8_model_input_rtn_generation(self):
-    #     model_name = "Qwen/Qwen3-0.6B-FP8"
-    #     ar = AutoRound(model=model_name, iters=0)
-    #     ar.quantize_and_save(output_dir=self.save_folder)
-    #     model = AutoModelForCausalLM.from_pretrained(self.save_folder, torch_dtype="auto", trust_remote_code=True)
-    #     tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-    #     text = "There is a girl who likes adventure,"
-    #     inputs = tokenizer(text, return_tensors="pt").to(model.device)
-    #     print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
-
     def test_dequant_fp8_weight(self):
         from auto_round.utils import dequant_block_fp8_weight
 
@@ -655,13 +645,13 @@ def test_dequant_fp8_weight(self):
 
     def test_mixed_bit_setting(self, tiny_opt_model_path):
         model_name = tiny_opt_model_path
-        layer_config = {"model.decoder.layers.7.fc1": {"bits": 8, "act_bits": 8}}
+        layer_config = {"model.decoder.layers.1.fc1": {"bits": 8, "act_bits": 8}}
         ar = AutoRound(model_name, data_type="mx_fp4", act_bits=4, iters=0, layer_config=layer_config)
         ar.quantize()
         layer_config = ar.layer_config
         if (
-            layer_config["model.decoder.layers.7.fc1"]["bits"] != 8
-            or layer_config["model.decoder.layers.7.fc1"]["act_bits"] != 8
+            layer_config["model.decoder.layers.1.fc1"]["bits"] != 8
+            or layer_config["model.decoder.layers.1.fc1"]["act_bits"] != 8
         ):
             raise ValueError("mixed bits is not correct")
 
@@ -727,8 +717,8 @@ def test_quant_lm_head_layer_config(self):
         assert "lm_head" in model.config.quantization_config.extra_config
         assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4
 
-    def test_compressor(self):
-        model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct")
+    def test_compressor(self, tiny_qwen_vl_model_path):
+        model_name = tiny_qwen_vl_model_path
         ar = AutoRound(model_name, enable_adam=True)
         assert ar.optimizer == torch.optim.AdamW
         assert ar.mllm
diff --git a/test/test_cpu/test_block_names.py b/test/test_cpu/test_block_names.py
index 5d0423fa2..47c554317 100644
--- a/test/test_cpu/test_block_names.py
+++ b/test/test_cpu/test_block_names.py
@@ -178,12 +178,12 @@ def test_block_name_quant(self, dataloader):
         assert quant_config.block_name_to_quantize is not None
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_mm_block_name(self):
+    def test_mm_block_name(self, tiny_qwen_vl_model_path):
         from transformers import Qwen2VLForConditionalGeneration
 
         from auto_round.utils import get_block_names
 
-        model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct")
+        model_name = tiny_qwen_vl_model_path
         model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, trust_remote_code=True, device_map="auto")
         block_name = get_block_names(model, quant_vision=True)
         assert len(block_name) == 2
diff --git a/test/test_cpu/test_cli_usage.py b/test/test_cpu/test_cli_usage.py
index e71b2854a..b848c22df 100644
--- a/test/test_cpu/test_cli_usage.py
+++ b/test/test_cpu/test_cli_usage.py
@@ -2,7 +2,7 @@
 import shutil
 import sys
 
-import pytest
+from ..helpers import get_model_path
 
 
 class TestAutoRoundCmd:
@@ -56,13 +56,13 @@ def test_auto_round_cmd(self, tiny_opt_model_path):
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --mllm --model /tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct --iter 2 --nsamples 2 --seqlen 32 --format auto_round --output_dir ./saved"
+            f"cd ../.. && {python_path} -m auto_round --mllm --model {get_model_path('Qwen/Qwen2-VL-2B-Instruct')} --iter 2 --nsamples 2 --seqlen 32 --format auto_round --output_dir ./saved"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --mllm --iter 2 --nsamples 2 --model /tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct --seqlen 32 --format auto_round"
+            f"cd ../.. && {python_path} -m auto_round --mllm --iter 2 --nsamples 2 --model {get_model_path('Qwen/Qwen2-VL-2B-Instruct')} --seqlen 32 --format auto_round"
             " --quant_nontext_module --output_dir ./saved "
         )
         if res > 0 or res == -1:
diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py
index c34c4f096..169b07825 100644
--- a/test/test_cpu/test_gguf_format.py
+++ b/test/test_cpu/test_gguf_format.py
@@ -8,7 +8,7 @@
 
 from auto_round import AutoRound
 
-from ..helpers import get_model_path
+from ..helpers import get_model_path, get_tiny_model
 
 
 class TestGGUF:
@@ -26,7 +26,7 @@ def teardown_class(self):
     def test_basic_usage(self):
         python_path = sys.executable
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --model /tf_dataset/auto_round/models/benzart/gemma-2b-it-fine-tuning-for-code-test "
+            f"cd ../.. && {python_path} -m auto_round --model {get_model_path('benzart/gemma-2b-it-fine-tuning-for-code-test')} "
             f" --bs 16 --iters 0 --nsamples 1 --format gguf:q4_k_m"
         )
         if res > 0 or res == -1:
@@ -62,39 +62,12 @@ def test_q4_0(self):
         inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
         print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
 
-        # from auto_round.eval.evaluation import simple_evaluate_user_model
-        # result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="openbookqa", eval_model_dtype="bf16")
-        # # 0.246
-        # assert result['results']['openbookqa']['acc,none'] > 0.23
         shutil.rmtree("./saved", ignore_errors=True)
 
-    # def test_q4_1(self):
-    #     bits, group_size, sym = 4, 32, False
-    #     autoround = AutoRound(
-    #         self.model, self.tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, data_type="int", nsamples=1
-    #     )
-    #     quantized_model_path = "./saved"
-    #
-    #     autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q4_1")
-    #     gguf_file = os.listdir(quantized_model_path)[0]
-    #     model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
-    #     text = "There is a girl who likes adventure,"
-    #     inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
-    #     print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
-    #
-    #     # from auto_round.eval.evaluation import simple_evaluate_user_model
-    #     # result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="openbookqa", eval_model_dtype="bf16")
-    #     # # 0.23
-    #     # assert result['results']['openbookqa']['acc,none'] > 0.22
-    #     shutil.rmtree("./saved", ignore_errors=True)
-
     def test_func(self):
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
             self.model_name,
-            # bits=bits,
-            # group_size=group_size,
-            # sym=sym,
             iters=1,
             nsamples=1,
             seqlen=10,
@@ -111,78 +84,6 @@ def test_func(self):
         print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
         shutil.rmtree("./saved", ignore_errors=True)
 
-        # model_name = "Qwen/Qwen2.5-1.5B-Instruct"
-        # model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        # autoround = AutoRound(
-        #     model,
-        #     self.tokenizer,
-        #     bits=3,
-        #     group_size=16,
-        #     sym=True,
-        #     iters=1,
-        #     nsamples=1,
-        #     data_type="int_sym_dq",
-        #     super_group_size=16,
-        #     super_bits=6,
-        # )
-        quantized_model_path = "./saved"
-        # autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q*_k_s")
-        # from auto_round.eval.evaluation import simple_evaluate_user_model
-        # gguf_file = os.listdir("saved")[0]
-        # model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
-        # result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="lambada_openai", eval_model_dtype="bf16")
-        # assert result['results']['lambada_openai']['acc,none'] > 0.5
-        shutil.rmtree("./saved", ignore_errors=True)
-
-    #
-    # def test_q5_k(self):
-    #     model_name = "Qwen/Qwen2.5-1.5B-Instruct"
-    #     model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-    #     autoround = AutoRound(
-    #         model,
-    #         self.tokenizer,
-    #         bits=5,
-    #         group_size=32,
-    #         sym=False,
-    #         iters=1,
-    #         nsamples=1,
-    #         data_type="int_asym_dq",
-    #         super_group_size=8,
-    #         super_bits=6,
-    #     )
-    #     quantized_model_path = "./saved"
-    #     autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q*_k_s")
-    #     gguf_file = os.listdir("saved")[0]
-    #     model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
-    #     text = "There is a girl who likes adventure,"
-    #     inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
-    #     print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
-    #     shutil.rmtree("./saved", ignore_errors=True)
-
-    # def test_q6_k(self):
-    #     model_name = "Qwen/Qwen2.5-1.5B-Instruct"
-    #     model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-    #     autoround = AutoRound(
-    #         model,
-    #         self.tokenizer,
-    #         bits=6,
-    #         group_size=16,
-    #         sym=True,
-    #         iters=1,
-    #         nsamples=1,
-    #         data_type="int_sym_dq",
-    #         super_group_size=16,
-    #         super_bits=8,
-    #     )
-    #     quantized_model_path = "./saved"
-    #     autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q*_k")
-    #     gguf_file = os.listdir("saved")[0]
-    #     model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
-    #     text = "There is a girl who likes adventure,"
-    #     inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
-    #     print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
-    #     shutil.rmtree("./saved", ignore_errors=True)
-
     def test_gguf_baseline(self):
         model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct")
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
@@ -207,28 +108,6 @@ def test_gguf_baseline(self):
         inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
         print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
         shutil.rmtree("./saved", ignore_errors=True)
-        #
-        # model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        # autoround = AutoRound(
-        #     model,
-        #     self.tokenizer,
-        #     bits=5,
-        #     group_size=32,
-        #     sym=True,
-        #     iters=0,
-        #     nsamples=8,
-        #     data_type="int_asym_dq",
-        #     super_group_size=8,
-        #     super_bits=6,
-        #     disable_opt_rtn=True,
-        # )
-        # quantized_model_path = "./saved"
-        # autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q5_k_s,fake")
-        # model = AutoModelForCausalLM.from_pretrained(quantized_model_path + "/fake", device_map="auto")
-        # text = "There is a girl who likes adventure,"
-        # inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
-        # print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
-        # shutil.rmtree("./saved", ignore_errors=True)
 
     def test_q4_k_m(self, dataloader):
         model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct")
@@ -307,8 +186,8 @@ def test_all_format(self):
             assert False, "cmd line test fail, please have a check"
         shutil.rmtree("../../tmp_autoround", ignore_errors=True)
 
-    def test_vlm_gguf(self):
-        model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct")
+    def test_vlm_gguf(self, tiny_qwen_vl_model_path):
+        model_name = tiny_qwen_vl_model_path
         from auto_round import AutoRoundMLLM
         from auto_round.utils import mllm_load_model
 
diff --git a/test/test_cpu/test_mllm.py b/test/test_cpu/test_mllm.py
index 5e4842d4c..ec5c1487e 100644
--- a/test/test_cpu/test_mllm.py
+++ b/test/test_cpu/test_mllm.py
@@ -36,12 +36,10 @@ def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-        return super().teardown_class()
-
-    def test_tune(self):
+    def test_tune(self, tiny_qwen_vl_model_path):
         bits, group_size = 4, 128
         autoround = AutoRoundMLLM(
-            model=self.model_name,
+            model=tiny_qwen_vl_model_path,
             bits=bits,
             group_size=group_size,
             nsamples=1,
@@ -54,11 +52,11 @@ def test_tune(self):
         autoround.save_quantized("./saved/", format="auto_gptq", inplace=False)
         autoround.save_quantized("./saved/", format="auto_round", inplace=False)
 
-    def test_quant_vision(self):  ## bug need to fix
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-        processor = AutoProcessor.from_pretrained(self.model_name, trust_remote_code=True)
+    def test_quant_vision(self, tiny_qwen_vl_model_path):  ## bug need to fix
+        tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_vl_model_path)
+        processor = AutoProcessor.from_pretrained(tiny_qwen_vl_model_path, trust_remote_code=True)
         model = Qwen2VLForConditionalGeneration.from_pretrained(
-            self.model_name, trust_remote_code=True, device_map="auto"
+            tiny_qwen_vl_model_path, trust_remote_code=True, device_map="auto"
         )
         bits, group_size = 4, 128
         autoround = AutoRoundMLLM(
@@ -109,11 +107,11 @@ class Myclass:
         )
         assert len(dataset.questions) == 512
 
-    def test_diff_dataset(self):
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-        processor = AutoProcessor.from_pretrained(self.model_name, trust_remote_code=True)
+    def test_diff_dataset(self, tiny_qwen_vl_model_path):
+        tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_vl_model_path)
+        processor = AutoProcessor.from_pretrained(tiny_qwen_vl_model_path, trust_remote_code=True)
         model = Qwen2VLForConditionalGeneration.from_pretrained(
-            self.model_name, trust_remote_code=True, device_map="auto"
+            tiny_qwen_vl_model_path, trust_remote_code=True, device_map="auto"
         )
         bits, group_size = 4, 128
         dataset = ["dataset test", "list test"]
@@ -131,13 +129,13 @@ def test_diff_dataset(self):
         )
         autoround.quantize()
 
-    def test_pure_text_model_check(self):
+    def test_pure_text_model_check(self, tiny_qwen_vl_model_path):
         from transformers import AutoModelForCausalLM
 
         from auto_round.utils import is_pure_text_model
 
         model = Qwen2VLForConditionalGeneration.from_pretrained(
-            self.model_name, trust_remote_code=True, device_map="auto"
+            tiny_qwen_vl_model_path, trust_remote_code=True, device_map="auto"
         )
         assert not is_pure_text_model(model)
         model = AutoModelForCausalLM.from_pretrained(opt_name_or_path, trust_remote_code=True)
diff --git a/test/test_cpu/test_model_scope.py b/test/test_cpu/test_model_scope.py
index cf48eeaab..7edcab156 100644
--- a/test/test_cpu/test_model_scope.py
+++ b/test/test_cpu/test_model_scope.py
@@ -28,18 +28,14 @@ def teardown_class(self):
         if os.path.exists(self.cache_path):
             shutil.rmtree(self.cache_path, ignore_errors=True)
 
-        return super().teardown_class()
-
-    def test_llm(self):
+    def test_llm(self, dataloader):
         model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct")
-        autoround = AutoRound(
-            model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=self.dataset
-        )
+        autoround = AutoRound(model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=dataloader)
         autoround.quantize_and_save()
 
-    def test_mllm(self):
+    def test_mllm(self, dataloader):
         model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct")
         autoround = AutoRound(
-            model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=self.dataset, batch_size=2
+            model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=dataloader, batch_size=2
         )
         autoround.quantize_and_save(self.saved_path)
diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py
index 695371061..c06fea969 100644
--- a/test/test_cpu/test_mxfp_nvfp.py
+++ b/test/test_cpu/test_mxfp_nvfp.py
@@ -7,7 +7,7 @@
 
 from auto_round import AutoRound
 
-from ..helpers import get_model_path, opt_name_or_path
+from ..helpers import is_model_outputs_similar
 
 
 def _get_folder_size(path: str) -> float:
@@ -24,18 +24,15 @@ def _get_folder_size(path: str) -> float:
 class TestAutoRoundFP:
     @classmethod
     def setup_class(self):
-        self.model_name = opt_name_or_path
         self.save_dir = "./saved"
-        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto")
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
 
     @classmethod
     def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_nvfp4_moe_actmax_rtn(self, dataloader):
-        model_name = get_model_path("deepseek-ai/DeepSeek-V2-Lite")
+    def test_nvfp4_moe_actmax_rtn(self, tiny_deepseek_v2_model_path, dataloader):
+        model_name = tiny_deepseek_v2_model_path
         layer_config = {
             "self_attn": {"bits": 16, "act_bits": 16},
             "mlp.shared_experts": {"bits": 16, "act_bits": 16},
@@ -61,8 +58,8 @@ def test_nvfp4_moe_actmax_rtn(self, dataloader):
         ), "Illegal NVFP4 quantization for lm_head layer"
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
-    def test_nvfp4_moe_actmax_ar(self, dataloader):
-        model_name = get_model_path("deepseek-ai/DeepSeek-V2-Lite")
+    def test_nvfp4_moe_actmax_ar(self, tiny_deepseek_v2_model_path, dataloader):
+        model_name = tiny_deepseek_v2_model_path
         layer_config = {
             "q_proj": {"bits": 16, "act_bits": 16},
             "mlp.shared_experts": {"bits": 16, "act_bits": 16},
@@ -90,17 +87,11 @@ def test_nvfp4_moe_actmax_ar(self, dataloader):
             and lm_head.weight_scale.dtype is torch.float8_e4m3fn
         ), "Illegal NVFP4 packing for lm_head layer"
         quantized_model_path = self.save_dir
-        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="auto")
-        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
-        from auto_round.eval.evaluation import simple_evaluate_user_model
-
-        result = simple_evaluate_user_model(model, tokenizer, batch_size=4, tasks="piqa", limit=4)
-        print(result["results"]["piqa"]["acc,none"])
-        assert result["results"]["piqa"]["acc,none"] > 0.7
+        assert is_model_outputs_similar(model_name, quantized_model_path)
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
-    def test_mxfp4_moe_ar(self, dataloader):
-        model_name = get_model_path("deepseek-ai/DeepSeek-V2-Lite")
+    def test_mxfp4_moe_ar(self, tiny_deepseek_v2_model_path, dataloader):
+        model_name = tiny_deepseek_v2_model_path
         layer_config = {
             "q_proj": {"bits": 16, "act_bits": 16, "data_type": "float"},
             "mlp.shared_experts": {"bits": 16, "act_bits": 16, "data_type": "float"},
@@ -127,8 +118,8 @@ def test_mxfp4_moe_ar(self, dataloader):
         ), "Illegal MXFP4 packing for lm_head layer"
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
-    def test_mxfp4_llmcompressor_format(self, dataloader):
-        model_name = self.model_name
+    def test_mxfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         from transformers import AutoConfig
 
         scheme = "MXFP4"
@@ -146,8 +137,8 @@ def test_mxfp4_llmcompressor_format(self, dataloader):
         compressed_model = autoround.save_quantized(
             output_dir=quantized_model_path, inplace=True, format="llm_compressor"
         )
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
-        skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj
+        tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
+        skip_layer = compressed_model.model.decoder.layers[1].self_attn.k_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
             and hasattr(tmp_layer, "weight_packed")
@@ -167,8 +158,8 @@ def test_mxfp4_llmcompressor_format(self, dataloader):
         ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_rtn_mxfp4_llmcompressor_format(self, dataloader):
-        model_name = self.model_name
+    def test_rtn_mxfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         from transformers import AutoConfig
 
         scheme = "MXFP4"
@@ -186,8 +177,8 @@ def test_rtn_mxfp4_llmcompressor_format(self, dataloader):
         compressed_model = autoround.save_quantized(
             output_dir=quantized_model_path, inplace=True, format="llm_compressor"
         )
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
-        skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj
+        tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
+        skip_layer = compressed_model.model.decoder.layers[1].self_attn.k_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
             and hasattr(tmp_layer, "weight_packed")
@@ -207,8 +198,8 @@ def test_rtn_mxfp4_llmcompressor_format(self, dataloader):
         ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mxfp8_llmcompressor_format(self, dataloader):
-        model_name = self.model_name
+    def test_mxfp8_llmcompressor_format(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         from transformers import AutoConfig
 
         scheme = "MXFP8"
@@ -221,7 +212,7 @@ def test_mxfp8_llmcompressor_format(self, dataloader):
         )
         quantized_model_path = self.save_dir
         compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
+        tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
             and hasattr(tmp_layer, "weight")
@@ -238,14 +229,14 @@ def test_mxfp8_llmcompressor_format(self, dataloader):
             and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 8
         ), f"Invalid MXFP8 quantization configuration: {quantization_config}"
         folder_size_gb = _get_folder_size(quantized_model_path)
-        # Original opt-125m is < 0.5GB -> quantized mxfp8 model should be smaller but not empty
+        # Original tiny_opt_model_path-125m is < 0.1GB -> quantized mxfp8 model should be smaller but not empty
         assert (
-            0.15 < folder_size_gb < 0.2
-        ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.2 GB)"
+            0.05 < folder_size_gb < 0.1
+        ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.05~0.1 GB)"
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_nvfp4_llmcompressor_format(self, dataloader):
-        model_name = self.model_name
+    def test_nvfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         from transformers import AutoConfig
 
         scheme = "NVFP4"
@@ -258,7 +249,7 @@ def test_nvfp4_llmcompressor_format(self, dataloader):
         )
         quantized_model_path = self.save_dir
         compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
+        tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
             and hasattr(tmp_layer, "weight_global_scale")
@@ -275,14 +266,14 @@ def test_nvfp4_llmcompressor_format(self, dataloader):
             and quantization_config["config_groups"]["group_0"]["input_activations"]["num_bits"] == 4
         ), f"Invalid NVFP4 quantization configuration: {quantization_config}"
         folder_size_gb = _get_folder_size(quantized_model_path)
-        # Original opt-125m is < 0.5GB -> quantized nvfp4 model should be smaller but not empty
+        # Original opt-125m is < 0.1GB -> quantized nvfp4 model should be smaller but not empty
         assert (
-            0.1 < folder_size_gb < 0.15
-        ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.15 GB)"
+            0.05 < folder_size_gb < 0.1
+        ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.05~0.1 GB)"
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_nvfp4_autoround_format(self, dataloader):
-        model_name = self.model_name
+    def test_nvfp4_autoround_format(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         from transformers import AutoConfig
 
         scheme = "NVFP4"
@@ -295,7 +286,7 @@ def test_nvfp4_autoround_format(self, dataloader):
         )
         quantized_model_path = self.save_dir
         compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
+        tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
             and hasattr(tmp_layer, "weight_global_scale")
@@ -306,8 +297,8 @@ def test_nvfp4_autoround_format(self, dataloader):
         ), "Illegal NVFP4 packing name or data_type or shape"
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_nvfp4_autoround_save_quantized(self, dataloader):
-        model_name = self.model_name
+    def test_nvfp4_autoround_save_quantized(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         from transformers import AutoConfig
 
         scheme = "NVFP4"
@@ -321,7 +312,7 @@ def test_nvfp4_autoround_save_quantized(self, dataloader):
         quantized_model_path = self.save_dir
         autoround.quantize()
         compressed_model = autoround.save_quantized(output_dir=quantized_model_path, format="auto_round")
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
+        tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
             and hasattr(tmp_layer, "weight_global_scale")
@@ -332,10 +323,10 @@ def test_nvfp4_autoround_save_quantized(self, dataloader):
         ), "Illegal NVFP4 packing name or data_type or shape"
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_qwen_moe_quant_infer(self, dataloader):
-        model_name = get_model_path("Qwen/Qwen1.5-MoE-A2.7B")
+    def test_qwen_moe_quant_infer(self, tiny_qwen_moe_model_path, dataloader):
+        model_name = tiny_qwen_moe_model_path
         layer_config = {
-            "layers\.(?:[3-9]|1[0-9]|2[0-3])": {"bits": 16, "act_bits": 16},
+            "layers.0": {"bits": 16, "act_bits": 16},
         }
         scheme = "nvfp4"
         autoround = AutoRound(
@@ -349,14 +340,8 @@ def test_qwen_moe_quant_infer(self, dataloader):
         )
         quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round")
-        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="cpu")
-        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
-        from auto_round.eval.evaluation import simple_evaluate_user_model
-
-        result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="piqa", limit=10)
-        print(result["results"]["piqa"]["acc,none"])
-        assert result["results"]["piqa"]["acc,none"] > 0.60
-        shutil.rmtree(quantized_model_path, ignore_errors=True)
+        assert is_model_outputs_similar(model_name, quantized_model_path)
+        shutil.rmtree(self.save_dir, ignore_errors=True)
 
     @pytest.mark.parametrize(
         "scheme, static_kv_dtype, static_attention_dtype",
@@ -370,9 +355,9 @@ def test_qwen_moe_quant_infer(self, dataloader):
             ("NVFP4", "fp8", None),
         ],
     )
-    def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype, dataloader):
-        model_name = self.model_name
-        from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+    def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
+        from transformers import AutoConfig
         from transformers.models.opt.modeling_opt import OPTForCausalLM
 
         config = AutoConfig.from_pretrained(model_name)
diff --git a/test/test_cpu/test_scheme.py b/test/test_cpu/test_scheme.py
index 9bd236765..890b4bee4 100644
--- a/test/test_cpu/test_scheme.py
+++ b/test/test_cpu/test_scheme.py
@@ -54,7 +54,7 @@ def test_mxfp4(self, dataloader):
     def test_vllm(self):
         from auto_round import AutoRoundMLLM
 
-        ar = AutoRoundMLLM(get_model_path("Qwen/Qwen2-VL-2B-Instruct", scheme="W2A16"), nsamples=1, iters=1, seqlen=2)
+        ar = AutoRoundMLLM(get_model_path("Qwen/Qwen2-VL-2B-Instruct"), scheme="W2A16", nsamples=1, iters=1, seqlen=2)
         assert ar.bits == 2
         assert ar.act_bits == 16
 

From 3764e88515d73a35c7e78b529edc5485cbb9f355 Mon Sep 17 00:00:00 2001
From: sys-lpot-val <sys_lpot_val@intel.com>
Date: Sun, 21 Dec 2025 22:57:16 -0800
Subject: [PATCH 13/24] support mllm and untied tiny model

Signed-off-by: sys-lpot-val <sys_lpot_val@intel.com>
Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 auto_round/utils/model.py          |  5 +++
 test/fixtures.py                   | 21 +++++++++-
 test/helpers.py                    | 43 ++++++++++++++------
 test/test_cpu/test_autoround.py    |  8 ++--
 test/test_cpu/test_cli_usage.py    |  6 +--
 test/test_cpu/test_gguf_format.py  | 63 +++++++++++++++---------------
 test/test_cpu/test_mllm.py         |  5 +--
 test/test_cpu/test_scheme.py       | 41 +++++++++----------
 test/test_cpu/test_script.py       | 15 -------
 test/test_cuda/test_auto_scheme.py |  4 +-
 10 files changed, 119 insertions(+), 92 deletions(-)
 delete mode 100644 test/test_cpu/test_script.py

diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
index f4bb15575..38f984663 100644
--- a/auto_round/utils/model.py
+++ b/auto_round/utils/model.py
@@ -1047,6 +1047,11 @@ def set_module(model, key, new_module):
     setattr(module, name_list[-1], new_module)
 
 
+# For getting and setting attribution, such as 'lm_head.weight'
+get_attr = get_module
+set_attr = set_module
+
+
 def get_layer_features(layer):
     """Extracts input and output feature dimensions for supported layers."""
     from auto_round.utils import deepspeed_exists
diff --git a/test/fixtures.py b/test/fixtures.py
index c4e2ea198..e64f9d25b 100644
--- a/test/fixtures.py
+++ b/test/fixtures.py
@@ -8,6 +8,7 @@
 from .helpers import (
     DataLoader,
     deepseek_v2_name_or_path,
+    gemma_name_or_path,
     get_tiny_model,
     gptj_name_or_path,
     lamini_name_or_path,
@@ -66,6 +67,15 @@ def tiny_deepseek_v2_model_path():
     shutil.rmtree(tiny_model_path)
 
 
+@pytest.fixture(scope="session")
+def tiny_gemma_model_path():
+    model_name_or_path = gemma_name_or_path
+    tiny_model_path = "./tmp_tiny_gemma_model_path"
+    tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2)
+    yield tiny_model_path
+    shutil.rmtree(tiny_model_path)
+
+
 @pytest.fixture(scope="session")
 def tiny_qwen_model_path():
     model_name_or_path = qwen_name_or_path
@@ -75,6 +85,15 @@ def tiny_qwen_model_path():
     shutil.rmtree(tiny_model_path)
 
 
+@pytest.fixture(scope="session")
+def tiny_untied_qwen_model_path():
+    model_name_or_path = qwen_name_or_path
+    tiny_model_path = "./tmp_tiny_untied_qwen_model_path"
+    tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, force_untie=True)
+    yield tiny_model_path
+    shutil.rmtree(tiny_model_path)
+
+
 @pytest.fixture(scope="session")
 def tiny_qwen_moe_model_path():
     model_name_or_path = qwen_moe_name_or_path
@@ -88,7 +107,7 @@ def tiny_qwen_moe_model_path():
 def tiny_qwen_vl_model_path():
     model_name_or_path = qwen_vl_name_or_path
     tiny_model_path = "./tmp_tiny_qwen_vl_model_path"
-    tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2)
+    tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2, is_mllm=True)
     yield tiny_model_path
     shutil.rmtree(tiny_model_path)
 
diff --git a/test/helpers.py b/test/helpers.py
index be086497c..9f29c9b1d 100644
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -1,10 +1,11 @@
+import copy
 import os
 
 import pytest
 import torch
 import transformers
 
-from auto_round.utils import llm_load_model
+from auto_round.utils import get_attr, llm_load_model, mllm_load_model, set_attr
 
 
 # Automatic choose local path or model name.
@@ -28,18 +29,15 @@ def get_model_path(model_name: str) -> str:
 deepseek_v2_name_or_path = get_model_path("deepseek-ai/DeepSeek-V2-Lite")
 qwen_moe_name_or_path = get_model_path("Qwen/Qwen1.5-MoE-A2.7B")
 qwen_vl_name_or_path = get_model_path("Qwen/Qwen2-VL-2B-Instruct")
+gemma_name_or_path = get_model_path("benzart/gemma-2b-it-fine-tuning-for-code-test")
 
 
 # Slice model into tiny model for speedup
-def get_tiny_model(model_name_or_path, num_layers=3, **kwargs):
-    kwargs["dtype"] = "auto" if "auto" not in kwargs else kwargs["dtype"]
-    kwargs["trust_remote_code"] = True if "trust_remote_code" not in kwargs else kwargs["trust_remote_code"]
-    model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, **kwargs)
-
-    if hasattr(model.config, "num_hidden_layers"):
-        model.config.num_hidden_layers = num_layers
+def get_tiny_model(model_name_or_path, num_layers=2, is_mllm=False, **kwargs):
+    """Generate a tiny model by slicing layers from the original model."""
 
     def slice_layers(module):
+        """slice layers in the model."""
         for name, child in module.named_children():
             if isinstance(child, torch.nn.ModuleList) and len(child) > num_layers:
                 new_layers = torch.nn.ModuleList(child[:num_layers])
@@ -49,7 +47,12 @@ def slice_layers(module):
                 return True
         return False
 
-    model, tokenizer = llm_load_model(model_name_or_path)
+    kwargs["dtype"] = "auto" if "auto" not in kwargs else kwargs["dtype"]
+    kwargs["trust_remote_code"] = True if "trust_remote_code" not in kwargs else kwargs["trust_remote_code"]
+    if is_mllm:
+        model, processor, tokenizer, image_processor = mllm_load_model(model_name_or_path, **kwargs)
+    else:
+        model, tokenizer = llm_load_model(model_name_or_path, **kwargs)
     slice_layers(model)
 
     if hasattr(model.config, "num_hidden_layers"):
@@ -61,13 +64,25 @@ def slice_layers(module):
 
 
 # for fixture usage only
-def save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2):
-    model = get_tiny_model(model_name_or_path, num_layers=num_layers)
+def save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2, is_mllm=False, force_untie=False, **kwargs):
+    """Generate  a tiny model and save to the specified path."""
+    model = get_tiny_model(model_name_or_path, num_layers=num_layers, is_mllm=is_mllm, **kwargs)
+    if force_untie:
+        if getattr(getattr(model, "config", None), "tie_word_embeddings", False):
+            model.config.tie_word_embeddings = False
+            for key in model._tied_weights_keys:
+                weight = get_attr(model, key)
+                set_attr(model, key, copy.deepcopy(weight))
     tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
     test_path = os.path.dirname(__file__)
     tiny_model_path = os.path.join(test_path, tiny_model_path.removeprefix("./"))
     model.save_pretrained(tiny_model_path)
     tokenizer.save_pretrained(tiny_model_path)
+    if is_mllm:
+        processor = transformers.AutoProcessor.from_pretrained(model_name_or_path, trust_remote_code=True)
+        processor.save_pretrained(tiny_model_path)
+        image_processor = transformers.AutoImageProcessor.from_pretrained(model_name_or_path, trust_remote_code=True)
+        image_processor.save_pretrained(tiny_model_path)
     print(f"[Fixture]: built tiny model path:{tiny_model_path} for testing in session")
     return tiny_model_path
 
@@ -83,6 +98,7 @@ def is_pytest_mode_lazy():
 
 # General model inference code
 def model_infer(model, tokenizer, apply_chat_template=False):
+    """Run model inference and print generated outputs."""
     prompts = [
         "Hello,my name is",
         # "The president of the United States is",
@@ -131,7 +147,10 @@ def __iter__(self):
 
 def get_output(model_name_or_path):
     """Get model output for fixed input."""
-    model, tokenizer = llm_load_model(model_name_or_path)
+    try:
+        model, tokenizer = llm_load_model(model_name_or_path)
+    except:
+        model, processor, tokenizer, image_processor = mllm_load_model(model_name_or_path)
     outputs = model(fixed_input)[0]
     return outputs.detach().cpu()
 
diff --git a/test/test_cpu/test_autoround.py b/test/test_cpu/test_autoround.py
index 3d5d60c24..aa7aeca5e 100644
--- a/test/test_cpu/test_autoround.py
+++ b/test/test_cpu/test_autoround.py
@@ -677,8 +677,8 @@ def test_invalid_layer_config(self, tiny_opt_model_path):
             )
             ar.quantize()
 
-    def test_quant_lm_head(self):
-        model_name = get_model_path("Qwen/Qwen3-8B")
+    def test_quant_lm_head(self, tiny_untied_qwen_model_path):
+        model_name = tiny_untied_qwen_model_path
         ar = AutoRound(model_name, quant_lm_head=True, iters=0, seqlen=8, nsamples=1, disable_opt_rtn=True)
         ar.quantize_and_save(output_dir=self.save_folder, format="auto_round")
         model = AutoModelForCausalLM.from_pretrained(self.save_folder, device_map="cpu")
@@ -700,8 +700,8 @@ def test_quant_lm_head(self):
         assert "lm_head" in model.config.quantization_config.extra_config
         assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4
 
-    def test_quant_lm_head_layer_config(self):
-        model_name = get_model_path("Qwen/Qwen3-8B")
+    def test_quant_lm_head_layer_config(self, tiny_untied_qwen_model_path):
+        model_name = tiny_untied_qwen_model_path
         layer_config = {"lm_head": {"bits": 4}}
         ar = AutoRound(
             model_name,
diff --git a/test/test_cpu/test_cli_usage.py b/test/test_cpu/test_cli_usage.py
index b848c22df..82466dc82 100644
--- a/test/test_cpu/test_cli_usage.py
+++ b/test/test_cpu/test_cli_usage.py
@@ -17,7 +17,7 @@ def teardown_class(self):
         shutil.rmtree("../../saved", ignore_errors=True)
         shutil.rmtree("../../tmp_autoround", ignore_errors=True)
 
-    def test_auto_round_cmd(self, tiny_opt_model_path):
+    def test_auto_round_cmd(self, tiny_opt_model_path, tiny_qwen_vl_model_path):
         python_path = sys.executable
 
         # Test llm script
@@ -56,13 +56,13 @@ def test_auto_round_cmd(self, tiny_opt_model_path):
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --mllm --model {get_model_path('Qwen/Qwen2-VL-2B-Instruct')} --iter 2 --nsamples 2 --seqlen 32 --format auto_round --output_dir ./saved"
+            f"cd ../.. && {python_path} -m auto_round --mllm --model {tiny_qwen_vl_model_path} --iter 2 --nsamples 2 --seqlen 32 --format auto_round --output_dir ./saved"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --mllm --iter 2 --nsamples 2 --model {get_model_path('Qwen/Qwen2-VL-2B-Instruct')} --seqlen 32 --format auto_round"
+            f"cd ../.. && {python_path} -m auto_round --mllm --iter 2 --nsamples 2 --model {tiny_qwen_vl_model_path} --seqlen 32 --format auto_round"
             " --quant_nontext_module --output_dir ./saved "
         )
         if res > 0 or res == -1:
diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py
index 169b07825..81bb0667c 100644
--- a/test/test_cpu/test_gguf_format.py
+++ b/test/test_cpu/test_gguf_format.py
@@ -4,29 +4,29 @@
 
 import pytest
 import torch
+import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
 
-from ..helpers import get_model_path, get_tiny_model
+from ..helpers import get_tiny_model, qwen_name_or_path
 
 
 class TestGGUF:
 
     @classmethod
     def setup_class(self):
-        self.model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct")
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+        self.tokenizer = AutoTokenizer.from_pretrained(qwen_name_or_path, trust_remote_code=True)
 
     @classmethod
     def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_basic_usage(self):
+    def test_basic_usage(self, tiny_gemma_model_path, tiny_qwen_model_path):
         python_path = sys.executable
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --model {get_model_path('benzart/gemma-2b-it-fine-tuning-for-code-test')} "
+            f"cd ../.. && {python_path} -m auto_round --model {tiny_gemma_model_path} "
             f" --bs 16 --iters 0 --nsamples 1 --format gguf:q4_k_m"
         )
         if res > 0 or res == -1:
@@ -34,17 +34,17 @@ def test_basic_usage(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --model {self.model_name}"
+            f"cd ../.. && {python_path} -m auto_round --model {tiny_qwen_model_path}"
             f" --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_q4_0(self):
+    def test_q4_0(self, tiny_qwen_model_path):
         bits, group_size, sym = 4, 32, True
         autoround = AutoRound(
-            self.model_name,
+            tiny_qwen_model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
@@ -61,13 +61,12 @@ def test_q4_0(self):
         text = "There is a girl who likes adventure,"
         inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
         print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
-
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_func(self):
+    def test_func(self, tiny_qwen_model_path):
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
-            self.model_name,
+            tiny_qwen_model_path,
             iters=1,
             nsamples=1,
             seqlen=10,
@@ -84,8 +83,8 @@ def test_func(self):
         print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_gguf_baseline(self):
-        model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct")
+    def test_gguf_baseline(self, tiny_qwen_model_path):
+        model_name = tiny_qwen_model_path
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         autoround = AutoRound(
             model,
@@ -103,16 +102,16 @@ def test_gguf_baseline(self):
         )
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="fake")
+
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto")
         text = "There is a girl who likes adventure,"
         inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
         print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_q4_k_m(self, dataloader):
-        model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct")
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    def test_q4_k_m(self, tiny_qwen_model_path, dataloader):
+        model = get_tiny_model(qwen_name_or_path, num_layers=4)
+        tokenizer = transformers.AutoTokenizer.from_pretrained(qwen_name_or_path, trust_remote_code=True)
         layer_config = {
             "lm_head": {
                 "bits": 4,
@@ -123,8 +122,8 @@ def test_q4_k_m(self, dataloader):
                 "super_group_size": 8,
             },
             "model.embed_tokens": {"bits": 6, "group_size": 32, "super_bits": 6, "super_group_size": 8},
-            "model.layers.12.mlp.gate_proj": {"bits": 3},
-            "model.layers.10.mlp.gate_proj": {"bits": 8},
+            "model.layers.3.mlp.gate_proj": {"bits": 3},
+            "model.layers.1.mlp.gate_proj": {"bits": 8},
         }
         autoround = AutoRound(
             model,
@@ -138,26 +137,26 @@ def test_q4_k_m(self, dataloader):
         )
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake")
-        assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["super_group_size"] == 16
-        assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["data_type"] == "int_sym_dq"
-        assert autoround.layer_config["model.layers.7.self_attn.v_proj"]["data_type"] == "int_asym_dq"
+        assert autoround.layer_config["model.layers.2.self_attn.v_proj"]["super_group_size"] == 16
+        assert autoround.layer_config["model.layers.2.self_attn.v_proj"]["data_type"] == "int_sym_dq"
+        assert autoround.layer_config["model.layers.0.self_attn.v_proj"]["data_type"] == "int_asym_dq"
         assert autoround.model.model.layers[0].self_attn.v_proj.bits == 6
-        assert autoround.model.model.layers[12].self_attn.v_proj.bits == 4
+        assert autoround.model.model.layers[3].self_attn.v_proj.bits == 4
         assert autoround.model.model.embed_tokens.bits == 6
         assert autoround.model.model.embed_tokens.group_size == 16
-        assert autoround.model.model.layers[12].mlp.gate_proj.bits == 3
-        assert autoround.model.model.layers[10].mlp.gate_proj.bits == 8
-        assert autoround.layer_config["model.layers.10.mlp.gate_proj"]["mostly"] == "gguf:q8_0"
+        assert autoround.model.model.layers[3].mlp.gate_proj.bits == 3
+        assert autoround.model.model.layers[1].mlp.gate_proj.bits == 8
+        assert autoround.layer_config["model.layers.1.mlp.gate_proj"]["mostly"] == "gguf:q8_0"
         shutil.rmtree("./saved", ignore_errors=True)
 
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(tiny_qwen_model_path, torch_dtype="auto", trust_remote_code=True)
         autoround = AutoRound(model, tokenizer, iters=0, nsamples=1, seqlen=128, disable_opt_rtn=False)
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake")
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_all_format(self):
-        model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct")
+    def test_all_format(self, tiny_qwen_model_path):
+        model_name = tiny_qwen_model_path
         python_path = sys.executable
         # for gguf_format in ["gguf:q4_0", "gguf:q4_1", "gguf:q4_k_m", "gguf:q6_k"]:
         for gguf_format in ["gguf:q4_k_m"]:
@@ -211,7 +210,7 @@ def test_vlm_gguf(self, tiny_qwen_vl_model_path):
                 assert abs(file_size - 892) < 5.0
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_qtype_setting(self):
+    def test_qtype_setting(self, tiny_qwen_model_path):
         # Qwen2.5-0.5B-Instruct no output, token_embed q6_k fallbakc to q8_0 336M
         # Qwen3-0.6B output q6_k, token_embed q4_0  448M
         # Qwen3-8B output q6_k, token_embed q4_0 4.5G
@@ -219,7 +218,7 @@ def test_qtype_setting(self):
         from auto_round.compressors.utils import set_layer_config
         from auto_round.export.export_to_gguf.config import ModelType
 
-        model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct")
+        model_name = tiny_qwen_model_path
         ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0)
         ar.formats = ["gguf:q4_0"]
         ar.layer_config, _, _ = set_layer_config(
@@ -238,7 +237,7 @@ def test_qtype_setting(self):
         assert ar.layer_config["model.embed_tokens"]["bits"] == 8
         assert "lm_head" not in ar.layer_config
 
-        model_name = "Qwen/Qwen3-0.6B"
+        model_name = tiny_qwen_model_path
         ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0)
         ar.formats = ["gguf:q4_0"]
         ar.layer_config, _, _ = set_layer_config(
diff --git a/test/test_cpu/test_mllm.py b/test/test_cpu/test_mllm.py
index ec5c1487e..0f8f7219c 100644
--- a/test/test_cpu/test_mllm.py
+++ b/test/test_cpu/test_mllm.py
@@ -205,10 +205,10 @@ def test_str_input(self):
         )
         print(output_text[0])
 
-    def test_qwen2_5(self):
+    def test_qwen2_5(self, tiny_qwen_vl_model_path):
         from auto_round.utils import mllm_load_model
 
-        model_name = get_model_path("Qwen/Qwen2.5-VL-3B-Instruct")
+        model_name = tiny_qwen_vl_model_path
         model, processor, tokenizer, image_processor = mllm_load_model(model_name)
         autoround = AutoRoundMLLM(
             model,
@@ -258,4 +258,3 @@ def test_qwen2_5(self):
         output_text = processor.batch_decode(
             generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
-        print(output_text)
diff --git a/test/test_cpu/test_scheme.py b/test/test_cpu/test_scheme.py
index 890b4bee4..7a60a9ccd 100644
--- a/test/test_cpu/test_scheme.py
+++ b/test/test_cpu/test_scheme.py
@@ -1,18 +1,16 @@
 import shutil
 
-import pytest
-import torch
+import transformers
 
 from auto_round import AutoRound
 from auto_round.schemes import QuantizationScheme
 
-from ..helpers import get_model_path, opt_name_or_path, qwen_name_or_path
+from ..helpers import get_model_path, get_tiny_model, opt_name_or_path, qwen_name_or_path
 
 
 class TestAutoRound:
     @classmethod
     def setup_class(self):
-        self.model_name = opt_name_or_path
         self.save_folder = "./saved"
 
     @classmethod
@@ -20,9 +18,9 @@ def teardown_class(self):
         shutil.rmtree(self.save_folder, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_gguf(self, dataloader):
+    def test_gguf(self, tiny_qwen_model_path, dataloader):
         ar = AutoRound(
-            qwen_name_or_path,
+            tiny_qwen_model_path,
             scheme="W2A16",
             nsamples=1,
             iters=1,
@@ -33,60 +31,63 @@ def test_gguf(self, dataloader):
         assert ar.bits == 4
         shutil.rmtree(self.save_folder, ignore_errors=True)
 
-    def test_w4a16(self, dataloader):
-        ar = AutoRound(self.model_name, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
+    def test_w4a16(self, tiny_opt_model_path, dataloader):
+        ar = AutoRound(tiny_opt_model_path, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
         assert ar.bits == 4
         ar.quantize()
 
-    def test_w2a16_rtn(self, dataloader):
-        ar = AutoRound(self.model_name, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=dataloader)
+    def test_w2a16_rtn(self, tiny_opt_model_path, dataloader):
+        ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=dataloader)
         assert ar.bits == 2
         ar.quantize()
 
-    def test_mxfp4(self, dataloader):
-        ar = AutoRound(self.model_name, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
+    def test_mxfp4(self, tiny_opt_model_path, dataloader):
+        ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
         assert ar.bits == 4
         assert ar.act_bits == 4
         assert ar.data_type == "mx_fp"
         assert ar.act_data_type == "mx_fp_rceil"
         ar.quantize()
 
-    def test_vllm(self):
+    def test_vllm(self, tiny_qwen_vl_model_path):
         from auto_round import AutoRoundMLLM
 
-        ar = AutoRoundMLLM(get_model_path("Qwen/Qwen2-VL-2B-Instruct"), scheme="W2A16", nsamples=1, iters=1, seqlen=2)
+        ar = AutoRoundMLLM(tiny_qwen_vl_model_path, scheme="W2A16", nsamples=1, iters=1, seqlen=2)
         assert ar.bits == 2
         assert ar.act_bits == 16
 
-    def test_nvfp4(self, dataloader):
-        ar = AutoRound(self.model_name, scheme="NVFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
+    def test_nvfp4(self, tiny_opt_model_path, dataloader):
+        ar = AutoRound(tiny_opt_model_path, scheme="NVFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
         assert ar.bits == 4
         assert ar.act_bits == 4
         assert ar.data_type == "nv_fp"
         assert ar.act_data_type == "nv_fp4_with_static_gs"
         ar.quantize()
 
-    def test_all_scheme(self, dataloader):
+    def test_all_scheme(self, tiny_opt_model_path, tiny_qwen_model_path, dataloader):
         import copy
 
         preset_schemes = ["W8A16", "MXFP8", "FPW8A16", "FP8_STATIC", "GGUF:Q2_K_S", "GGUF:Q4_K_M"]
         for scheme in preset_schemes:
-            model_name = self.model_name
+            model_name = tiny_opt_model_path
             if "gguf" in scheme.lower():
-                model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct")
+                model_name = tiny_qwen_model_path
             print(f"scheme={scheme}")
             ar = AutoRound(model_name, scheme=scheme, nsamples=1, iters=1, seqlen=2, dataset=dataloader)
             ar.quantize_and_save(self.save_folder)
             shutil.rmtree(self.save_folder, ignore_errors=True)
 
     def test_scheme_in_layer_config(self, dataloader):
+        model = get_tiny_model(opt_name_or_path, num_layers=5)
+        tokenizer = transformers.AutoTokenizer.from_pretrained(opt_name_or_path, trust_remote_code=True)
         layer_config = {
             "model.decoder.layers.2.self_attn": {"bits": 2},
             "model.decoder.layers.3.self_attn.v_proj": "W8A16",
             "model.decoder.layers.4.self_attn.k_proj": QuantizationScheme.from_dict({"group_size": 64}),
         }
         ar = AutoRound(
-            opt_name_or_path,
+            model,
+            tokenizer,
             scheme="W3A16",
             nsamples=1,
             iters=1,
diff --git a/test/test_cpu/test_script.py b/test/test_cpu/test_script.py
deleted file mode 100644
index aa25d7f61..000000000
--- a/test/test_cpu/test_script.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import os
-
-import pytest
-
-
-class TestScript:
-    def test_default(self):
-        os.system(
-            """
-                cd ../.. && 
-                python -m auto_round
-                    --iters 2
-                    --deployment_device fake
-                    --output_dir ./tmp_script_test"""
-        )
diff --git a/test/test_cuda/test_auto_scheme.py b/test/test_cuda/test_auto_scheme.py
index b8c40c470..259bc4450 100644
--- a/test/test_cuda/test_auto_scheme.py
+++ b/test/test_cuda/test_auto_scheme.py
@@ -223,8 +223,8 @@ def test_layer_config(self):
         print(avg_bits)
         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
 
-    def test_lm_head_and_mix_dtype(self):
-        model_name = get_model_path("qwen/Qwen3-8B")
+    def test_lm_head_and_mix_dtype(self, tiny_untied_qwen_model_path):
+        model_name = tiny_untied_qwen_model_path
         model = get_tiny_model(model_name)
         tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         target_bits = 6

From 418022fa5ae87f03e1f3cae7643403d49fbf4740 Mon Sep 17 00:00:00 2001
From: "Sun, Xuehao" <xuehao.sun@intel.com>
Date: Mon, 22 Dec 2025 15:47:24 +0800
Subject: [PATCH 14/24] fix ut path

Signed-off-by: Sun, Xuehao <xuehao.sun@intel.com>
---
 .azure-pipelines/scripts/ut/run_ut.sh      |  5 ++---
 .azure-pipelines/scripts/ut/run_ut_cuda.sh | 24 ++++++++++------------
 .azure-pipelines/scripts/ut/run_ut_hpu.sh  |  7 +++----
 3 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/.azure-pipelines/scripts/ut/run_ut.sh b/.azure-pipelines/scripts/ut/run_ut.sh
index dcf1a7170..e7d3d9e00 100644
--- a/.azure-pipelines/scripts/ut/run_ut.sh
+++ b/.azure-pipelines/scripts/ut/run_ut.sh
@@ -19,8 +19,7 @@ cd /auto-round && uv pip install .
 echo "##[endgroup]"
 uv pip list
 
-cd /auto-round/test/test_cpu || exit 1
-find . -type f -exec sed -i '/sys\.path\.insert(0, "\.\.")/d' {} +
+cd /auto-round/test || exit 1
 
 export LD_LIBRARY_PATH=${HOME}/.venv/lib/:$LD_LIBRARY_PATH
 export FORCE_BF16=1
@@ -32,7 +31,7 @@ mkdir -p ${LOG_DIR}
 ut_log_name=${LOG_DIR}/ut.log
 
 # Split test files into 5 parts
-find . -name "test*.py" | sort > all_tests.txt
+find ./test_cpu -name "test*.py" | sort > all_tests.txt
 total_lines=$(wc -l < all_tests.txt)
 NUM_CHUNKS=5
 q=$(( total_lines / NUM_CHUNKS ))
diff --git a/.azure-pipelines/scripts/ut/run_ut_cuda.sh b/.azure-pipelines/scripts/ut/run_ut_cuda.sh
index 18a9bb00d..0f111d3fa 100644
--- a/.azure-pipelines/scripts/ut/run_ut_cuda.sh
+++ b/.azure-pipelines/scripts/ut/run_ut_cuda.sh
@@ -27,16 +27,14 @@ function create_conda_env() {
 
     # install AutoRound
     cd ${REPO_PATH}
-    pip uninstall auto-round -y
+    uv pip install torch==2.8.0 torchvision
     uv pip install -r requirements.txt
-    sed -i '/^torch==/d;/^transformers==/d;/^lm-eval==/d' requirements.txt
     if [ -d "/proc/driver/nvidia" ]; then
         export PATH=/usr/local/cuda/bin${PATH:+:${PATH}}
         export LD_LIBRARY_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/nvidia/nvjitlink/lib:$LD_LIBRARY_PATH
     fi
     uv pip install --no-build-isolation .
     uv pip install pytest-cov pytest-html cmake==4.0.2
-    uv pip install torch==2.8.0 torchvision
 }
 
 function print_test_results_table() {
@@ -92,7 +90,7 @@ function run_unit_test() {
     # install unit test dependencies
     create_conda_env
 
-    cd ${REPO_PATH}/test/test_cuda
+    cd ${REPO_PATH}/test
     rm -rf .coverage* *.xml *.html
 
     uv pip install -v git+https://github.com/casper-hansen/AutoAWQ.git --no-build-isolation
@@ -100,15 +98,15 @@ function run_unit_test() {
     uv pip install -r https://raw.githubusercontent.com/ModelCloud/GPTQModel/refs/heads/main/requirements.txt
     CMAKE_ARGS="-DGGML_CUDA=on -DLLAVA_BUILD=off" uv pip install llama-cpp-python
     uv pip install 'git+https://github.com/ggml-org/llama.cpp.git#subdirectory=gguf-py'
-    uv pip install -r requirements.txt
-    uv pip install -r requirements_diffusion.txt
+    uv pip install -r test_cuda/requirements.txt
+    uv pip install -r test_cuda/requirements_diffusion.txt
 
     pip list > ${LOG_DIR}/ut_pip_list.txt
     export COVERAGE_RCFILE=${REPO_PATH}/.azure-pipelines/scripts/ut/.coverage
     local auto_round_path=$(python -c 'import auto_round; print(auto_round.__path__[0])')
 
     # run unit tests individually with separate logs
-    for test_file in $(find . -name "test_*.py" ! -name "test_*vlms.py" ! -name "test_llmc*.py" | sort); do
+    for test_file in $(find ./test_cuda -name "test_*.py" ! -name "test_*vlms.py" ! -name "test_llmc*.py" | sort); do
         local test_basename=$(basename ${test_file} .py)
         local ut_log_name=${LOG_DIR}/unittest_cuda_${test_basename}.log
         echo "Running ${test_file}..."
@@ -128,7 +126,7 @@ function run_unit_test() {
 function run_unit_test_vlm() {
     # install unit test dependencies
     create_conda_env
-    cd ${REPO_PATH}/test/test_cuda
+    cd ${REPO_PATH}/test
     rm -rf .coverage* *.xml *.html
 
     uv pip install git+https://github.com/haotian-liu/LLaVA.git@v1.2.2 --no-deps
@@ -138,14 +136,14 @@ function run_unit_test_vlm() {
     uv pip install git+https://github.com/deepseek-ai/DeepSeek-VL2.git timm attrdict --no-deps
     uv pip install -v git+https://github.com/casper-hansen/AutoAWQ.git@v0.2.0 --no-build-isolation
     uv pip install flash-attn==2.7.4.post1 --no-build-isolation
-    uv pip install -r requirements_vlm.txt
+    uv pip install -r test_cuda/requirements_vlm.txt
 
     pip list > ${LOG_DIR}/vlm_ut_pip_list.txt
     export COVERAGE_RCFILE=${REPO_PATH}/.azure-pipelines/scripts/ut/.coverage
     local auto_round_path=$(python -c 'import auto_round; print(auto_round.__path__[0])')
 
     # run VLM unit tests individually with separate logs
-    for test_file in $(find . -name "test*vlms.py"); do
+    for test_file in $(find ./test_cuda -name "test*vlms.py"); do
         local test_basename=$(basename ${test_file} .py)
         local ut_log_name=${LOG_DIR}/unittest_cuda_vlm_${test_basename}.log
         echo "Running ${test_file}..."
@@ -166,17 +164,17 @@ function run_unit_test_llmc() {
     # install unit test dependencies
     create_conda_env
 
-    cd ${REPO_PATH}/test/test_cuda
+    cd ${REPO_PATH}/test
     rm -rf .coverage* *.xml *.html
 
-    uv pip install -r requirements_llmc.txt
+    uv pip install -r test_cuda/requirements_llmc.txt
 
     pip list > ${LOG_DIR}/llmc_ut_pip_list.txt
     export COVERAGE_RCFILE=${REPO_PATH}/.azure-pipelines/scripts/ut/.coverage
     local auto_round_path=$(python -c 'import auto_round; print(auto_round.__path__[0])')
 
     # run unit tests individually with separate logs
-    for test_file in $(find . -name "test_llmc*.py" | sort); do
+    for test_file in $(find ./test_cuda -name "test_llmc*.py" | sort); do
         local test_basename=$(basename ${test_file} .py)
         local ut_log_name=${LOG_DIR}/unittest_cuda_llmc_${test_basename}.log
         echo "Running ${test_file}..."
diff --git a/.azure-pipelines/scripts/ut/run_ut_hpu.sh b/.azure-pipelines/scripts/ut/run_ut_hpu.sh
index 3c3bb6991..b370edfb5 100644
--- a/.azure-pipelines/scripts/ut/run_ut_hpu.sh
+++ b/.azure-pipelines/scripts/ut/run_ut_hpu.sh
@@ -7,8 +7,7 @@ export TQDM_MININTERVAL=60
 pip install pytest-cov pytest-html
 pip list
 
-cd /auto-round/test/test_hpu || exit 1
-find . -type f -exec sed -i '/sys\.path\.insert(0, "\.\.")/d' {} +
+cd /auto-round/test || exit 1
 
 export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
 export FORCE_BF16=1
@@ -19,8 +18,8 @@ LOG_DIR=/auto-round/log_dir
 mkdir -p ${LOG_DIR}
 ut_log_name=${LOG_DIR}/ut.log
 
-find . -name "test*.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html  --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_lazy.sh
-find . -name "test*.py" | sed "s,\.\/,python -m pytest --mode compile --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html  --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_compile.sh
+find ./test_hpu -name "test*.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html  --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_lazy.sh
+find ./test_hpu -name "test*.py" | sed "s,\.\/,python -m pytest --mode compile --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html  --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_compile.sh
 
 cat run_lazy.sh
 bash run_lazy.sh 2>&1 | tee ${ut_log_name}

From 71ec4c220842d751cc72257e5d7dd6bd84458673 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Mon, 22 Dec 2025 11:15:25 -0500
Subject: [PATCH 15/24] fix UT failures

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 auto_round/compressors/mllm/compressor.py |  3 +++
 test/fixtures.py                          | 10 ++++++++++
 test/helpers.py                           |  8 +++++++-
 test/test_cpu/test_gguf_format.py         |  8 +++++---
 test/test_cpu/test_mllm.py                |  4 ++--
 5 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/auto_round/compressors/mllm/compressor.py b/auto_round/compressors/mllm/compressor.py
index c6808eeb0..e690bc7a9 100644
--- a/auto_round/compressors/mllm/compressor.py
+++ b/auto_round/compressors/mllm/compressor.py
@@ -205,6 +205,9 @@ def __init__(
         if hasattr(model, "name_or_path") and any([name in model.name_or_path for name in MISTRAL_3_2_MODELS]):
             template = "mistral3_2"
         if iters > 0:
+            # TODO: Remove after fixing https://github.com/huggingface/transformers/issues/43005
+            model.config.model_type = model.config.to_dict()["model_type"]
+
             self.template = template if template is not None else model.config.model_type
             if not isinstance(dataset, torch.utils.data.DataLoader):
                 self.template = get_template(
diff --git a/test/fixtures.py b/test/fixtures.py
index e64f9d25b..86bc36e48 100644
--- a/test/fixtures.py
+++ b/test/fixtures.py
@@ -14,6 +14,7 @@
     lamini_name_or_path,
     opt_name_or_path,
     phi2_name_or_path,
+    qwen_2_5_vl_name_or_path,
     qwen_moe_name_or_path,
     qwen_name_or_path,
     qwen_vl_name_or_path,
@@ -112,6 +113,15 @@ def tiny_qwen_vl_model_path():
     shutil.rmtree(tiny_model_path)
 
 
+@pytest.fixture(scope="session")
+def tiny_qwen_2_5_vl_model_path():
+    model_name_or_path = qwen_2_5_vl_name_or_path
+    tiny_model_path = "./tmp_tiny_qwen_2_5_vl_model_path"
+    tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2, is_mllm=True)
+    yield tiny_model_path
+    shutil.rmtree(tiny_model_path)
+
+
 # Create objective fixtures for testing
 @pytest.fixture(scope="function")
 def tiny_opt_model():
diff --git a/test/helpers.py b/test/helpers.py
index 9f29c9b1d..b239bb451 100644
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -29,6 +29,7 @@ def get_model_path(model_name: str) -> str:
 deepseek_v2_name_or_path = get_model_path("deepseek-ai/DeepSeek-V2-Lite")
 qwen_moe_name_or_path = get_model_path("Qwen/Qwen1.5-MoE-A2.7B")
 qwen_vl_name_or_path = get_model_path("Qwen/Qwen2-VL-2B-Instruct")
+qwen_2_5_vl_name_or_path = get_model_path("Qwen/Qwen2.5-VL-3B-Instruct")
 gemma_name_or_path = get_model_path("benzart/gemma-2b-it-fine-tuning-for-code-test")
 
 
@@ -51,6 +52,11 @@ def slice_layers(module):
     kwargs["trust_remote_code"] = True if "trust_remote_code" not in kwargs else kwargs["trust_remote_code"]
     if is_mllm:
         model, processor, tokenizer, image_processor = mllm_load_model(model_name_or_path, **kwargs)
+        if hasattr(model.config, "vision_config"):
+            if hasattr(model.config.vision_config, "num_hidden_layers"):  # mistral, etc.
+                model.config.num_hidden_layers = num_layers
+            elif hasattr(model.config.vision_config, "depth"):  # qwen vl
+                model.config.vision_config.depth = num_layers
     else:
         model, tokenizer = llm_load_model(model_name_or_path, **kwargs)
     slice_layers(model)
@@ -80,8 +86,8 @@ def save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2, is_mllm=F
     tokenizer.save_pretrained(tiny_model_path)
     if is_mllm:
         processor = transformers.AutoProcessor.from_pretrained(model_name_or_path, trust_remote_code=True)
-        processor.save_pretrained(tiny_model_path)
         image_processor = transformers.AutoImageProcessor.from_pretrained(model_name_or_path, trust_remote_code=True)
+        processor.save_pretrained(tiny_model_path)
         image_processor.save_pretrained(tiny_model_path)
     print(f"[Fixture]: built tiny model path:{tiny_model_path} for testing in session")
     return tiny_model_path
diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py
index 81bb0667c..b7f25541c 100644
--- a/test/test_cpu/test_gguf_format.py
+++ b/test/test_cpu/test_gguf_format.py
@@ -9,7 +9,7 @@
 
 from auto_round import AutoRound
 
-from ..helpers import get_tiny_model, qwen_name_or_path
+from ..helpers import get_tiny_model, qwen_name_or_path, qwen_vl_name_or_path
 
 
 class TestGGUF:
@@ -185,8 +185,10 @@ def test_all_format(self, tiny_qwen_model_path):
             assert False, "cmd line test fail, please have a check"
         shutil.rmtree("../../tmp_autoround", ignore_errors=True)
 
-    def test_vlm_gguf(self, tiny_qwen_vl_model_path):
-        model_name = tiny_qwen_vl_model_path
+    def test_vlm_gguf(self):
+        # TODO: Using two-layers tiny model will return ValueError:
+        # Can not map tensor 'model.layers.10.input_layernorm.weight'
+        model_name = qwen_vl_name_or_path
         from auto_round import AutoRoundMLLM
         from auto_round.utils import mllm_load_model
 
diff --git a/test/test_cpu/test_mllm.py b/test/test_cpu/test_mllm.py
index 0f8f7219c..2eb1d3e2f 100644
--- a/test/test_cpu/test_mllm.py
+++ b/test/test_cpu/test_mllm.py
@@ -205,10 +205,10 @@ def test_str_input(self):
         )
         print(output_text[0])
 
-    def test_qwen2_5(self, tiny_qwen_vl_model_path):
+    def test_qwen2_5(self, tiny_qwen_2_5_vl_model_path):
         from auto_round.utils import mllm_load_model
 
-        model_name = tiny_qwen_vl_model_path
+        model_name = tiny_qwen_2_5_vl_model_path
         model, processor, tokenizer, image_processor = mllm_load_model(model_name)
         autoround = AutoRoundMLLM(
             model,

From f1700bd901797ead09cd8f802ad88075a1067e6d Mon Sep 17 00:00:00 2001
From: n1ck-guo <heng.guo@intel.com>
Date: Tue, 23 Dec 2025 00:25:35 -0500
Subject: [PATCH 16/24] update cuda ut

Signed-off-by: n1ck-guo <heng.guo@intel.com>
---
 test/helpers.py                            |   4 +
 test/test_cuda/test_gguf.py                | 114 ++++++++-------------
 test/test_cuda/test_main_func.py           |  54 +++++-----
 test/test_cuda/test_marlin_backend.py      |  56 +++++-----
 test/test_cuda/test_mix_bits.py            |  53 +++++-----
 test/test_cuda/test_multiple_card.py       |  72 ++++++-------
 test/test_cuda/test_multiple_card_calib.py |  18 ++--
 test/test_cuda/test_mxfp_and_nvfp_quant.py |   9 +-
 test/test_cuda/test_mxfp_nvfp.py           |  40 +++++---
 test/test_cuda/test_qbits.py               |  43 +++++---
 test/test_cuda/test_scheme.py              |  62 ++++++-----
 test/test_cuda/test_torch_backend.py       |  45 ++++----
 12 files changed, 299 insertions(+), 271 deletions(-)

diff --git a/test/helpers.py b/test/helpers.py
index b239bb451..f30e632f7 100644
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -13,6 +13,9 @@ def get_model_path(model_name: str) -> str:
     ut_path = f"/tf_dataset/auto_round/models/{model_name}"
     local_path = f"/models/{model_name.split('/')[-1]}"
 
+    if "DeepSeek-V2-Lite" in model_name and os.path.exists("/data0/deepseek-ai/DeepSeek-V2-Lite"):
+        return "/data0/deepseek-ai/DeepSeek-V2-Lite"
+
     if os.path.exists(ut_path):
         return ut_path
     elif os.path.exists(local_path):
@@ -36,6 +39,7 @@ def get_model_path(model_name: str) -> str:
 # Slice model into tiny model for speedup
 def get_tiny_model(model_name_or_path, num_layers=2, is_mllm=False, **kwargs):
     """Generate a tiny model by slicing layers from the original model."""
+    model_name_or_path = get_model_path(model_name_or_path)
 
     def slice_layers(module):
         """slice layers in the model."""
diff --git a/test/test_cuda/test_gguf.py b/test/test_cuda/test_gguf.py
index a7076667c..7a3a0cd89 100644
--- a/test/test_cuda/test_gguf.py
+++ b/test/test_cuda/test_gguf.py
@@ -10,19 +10,30 @@
 from auto_round import AutoRound
 from auto_round.testing_utils import require_gguf
 
+from ..helpers import get_model_path, get_tiny_model, save_tiny_model
+
 
 class TestAutoRound:
-    @classmethod
-    def teardown_class(self):
+    save_dir = "./saved"
+
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
+
+        # Yield to hand control to the test methods
+        yield
+
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_gguf
-    def test_gguf_format(self, dataloader):
-        model_name = "Qwen/Qwen2.5-0.5B-Instruct"
+    def test_gguf_format(self, tiny_qwen_model_path, dataloader):
         bits, group_size, sym = 4, 32, False
         autoround = AutoRound(
-            model_name,
+            tiny_qwen_model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
@@ -44,9 +55,8 @@ def test_gguf_format(self, dataloader):
         shutil.rmtree("./saved", ignore_errors=True)
 
         save_dir = os.path.join(os.path.dirname(__file__), "saved")
-        model_path = "Qwen/Qwen2.5-0.5B-Instruct"
         res = os.system(
-            f"cd ../.. && {sys.executable} -m auto_round --model {model_path} --iter 2 "
+            f"cd ../.. && {sys.executable} -m auto_round --model {tiny_qwen_model_path} --iter 2 "
             f"--output_dir {save_dir} --nsample 2 --format gguf:q4_0 --device 0"
         )
         print(save_dir)
@@ -54,8 +64,8 @@ def test_gguf_format(self, dataloader):
 
         from llama_cpp import Llama
 
-        gguf_file = os.listdir("saved/Qwen2.5-0.5B-Instruct-gguf")[0]
-        llm = Llama(f"saved/Qwen2.5-0.5B-Instruct-gguf/{gguf_file}", n_gpu_layers=-1)
+        gguf_file = os.listdir("saved/tmp_tiny_qwen_model_path-gguf")[0]
+        llm = Llama(f"saved/tmp_tiny_qwen_model_path-gguf/{gguf_file}", n_gpu_layers=-1)
         output = llm("There is a girl who likes adventure,", max_tokens=32)
         print(output)
         shutil.rmtree("./saved", ignore_errors=True)
@@ -63,9 +73,12 @@ def test_gguf_format(self, dataloader):
     @require_gguf
     def test_q2_k_export(self, dataloader):
         bits, group_size, sym = 2, 16, False
-        model_name = "Qwen/Qwen2.5-1.5B-Instruct"
+        model_path = get_model_path("Qwen/Qwen2.5-1.5B-Instruct")
+        model = get_tiny_model(model_path)
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
         autoround = AutoRound(
-            model_name,
+            model,
+            tokenizer,
             bits=bits,
             group_size=group_size,
             sym=sym,
@@ -84,20 +97,13 @@ def test_q2_k_export(self, dataloader):
         inputs = autoround.tokenizer(text, return_tensors="pt").to(model.device)
         result = autoround.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])
         print(result)
-
-        from auto_round.eval.evaluation import simple_evaluate_user_model
-
-        result = simple_evaluate_user_model(model, autoround.tokenizer, batch_size=16, tasks="piqa")
-        assert result["results"]["piqa"]["acc,none"] > 0.45
-
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     @require_gguf
-    def test_basic_usage(self):
-        model_name = "Qwen/Qwen2.5-0.5B-Instruct"
+    def test_basic_usage(self, tiny_qwen_model_path):
         python_path = sys.executable
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --model {model_name} --eval_task_by_task"
+            f"cd ../.. && {python_path} -m auto_round --model {tiny_qwen_model_path} --eval_task_by_task"
             f" --tasks piqa,openbookqa --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0 --eval_model_dtype bf16"
         )
         if res > 0 or res == -1:
@@ -106,7 +112,7 @@ def test_basic_usage(self):
 
     @require_gguf
     def test_q4_0(self):
-        model_name = "Qwen/Qwen2.5-0.5B-Instruct"
+        model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct")
         bits, group_size, sym = 4, 32, True
         autoround = AutoRound(model_name, bits=bits, group_size=group_size, sym=sym, iters=1, data_type="int")
         autoround.quantize()
@@ -127,7 +133,7 @@ def test_q4_0(self):
 
     @require_gguf
     def test_q4_1(self):
-        model_name = "Qwen/Qwen2.5-0.5B-Instruct"
+        model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct")
         bits, group_size, sym = 4, 32, False
         autoround = AutoRound(model=model_name, bits=bits, group_size=group_size, sym=sym, iters=1, data_type="int")
         autoround.quantize()
@@ -148,31 +154,23 @@ def test_q4_1(self):
 
     @require_gguf
     def test_all_format(self):
-        from auto_round.export.export_to_gguf.config import GGUF_CONFIG
+        for model_name in ["qwen/Qwen3-8B", "meta-llama/Llama-3.1-8B-Instruct", "meta-llama/Llama-3.2-3B"]:
+            for gguf_format in ["gguf:q5_0", "gguf:q5_1", "gguf:q3_k_m", "q5_k_m", "q6_k", "q8_0"]:
+                model_path = get_model_path(model_name)
+                tiny_model_path = "tmp_tiny_model"
+                tiny_model_path = save_tiny_model(model_path, tiny_model_path, num_layers=2)
+                ar = AutoRound(tiny_model_path, scheme=gguf_format, iters=0, nsampels=1, seqlen=16)
+                ar.quantize_and_save(output_dir=self.save_dir, format=gguf_format)
 
-        python_path = sys.executable
-        for model_name in ["/models/Qwen3-8B/", "/models/Llama-3.2-3B/", "/models/Meta-Llama-3.1-8B-Instruct"]:
-            for gguf_format in GGUF_CONFIG.keys():
-                print(model_name, gguf_format)
-                res = os.system(
-                    f"cd ../.. && {python_path} -m auto_round --model {model_name} "
-                    f" --bs 16 --iters 1 --nsamples 1 --format fake,{gguf_format}"
-                )
-                if res > 0 or res == -1:
-                    assert False, "cmd line test fail, please have a check"
-                shutil.rmtree("../../tmp_autoround", ignore_errors=True)
-
-                res = os.system(
-                    f"cd ../.. && {python_path} -m auto_round --model {model_name} "
-                    f" --bs 16 --iters 0 --nsamples 1 --format {gguf_format}"
-                )
-                if res > 0 or res == -1:
-                    assert False, "cmd line test fail, please have a check"
-                shutil.rmtree("../../tmp_autoround", ignore_errors=True)
+                ar = AutoRound(tiny_model_path, scheme=gguf_format, iters=1, nsampels=1, seqlen=16)
+                ar.quantize_and_save(output_dir=self.save_dir, format=gguf_format)
+
+                shutil.rmtree(tiny_model_path, ignore_errors=True)
+                shutil.rmtree(self.save_dir, ignore_errors=True)
 
     @require_gguf
     def test_vlm_gguf(self):
-        model_name = "/models/Qwen2.5-VL-7B-Instruct"
+        model_name = "/models/Qwen2-VL-2B-Instruct"
         from auto_round import AutoRoundMLLM
         from auto_round.utils import mllm_load_model
 
@@ -188,7 +186,7 @@ def test_vlm_gguf(self):
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
         assert "mmproj-model.gguf" in os.listdir("./saved")
-        file_size = os.path.getsize("./saved/Qwen2.5-VL-7B-Instruct-Q4_0.gguf") / 1024**2
+        file_size = os.path.getsize("./saved/Qwen2-VL-2B-Instruct-Q4_0.gguf") / 1024**2
         assert abs(file_size - 4242) < 5.0
         file_size = os.path.getsize("./saved/mmproj-model.gguf") / 1024**2
         assert abs(file_size - 2580) < 5.0
@@ -214,31 +212,3 @@ def test_vlm_gguf(self):
         file_size = os.path.getsize("./saved/mmproj-model.gguf") / 1024**2
         assert abs(file_size - 1599) < 5.0
         shutil.rmtree(quantized_model_path, ignore_errors=True)
-
-    # @require_gguf
-    # def test_llama_4(self):
-    #     model_name = "/dataset/Llama-4-Scout-17B-16E-Instruct/"
-    #     from auto_round import AutoRoundMLLM
-    #     from auto_round.utils import mllm_load_model
-
-    #     model, processor, tokenizer, image_processor = mllm_load_model(model_name, use_auto_mapping=False)
-    #     autoround = AutoRoundMLLM(
-    #         model,
-    #         tokenizer=tokenizer,
-    #         processor=processor,
-    #         image_processor=image_processor,
-    #         device="auto",
-    #         iters=0,
-    #     )
-    #     quantized_model_path = "/dataset/Llam-4-test"
-    #     shutil.rmtree(quantized_model_path, ignore_errors=True)
-    #     autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
-    #     assert "mmproj-model.gguf" in os.listdir(quantized_model_path)
-    #     file_size = (
-    #         os.path.getsize(os.path.join(quantized_model_path, "Llama-4-Scout-17B-16E-Instruct-16x17B-Q4_0.gguf"))
-    #         / 1024**2
-    #     )
-    #     assert abs(file_size - 58093.62) < 1.0
-    #     file_size = os.path.getsize(os.path.join(quantized_model_path, "mmproj-model.gguf")) / 1024**2
-    #     assert abs(file_size - 3326.18) < 5.0
-    #     shutil.rmtree(quantized_model_path, ignore_errors=True)
diff --git a/test/test_cuda/test_main_func.py b/test/test_cuda/test_main_func.py
index 3243963fe..ac8b8b91e 100644
--- a/test/test_cuda/test_main_func.py
+++ b/test/test_cuda/test_main_func.py
@@ -13,6 +13,8 @@
 from auto_round.eval.evaluation import simple_evaluate
 from auto_round.testing_utils import require_awq, require_gptqmodel, require_optimum, require_package_version_ut
 
+from ..helpers import get_model_path
+
 
 def get_accuracy(data):
     match = re.search(r"\|acc\s+\|[↑↓]\s+\|\s+([\d.]+)\|", data)
@@ -25,20 +27,26 @@ def get_accuracy(data):
 
 
 class TestMainFunc:
-    @classmethod
-    def setup_class(self):
-        self.save_dir = "./saved"
-        self.tasks = "lambada_openai"
+    save_dir = "./saved"
+    tasks = "lambada_openai"
+
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
+
+        # Yield to hand control to the test methods
+        yield
 
-    @classmethod
-    def teardown_class(self):
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_gptqmodel
     @require_optimum
     def test_backend(self):
-        model_name = "/models/opt-125m"
+        model_name = get_model_path("facebook/opt-125m")
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         autoround = AutoRound(model, tokenizer, bits=4, group_size=128)
@@ -66,7 +74,7 @@ def test_backend(self):
     @require_awq
     @require_package_version_ut("transformers", "<4.57.0")
     def test_backend_awq(self):
-        model_name = "/models/opt-125m"
+        model_name = get_model_path("facebook/opt-125m")
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         autoround = AutoRound(model, tokenizer, bits=4, group_size=128)
@@ -84,7 +92,7 @@ def test_backend_awq(self):
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
     @require_gptqmodel
     def test_fp_layers(self):
-        model_name = "/models/opt-125m"
+        model_name = get_model_path("facebook/opt-125m")
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         from auto_round.compressors.utils import get_fp_layer_names
@@ -109,7 +117,7 @@ def test_fp_layers(self):
     @require_awq
     @require_package_version_ut("transformers", "<4.57.0")
     def test_fp_layers_awq(self):
-        model_name = "/models/opt-125m"
+        model_name = get_model_path("facebook/opt-125m")
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         from auto_round.compressors.utils import get_fp_layer_names
@@ -131,17 +139,16 @@ def test_fp_layers_awq(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
-    def test_undivided_group_size_tuning(self):
-        model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def test_undivided_group_size_tuning(self, tiny_opt_model_path):
+        model = AutoModelForCausalLM.from_pretrained(tiny_opt_model_path, torch_dtype=torch.float16, device_map="auto")
+        tokenizer = AutoTokenizer.from_pretrained(tiny_opt_model_path)
 
         autoround = AutoRound(model, tokenizer, bits=4, group_size=127, nsamples=2, iters=2)
         autoround.quantize()
 
     @require_gptqmodel
     def test_adam(self):
-        model_name = "/models/opt-125m"
+        model_name = get_model_path("facebook/opt-125m")
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         autoround = AutoRoundAdam(model, tokenizer, bits=4, group_size=128)
@@ -162,7 +169,7 @@ def test_autoround_asym(self):  ##need to install false
         except ImportError as e:
             print("skip autoround asym test, as autoround is not installed from source")
             return
-        model_name = "/models/opt-125m"
+        model_name = get_model_path("facebook/opt-125m")
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         autoround = AutoRound(model, tokenizer, bits=4, group_size=128, sym=False)
@@ -177,12 +184,12 @@ def test_autoround_asym(self):  ##need to install false
         assert accuracy > 0.35
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_attention_mask_lm_head(self):
+    def test_attention_mask_lm_head(self, tiny_qwen_moe_model_path):
         from transformers import AutoTokenizer
 
-        model_name = "/models/Qwen3-8B"
+        # model_name = "/models/Qwen3-8B"
         # model_name = "/models/Qwen3-0.6B"
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_moe_model_path)
         text = ["haha", "hello world"]
         res = tokenizer(text, return_tensors="pt", max_length=8, padding="max_length", truncation=True)
         res.data.pop("attention_mask")
@@ -194,14 +201,13 @@ def test_attention_mask_lm_head(self):
         data.append(res.data)
         from auto_round import AutoRound
 
-        ar = AutoRound(model_name, iters=1, dataset=data, seqlen=8, quant_lm_head=True)
+        ar = AutoRound(tiny_qwen_moe_model_path, iters=1, dataset=data, seqlen=8, quant_lm_head=True)
         ar.quantize()
 
-    def test_low_cpu_mem_usage(self):
+    def test_low_cpu_mem_usage(self, tiny_opt_model_path):
         bits, group_size = 4, 32
-        model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(tiny_opt_model_path, torch_dtype="auto", trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(tiny_opt_model_path, trust_remote_code=True)
         quantized_model_path = "./saved"
         autoround = AutoRound(
             model,
diff --git a/test/test_cuda/test_marlin_backend.py b/test/test_cuda/test_marlin_backend.py
index 334cb2697..8d7594086 100644
--- a/test/test_cuda/test_marlin_backend.py
+++ b/test/test_cuda/test_marlin_backend.py
@@ -11,6 +11,20 @@
 
 
 class TestAutoRoundMarlinBackend:
+    save_dir = "./saved"
+
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
+
+        # Yield to hand control to the test methods
+        yield
+
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
+        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree("runs", ignore_errors=True)
 
     def test_marlin_group_size(self, dataloader):
         for group_size in [-1, 64]:
@@ -28,15 +42,15 @@ def test_marlin_group_size(self, dataloader):
                 seqlen=2,
                 dataset=dataloader,
             )
-            quantized_model_path = self.save_folder
+            quantized_model_path = self.save_dir
             autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
 
             quantization_config = AutoRoundConfig(backend="marlin")
             model = AutoModelForCausalLM.from_pretrained(
-                self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+                self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
             )
 
-            tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+            tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
             model_infer(model, tokenizer)
             result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
             print(result["results"]["lambada_openai"]["acc,none"])
@@ -57,30 +71,20 @@ def test_marlin_group_size(self, dataloader):
                 seqlen=2,
                 dataset=dataloader,
             )
-            quantized_model_path = self.save_folder
+            quantized_model_path = self.save_dir
             autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
 
             quantization_config = AutoRoundConfig(backend="marlin")
             model = AutoModelForCausalLM.from_pretrained(
-                self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+                self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
             )
 
-            tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+            tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
             model_infer(model, tokenizer)
             result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
             print(result["results"]["lambada_openai"]["acc,none"])
             assert result["results"]["lambada_openai"]["acc,none"] > 0.14
 
-    @classmethod
-    def setup_class(self):
-        self.model_name = "/models/opt-125m"
-        self.save_folder = "./saved"
-
-    @classmethod
-    def teardown_class(self):
-        shutil.rmtree("./saved", ignore_errors=True)
-        shutil.rmtree("runs", ignore_errors=True)
-
     def test_marlin_4bits_sym_with_zp_m_1(self, dataloader):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -95,15 +99,15 @@ def test_marlin_4bits_sym_with_zp_m_1(self, dataloader):
             seqlen=2,
             dataset=dataloader,
         )
-        quantized_model_path = self.save_folder
+        quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
 
         quantization_config = AutoRoundConfig(backend="marlin")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -111,10 +115,10 @@ def test_marlin_4bits_sym_with_zp_m_1(self, dataloader):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            self.save_dir, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -136,18 +140,18 @@ def test_marlin_4bits_sym_with_zp_m_1(self, dataloader):
     #         seqlen=2,
     #         dataset=dataloader,
     #     )
-    #     quantized_model_path = self.save_folder
+    #     quantized_model_path = self.save_dir
     #     autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
     #
     #     quantization_config = AutoRoundConfig(backend="marlin")
     #     model = AutoModelForCausalLM.from_pretrained(
-    #         self.save_folder,
+    #         self.save_dir,
     #         torch_dtype=torch.float16,
     #         device_map="auto",
     #         quantization_config=quantization_config
     #     )
     #
-    #     tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+    #     tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
     #     model_infer(model, tokenizer)
     #     result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
     #     print(result['results']['lambada_openai']['acc,none'])
@@ -155,13 +159,13 @@ def test_marlin_4bits_sym_with_zp_m_1(self, dataloader):
     #     torch.cuda.empty_cache()
     #
     #     model = AutoModelForCausalLM.from_pretrained(
-    #         self.save_folder,
+    #         self.save_dir,
     #         torch_dtype=torch.bfloat16,
     #         device_map="auto",
     #         quantization_config=quantization_config
     #     )
     #
-    #     tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+    #     tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
     #     model_infer(model, tokenizer)
     #     result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
     #     print(result['results']['lambada_openai']['acc,none'])
diff --git a/test/test_cuda/test_mix_bits.py b/test/test_cuda/test_mix_bits.py
index 958b8ba8e..6988709d5 100644
--- a/test/test_cuda/test_mix_bits.py
+++ b/test/test_cuda/test_mix_bits.py
@@ -14,22 +14,27 @@
     require_package_version_ut,
 )
 
+from ..helpers import get_model_path
+
 
 class TestAutoRound:
-    @classmethod
-    def setup_class(self):
-        self.model_name = "/models/opt-125m"
-        self.save_dir = "./saved"
-        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+    save_dir = "./saved"
+
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
+
+        # Yield to hand control to the test methods
+        yield
 
-    @classmethod
-    def teardown_class(self):
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_gptqmodel
-    def test_mixed_gptqmodel(self, dataloader):
+    def test_mixed_gptqmodel(self, tiny_opt_model_path, dataloader):
         scheme = "W4A16"
         layer_config = {
             "k_proj": {"bits": 8},  # part name
@@ -39,7 +44,7 @@ def test_mixed_gptqmodel(self, dataloader):
             "model.decoder.layers.0.self_attn.q_proj": {"bits": 8},  # full name
         }
         autoround = AutoRound(
-            model=self.model_name,
+            model=tiny_opt_model_path,
             scheme=scheme,
             iters=2,
             seqlen=2,
@@ -58,7 +63,7 @@ def test_mixed_gptqmodel(self, dataloader):
         print(res)
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mixed_gptqmodel_convert_to_ar(self, dataloader):
+    def test_mixed_gptqmodel_convert_to_ar(self, tiny_opt_model_path, dataloader):
         layer_config = {
             "k_proj": {"bits": 8},  # part name
             "lm_head": {"bits": 4},  # set lm_head quant
@@ -66,7 +71,7 @@ def test_mixed_gptqmodel_convert_to_ar(self, dataloader):
             "model.decoder.layers.0.self_attn.q_proj": {"bits": 8},  # full name
         }
         autoround = AutoRound(
-            model=self.model_name,
+            model=tiny_opt_model_path,
             scheme="W4A16",
             iters=2,
             seqlen=2,
@@ -86,7 +91,7 @@ def test_mixed_gptqmodel_convert_to_ar(self, dataloader):
         print(res)
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mixed_autoround_format(self, dataloader):
+    def test_mixed_autoround_format(self, tiny_opt_model_path, dataloader):
         layer_config = {
             "k_proj": {"bits": 8},
             "q_proj": {"bits": 3},
@@ -94,7 +99,7 @@ def test_mixed_autoround_format(self, dataloader):
             "fc1": {"bits": 16},
         }
         autoround = AutoRound(
-            model=self.model_name,
+            model=tiny_opt_model_path,
             scheme="W4A16",
             iters=2,
             seqlen=2,
@@ -114,14 +119,13 @@ def test_mixed_autoround_format(self, dataloader):
 
     @require_awq
     @require_package_version_ut("transformers", "<4.57.0")
-    def test_fallback_regex_for_awq_format(self, dataloader):
-        model_name = "facebook/opt-125m"
+    def test_fallback_regex_for_awq_format(self, tiny_opt_model_path, dataloader):
         layer_config = {
             "lm_head": {"bits": 16},
             "fc1": {"bits": 16},
         }
         autoround = AutoRound(
-            model=model_name,
+            model=tiny_opt_model_path,
             scheme="W4A16",
             iters=2,
             seqlen=2,
@@ -140,14 +144,14 @@ def test_fallback_regex_for_awq_format(self, dataloader):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mixed_ar_format_part_name_hf_loading(self, dataloader):
+    def test_mixed_ar_format_part_name_hf_loading(self, tiny_opt_model_path, dataloader):
         layer_config = {
             "k_proj": {"bits": 8},  # part name
             "lm_head": {"bits": 16},  # full name
             ".*fc1.*": {"bits": 16},  # standard regex
         }
         autoround = AutoRound(
-            model=self.model_name,
+            model=tiny_opt_model_path,
             scheme="W4A16",
             iters=2,
             seqlen=2,
@@ -207,8 +211,9 @@ def test_mixed_MXFP_autoround_format_loading(self, dataloader):
             "lm_head": {"bits": 16, "act_bits": 16},
             "fc1": {"bits": 8, "act_bits": 8},
         }
+        model_path = get_model_path("facebook/opt-125m")
         autoround = AutoRound(
-            self.model_name,
+            model_path,
             scheme="MXFP4",
             iters=2,
             seqlen=2,
@@ -230,13 +235,13 @@ def test_mixed_MXFP_autoround_format_loading(self, dataloader):
         assert result["results"]["lambada_openai"]["acc,none"] > 0.32
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mixed_autoround_format_vllm(self, dataloader):
+    def test_mixed_autoround_format_vllm(self, tiny_opt_model_path, dataloader):
         layer_config = {
             "self_attn": {"bits": 8},
             "lm_head": {"bits": 16},
         }
         autoround = AutoRound(
-            self.model,
+            tiny_opt_model_path,
             self.tokenizer,
             scheme="W4A16",
             iters=2,
@@ -270,14 +275,14 @@ def test_mixed_autoround_format_vllm(self, dataloader):
             print(f"{prompt}: {generated_text}")
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mixed_llmcompressor_format_vllm(self, dataloader):
+    def test_mixed_llmcompressor_format_vllm(self, tiny_opt_model_path, dataloader):
         layer_config = {
             "self_attn": {"bits": 16, "act_bits": 16},
             "lm_head": {"bits": 16, "act_bits": 16},
             "fc1": {"bits": 16, "act_bits": 16},
         }
         autoround = AutoRound(
-            self.model_name,
+            tiny_opt_model_path,
             scheme="NVFP4",
             iters=2,
             seqlen=2,
diff --git a/test/test_cuda/test_multiple_card.py b/test/test_cuda/test_multiple_card.py
index 2f29f7a37..e09975a19 100644
--- a/test/test_cuda/test_multiple_card.py
+++ b/test/test_cuda/test_multiple_card.py
@@ -10,6 +10,8 @@
 from auto_round.eval.evaluation import simple_evaluate
 from auto_round.testing_utils import multi_card, require_gptqmodel, require_greater_than_050
 
+from ..helpers import get_model_path, get_tiny_model
+
 
 def get_accuracy(data):
     match = re.search(r"\|acc\s+\|[↑↓]\s+\|\s+([\d.]+)\|", data)
@@ -24,14 +26,20 @@ def get_accuracy(data):
 # import os
 # os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
 class TestAutoRound:
-    @classmethod
-    def setup_class(self):
-        self.save_dir = "./saved"
-        self.tasks = "lambada_openai"
-
-    @classmethod
-    def teardown_class(self):
-        shutil.rmtree(self.save_dir, ignore_errors=True)
+    save_dir = "./saved"
+    tasks = "lambada_openai"
+
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
+
+        # Yield to hand control to the test methods
+        yield
+
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
+        shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @multi_card
@@ -53,10 +61,9 @@ def test_device_map_str(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
     @multi_card
-    def test_layer_norm(self):
-        model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def test_layer_norm(self, tiny_opt_model_path):
+        model = AutoModelForCausalLM.from_pretrained(tiny_opt_model_path, torch_dtype=torch.float16)
+        tokenizer = AutoTokenizer.from_pretrained(tiny_opt_model_path)
         device_map = {"norm": "cuda:1"}
         autoround = AutoRound(
             model, tokenizer, iters=2, device_map=device_map, nsamples=7, seqlen=32, enable_norm_bias_tuning=True
@@ -64,10 +71,9 @@ def test_layer_norm(self):
         autoround.quantize()
 
     @multi_card
-    def test_rms_norm(self):
-        model_name = "/models/Qwen2-0.5B-Instruct"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def test_rms_norm(self, tiny_qwen_model_path):
+        model = AutoModelForCausalLM.from_pretrained(tiny_qwen_model_path, torch_dtype=torch.float16)
+        tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_model_path)
         device_map = {"norm": "cuda:1"}
         autoround = AutoRound(
             model, tokenizer, iters=2, device_map=device_map, nsamples=7, seqlen=32, enable_norm_bias_tuning=True
@@ -75,10 +81,9 @@ def test_rms_norm(self):
         autoround.quantize()
 
     @multi_card
-    def test_act_quantization(self):
-        model_name = "/models/Qwen2-0.5B-Instruct"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def test_act_quantization(self, tiny_qwen_model_path):
+        model = AutoModelForCausalLM.from_pretrained(tiny_qwen_model_path, torch_dtype=torch.float16)
+        tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_model_path)
         device_map = {".*q_proj": "0", ".*k_proj": "cuda:1", "v_proj": 1, ".*up_proj": "1"}
         autoround = AutoRound(
             model, tokenizer, iters=2, device_map=device_map, nsamples=7, seqlen=32, act_bits=4, act_dynamic=False
@@ -87,9 +92,9 @@ def test_act_quantization(self):
 
     @multi_card
     def test_lm_head(self):
-        model_name = "/models/Qwen2.5-7B-Instruct"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model_path = get_model_path("qwen/Qwen2.5-7B-Instruct")
+        model = get_tiny_model(model_path)
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
         device_map = {".*q_proj": "0", ".*k_proj": "cuda:1", "v_proj": 1, ".*up_proj": "1", "lm_head": 1}
         layer_config = {"lm_head": {"bits": 4}}
         autoround = AutoRound(
@@ -105,10 +110,9 @@ def test_lm_head(self):
         autoround.quantize()
 
     @multi_card
-    def test_device_map(self):
-        model_name = "/models/Qwen2-0.5B-Instruct"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def test_device_map(self, tiny_qwen_model_path):
+        model = AutoModelForCausalLM.from_pretrained(tiny_qwen_model_path, torch_dtype=torch.float16)
+        tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_model_path)
         device_map = {".*q_proj": "0", ".*k_proj": "cuda:1", "v_proj": 1, ".*up_proj": "cpu"}
         autoround = AutoRound(model, tokenizer, iters=2, device_map=device_map, nsamples=7, seqlen=32)
         autoround.quantize()
@@ -206,12 +210,11 @@ def test_device_map(self):
             torch.cuda.empty_cache()
 
     @multi_card
-    def test_device_map_dict(self):
+    def test_device_map_dict(self, tiny_opt_model_path):
         device_map = {".*q_proj": "0", ".*k_proj": "cuda:1", "v_proj": 1, ".*up_proj": "1"}
         bits, group_size, sym = 4, 128, False
-        model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForCausalLM.from_pretrained(tiny_opt_model_path, torch_dtype=torch.float16, device_map="auto")
+        tokenizer = AutoTokenizer.from_pretrained(tiny_opt_model_path)
         autoround = AutoRound(
             model,
             tokenizer,
@@ -225,9 +228,8 @@ def test_device_map_dict(self):
         autoround.quantize()
 
         # test model_name
-        model_name = "/models/opt-125m"
         autoround = AutoRound(
-            model_name,
+            tiny_opt_model_path,
             tokenizer,
             bits=bits,
             group_size=group_size,
@@ -240,7 +242,7 @@ def test_device_map_dict(self):
 
         # test rtn
         autoround = AutoRound(
-            model_name,
+            tiny_opt_model_path,
             tokenizer,
             bits=bits,
             group_size=group_size,
@@ -352,7 +354,7 @@ def test_device_map_for_triton(self):
 
     @multi_card
     def test_mllm_device_map(self):
-        model_name = "/models/Qwen2-VL-2B-Instruct/"
+        model_name = get_model_path("qwen/Qwen2-VL-2B-Instruct/")
         from auto_round import AutoRoundMLLM
 
         device_map = "0,1"
diff --git a/test/test_cuda/test_multiple_card_calib.py b/test/test_cuda/test_multiple_card_calib.py
index 410855c33..e82d0b330 100644
--- a/test/test_cuda/test_multiple_card_calib.py
+++ b/test/test_cuda/test_multiple_card_calib.py
@@ -19,13 +19,19 @@ def get_accuracy(data):
 
 
 class TestAutoRound:
-    @classmethod
-    def setup_class(self):
-        self.save_dir = "./saved"
-        self.tasks = "lambada_openai"
+    save_dir = "./saved"
+    tasks = "lambada_openai"
 
-    @classmethod
-    def teardown_class(self):
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
+
+        # Yield to hand control to the test methods
+        yield
+
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
diff --git a/test/test_cuda/test_mxfp_and_nvfp_quant.py b/test/test_cuda/test_mxfp_and_nvfp_quant.py
index 0dc43b093..808fa4a28 100644
--- a/test/test_cuda/test_mxfp_and_nvfp_quant.py
+++ b/test/test_cuda/test_mxfp_and_nvfp_quant.py
@@ -12,6 +12,8 @@
 from auto_round.export.export_to_autoround import qlinear_fp as ar_qlinear_fp
 from auto_round.testing_utils import has_module
 
+from ..helpers import get_model_path
+
 testing_schemes = [AutoRoundFormat.MXFP8.value, AutoRoundFormat.MXFP4.value, AutoRoundFormat.NVFP4.value]
 QMODULE_MAPPING = {
     AutoRoundFormat.MXFP8.value: ar_qmodules.MXFP8QuantLinear,
@@ -22,15 +24,14 @@
 
 @pytest.mark.parametrize("scheme", testing_schemes)
 @torch.inference_mode()
-def test_e2e_quant_and_infer(scheme):
+def test_e2e_quant_and_infer(scheme, tiny_qwen_model_path):
     # Use a temporary directory for saving the quantized model
     with tempfile.TemporaryDirectory() as temp_dir:
-        model_name = "Qwen/Qwen2.5-0.5B-Instruct"
 
         # Load the tokenizer and model
-        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_model_path, trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
-            model_name,
+            tiny_qwen_model_path,
             device_map="cpu",
             torch_dtype="auto",
             trust_remote_code=True,
diff --git a/test/test_cuda/test_mxfp_nvfp.py b/test/test_cuda/test_mxfp_nvfp.py
index 38116f3be..41c996b95 100644
--- a/test/test_cuda/test_mxfp_nvfp.py
+++ b/test/test_cuda/test_mxfp_nvfp.py
@@ -9,20 +9,27 @@
 from auto_round import AutoRound
 from auto_round.testing_utils import require_awq, require_optimum
 
+from ..helpers import get_model_path, get_tiny_model
+
 
 class TestAutoRound:
-    @classmethod
-    def setup_class(self):
-        self.model_name = "facebook/opt-125m"
-        self.save_dir = "./saved"
+    save_dir = "./saved"
+
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
+
+        # Yield to hand control to the test methods
+        yield
 
-    @classmethod
-    def teardown_class(self):
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     def test_fp8input_mxfp4_llmcompressor_format(self, dataloader):
-        model_name = "/models/Qwen3-0.6B-FP8"
+        model_name = get_model_path("qwen/Qwen3-0.6B-FP8")
         scheme = "mxfp4"
         ar = AutoRound(
             model=model_name,
@@ -47,10 +54,10 @@ def test_fp8input_mxfp4_llmcompressor_format(self, dataloader):
         ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
-    def test_nvfp4_llmcompressor_format(self, dataloader):
+    def test_nvfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader):
         scheme = "nvfp4"
         autoround = AutoRound(
-            self.model_name,
+            tiny_opt_model_path,
             scheme=scheme,
             iters=2,
             seqlen=2,
@@ -58,7 +65,7 @@ def test_nvfp4_llmcompressor_format(self, dataloader):
         )
         quantized_model_path = self.save_dir
         compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
+        tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
             and hasattr(tmp_layer, "weight_global_scale")
@@ -98,11 +105,11 @@ def test_nvfp4_llmcompressor_format(self, dataloader):
         #     if "France" in prompt:
         #         assert "Paris" in generated_text
 
-    def test_nvfp4_moe_actmax_rtn(self, dataloader):
-        model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite"
+    def test_nvfp4_moe_actmax_rtn(self, tiny_deepseek_v2_model_path, dataloader):
+        # model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite"
         scheme = "nvfp4"
         autoround = AutoRound(
-            model_name,
+            tiny_deepseek_v2_model_path,
             scheme=scheme,
             iters=0,
             seqlen=2,
@@ -113,11 +120,10 @@ def test_nvfp4_moe_actmax_rtn(self, dataloader):
         quantized_model_path = self.save_dir
         autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round")
 
-    def test_nvfp4_moe_actmax_ar(self, dataloader):
-        model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite"
+    def test_nvfp4_moe_actmax_ar(self, tiny_deepseek_v2_model_path, dataloader):
         scheme = "nvfp4"
         autoround = AutoRound(
-            model_name,
+            tiny_deepseek_v2_model_path,
             scheme=scheme,
             iters=1,
             seqlen=2,
@@ -129,7 +135,7 @@ def test_nvfp4_moe_actmax_ar(self, dataloader):
         autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round")
 
     def test_qwen_moe_quant_infer(self, dataloader):
-        model_name = "/models/Qwen1.5-MoE-A2.7B"
+        model_name = get_model_path("qwen/Qwen1.5-MoE-A2.7B")
         layer_config = {
             "layers\.(?:[3-9]|1[0-9]|2[0-3])": {"bits": 16, "act_bits": 16},
         }
diff --git a/test/test_cuda/test_qbits.py b/test/test_cuda/test_qbits.py
index 0ce3597db..37e119b2c 100644
--- a/test/test_cuda/test_qbits.py
+++ b/test/test_cuda/test_qbits.py
@@ -6,17 +6,23 @@
 from auto_round import AutoRound, AutoRoundConfig
 from auto_round.testing_utils import require_gptqmodel, require_itrex
 
-from ..helpers import model_infer
+from ..helpers import get_model_path, model_infer
 
 
 class TestAutoRound:
-    @classmethod
-    def setup_class(self):
-        self.model_name = "/models/opt-125m"
-        self.save_folder = "./saved"
+    save_dir = "./saved"
 
-    @classmethod
-    def teardown_class(self):
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
+
+        # Yield to hand control to the test methods
+        yield
+
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
+        shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     ## require torch 2.6
@@ -50,8 +56,9 @@ def test_load_gptq_model_2bits(self):
 
     @require_itrex
     def test_mixed_precision(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+        model_path = get_model_path("facebook/opt-125m")
+        model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto", trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
         layer_config = {}
 
         layer_config["model.decoder.layers.0.self_attn.k_proj"] = {"bits": 8}
@@ -64,27 +71,29 @@ def test_mixed_precision(self):
         autoround = AutoRound(
             model, tokenizer, bits=bits, group_size=group_size, iters=1, nsamples=1, sym=sym, layer_config=layer_config
         )
-        quantized_model_path = self.save_folder
+        quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder,
+            self.save_dir,
             torch_dtype=torch.float16,
             device_map="cpu",
         )
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
         text = "There is a girl who likes adventure,"
         inputs = tokenizer(text, return_tensors="pt").to(model.device)
         res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])
         print(res)
         assert "!!!" not in res
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+        shutil.rmtree(self.save_dir, ignore_errors=True)
 
     @require_gptqmodel
-    def test_autoround_sym(self):
+    def test_autoround_sym(self, tiny_opt_model_path):
         for bits in [4]:
-            model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-            tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+            model = AutoModelForCausalLM.from_pretrained(
+                tiny_opt_model_path, torch_dtype="auto", trust_remote_code=True
+            )
+            tokenizer = AutoTokenizer.from_pretrained(tiny_opt_model_path, trust_remote_code=True)
             bits, group_size, sym = bits, 128, True
             autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=2)
             quantized_model_path = "./saved"
@@ -100,4 +109,4 @@ def test_autoround_sym(self):
             res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])
             print(res)
             assert "!!!" not in res
-            shutil.rmtree(self.save_folder, ignore_errors=True)
+            shutil.rmtree(self.save_dir, ignore_errors=True)
diff --git a/test/test_cuda/test_scheme.py b/test/test_cuda/test_scheme.py
index d6fe43374..2ed5527bd 100644
--- a/test/test_cuda/test_scheme.py
+++ b/test/test_cuda/test_scheme.py
@@ -5,45 +5,52 @@
 from auto_round import AutoRound
 from auto_round.schemes import QuantizationScheme
 
+from ..helpers import get_model_path
+
 
 class TestAutoRound:
-    @classmethod
-    def setup_class(self):
-        self.model_name = "/models/opt-125m"
-        self.save_folder = "./saved"
-
-    @classmethod
-    def teardown_class(self):
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+    save_dir = "./saved"
+
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
+
+        # Yield to hand control to the test methods
+        yield
+
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
+        shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     # Tuning tests
-    def test_gguf(self):
-        ar = AutoRound("/models/Qwen3-0.6B", scheme="W2A16", nsamples=1, iters=1)
-        ar.quantize_and_save(self.save_folder, format="gguf:q4_k_m")
+    def test_gguf(self, tiny_qwen_model_path):
+        ar = AutoRound(tiny_qwen_model_path, scheme="W2A16", nsamples=1, iters=1)
+        ar.quantize_and_save(self.save_dir, format="gguf:q4_k_m")
         assert ar.bits == 4
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+        shutil.rmtree(self.save_dir, ignore_errors=True)
 
-    def test_w4a16(self):
-        ar = AutoRound(self.model_name, scheme="W4A16", nsamples=1, iters=1)
+    def test_w4a16(self, tiny_opt_model_path):
+        ar = AutoRound(tiny_opt_model_path, scheme="W4A16", nsamples=1, iters=1)
         assert ar.bits == 4
         ar.quantize()
 
-    def test_w2a16(self):
-        ar = AutoRound(self.model_name, scheme="W2A16", nsamples=1, iters=1)
+    def test_w2a16(self, tiny_opt_model_path):
+        ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=1)
         assert ar.bits == 2
         ar.quantize()
 
-    def test_mxfp4(self):
-        ar = AutoRound(self.model_name, scheme="MXFP4", nsamples=1, iters=1)
+    def test_mxfp4(self, tiny_opt_model_path):
+        ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=1)
         assert ar.bits == 4
         assert ar.act_bits == 4
         assert ar.data_type == "mx_fp"
         assert ar.act_data_type == "mx_fp_rceil"
         ar.quantize()
 
-    def test_fp8_static(self):
-        ar = AutoRound(self.model_name, scheme="FP8_STATIC", nsamples=1, iters=1)
+    def test_fp8_static(self, tiny_opt_model_path):
+        ar = AutoRound(tiny_opt_model_path, scheme="FP8_STATIC", nsamples=1, iters=1)
         assert ar.bits == 8
         assert ar.act_bits == 8
         assert ar.data_type == "fp"
@@ -53,21 +60,21 @@ def test_fp8_static(self):
         ar.quantize()
 
     ## RTN tests
-    def test_w2a16_rtn(self):
-        ar = AutoRound(self.model_name, scheme="W2A16", nsamples=1, iters=0)
+    def test_w2a16_rtn(self, tiny_opt_model_path):
+        ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=0)
         assert ar.bits == 2
         ar.quantize()
 
-    def test_mxfp4_rtn(self):
-        ar = AutoRound(self.model_name, scheme="MXFP4", nsamples=1, iters=0)
+    def test_mxfp4_rtn(self, tiny_opt_model_path):
+        ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=0)
         assert ar.bits == 4
         assert ar.act_bits == 4
         assert ar.data_type == "mx_fp"
         assert ar.act_data_type == "mx_fp_rceil"
         ar.quantize()
 
-    def test_fp8_static_rtn(self):
-        ar = AutoRound(self.model_name, scheme="FP8_STATIC", nsamples=1, iters=0)
+    def test_fp8_static_rtn(self, tiny_opt_model_path):
+        ar = AutoRound(tiny_opt_model_path, scheme="FP8_STATIC", nsamples=1, iters=0)
         assert ar.bits == 8
         assert ar.act_bits == 8
         assert ar.data_type == "fp"
@@ -77,12 +84,13 @@ def test_fp8_static_rtn(self):
         ar.quantize()
 
     def test_scheme_in_layer_config(self):
+        model_path = get_model_path("facebook/opt-125m")
         layer_config = {
             "model.decoder.layers.2.self_attn": {"bits": 2},
             "model.decoder.layers.3.self_attn.v_proj": "W8A16",
             "model.decoder.layers.4.self_attn.k_proj": QuantizationScheme.from_dict({"group_size": 64}),
         }
-        ar = AutoRound(self.model_name, scheme="W3A16", nsamples=1, iters=1, layer_config=layer_config)
+        ar = AutoRound(model_path, scheme="W3A16", nsamples=1, iters=1, layer_config=layer_config)
 
         ar.quantize()
         for n, m in ar.model.named_modules():
diff --git a/test/test_cuda/test_torch_backend.py b/test/test_cuda/test_torch_backend.py
index 5244725e8..a7eb30552 100644
--- a/test/test_cuda/test_torch_backend.py
+++ b/test/test_cuda/test_torch_backend.py
@@ -8,24 +8,30 @@
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.testing_utils import require_autogptq, require_gptqmodel
 
-from ..helpers import model_infer
+from ..helpers import get_model_path, model_infer
 
 
 class TestAutoRoundTorchBackend:
 
-    @classmethod
-    def setup_class(self):
-        self.model_name = "/models/opt-125m"
-        self.save_folder = "./saved"
+    save_dir = "./saved"
 
-    @classmethod
-    def teardown_class(self):
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
+
+        # Yield to hand control to the test methods
+        yield
+
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
+        shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     def test_torch_4bits_asym(self, dataloader):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+        model_path = get_model_path("facebook/opt-125m")
+        model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto", trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
         bits, group_size, sym = 4, 128, False
         autoround = AutoRound(
             model,
@@ -37,7 +43,7 @@ def test_torch_4bits_asym(self, dataloader):
             seqlen=2,
             dataset=dataloader,
         )
-        quantized_model_path = self.save_folder
+        quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel")
 
         quantization_config = AutoRoundConfig(backend="torch")
@@ -45,7 +51,7 @@ def test_torch_4bits_asym(self, dataloader):
             quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -53,10 +59,10 @@ def test_torch_4bits_asym(self, dataloader):
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            self.save_dir, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
@@ -65,8 +71,9 @@ def test_torch_4bits_asym(self, dataloader):
         shutil.rmtree("./saved", ignore_errors=True)
 
     def test_torch_4bits_sym(self, dataloader):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+        model_path = get_model_path("facebook/opt-125m")
+        model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto", trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
             model,
@@ -78,7 +85,7 @@ def test_torch_4bits_sym(self, dataloader):
             seqlen=2,
             dataset=dataloader,
         )
-        quantized_model_path = self.save_folder
+        quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")  ##will convert to gptq model
 
         quantization_config = AutoRoundConfig(backend="torch")
@@ -86,10 +93,10 @@ def test_torch_4bits_sym(self, dataloader):
             quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
         model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
         assert result["results"]["lambada_openai"]["acc,none"] > 0.28
         torch.cuda.empty_cache()
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+        shutil.rmtree(self.save_dir, ignore_errors=True)

From cb5acf61fed19c1be9aa40063773b21f2bb7aef7 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Tue, 23 Dec 2025 00:48:05 -0500
Subject: [PATCH 17/24] add ./tmp as workspace and remove duplicate UTs

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 test/fixtures.py                  | 29 +++++++-----
 test/helpers.py                   | 10 +++--
 test/test_cpu/test_gguf_format.py | 67 ++++++++++++++--------------
 test/test_cpu/test_gpt_oss.py     | 74 -------------------------------
 test/test_cpu/test_moe_model.py   | 59 +++++++++++++++++++-----
 5 files changed, 104 insertions(+), 135 deletions(-)
 delete mode 100644 test/test_cpu/test_gpt_oss.py

diff --git a/test/fixtures.py b/test/fixtures.py
index 86bc36e48..c76040322 100644
--- a/test/fixtures.py
+++ b/test/fixtures.py
@@ -26,7 +26,7 @@
 @pytest.fixture(scope="session")
 def tiny_opt_model_path():
     model_name_or_path = opt_name_or_path
-    tiny_model_path = "./tmp_tiny_opt_model_path"
+    tiny_model_path = "./tmp/tiny_opt_model_path"
     tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path)
     yield tiny_model_path
     shutil.rmtree(tiny_model_path)
@@ -35,7 +35,7 @@ def tiny_opt_model_path():
 @pytest.fixture(scope="session")
 def tiny_lamini_model_path():
     model_name_or_path = lamini_name_or_path
-    tiny_model_path = "./tmp_tiny_lamini_model_path"
+    tiny_model_path = "./tmp/tiny_lamini_model_path"
     tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path)
     yield tiny_model_path
     shutil.rmtree(tiny_model_path)
@@ -44,7 +44,7 @@ def tiny_lamini_model_path():
 @pytest.fixture(scope="session")
 def tiny_gptj_model_path():
     model_name_or_path = gptj_name_or_path
-    tiny_model_path = "./tmp_tiny_gptj_model_path"
+    tiny_model_path = "./tmp/tiny_gptj_model_path"
     tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path)
     yield tiny_model_path
     shutil.rmtree(tiny_model_path)
@@ -53,7 +53,7 @@ def tiny_gptj_model_path():
 @pytest.fixture(scope="session")
 def tiny_phi2_model_path():
     model_name_or_path = phi2_name_or_path
-    tiny_model_path = "./tmp_tiny_phi2_model_path"
+    tiny_model_path = "./tmp/tiny_phi2_model_path"
     tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path)
     yield tiny_model_path
     shutil.rmtree(tiny_model_path)
@@ -62,7 +62,7 @@ def tiny_phi2_model_path():
 @pytest.fixture(scope="session")
 def tiny_deepseek_v2_model_path():
     model_name_or_path = deepseek_v2_name_or_path
-    tiny_model_path = "./tmp_tiny_deepseek_v2_model_path"
+    tiny_model_path = "./tmp/tiny_deepseek_v2_model_path"
     tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2)
     yield tiny_model_path
     shutil.rmtree(tiny_model_path)
@@ -71,7 +71,7 @@ def tiny_deepseek_v2_model_path():
 @pytest.fixture(scope="session")
 def tiny_gemma_model_path():
     model_name_or_path = gemma_name_or_path
-    tiny_model_path = "./tmp_tiny_gemma_model_path"
+    tiny_model_path = "./tmp/tiny_gemma_model_path"
     tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2)
     yield tiny_model_path
     shutil.rmtree(tiny_model_path)
@@ -80,7 +80,7 @@ def tiny_gemma_model_path():
 @pytest.fixture(scope="session")
 def tiny_qwen_model_path():
     model_name_or_path = qwen_name_or_path
-    tiny_model_path = "./tmp_tiny_qwen_model_path"
+    tiny_model_path = "./tmp/tiny_qwen_model_path"
     tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path)
     yield tiny_model_path
     shutil.rmtree(tiny_model_path)
@@ -89,7 +89,7 @@ def tiny_qwen_model_path():
 @pytest.fixture(scope="session")
 def tiny_untied_qwen_model_path():
     model_name_or_path = qwen_name_or_path
-    tiny_model_path = "./tmp_tiny_untied_qwen_model_path"
+    tiny_model_path = "./tmp/tiny_untied_qwen_model_path"
     tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, force_untie=True)
     yield tiny_model_path
     shutil.rmtree(tiny_model_path)
@@ -98,7 +98,7 @@ def tiny_untied_qwen_model_path():
 @pytest.fixture(scope="session")
 def tiny_qwen_moe_model_path():
     model_name_or_path = qwen_moe_name_or_path
-    tiny_model_path = "./tmp_tiny_qwen_moe_model_path"
+    tiny_model_path = "./tmp/tiny_qwen_moe_model_path"
     tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2)
     yield tiny_model_path
     shutil.rmtree(tiny_model_path)
@@ -107,7 +107,7 @@ def tiny_qwen_moe_model_path():
 @pytest.fixture(scope="session")
 def tiny_qwen_vl_model_path():
     model_name_or_path = qwen_vl_name_or_path
-    tiny_model_path = "./tmp_tiny_qwen_vl_model_path"
+    tiny_model_path = "./tmp/tiny_qwen_vl_model_path"
     tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2, is_mllm=True)
     yield tiny_model_path
     shutil.rmtree(tiny_model_path)
@@ -116,12 +116,19 @@ def tiny_qwen_vl_model_path():
 @pytest.fixture(scope="session")
 def tiny_qwen_2_5_vl_model_path():
     model_name_or_path = qwen_2_5_vl_name_or_path
-    tiny_model_path = "./tmp_tiny_qwen_2_5_vl_model_path"
+    tiny_model_path = "./tmp/tiny_qwen_2_5_vl_model_path"
     tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2, is_mllm=True)
     yield tiny_model_path
     shutil.rmtree(tiny_model_path)
 
 
+@pytest.fixture(autouse=True, scope="session")
+def clean_tmp_model_folder():
+    yield
+    shutil.rmtree("./tmp", ignore_errors=True)  # unittest default workspace
+    shutil.rmtree("./tmp_autoround", ignore_errors=True)  # autoround default workspace
+
+
 # Create objective fixtures for testing
 @pytest.fixture(scope="function")
 def tiny_opt_model():
diff --git a/test/helpers.py b/test/helpers.py
index f30e632f7..89b832c6d 100644
--- a/test/helpers.py
+++ b/test/helpers.py
@@ -43,14 +43,15 @@ def get_tiny_model(model_name_or_path, num_layers=2, is_mllm=False, **kwargs):
 
     def slice_layers(module):
         """slice layers in the model."""
+        sliced = False
         for name, child in module.named_children():
             if isinstance(child, torch.nn.ModuleList) and len(child) > num_layers:
                 new_layers = torch.nn.ModuleList(child[:num_layers])
                 setattr(module, name, new_layers)
-                return True
-            if slice_layers(child):
-                return True
-        return False
+                sliced = True
+            elif slice_layers(child):
+                sliced = True
+        return sliced
 
     kwargs["dtype"] = "auto" if "auto" not in kwargs else kwargs["dtype"]
     kwargs["trust_remote_code"] = True if "trust_remote_code" not in kwargs else kwargs["trust_remote_code"]
@@ -63,6 +64,7 @@ def slice_layers(module):
                 model.config.vision_config.depth = num_layers
     else:
         model, tokenizer = llm_load_model(model_name_or_path, **kwargs)
+
     slice_layers(model)
 
     if hasattr(model.config, "num_hidden_layers"):
diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py
index b7f25541c..366819234 100644
--- a/test/test_cpu/test_gguf_format.py
+++ b/test/test_cpu/test_gguf_format.py
@@ -4,29 +4,29 @@
 
 import pytest
 import torch
-import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
 
-from ..helpers import get_tiny_model, qwen_name_or_path, qwen_vl_name_or_path
+from ..helpers import get_model_path, get_tiny_model
 
 
 class TestGGUF:
 
     @classmethod
     def setup_class(self):
-        self.tokenizer = AutoTokenizer.from_pretrained(qwen_name_or_path, trust_remote_code=True)
+        self.model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct")
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
 
     @classmethod
     def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_basic_usage(self, tiny_gemma_model_path, tiny_qwen_model_path):
+    def test_basic_usage(self):
         python_path = sys.executable
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --model {tiny_gemma_model_path} "
+            f"cd ../.. && {python_path} -m auto_round --model {get_model_path('benzart/gemma-2b-it-fine-tuning-for-code-test')} "
             f" --bs 16 --iters 0 --nsamples 1 --format gguf:q4_k_m"
         )
         if res > 0 or res == -1:
@@ -34,17 +34,17 @@ def test_basic_usage(self, tiny_gemma_model_path, tiny_qwen_model_path):
         shutil.rmtree("./saved", ignore_errors=True)
 
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --model {tiny_qwen_model_path}"
+            f"cd ../.. && {python_path} -m auto_round --model {self.model_name}"
             f" --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_q4_0(self, tiny_qwen_model_path):
+    def test_q4_0(self):
         bits, group_size, sym = 4, 32, True
         autoround = AutoRound(
-            tiny_qwen_model_path,
+            self.model_name,
             bits=bits,
             group_size=group_size,
             sym=sym,
@@ -61,12 +61,13 @@ def test_q4_0(self, tiny_qwen_model_path):
         text = "There is a girl who likes adventure,"
         inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
         print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
+
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_func(self, tiny_qwen_model_path):
+    def test_func(self):
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
-            tiny_qwen_model_path,
+            self.model_name,
             iters=1,
             nsamples=1,
             seqlen=10,
@@ -83,8 +84,8 @@ def test_func(self, tiny_qwen_model_path):
         print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_gguf_baseline(self, tiny_qwen_model_path):
-        model_name = tiny_qwen_model_path
+    def test_gguf_baseline(self):
+        model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct")
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         autoround = AutoRound(
             model,
@@ -102,16 +103,16 @@ def test_gguf_baseline(self, tiny_qwen_model_path):
         )
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="fake")
-
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto")
         text = "There is a girl who likes adventure,"
         inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
         print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_q4_k_m(self, tiny_qwen_model_path, dataloader):
-        model = get_tiny_model(qwen_name_or_path, num_layers=4)
-        tokenizer = transformers.AutoTokenizer.from_pretrained(qwen_name_or_path, trust_remote_code=True)
+    def test_q4_k_m(self, dataloader):
+        model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct")
+        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         layer_config = {
             "lm_head": {
                 "bits": 4,
@@ -122,8 +123,8 @@ def test_q4_k_m(self, tiny_qwen_model_path, dataloader):
                 "super_group_size": 8,
             },
             "model.embed_tokens": {"bits": 6, "group_size": 32, "super_bits": 6, "super_group_size": 8},
-            "model.layers.3.mlp.gate_proj": {"bits": 3},
-            "model.layers.1.mlp.gate_proj": {"bits": 8},
+            "model.layers.12.mlp.gate_proj": {"bits": 3},
+            "model.layers.10.mlp.gate_proj": {"bits": 8},
         }
         autoround = AutoRound(
             model,
@@ -137,26 +138,26 @@ def test_q4_k_m(self, tiny_qwen_model_path, dataloader):
         )
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake")
-        assert autoround.layer_config["model.layers.2.self_attn.v_proj"]["super_group_size"] == 16
-        assert autoround.layer_config["model.layers.2.self_attn.v_proj"]["data_type"] == "int_sym_dq"
-        assert autoround.layer_config["model.layers.0.self_attn.v_proj"]["data_type"] == "int_asym_dq"
+        assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["super_group_size"] == 16
+        assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["data_type"] == "int_sym_dq"
+        assert autoround.layer_config["model.layers.7.self_attn.v_proj"]["data_type"] == "int_asym_dq"
         assert autoround.model.model.layers[0].self_attn.v_proj.bits == 6
-        assert autoround.model.model.layers[3].self_attn.v_proj.bits == 4
+        assert autoround.model.model.layers[12].self_attn.v_proj.bits == 4
         assert autoround.model.model.embed_tokens.bits == 6
         assert autoround.model.model.embed_tokens.group_size == 16
-        assert autoround.model.model.layers[3].mlp.gate_proj.bits == 3
-        assert autoround.model.model.layers[1].mlp.gate_proj.bits == 8
-        assert autoround.layer_config["model.layers.1.mlp.gate_proj"]["mostly"] == "gguf:q8_0"
+        assert autoround.model.model.layers[12].mlp.gate_proj.bits == 3
+        assert autoround.model.model.layers[10].mlp.gate_proj.bits == 8
+        assert autoround.layer_config["model.layers.10.mlp.gate_proj"]["mostly"] == "gguf:q8_0"
         shutil.rmtree("./saved", ignore_errors=True)
 
-        model = AutoModelForCausalLM.from_pretrained(tiny_qwen_model_path, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         autoround = AutoRound(model, tokenizer, iters=0, nsamples=1, seqlen=128, disable_opt_rtn=False)
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake")
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_all_format(self, tiny_qwen_model_path):
-        model_name = tiny_qwen_model_path
+    def test_all_format(self):
+        model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct")
         python_path = sys.executable
         # for gguf_format in ["gguf:q4_0", "gguf:q4_1", "gguf:q4_k_m", "gguf:q6_k"]:
         for gguf_format in ["gguf:q4_k_m"]:
@@ -186,9 +187,7 @@ def test_all_format(self, tiny_qwen_model_path):
         shutil.rmtree("../../tmp_autoround", ignore_errors=True)
 
     def test_vlm_gguf(self):
-        # TODO: Using two-layers tiny model will return ValueError:
-        # Can not map tensor 'model.layers.10.input_layernorm.weight'
-        model_name = qwen_vl_name_or_path
+        model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct")
         from auto_round import AutoRoundMLLM
         from auto_round.utils import mllm_load_model
 
@@ -212,7 +211,7 @@ def test_vlm_gguf(self):
                 assert abs(file_size - 892) < 5.0
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_qtype_setting(self, tiny_qwen_model_path):
+    def test_qtype_setting(self):
         # Qwen2.5-0.5B-Instruct no output, token_embed q6_k fallbakc to q8_0 336M
         # Qwen3-0.6B output q6_k, token_embed q4_0  448M
         # Qwen3-8B output q6_k, token_embed q4_0 4.5G
@@ -220,7 +219,7 @@ def test_qtype_setting(self, tiny_qwen_model_path):
         from auto_round.compressors.utils import set_layer_config
         from auto_round.export.export_to_gguf.config import ModelType
 
-        model_name = tiny_qwen_model_path
+        model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct")
         ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0)
         ar.formats = ["gguf:q4_0"]
         ar.layer_config, _, _ = set_layer_config(
@@ -239,7 +238,7 @@ def test_qtype_setting(self, tiny_qwen_model_path):
         assert ar.layer_config["model.embed_tokens"]["bits"] == 8
         assert "lm_head" not in ar.layer_config
 
-        model_name = tiny_qwen_model_path
+        model_name = "Qwen/Qwen3-0.6B"
         ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0)
         ar.formats = ["gguf:q4_0"]
         ar.layer_config, _, _ = set_layer_config(
diff --git a/test/test_cpu/test_gpt_oss.py b/test/test_cpu/test_gpt_oss.py
deleted file mode 100644
index b82c04c31..000000000
--- a/test/test_cpu/test_gpt_oss.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import pytest
-from transformers import AutoConfig, AutoTokenizer
-from transformers.models.gpt_oss.modeling_gpt_oss import GptOssForCausalLM
-
-from auto_round import AutoRound
-
-from ..helpers import get_model_path
-
-
-@pytest.fixture
-def setup_gpt_oss():
-    """Fixture to set up the GPT-OSS model and tokenizer."""
-    model_name = get_model_path("unsloth/gpt-oss-20b-BF16")
-    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
-    config.num_hidden_layers = 1  # Reduce layers for testing
-    model = GptOssForCausalLM(config)
-    output_dir = "/tmp/test_quantized_gpt_oss"
-    return model, tokenizer, output_dir, config
-
-
-def quantize_model(model, tokenizer, output_dir, scheme, iters=0):
-    """Helper function to quantize the model with the given scheme."""
-    autoround = AutoRound(
-        model,
-        tokenizer,
-        scheme=scheme,
-        nsamples=2,
-        iters=iters,
-        fp_layers="self_attn,router,lm_head,mlp.gate",
-    )
-    quantized_model, save_folder = autoround.quantize_and_save(format="auto_round", output_dir=output_dir)
-    return quantized_model
-
-
-def count_modules_by_type(model, target_module_name_or_class):
-    """Helper function to count modules of a specific type in the model."""
-    cnt = 0
-    for name, module in model.named_modules():
-        if isinstance(target_module_name_or_class, str):
-            if target_module_name_or_class == module.__class__.__name__:
-                cnt += 1
-        else:
-            if isinstance(module, target_module_name_or_class):
-                cnt += 1
-    return cnt
-
-
-@pytest.mark.parametrize("scheme", ["MXFP4", "MXFP8"])
-def test_quantization(setup_gpt_oss, scheme):
-    """Test quantization with the scheme."""
-    model, tokenizer, output_dir, config = setup_gpt_oss
-    quantized_model = quantize_model(model, tokenizer, output_dir, scheme)
-
-    # Ensure the quantized model is not None
-    assert quantized_model is not None, "Quantized model should not be None."
-    from auto_round.export.export_to_autoround.qlinear_fp import QuantLinear
-    from auto_round.modelling.gpt_oss import GPTOssSingleExpert
-
-    single_expert_cnt = count_modules_by_type(quantized_model, GPTOssSingleExpert)
-    quant_linear_cnt = count_modules_by_type(quantized_model, QuantLinear)
-    assert (
-        single_expert_cnt == config.num_local_experts
-    ), f"Expected {config.num_local_experts} GPTOssSingleExpert modules, found {single_expert_cnt}."
-    assert (
-        quant_linear_cnt == config.num_hidden_layers * 3 * config.num_local_experts
-    ), f"Expected {config.num_hidden_layers * 3 * config.num_local_experts} QuantLinear modules, found {quant_linear_cnt}."
-
-    print(f"[{scheme}] Total {GPTOssSingleExpert.__name__} modules: {single_expert_cnt}")
-    print(f"[{scheme}] Total {QuantLinear.__name__} modules: {quant_linear_cnt}")
-    # clean the output directory after test
-    import shutil
-
-    shutil.rmtree(output_dir, ignore_errors=True)
diff --git a/test/test_cpu/test_moe_model.py b/test/test_cpu/test_moe_model.py
index 62bac4efc..c30ab0e39 100644
--- a/test/test_cpu/test_moe_model.py
+++ b/test/test_cpu/test_moe_model.py
@@ -8,29 +8,35 @@
 
 from ..helpers import get_model_path
 
+gpt_oss_name_or_path = get_model_path("unsloth/gpt-oss-20b-BF16")
+llama4_name_or_path = get_model_path("meta-llama/Llama-4-Scout-17B-16E-Instruct")
+
+# local path for debug
+# llama4_name_or_path = get_model_path("/dataset/Llama-4-Scout-17B-16E-Instruct")
+
 
 @pytest.fixture
 def setup_gpt_oss():
     """Fixture to set up the GPT-OSS model and tokenizer."""
-    model_name = get_model_path("unsloth/gpt-oss-20b-BF16")
+    model_name = gpt_oss_name_or_path
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
     config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
     config.num_hidden_layers = 1  # Reduce layers for testing
     model = GptOssForCausalLM(config)
-    output_dir = "/tmp/test_quantized_gpt_oss"
+    output_dir = "./tmp/test_quantized_gpt_oss"
     return model, tokenizer, output_dir, config
 
 
 @pytest.fixture
 def setup_llama4():
     """Fixture to set up the llama4 model and tokenizer."""
-    model_name = get_model_path("meta-llama/Llama-4-Scout-17B-16E-Instruct")
+    model_name = llama4_name_or_path
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
     config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
     config.vision_config.num_hidden_layers = 2  # Reduce layers for testing
     config.text_config.num_hidden_layers = 2
     model = Llama4ForConditionalGeneration(config)
-    output_dir = "/tmp/test_quantized_llama4"
+    output_dir = "./tmp/test_quantized_llama4"
     return model, tokenizer, output_dir, config
 
 
@@ -48,23 +54,52 @@ def quantize_model(model, tokenizer, output_dir, scheme, iters=0):
     return quantized_model
 
 
-def test_gptoss(setup_gpt_oss):
+def count_modules_by_type(model, target_module_name_or_class):
+    """Helper function to count modules of a specific type in the model."""
+    cnt = 0
+    for name, module in model.named_modules():
+        if isinstance(target_module_name_or_class, str):
+            if target_module_name_or_class == module.__class__.__name__:
+                cnt += 1
+        else:
+            if isinstance(module, target_module_name_or_class):
+                cnt += 1
+    return cnt
+
+
+@pytest.mark.parametrize("scheme", ["MXFP4", "MXFP8"])
+def test_gptoss(setup_gpt_oss, scheme):
     model, tokenizer, output_dir, config = setup_gpt_oss
 
     # Below parameter is set to be same as the full model
     # Remove it to avoid mismatch during quantized model loading
     delattr(model.config, "layer_types")
 
-    quantized_model = quantize_model(model, tokenizer, output_dir, "MXFP4")
+    quantized_model = quantize_model(model, tokenizer, output_dir, scheme)
 
     # Ensure the quantized model is not None
     assert quantized_model is not None, "Quantized model should not be None."
-
-    loaded_model = GptOssForCausalLM.from_pretrained(output_dir)
-    for n, m in quantized_model.named_modules():
-        if m.__class__.__name__ == "QuantLinear":
-            loaded_m = loaded_model.get_submodule(n)
-            assert (loaded_m.weight_packed.to("cpu") == m.weight_packed.to("cpu")).all()
+    from auto_round.export.export_to_autoround.qlinear_fp import QuantLinear
+    from auto_round.modelling.gpt_oss import GPTOssSingleExpert
+
+    single_expert_cnt = count_modules_by_type(quantized_model, GPTOssSingleExpert)
+    quant_linear_cnt = count_modules_by_type(quantized_model, QuantLinear)
+    assert (
+        single_expert_cnt == config.num_local_experts
+    ), f"Expected {config.num_local_experts} GPTOssSingleExpert modules, found {single_expert_cnt}."
+    assert (
+        quant_linear_cnt == config.num_hidden_layers * 3 * config.num_local_experts
+    ), f"Expected {config.num_hidden_layers * 3 * config.num_local_experts} QuantLinear modules, found {quant_linear_cnt}."
+
+    print(f"[{scheme}] Total {GPTOssSingleExpert.__name__} modules: {single_expert_cnt}")
+    print(f"[{scheme}] Total {QuantLinear.__name__} modules: {quant_linear_cnt}")
+
+    if scheme == "MXFP4":
+        loaded_model = GptOssForCausalLM.from_pretrained(output_dir)
+        for n, m in quantized_model.named_modules():
+            if m.__class__.__name__ == "QuantLinear":
+                loaded_m = loaded_model.get_submodule(n)
+                assert (loaded_m.weight_packed.to("cpu") == m.weight_packed.to("cpu")).all()
     # clean the output directory after test
     shutil.rmtree(output_dir, ignore_errors=True)
 

From cd26af94aa0787aedd6082f9298dda6d5e951382 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Tue, 23 Dec 2025 01:32:28 -0500
Subject: [PATCH 18/24] revert ark change and add some gguf tiny model back

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 test/test_ark/test_model.py       | 145 +++++++++++++++++++-----------
 test/test_cpu/test_gguf_format.py |  10 +--
 2 files changed, 97 insertions(+), 58 deletions(-)

diff --git a/test/test_ark/test_model.py b/test/test_ark/test_model.py
index b8dfdca5c..09d8bf25a 100644
--- a/test/test_ark/test_model.py
+++ b/test/test_ark/test_model.py
@@ -1,6 +1,10 @@
 import shutil
+import sys
 
 import pytest
+
+sys.path.insert(0, "../..")
+
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -8,76 +12,111 @@
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.testing_utils import require_autogptq, require_gptqmodel
 
-from ..helpers import model_infer
 
+class LLMDataLoader:
+    def __init__(self):
+        self.batch_size = 1
+
+    def __iter__(self):
+        for i in range(2):
+            yield torch.ones([1, 10], dtype=torch.long)
 
-class TestAutoRoundTorchBackend:
 
-    @pytest.fixture(autouse=True, scope="class")
-    def setup_and_teardown_class(self):
-        # ===== SETUP (setup_class) =====
-        print("[Setup] Running before any test in class")
+class TestAutoRoundARKBackend:
 
-        # Yield to hand control to the test methods
-        yield
+    @classmethod
+    def setup_class(self):
+        self.model_name = "facebook/opt-125m"
+        self.save_folder = "./saved"
+        self.llm_dataloader = LLMDataLoader()
 
-        # ===== TEARDOWN (teardown_class) =====
-        print("[Teardown] Running after all tests in class")
-        shutil.rmtree("./saved", ignore_errors=True)
+    @classmethod
+    def teardown_class(self):
+        shutil.rmtree(self.save_folder, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_torch_4bits_sym_cpu(self, opt_model, opt_tokenizer, dataloader):
-        bits, group_size, sym = 4, 32, True
-        autoround = AutoRound(
-            opt_model,
-            opt_tokenizer,
-            bits=bits,
-            group_size=group_size,
-            sym=sym,
-            iters=0,
-            seqlen=2,
-            dataset=dataloader,
-        )
-        quantized_model_path = self.save_folder
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel")
+    def model_infer(self, model, tokenizer):
+        prompts = [
+            "Hello,my name is",
+            # "The president of the United States is",
+            # "The capital of France is",
+            # "The future of AI is",
+        ]
 
-        quantization_config = AutoRoundConfig(backend="ark")
-        model = AutoModelForCausalLM.from_pretrained(
-            quantized_model_path, dtype=torch.float16, device_map="cpu", quantization_config=quantization_config
-        )
+        inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        model_infer(model, tokenizer)
-        result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=1000)
-        print(result["results"]["lambada_openai"]["acc,none"])
-        assert result["results"]["lambada_openai"]["acc,none"] > 0.28
-
-        shutil.rmtree("./saved", ignore_errors=True)
-
-    def test_torch_4bits_sym_xpu(self, opt_model, opt_tokenizer, dataloader):
-        bits, group_size, sym = 4, 32, True
-        autoround = AutoRound(
-            opt_model,
-            opt_tokenizer,
-            bits=bits,
-            group_size=group_size,
-            sym=sym,
-            iters=0,
-            seqlen=2,
-            dataset=dataloader,
+        outputs = model.generate(
+            input_ids=inputs["input_ids"].to(model.device),
+            attention_mask=inputs["attention_mask"].to(model.device),
+            do_sample=False,  ## change this to follow official usage
+            max_new_tokens=5,
         )
+        generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
+
+        decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+
+        for i, prompt in enumerate(prompts):
+            print(f"Prompt: {prompt}")
+            print(f"Generated: {decoded_outputs[i]}")
+            print("-" * 50)
+        return decoded_outputs[0]
+
+    def main_op(self, format, bits, group_size, sym, dtype, device, fast_cfg=True, tar_acc=0.28):
+        limit = 100
+        if device == "xpu":
+            limit = 1000
+            if not torch.xpu.is_available():
+                pytest.skip("No XPU device")
+            if sym is False:
+                pytest.skip("No asym support for XPU")
+        model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto")
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        if fast_cfg:
+            autoround = AutoRound(
+                model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=0, nsamples=1, disable_opt_rtn=True
+            )
+        else:
+            autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym)
         quantized_model_path = self.save_folder
-        autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")  ##will convert to gptq model
+        autoround.quantize_and_save(output_dir=quantized_model_path, format=format)  ##will convert to gptq model
 
         quantization_config = AutoRoundConfig(backend="ark")
         model = AutoModelForCausalLM.from_pretrained(
-            quantized_model_path, dtype=torch.float16, device_map="xpu", quantization_config=quantization_config
+            quantized_model_path, dtype=dtype, device_map=device, quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        model_infer(model, tokenizer)
-        result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=1000)
+        self.model_infer(model, tokenizer)
+        result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=limit)
         print(result["results"]["lambada_openai"]["acc,none"])
-        assert result["results"]["lambada_openai"]["acc,none"] > 0.28
+        assert result["results"]["lambada_openai"]["acc,none"] > tar_acc
         torch.xpu.empty_cache()
         shutil.rmtree(self.save_folder, ignore_errors=True)
+
+    @pytest.mark.parametrize("format", ["auto_round", "auto_round:gptqmodel"])
+    @pytest.mark.parametrize("bits, group_size, sym", [(4, 128, True), (8, 128, True)])
+    @pytest.mark.parametrize("dtype", [torch.bfloat16])
+    @pytest.mark.parametrize("device", ["cpu", "xpu"])
+    def test_formats(self, format, bits, group_size, sym, dtype, device):
+        self.main_op(format, bits, group_size, sym, dtype, device)
+
+    @pytest.mark.parametrize("format", ["auto_round:auto_awq"])
+    @pytest.mark.parametrize("bits, group_size, sym", [(4, 32, True)])
+    @pytest.mark.parametrize("dtype", [torch.float16])
+    @pytest.mark.parametrize("device", ["cpu", "xpu"])
+    def test_awq_fp16(self, format, bits, group_size, sym, dtype, device):
+        self.main_op(format, bits, group_size, sym, dtype, device)
+
+    @pytest.mark.parametrize("format", ["auto_round"])
+    @pytest.mark.parametrize("bits, group_size, sym", [(2, 32, False)])
+    @pytest.mark.parametrize("dtype", [torch.bfloat16])
+    @pytest.mark.parametrize("device", ["cpu"])
+    def test_other_bits(self, format, bits, group_size, sym, dtype, device):
+        self.main_op(format, bits, group_size, sym, dtype, device, False, 0.2)
+
+
+if __name__ == "__main__":
+    p = TestAutoRoundARKBackend()
+    p.setup_class()
+    p.test_formats("auto_round:auto_awq", 4, 32, True, torch.bfloat16, "xpu")
+    p.teardown_class()
diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py
index 366819234..73491eb7f 100644
--- a/test/test_cpu/test_gguf_format.py
+++ b/test/test_cpu/test_gguf_format.py
@@ -23,10 +23,10 @@ def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_basic_usage(self):
+    def test_basic_usage(self, tiny_gemma_model_path, tiny_qwen_model_path):
         python_path = sys.executable
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --model {get_model_path('benzart/gemma-2b-it-fine-tuning-for-code-test')} "
+            f"cd ../.. && {python_path} -m auto_round --model {tiny_gemma_model_path} "
             f" --bs 16 --iters 0 --nsamples 1 --format gguf:q4_k_m"
         )
         if res > 0 or res == -1:
@@ -34,7 +34,7 @@ def test_basic_usage(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --model {self.model_name}"
+            f"cd ../.. && {python_path} -m auto_round --model {tiny_qwen_model_path}"
             f" --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0"
         )
         if res > 0 or res == -1:
@@ -156,8 +156,8 @@ def test_q4_k_m(self, dataloader):
         autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake")
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_all_format(self):
-        model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct")
+    def test_all_format(self, tiny_qwen_model_path):
+        model_name = tiny_qwen_model_path
         python_path = sys.executable
         # for gguf_format in ["gguf:q4_0", "gguf:q4_1", "gguf:q4_k_m", "gguf:q6_k"]:
         for gguf_format in ["gguf:q4_k_m"]:

From 95bd71e1e41537302a0acc8f7f17ed4dd0d857e3 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 23 Dec 2025 09:01:58 +0000
Subject: [PATCH 19/24] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 test/test_cuda/test_multiple_card_calib.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/test_cuda/test_multiple_card_calib.py b/test/test_cuda/test_multiple_card_calib.py
index 5243b1c07..03c59bba6 100644
--- a/test/test_cuda/test_multiple_card_calib.py
+++ b/test/test_cuda/test_multiple_card_calib.py
@@ -56,4 +56,3 @@ def test_multiple_card_nvfp4(self):
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
-

From f3369d6156a03396e742eba8110d69c15e0ba89b Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Tue, 23 Dec 2025 04:09:50 -0500
Subject: [PATCH 20/24] add test_ark change and minor fix

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 test/test_ark/test_model.py         | 42 ++---------------------------
 test/test_cpu/test_init.py          |  6 ++---
 test/test_cpu/test_torch_backend.py |  4 +--
 test/test_cuda/test_transformers.py |  8 +++---
 4 files changed, 12 insertions(+), 48 deletions(-)

diff --git a/test/test_ark/test_model.py b/test/test_ark/test_model.py
index 09d8bf25a..de4e9238e 100644
--- a/test/test_ark/test_model.py
+++ b/test/test_ark/test_model.py
@@ -2,65 +2,27 @@
 import sys
 
 import pytest
-
-sys.path.insert(0, "../..")
-
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound, AutoRoundConfig
 from auto_round.eval.evaluation import simple_evaluate_user_model
-from auto_round.testing_utils import require_autogptq, require_gptqmodel
-
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
+from ..helpers import get_model_path, model_infer
 
 
 class TestAutoRoundARKBackend:
 
     @classmethod
     def setup_class(self):
-        self.model_name = "facebook/opt-125m"
+        self.model_name = get_model_path("facebook/opt-125m")
         self.save_folder = "./saved"
-        self.llm_dataloader = LLMDataLoader()
 
     @classmethod
     def teardown_class(self):
         shutil.rmtree(self.save_folder, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def model_infer(self, model, tokenizer):
-        prompts = [
-            "Hello,my name is",
-            # "The president of the United States is",
-            # "The capital of France is",
-            # "The future of AI is",
-        ]
-
-        inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
-
-        outputs = model.generate(
-            input_ids=inputs["input_ids"].to(model.device),
-            attention_mask=inputs["attention_mask"].to(model.device),
-            do_sample=False,  ## change this to follow official usage
-            max_new_tokens=5,
-        )
-        generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
-
-        decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-        for i, prompt in enumerate(prompts):
-            print(f"Prompt: {prompt}")
-            print(f"Generated: {decoded_outputs[i]}")
-            print("-" * 50)
-        return decoded_outputs[0]
-
     def main_op(self, format, bits, group_size, sym, dtype, device, fast_cfg=True, tar_acc=0.28):
         limit = 100
         if device == "xpu":
diff --git a/test/test_cpu/test_init.py b/test/test_cpu/test_init.py
index 6ebee954d..01785d679 100644
--- a/test/test_cpu/test_init.py
+++ b/test/test_cpu/test_init.py
@@ -1,8 +1,8 @@
 from auto_round import AutoRound
 
 
-def test_torch_compile():
-    ar = AutoRound(model="facebook/opt-125m", scheme="NVFP4", enable_torch_compile=True)
+def test_torch_compile(tiny_opt_model_path):
+    ar = AutoRound(model=tiny_opt_model_path, scheme="NVFP4", enable_torch_compile=True)
     assert not ar.enable_torch_compile, "NVFP4 cannot work with torch.compile."
-    ar = AutoRound(model="facebook/opt-125m", scheme="FP8_STATIC", enable_torch_compile=True)
+    ar = AutoRound(model=tiny_opt_model_path, scheme="FP8_STATIC", enable_torch_compile=True)
     assert not ar.enable_torch_compile, "FP8_STATIC cannot work with torch.compile."
diff --git a/test/test_cpu/test_torch_backend.py b/test/test_cpu/test_torch_backend.py
index 81e009c06..0be8f76e6 100644
--- a/test/test_cpu/test_torch_backend.py
+++ b/test/test_cpu/test_torch_backend.py
@@ -8,14 +8,14 @@
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.testing_utils import require_autogptq, require_gptqmodel
 
-from ..helpers import model_infer
+from ..helpers import get_model_path, model_infer
 
 
 class TestAutoRoundTorchBackend:
 
     @classmethod
     def setup_class(self):
-        self.model_name = "facebook/opt-125m"
+        self.model_name = get_model_path("facebook/opt-125m")
         self.save_folder = "./saved"
 
     @classmethod
diff --git a/test/test_cuda/test_transformers.py b/test/test_cuda/test_transformers.py
index f6e5b4497..f37fe94ff 100644
--- a/test/test_cuda/test_transformers.py
+++ b/test/test_cuda/test_transformers.py
@@ -27,6 +27,8 @@
 )
 from transformers.utils import is_torch_available
 
+from ..helpers import get_model_path
+
 if is_torch_available():
     import torch
 
@@ -76,8 +78,8 @@ def test_quantized_model(self):
         output = self.quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False)
         assert self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUTS
 
-    def test_raise_if_non_quantized(self):
-        model_id = "facebook/opt-125m"
+    def test_raise_if_non_quantized(self, tiny_opt_model_path):
+        model_id = tiny_opt_model_path
         quantization_config = AutoRoundConfig(bits=4)
         with pytest.raises(ValueError):
             _ = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
@@ -185,7 +187,7 @@ def test_mixed_bits(self):
         """
         Simple test that checks if auto-round work properly with mixed bits
         """
-        model_name = "facebook/opt-125m"
+        model_name = get_model_path("facebook/opt-125m")
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         layer_config = {

From e7a238bd911f3996376b129b1633148f7f191ed1 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Wed, 24 Dec 2025 00:29:32 -0500
Subject: [PATCH 21/24] update path for testing

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 test/test_cpu/test_cli_usage.py            | 16 ++++++++--------
 test/test_cpu/test_gguf_format.py          | 10 +++++-----
 test/test_cuda/test_alg_ext.py             |  4 ++--
 test/test_cuda/test_gguf.py                |  4 ++--
 test/test_cuda/test_multiple_card_calib.py |  4 ++--
 test/test_cuda/test_support_vlms.py        | 10 +++++-----
 6 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/test/test_cpu/test_cli_usage.py b/test/test_cpu/test_cli_usage.py
index 82466dc82..b3aecf2f1 100644
--- a/test/test_cpu/test_cli_usage.py
+++ b/test/test_cpu/test_cli_usage.py
@@ -21,24 +21,24 @@ def test_auto_round_cmd(self, tiny_opt_model_path, tiny_qwen_vl_model_path):
         python_path = sys.executable
 
         # Test llm script
-        res = os.system(f"cd ../.. && {python_path} -m auto_round -h")
+        res = os.system(f"cd .. && {python_path} -m auto_round -h")
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved  --tasks piqa"
+            f"cd .. && {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved  --tasks piqa"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32"
+            f"cd .. && {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd ../.. && {python_path} -c 'from auto_round.__main__ import run_light; run_light()' --seqlen 8 --iter 2 --nsamples 8 --output_dir ./saved --tasks lambada_openai"
+            f"cd .. && {python_path} -c 'from auto_round.__main__ import run_light; run_light()' --seqlen 8 --iter 2 --nsamples 8 --output_dir ./saved --tasks lambada_openai"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
@@ -46,23 +46,23 @@ def test_auto_round_cmd(self, tiny_opt_model_path, tiny_qwen_vl_model_path):
         # test mllm script
 
         # test auto_round_mllm --eval help
-        res = os.system(f"cd ../.. && {python_path} -m auto_round --eval -h")
+        res = os.system(f"cd .. && {python_path} -m auto_round --eval -h")
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         # test auto_round_mllm --lmms help
-        res = os.system(f"cd ../.. && {python_path} -m auto_round --eval --lmms -h")
+        res = os.system(f"cd .. && {python_path} -m auto_round --eval --lmms -h")
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --mllm --model {tiny_qwen_vl_model_path} --iter 2 --nsamples 2 --seqlen 32 --format auto_round --output_dir ./saved"
+            f"cd .. && {python_path} -m auto_round --mllm --model {tiny_qwen_vl_model_path} --iter 2 --nsamples 2 --seqlen 32 --format auto_round --output_dir ./saved"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --mllm --iter 2 --nsamples 2 --model {tiny_qwen_vl_model_path} --seqlen 32 --format auto_round"
+            f"cd .. && {python_path} -m auto_round --mllm --iter 2 --nsamples 2 --model {tiny_qwen_vl_model_path} --seqlen 32 --format auto_round"
             " --quant_nontext_module --output_dir ./saved "
         )
         if res > 0 or res == -1:
diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py
index 73491eb7f..92e9d620e 100644
--- a/test/test_cpu/test_gguf_format.py
+++ b/test/test_cpu/test_gguf_format.py
@@ -26,7 +26,7 @@ def teardown_class(self):
     def test_basic_usage(self, tiny_gemma_model_path, tiny_qwen_model_path):
         python_path = sys.executable
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --model {tiny_gemma_model_path} "
+            f"cd .. && {python_path} -m auto_round --model {tiny_gemma_model_path} "
             f" --bs 16 --iters 0 --nsamples 1 --format gguf:q4_k_m"
         )
         if res > 0 or res == -1:
@@ -34,7 +34,7 @@ def test_basic_usage(self, tiny_gemma_model_path, tiny_qwen_model_path):
         shutil.rmtree("./saved", ignore_errors=True)
 
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --model {tiny_qwen_model_path}"
+            f"cd .. && {python_path} -m auto_round --model {tiny_qwen_model_path}"
             f" --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0"
         )
         if res > 0 or res == -1:
@@ -162,7 +162,7 @@ def test_all_format(self, tiny_qwen_model_path):
         # for gguf_format in ["gguf:q4_0", "gguf:q4_1", "gguf:q4_k_m", "gguf:q6_k"]:
         for gguf_format in ["gguf:q4_k_m"]:
             res = os.system(
-                f"cd ../.. && {python_path} -m auto_round --model {model_name} "
+                f"cd .. && {python_path} -m auto_round --model {model_name} "
                 f" --bs 16 --iters 1 --nsamples 1 --seqlen 16 --format {gguf_format}"
             )
             if res > 0 or res == -1:
@@ -170,7 +170,7 @@ def test_all_format(self, tiny_qwen_model_path):
             shutil.rmtree("../../tmp_autoround", ignore_errors=True)
 
             res = os.system(
-                f"cd ../.. && {python_path} -m auto_round --model {model_name}"
+                f"cd .. && {python_path} -m auto_round --model {model_name}"
                 f" --bs 16 --iters 0 --nsamples 1 --seqlen 16 --format fake,{gguf_format}"
             )
             if res > 0 or res == -1:
@@ -179,7 +179,7 @@ def test_all_format(self, tiny_qwen_model_path):
 
         # test mixed q2_k_s
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --model {model_name}"
+            f"cd .. && {python_path} -m auto_round --model {model_name}"
             f" --bs 16 --iters 0 --nsamples 1 --seqlen 16 --scheme GGUF:Q2_K_MIXED"
         )
         if res > 0 or res == -1:
diff --git a/test/test_cuda/test_alg_ext.py b/test/test_cuda/test_alg_ext.py
index 6b04847ed..6cdbc82ab 100644
--- a/test/test_cuda/test_alg_ext.py
+++ b/test/test_cuda/test_alg_ext.py
@@ -49,13 +49,13 @@ def test_cli(self, tiny_opt_model_path):
         python_path = sys.executable
 
         res = os.system(
-            f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 2 --options=W2A16,W4A16 --ignore_scale_zp_bits --nsamples 1 --seqlen 32"
+            f"cd .. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 2 --options=W2A16,W4A16 --ignore_scale_zp_bits --nsamples 1 --seqlen 32"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 5.5 --options=mxfp4,mxfp8 --ignore_scale_zp_bits --enable_torch_compile --nsamples 1 --seqlen 32"
+            f"cd .. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 5.5 --options=mxfp4,mxfp8 --ignore_scale_zp_bits --enable_torch_compile --nsamples 1 --seqlen 32"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
diff --git a/test/test_cuda/test_gguf.py b/test/test_cuda/test_gguf.py
index 7a3a0cd89..174deab2f 100644
--- a/test/test_cuda/test_gguf.py
+++ b/test/test_cuda/test_gguf.py
@@ -56,7 +56,7 @@ def test_gguf_format(self, tiny_qwen_model_path, dataloader):
 
         save_dir = os.path.join(os.path.dirname(__file__), "saved")
         res = os.system(
-            f"cd ../.. && {sys.executable} -m auto_round --model {tiny_qwen_model_path} --iter 2 "
+            f"cd .. && {sys.executable} -m auto_round --model {tiny_qwen_model_path} --iter 2 "
             f"--output_dir {save_dir} --nsample 2 --format gguf:q4_0 --device 0"
         )
         print(save_dir)
@@ -103,7 +103,7 @@ def test_q2_k_export(self, dataloader):
     def test_basic_usage(self, tiny_qwen_model_path):
         python_path = sys.executable
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --model {tiny_qwen_model_path} --eval_task_by_task"
+            f"cd .. && {python_path} -m auto_round --model {tiny_qwen_model_path} --eval_task_by_task"
             f" --tasks piqa,openbookqa --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0 --eval_model_dtype bf16"
         )
         if res > 0 or res == -1:
diff --git a/test/test_cuda/test_multiple_card_calib.py b/test/test_cuda/test_multiple_card_calib.py
index 03c59bba6..fedb3f328 100644
--- a/test/test_cuda/test_multiple_card_calib.py
+++ b/test/test_cuda/test_multiple_card_calib.py
@@ -41,7 +41,7 @@ def test_multiple_card_calib(self):
 
         ##test llm script
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --model /models/Meta-Llama-3.1-8B-Instruct --devices '0,1' --quant_lm_head --iters 1 --nsamples 1 --output_dir None"
+            f"cd .. && {python_path} -m auto_round --model /models/Meta-Llama-3.1-8B-Instruct --devices '0,1' --quant_lm_head --iters 1 --nsamples 1 --output_dir None"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
@@ -52,7 +52,7 @@ def test_multiple_card_nvfp4(self):
 
         ##test llm script
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --model facebook/opt-125m  --scheme NVFP4 --devices '0,1' --iters 1 --nsamples 1 --enable_torch_compile --low_gpu_mem_usage"
+            f"cd .. && {python_path} -m auto_round --model facebook/opt-125m  --scheme NVFP4 --devices '0,1' --iters 1 --nsamples 1 --enable_torch_compile --low_gpu_mem_usage"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
diff --git a/test/test_cuda/test_support_vlms.py b/test/test_cuda/test_support_vlms.py
index 9efd53564..3358c8226 100644
--- a/test/test_cuda/test_support_vlms.py
+++ b/test/test_cuda/test_support_vlms.py
@@ -26,7 +26,7 @@ def test_qwen2(self):
         model_path = "/models/Qwen2-VL-2B-Instruct/"
         # test tune
         res = os.system(
-            f"cd ../.. && {self.python_path} -m auto_round --mllm "
+            f"cd .. && {self.python_path} -m auto_round --mllm "
             f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}"
         )
         assert not (res > 0 or res == -1), "qwen2 tuning fail"
@@ -81,7 +81,7 @@ def test_phi3(self):
         model_path = "/models/Phi-3.5-vision-instruct/"
         ## test tune
         res = os.system(
-            f"cd ../.. && {self.python_path} -m auto_round --mllm "
+            f"cd .. && {self.python_path} -m auto_round --mllm "
             f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}"
         )
         assert not (res > 0 or res == -1), "Phi-3.5 tuning fail"
@@ -129,7 +129,7 @@ def test_phi3_vision_awq(self):
         model_path = "/models/Phi-3.5-vision-instruct/"
         ## test tune
         res = os.system(
-            f"cd ../.. && {self.python_path} -m auto_round --mllm "
+            f"cd .. && {self.python_path} -m auto_round --mllm "
             f"--model {model_path} --iter 2 --quant_nontext_module "
             f"--nsample 64 --seqlen 32 "
             f"--format auto_awq --output_dir {self.save_dir} --device {self.device}"
@@ -177,7 +177,7 @@ def test_glm(self):
         model_path = "/models/glm-4v-9b/"
         ## test tune
         res = os.system(
-            f"cd ../.. && {self.python_path} -m auto_round "
+            f"cd .. && {self.python_path} -m auto_round "
             f"--model {model_path} --iter 1 --output_dir {self.save_dir} --device {self.device}"
         )
         assert not (res > 0 or res == -1), "glm-4v-9b tuning fail"
@@ -186,7 +186,7 @@ def test_granite_vision(self):
         model_path = "/models/granite-vision-3.2-2b"
         ## test tune
         res = os.system(
-            f"cd ../.. && {self.python_path} -m auto_round "
+            f"cd .. && {self.python_path} -m auto_round "
             f"--model {model_path} --iter 1 --output_dir {self.save_dir} --device {self.device}"
         )
         assert not (res > 0 or res == -1), "granite-vision-3.2-2b tuning fail"

From 7d1453b2826b9b0da9138d6f622e1539a299ed63 Mon Sep 17 00:00:00 2001
From: "Sun, Xuehao" <xuehao.sun@intel.com>
Date: Wed, 24 Dec 2025 13:31:07 +0800
Subject: [PATCH 22/24] fix xpu ut path

Signed-off-by: Sun, Xuehao <xuehao.sun@intel.com>
---
 .azure-pipelines/scripts/ut/run_ut_xpu.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.azure-pipelines/scripts/ut/run_ut_xpu.sh b/.azure-pipelines/scripts/ut/run_ut_xpu.sh
index 2ab0aef64..740937d18 100644
--- a/.azure-pipelines/scripts/ut/run_ut_xpu.sh
+++ b/.azure-pipelines/scripts/ut/run_ut_xpu.sh
@@ -12,8 +12,7 @@ echo "##[endgroup]"
 uv pip list
 
 # test ark cpu part only before external xpu available
-cd /auto-round/test/test_ark || exit 1
-find . -type f -exec sed -i '/sys\.path\.insert(0, "\.\.")/d' {} +
+cd /auto-round/test || exit 1
 
 export LD_LIBRARY_PATH=${HOME}/.venv/lib/:$LD_LIBRARY_PATH
 export COVERAGE_RCFILE=/auto-round/.azure-pipelines/scripts/ut/.coverage
@@ -23,7 +22,7 @@ LOG_DIR=/auto-round/log_dir
 mkdir -p ${LOG_DIR}
 ut_log_name=${LOG_DIR}/ut.log
 
-find . -name "test*.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run.sh
+find ./test_ark -name "test*.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run.sh
 cat run.sh
 bash run.sh 2>&1 | tee "${ut_log_name}"
 

From 562689fe7b1349fc76f7c7de2961c270cf5c440d Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Wed, 24 Dec 2025 01:27:38 -0500
Subject: [PATCH 23/24] fix bug

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 test/test_ark/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 test/test_ark/__init__.py

diff --git a/test/test_ark/__init__.py b/test/test_ark/__init__.py
new file mode 100644
index 000000000..e69de29bb

From 4ee57dadf8208395c0eae53d2597ebd0ee3bef6c Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Wed, 24 Dec 2025 03:34:33 -0500
Subject: [PATCH 24/24] fix bug

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 test/test_ark/test_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_ark/test_model.py b/test/test_ark/test_model.py
index de4e9238e..bd4734609 100644
--- a/test/test_ark/test_model.py
+++ b/test/test_ark/test_model.py
@@ -48,7 +48,7 @@ def main_op(self, format, bits, group_size, sym, dtype, device, fast_cfg=True, t
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=limit)
         print(result["results"]["lambada_openai"]["acc,none"])
         assert result["results"]["lambada_openai"]["acc,none"] > tar_acc