diff --git a/.azure-pipelines/scripts/ut/run_ut.sh b/.azure-pipelines/scripts/ut/run_ut.sh
index dcf1a7170..e7d3d9e00 100644
--- a/.azure-pipelines/scripts/ut/run_ut.sh
+++ b/.azure-pipelines/scripts/ut/run_ut.sh
@@ -19,8 +19,7 @@ cd /auto-round && uv pip install .
 echo "##[endgroup]"
 uv pip list
 
-cd /auto-round/test/test_cpu || exit 1
-find . -type f -exec sed -i '/sys\.path\.insert(0, "\.\.")/d' {} +
+cd /auto-round/test || exit 1
 
 export LD_LIBRARY_PATH=${HOME}/.venv/lib/:$LD_LIBRARY_PATH
 export FORCE_BF16=1
@@ -32,7 +31,7 @@ mkdir -p ${LOG_DIR}
 ut_log_name=${LOG_DIR}/ut.log
 
 # Split test files into 5 parts
-find . -name "test*.py" | sort > all_tests.txt
+find ./test_cpu -name "test*.py" | sort > all_tests.txt
 total_lines=$(wc -l < all_tests.txt)
 NUM_CHUNKS=5
 q=$(( total_lines / NUM_CHUNKS ))
diff --git a/.azure-pipelines/scripts/ut/run_ut_cuda.sh b/.azure-pipelines/scripts/ut/run_ut_cuda.sh
index 18a9bb00d..0f111d3fa 100644
--- a/.azure-pipelines/scripts/ut/run_ut_cuda.sh
+++ b/.azure-pipelines/scripts/ut/run_ut_cuda.sh
@@ -27,16 +27,14 @@ function create_conda_env() {
 
     # install AutoRound
     cd ${REPO_PATH}
-    pip uninstall auto-round -y
+    uv pip install torch==2.8.0 torchvision
     uv pip install -r requirements.txt
-    sed -i '/^torch==/d;/^transformers==/d;/^lm-eval==/d' requirements.txt
     if [ -d "/proc/driver/nvidia" ]; then
         export PATH=/usr/local/cuda/bin${PATH:+:${PATH}}
         export LD_LIBRARY_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/nvidia/nvjitlink/lib:$LD_LIBRARY_PATH
     fi
     uv pip install --no-build-isolation .
     uv pip install pytest-cov pytest-html cmake==4.0.2
-    uv pip install torch==2.8.0 torchvision
 }
 
 function print_test_results_table() {
@@ -92,7 +90,7 @@ function run_unit_test() {
     # install unit test dependencies
     create_conda_env
 
-    cd ${REPO_PATH}/test/test_cuda
+    cd ${REPO_PATH}/test
     rm -rf .coverage* *.xml *.html
 
     uv pip install -v git+https://github.com/casper-hansen/AutoAWQ.git --no-build-isolation
@@ -100,15 +98,15 @@ function run_unit_test() {
     uv pip install -r https://raw.githubusercontent.com/ModelCloud/GPTQModel/refs/heads/main/requirements.txt
     CMAKE_ARGS="-DGGML_CUDA=on -DLLAVA_BUILD=off" uv pip install llama-cpp-python
     uv pip install 'git+https://github.com/ggml-org/llama.cpp.git#subdirectory=gguf-py'
-    uv pip install -r requirements.txt
-    uv pip install -r requirements_diffusion.txt
+    uv pip install -r test_cuda/requirements.txt
+    uv pip install -r test_cuda/requirements_diffusion.txt
 
     pip list > ${LOG_DIR}/ut_pip_list.txt
     export COVERAGE_RCFILE=${REPO_PATH}/.azure-pipelines/scripts/ut/.coverage
     local auto_round_path=$(python -c 'import auto_round; print(auto_round.__path__[0])')
 
     # run unit tests individually with separate logs
-    for test_file in $(find . -name "test_*.py" ! -name "test_*vlms.py" ! -name "test_llmc*.py" | sort); do
+    for test_file in $(find ./test_cuda -name "test_*.py" ! -name "test_*vlms.py" ! -name "test_llmc*.py" | sort); do
         local test_basename=$(basename ${test_file} .py)
         local ut_log_name=${LOG_DIR}/unittest_cuda_${test_basename}.log
         echo "Running ${test_file}..."
@@ -128,7 +126,7 @@ function run_unit_test() {
 function run_unit_test_vlm() {
     # install unit test dependencies
     create_conda_env
-    cd ${REPO_PATH}/test/test_cuda
+    cd ${REPO_PATH}/test
     rm -rf .coverage* *.xml *.html
 
     uv pip install git+https://github.com/haotian-liu/LLaVA.git@v1.2.2 --no-deps
@@ -138,14 +136,14 @@ function run_unit_test_vlm() {
     uv pip install git+https://github.com/deepseek-ai/DeepSeek-VL2.git timm attrdict --no-deps
     uv pip install -v git+https://github.com/casper-hansen/AutoAWQ.git@v0.2.0 --no-build-isolation
     uv pip install flash-attn==2.7.4.post1 --no-build-isolation
-    uv pip install -r requirements_vlm.txt
+    uv pip install -r test_cuda/requirements_vlm.txt
 
     pip list > ${LOG_DIR}/vlm_ut_pip_list.txt
     export COVERAGE_RCFILE=${REPO_PATH}/.azure-pipelines/scripts/ut/.coverage
     local auto_round_path=$(python -c 'import auto_round; print(auto_round.__path__[0])')
 
     # run VLM unit tests individually with separate logs
-    for test_file in $(find . -name "test*vlms.py"); do
+    for test_file in $(find ./test_cuda -name "test*vlms.py"); do
         local test_basename=$(basename ${test_file} .py)
         local ut_log_name=${LOG_DIR}/unittest_cuda_vlm_${test_basename}.log
         echo "Running ${test_file}..."
@@ -166,17 +164,17 @@ function run_unit_test_llmc() {
     # install unit test dependencies
     create_conda_env
 
-    cd ${REPO_PATH}/test/test_cuda
+    cd ${REPO_PATH}/test
     rm -rf .coverage* *.xml *.html
 
-    uv pip install -r requirements_llmc.txt
+    uv pip install -r test_cuda/requirements_llmc.txt
 
     pip list > ${LOG_DIR}/llmc_ut_pip_list.txt
     export COVERAGE_RCFILE=${REPO_PATH}/.azure-pipelines/scripts/ut/.coverage
     local auto_round_path=$(python -c 'import auto_round; print(auto_round.__path__[0])')
 
     # run unit tests individually with separate logs
-    for test_file in $(find . -name "test_llmc*.py" | sort); do
+    for test_file in $(find ./test_cuda -name "test_llmc*.py" | sort); do
         local test_basename=$(basename ${test_file} .py)
         local ut_log_name=${LOG_DIR}/unittest_cuda_llmc_${test_basename}.log
         echo "Running ${test_file}..."
diff --git a/.azure-pipelines/scripts/ut/run_ut_hpu.sh b/.azure-pipelines/scripts/ut/run_ut_hpu.sh
index 3c3bb6991..b370edfb5 100644
--- a/.azure-pipelines/scripts/ut/run_ut_hpu.sh
+++ b/.azure-pipelines/scripts/ut/run_ut_hpu.sh
@@ -7,8 +7,7 @@ export TQDM_MININTERVAL=60
 pip install pytest-cov pytest-html
 pip list
 
-cd /auto-round/test/test_hpu || exit 1
-find . -type f -exec sed -i '/sys\.path\.insert(0, "\.\.")/d' {} +
+cd /auto-round/test || exit 1
 
 export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
 export FORCE_BF16=1
@@ -19,8 +18,8 @@ LOG_DIR=/auto-round/log_dir
 mkdir -p ${LOG_DIR}
 ut_log_name=${LOG_DIR}/ut.log
 
-find . -name "test*.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html  --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_lazy.sh
-find . -name "test*.py" | sed "s,\.\/,python -m pytest --mode compile --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html  --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_compile.sh
+find ./test_hpu -name "test*.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html  --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_lazy.sh
+find ./test_hpu -name "test*.py" | sed "s,\.\/,python -m pytest --mode compile --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html  --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_compile.sh
 
 cat run_lazy.sh
 bash run_lazy.sh 2>&1 | tee ${ut_log_name}
diff --git a/.azure-pipelines/scripts/ut/run_ut_xpu.sh b/.azure-pipelines/scripts/ut/run_ut_xpu.sh
index 2ab0aef64..740937d18 100644
--- a/.azure-pipelines/scripts/ut/run_ut_xpu.sh
+++ b/.azure-pipelines/scripts/ut/run_ut_xpu.sh
@@ -12,8 +12,7 @@ echo "##[endgroup]"
 uv pip list
 
 # test ark cpu part only before external xpu available
-cd /auto-round/test/test_ark || exit 1
-find . -type f -exec sed -i '/sys\.path\.insert(0, "\.\.")/d' {} +
+cd /auto-round/test || exit 1
 
 export LD_LIBRARY_PATH=${HOME}/.venv/lib/:$LD_LIBRARY_PATH
 export COVERAGE_RCFILE=/auto-round/.azure-pipelines/scripts/ut/.coverage
@@ -23,7 +22,7 @@ LOG_DIR=/auto-round/log_dir
 mkdir -p ${LOG_DIR}
 ut_log_name=${LOG_DIR}/ut.log
 
-find . -name "test*.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run.sh
+find ./test_ark -name "test*.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run.sh
 cat run.sh
 bash run.sh 2>&1 | tee "${ut_log_name}"
 
diff --git a/auto_round/compressors/mllm/compressor.py b/auto_round/compressors/mllm/compressor.py
index 6ea6d2cdf..31f97cbe3 100644
--- a/auto_round/compressors/mllm/compressor.py
+++ b/auto_round/compressors/mllm/compressor.py
@@ -206,6 +206,9 @@ def __init__(
         if hasattr(model, "name_or_path") and any([name in model.name_or_path for name in MISTRAL_3_2_MODELS]):
             template = "mistral3_2"
         if iters > 0:
+            # TODO: Remove after fixing https://github.com/huggingface/transformers/issues/43005
+            model.config.model_type = model.config.to_dict()["model_type"]
+
             if template is None and model.config.model_type not in TEMPLATES:
                 self.template = None
             else:
diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
index f4bb15575..38f984663 100644
--- a/auto_round/utils/model.py
+++ b/auto_round/utils/model.py
@@ -1047,6 +1047,11 @@ def set_module(model, key, new_module):
     setattr(module, name_list[-1], new_module)
 
 
+# For getting and setting attribution, such as 'lm_head.weight'
+get_attr = get_module
+set_attr = set_module
+
+
 def get_layer_features(layer):
     """Extracts input and output feature dimensions for supported layers."""
     from auto_round.utils import deepspeed_exists
diff --git a/test/README.md b/test/README.md
new file mode 100644
index 000000000..9ccca0017
--- /dev/null
+++ b/test/README.md
@@ -0,0 +1,46 @@
+# Unit Test (UT) Guide
+
+This project uses `pytest` for unit testing. All test cases are under the `test/` directory. Below is a simple guide for new users to write and run UTs:
+
+## 1. Environment Setup
+- Recommended Python 3.8 or above.
+- Install dependencies:
+  ```sh
+  pip install -r ../requirements.txt
+  pip install pytest
+  ```
+
+## 2. Test Structure
+- Place your test files in the `test/` directory, and name them starting with `test_`.
+- You can refer to existing `test_*.py` files.
+- Common fixtures (such as `tiny_opt_model`, `opt_model`, `opt_tokenizer`, `dataloader`) and helper functions (such as `model_infer`) are defined in `confest.py` and `helpers.py` and can be imported directly.
+- Example:
+  ```python
+  # test_example.py
+    from ..helpers import model_infer
+
+    def test_model_infer(tiny_opt_model, opt_tokenizer):
+        result = model_infer(tiny_opt_model, opt_tokenizer, input_text="hello world")
+        assert result is not None
+  ```
+
+## 3. Running Tests
+- In the `test/` directory, run:
+  ```sh
+  pytest
+  ```
+- You can specify a single file or test case:
+  ```sh
+  pytest test_xxx.py
+  pytest -k "test_func_name"
+  ```
+
+## 4. Debugging Tips
+- `confest.py` adds the parent directory to `sys.path`, so you can debug without installing the local package.
+- You can directly import project source code in your test cases.
+
+## 5. Reference
+- Fixtures are defined in `confest.py` and `fixtures.py`
+- Helper functions are in `helpers.py`
+
+If you have any questions, feel free to open an issue.
diff --git a/test/test_hpu/conftest.py b/test/conftest.py
similarity index 81%
rename from test/test_hpu/conftest.py
rename to test/conftest.py
index f4e9675bf..d21100824 100644
--- a/test/test_hpu/conftest.py
+++ b/test/conftest.py
@@ -1,9 +1,16 @@
 import os
+import sys
 from typing import Mapping
 
 import pytest
 
+from .fixtures import *
 
+# Easy debugging without installing auto-round.
+sys.path.insert(0, "..")
+
+
+### HPU related configuration, usage: `pytest --mode=compile/lazy``
 def pytest_addoption(parser):
     parser.addoption(
         "--mode",
diff --git a/test/fixtures.py b/test/fixtures.py
new file mode 100644
index 000000000..c76040322
--- /dev/null
+++ b/test/fixtures.py
@@ -0,0 +1,169 @@
+import os
+import shutil
+
+import pytest
+import torch
+import transformers
+
+from .helpers import (
+    DataLoader,
+    deepseek_v2_name_or_path,
+    gemma_name_or_path,
+    get_tiny_model,
+    gptj_name_or_path,
+    lamini_name_or_path,
+    opt_name_or_path,
+    phi2_name_or_path,
+    qwen_2_5_vl_name_or_path,
+    qwen_moe_name_or_path,
+    qwen_name_or_path,
+    qwen_vl_name_or_path,
+    save_tiny_model,
+)
+
+
+# Create tiny model path fixtures for testing
+@pytest.fixture(scope="session")
+def tiny_opt_model_path():
+    model_name_or_path = opt_name_or_path
+    tiny_model_path = "./tmp/tiny_opt_model_path"
+    tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path)
+    yield tiny_model_path
+    shutil.rmtree(tiny_model_path)
+
+
+@pytest.fixture(scope="session")
+def tiny_lamini_model_path():
+    model_name_or_path = lamini_name_or_path
+    tiny_model_path = "./tmp/tiny_lamini_model_path"
+    tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path)
+    yield tiny_model_path
+    shutil.rmtree(tiny_model_path)
+
+
+@pytest.fixture(scope="session")
+def tiny_gptj_model_path():
+    model_name_or_path = gptj_name_or_path
+    tiny_model_path = "./tmp/tiny_gptj_model_path"
+    tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path)
+    yield tiny_model_path
+    shutil.rmtree(tiny_model_path)
+
+
+@pytest.fixture(scope="session")
+def tiny_phi2_model_path():
+    model_name_or_path = phi2_name_or_path
+    tiny_model_path = "./tmp/tiny_phi2_model_path"
+    tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path)
+    yield tiny_model_path
+    shutil.rmtree(tiny_model_path)
+
+
+@pytest.fixture(scope="session")
+def tiny_deepseek_v2_model_path():
+    model_name_or_path = deepseek_v2_name_or_path
+    tiny_model_path = "./tmp/tiny_deepseek_v2_model_path"
+    tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2)
+    yield tiny_model_path
+    shutil.rmtree(tiny_model_path)
+
+
+@pytest.fixture(scope="session")
+def tiny_gemma_model_path():
+    model_name_or_path = gemma_name_or_path
+    tiny_model_path = "./tmp/tiny_gemma_model_path"
+    tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2)
+    yield tiny_model_path
+    shutil.rmtree(tiny_model_path)
+
+
+@pytest.fixture(scope="session")
+def tiny_qwen_model_path():
+    model_name_or_path = qwen_name_or_path
+    tiny_model_path = "./tmp/tiny_qwen_model_path"
+    tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path)
+    yield tiny_model_path
+    shutil.rmtree(tiny_model_path)
+
+
+@pytest.fixture(scope="session")
+def tiny_untied_qwen_model_path():
+    model_name_or_path = qwen_name_or_path
+    tiny_model_path = "./tmp/tiny_untied_qwen_model_path"
+    tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, force_untie=True)
+    yield tiny_model_path
+    shutil.rmtree(tiny_model_path)
+
+
+@pytest.fixture(scope="session")
+def tiny_qwen_moe_model_path():
+    model_name_or_path = qwen_moe_name_or_path
+    tiny_model_path = "./tmp/tiny_qwen_moe_model_path"
+    tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2)
+    yield tiny_model_path
+    shutil.rmtree(tiny_model_path)
+
+
+@pytest.fixture(scope="session")
+def tiny_qwen_vl_model_path():
+    model_name_or_path = qwen_vl_name_or_path
+    tiny_model_path = "./tmp/tiny_qwen_vl_model_path"
+    tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2, is_mllm=True)
+    yield tiny_model_path
+    shutil.rmtree(tiny_model_path)
+
+
+@pytest.fixture(scope="session")
+def tiny_qwen_2_5_vl_model_path():
+    model_name_or_path = qwen_2_5_vl_name_or_path
+    tiny_model_path = "./tmp/tiny_qwen_2_5_vl_model_path"
+    tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2, is_mllm=True)
+    yield tiny_model_path
+    shutil.rmtree(tiny_model_path)
+
+
+@pytest.fixture(autouse=True, scope="session")
+def clean_tmp_model_folder():
+    yield
+    shutil.rmtree("./tmp", ignore_errors=True)  # unittest default workspace
+    shutil.rmtree("./tmp_autoround", ignore_errors=True)  # autoround default workspace
+
+
+# Create objective fixtures for testing
+@pytest.fixture(scope="function")
+def tiny_opt_model():
+    model_name_or_path = opt_name_or_path
+    return get_tiny_model(model_name_or_path, num_layers=2)
+
+
+@pytest.fixture(scope="function")
+def opt_model():
+    model_name_or_path = opt_name_or_path
+    model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype="auto", trust_remote_code=True)
+    return model
+
+
+@pytest.fixture(scope="session")
+def opt_tokenizer():
+    model_name_or_path = opt_name_or_path
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
+    return tokenizer
+
+
+@pytest.fixture(scope="function")
+def model():
+    model_name_or_path = opt_name_or_path
+    model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype="auto", trust_remote_code=True)
+    return model
+
+
+@pytest.fixture(scope="session")
+def tokenizer():
+    model_name_or_path = opt_name_or_path
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
+    return tokenizer
+
+
+@pytest.fixture(scope="session")
+def dataloader():
+    return DataLoader()
diff --git a/test/helpers.py b/test/helpers.py
new file mode 100644
index 000000000..89b832c6d
--- /dev/null
+++ b/test/helpers.py
@@ -0,0 +1,236 @@
+import copy
+import os
+
+import pytest
+import torch
+import transformers
+
+from auto_round.utils import get_attr, llm_load_model, mllm_load_model, set_attr
+
+
+# Automatic choose local path or model name.
+def get_model_path(model_name: str) -> str:
+    ut_path = f"/tf_dataset/auto_round/models/{model_name}"
+    local_path = f"/models/{model_name.split('/')[-1]}"
+
+    if "DeepSeek-V2-Lite" in model_name and os.path.exists("/data0/deepseek-ai/DeepSeek-V2-Lite"):
+        return "/data0/deepseek-ai/DeepSeek-V2-Lite"
+
+    if os.path.exists(ut_path):
+        return ut_path
+    elif os.path.exists(local_path):
+        return local_path
+    else:
+        return model_name
+
+
+opt_name_or_path = get_model_path("facebook/opt-125m")
+qwen_name_or_path = get_model_path("Qwen/Qwen3-0.6B")
+lamini_name_or_path = get_model_path("MBZUAI/LaMini-GPT-124M")
+gptj_name_or_path = get_model_path("hf-internal-testing/tiny-random-GPTJForCausalLM")
+phi2_name_or_path = get_model_path("microsoft/phi-2")
+deepseek_v2_name_or_path = get_model_path("deepseek-ai/DeepSeek-V2-Lite")
+qwen_moe_name_or_path = get_model_path("Qwen/Qwen1.5-MoE-A2.7B")
+qwen_vl_name_or_path = get_model_path("Qwen/Qwen2-VL-2B-Instruct")
+qwen_2_5_vl_name_or_path = get_model_path("Qwen/Qwen2.5-VL-3B-Instruct")
+gemma_name_or_path = get_model_path("benzart/gemma-2b-it-fine-tuning-for-code-test")
+
+
+# Slice model into tiny model for speedup
+def get_tiny_model(model_name_or_path, num_layers=2, is_mllm=False, **kwargs):
+    """Generate a tiny model by slicing layers from the original model."""
+    model_name_or_path = get_model_path(model_name_or_path)
+
+    def slice_layers(module):
+        """slice layers in the model."""
+        sliced = False
+        for name, child in module.named_children():
+            if isinstance(child, torch.nn.ModuleList) and len(child) > num_layers:
+                new_layers = torch.nn.ModuleList(child[:num_layers])
+                setattr(module, name, new_layers)
+                sliced = True
+            elif slice_layers(child):
+                sliced = True
+        return sliced
+
+    kwargs["dtype"] = "auto" if "auto" not in kwargs else kwargs["dtype"]
+    kwargs["trust_remote_code"] = True if "trust_remote_code" not in kwargs else kwargs["trust_remote_code"]
+    if is_mllm:
+        model, processor, tokenizer, image_processor = mllm_load_model(model_name_or_path, **kwargs)
+        if hasattr(model.config, "vision_config"):
+            if hasattr(model.config.vision_config, "num_hidden_layers"):  # mistral, etc.
+                model.config.num_hidden_layers = num_layers
+            elif hasattr(model.config.vision_config, "depth"):  # qwen vl
+                model.config.vision_config.depth = num_layers
+    else:
+        model, tokenizer = llm_load_model(model_name_or_path, **kwargs)
+
+    slice_layers(model)
+
+    if hasattr(model.config, "num_hidden_layers"):
+        model.config.num_hidden_layers = num_layers
+    if hasattr(model.config, "layer_types"):
+        model.config.layer_types = model.config.layer_types[:num_layers]
+
+    return model
+
+
+# for fixture usage only
+def save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2, is_mllm=False, force_untie=False, **kwargs):
+    """Generate  a tiny model and save to the specified path."""
+    model = get_tiny_model(model_name_or_path, num_layers=num_layers, is_mllm=is_mllm, **kwargs)
+    if force_untie:
+        if getattr(getattr(model, "config", None), "tie_word_embeddings", False):
+            model.config.tie_word_embeddings = False
+            for key in model._tied_weights_keys:
+                weight = get_attr(model, key)
+                set_attr(model, key, copy.deepcopy(weight))
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
+    test_path = os.path.dirname(__file__)
+    tiny_model_path = os.path.join(test_path, tiny_model_path.removeprefix("./"))
+    model.save_pretrained(tiny_model_path)
+    tokenizer.save_pretrained(tiny_model_path)
+    if is_mllm:
+        processor = transformers.AutoProcessor.from_pretrained(model_name_or_path, trust_remote_code=True)
+        image_processor = transformers.AutoImageProcessor.from_pretrained(model_name_or_path, trust_remote_code=True)
+        processor.save_pretrained(tiny_model_path)
+        image_processor.save_pretrained(tiny_model_path)
+    print(f"[Fixture]: built tiny model path:{tiny_model_path} for testing in session")
+    return tiny_model_path
+
+
+# HPU mode checking
+def is_pytest_mode_compile():
+    return pytest.mode == "compile"
+
+
+def is_pytest_mode_lazy():
+    return pytest.mode == "lazy"
+
+
+# General model inference code
+def model_infer(model, tokenizer, apply_chat_template=False):
+    """Run model inference and print generated outputs."""
+    prompts = [
+        "Hello,my name is",
+        # "The president of the United States is",
+        # "The capital of France is",
+        # "The future of AI is",
+    ]
+    if apply_chat_template:
+        texts = []
+        for prompt in prompts:
+            messages = [{"role": "user", "content": prompt}]
+            text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            texts.append(text)
+        prompts = texts
+
+    inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
+
+    outputs = model.generate(
+        input_ids=inputs["input_ids"].to(model.device),
+        attention_mask=inputs["attention_mask"].to(model.device),
+        do_sample=False,  ## change this to follow official usage
+        max_new_tokens=5,
+    )
+    generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
+
+    decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+
+    for i, prompt in enumerate(prompts):
+        print(f"Prompt: {prompt}")
+        print(f"Generated: {decoded_outputs[i]}")
+        print("-" * 50)
+    return decoded_outputs[0]
+
+
+# Dummy dataloader for testing
+class DataLoader:
+    def __init__(self):
+        self.batch_size = 1
+
+    def __iter__(self):
+        for i in range(2):
+            yield torch.ones([1, 10], dtype=torch.long)
+
+
+fixed_input = torch.tensor([[10, 20, 30, 40, 50]], dtype=torch.long)
+
+
+def get_output(model_name_or_path):
+    """Get model output for fixed input."""
+    try:
+        model, tokenizer = llm_load_model(model_name_or_path)
+    except:
+        model, processor, tokenizer, image_processor = mllm_load_model(model_name_or_path)
+    outputs = model(fixed_input)[0]
+    return outputs.detach().cpu()
+
+
+def is_model_outputs_similar(model_path_1, model_path_2, metric="cosine_similarity", threshold=0.98, k=5, verbose=True):
+    """
+    Compare outputs from two models using specified metric and return pass/fail.
+
+    Args:
+        model_path_1: Path to first model
+        model_path_2: Path to second model
+        metric: Metric to use - "mse", "cosine_similarity"/"cos_sim", or "topk"
+        threshold: Threshold value for pass/fail
+        k: K value for top-k metric (only used when metric="topk")
+        verbose: Whether to print detailed results
+
+    Returns:
+        bool: True if metric passes threshold, False otherwise
+    """
+    if verbose:
+        print(f"\n{'='*70}")
+        print("Comparing Model Outputs")
+        print(f"{'='*70}")
+        print(f"Model 1: {model_path_1}")
+        print(f"Model 2: {model_path_2}")
+        print(f"Metric:  {metric} | Threshold: {threshold}" + (f" | K: {k}" if "top" in metric.lower() else ""))
+        print(f"{'='*70}\n")
+
+    output_1 = get_output(model_path_1)
+    output_2 = get_output(model_path_2)
+    metric = metric.lower().replace("-", "_")
+
+    # Calculate metric and check threshold
+    if metric == "mse":
+        value = torch.mean((output_1.float() - output_2.float()) ** 2).item()
+        passed = value <= threshold
+        if verbose:
+            print(f"MSE: {value:.6f} | Threshold: <= {threshold} | {'✓ PASS' if passed else '✗ FAIL'}\n")
+
+    elif metric in ["cosine_similarity", "cos_sim", "cosine"]:
+        out1 = output_1.float().flatten()
+        out2 = output_2.float().flatten()
+        value = torch.nn.functional.cosine_similarity(out1.unsqueeze(0), out2.unsqueeze(0)).item()
+        passed = value >= threshold
+        if verbose:
+            print(f"Cosine Similarity: {value:.6f} | Threshold: >= {threshold} | {'✓ PASS' if passed else '✗ FAIL'}\n")
+
+    elif metric in ["topk", "top_k"]:
+        _, topk_1 = torch.topk(output_1, k=min(k, output_1.size(-1)), dim=-1)
+        _, topk_2 = torch.topk(output_2, k=min(k, output_2.size(-1)), dim=-1)
+
+        total_agreement = 0
+        total_positions = topk_1.numel() // topk_1.size(-1)
+
+        for i in range(topk_1.size(0)):
+            for j in range(topk_1.size(1)):
+                set1 = set(topk_1[i, j].tolist())
+                set2 = set(topk_2[i, j].tolist())
+                total_agreement += len(set1 & set2) / k
+
+        value = total_agreement / total_positions
+        passed = value >= threshold
+        if verbose:
+            print(
+                f"Top-{k} Agreement: {value:.4%} | Threshold: >= {threshold:.4%} | {'✓ PASS' if passed else '✗ FAIL'}\n"
+            )
+
+    else:
+        raise ValueError(f"Unknown metric: {metric}. Choose from: 'mse', 'cosine_similarity', 'topk'")
+
+    return passed
diff --git a/test/test_ark/__init__.py b/test/test_ark/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/test_ark/test_model.py b/test/test_ark/test_model.py
index 09d8bf25a..bd4734609 100644
--- a/test/test_ark/test_model.py
+++ b/test/test_ark/test_model.py
@@ -2,65 +2,27 @@
 import sys
 
 import pytest
-
-sys.path.insert(0, "../..")
-
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound, AutoRoundConfig
 from auto_round.eval.evaluation import simple_evaluate_user_model
-from auto_round.testing_utils import require_autogptq, require_gptqmodel
-
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
+from ..helpers import get_model_path, model_infer
 
 
 class TestAutoRoundARKBackend:
 
     @classmethod
     def setup_class(self):
-        self.model_name = "facebook/opt-125m"
+        self.model_name = get_model_path("facebook/opt-125m")
         self.save_folder = "./saved"
-        self.llm_dataloader = LLMDataLoader()
 
     @classmethod
     def teardown_class(self):
         shutil.rmtree(self.save_folder, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def model_infer(self, model, tokenizer):
-        prompts = [
-            "Hello,my name is",
-            # "The president of the United States is",
-            # "The capital of France is",
-            # "The future of AI is",
-        ]
-
-        inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
-
-        outputs = model.generate(
-            input_ids=inputs["input_ids"].to(model.device),
-            attention_mask=inputs["attention_mask"].to(model.device),
-            do_sample=False,  ## change this to follow official usage
-            max_new_tokens=5,
-        )
-        generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
-
-        decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-        for i, prompt in enumerate(prompts):
-            print(f"Prompt: {prompt}")
-            print(f"Generated: {decoded_outputs[i]}")
-            print("-" * 50)
-        return decoded_outputs[0]
-
     def main_op(self, format, bits, group_size, sym, dtype, device, fast_cfg=True, tar_acc=0.28):
         limit = 100
         if device == "xpu":
@@ -86,7 +48,7 @@ def main_op(self, format, bits, group_size, sym, dtype, device, fast_cfg=True, t
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=limit)
         print(result["results"]["lambada_openai"]["acc,none"])
         assert result["results"]["lambada_openai"]["acc,none"] > tar_acc
diff --git a/test/test_cpu/__init__.py b/test/test_cpu/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/test_cpu/_test_helpers.py b/test/test_cpu/_test_helpers.py
deleted file mode 100644
index b4b8a5955..000000000
--- a/test/test_cpu/_test_helpers.py
+++ /dev/null
@@ -1,32 +0,0 @@
-def model_infer(model, tokenizer, apply_chat_template=False):
-    prompts = [
-        "Hello,my name is",
-        # "The president of the United States is",
-        # "The capital of France is",
-        # "The future of AI is",
-    ]
-    if apply_chat_template:
-        texts = []
-        for prompt in prompts:
-            messages = [{"role": "user", "content": prompt}]
-            text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-            texts.append(text)
-        prompts = texts
-
-    inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
-
-    outputs = model.generate(
-        input_ids=inputs["input_ids"].to(model.device),
-        attention_mask=inputs["attention_mask"].to(model.device),
-        do_sample=False,  ## change this to follow official usage
-        max_new_tokens=5,
-    )
-    generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
-
-    decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-    for i, prompt in enumerate(prompts):
-        print(f"Prompt: {prompt}")
-        print(f"Generated: {decoded_outputs[i]}")
-        print("-" * 50)
-    return decoded_outputs[0]
diff --git a/test/test_cpu/requirements.txt b/test/test_cpu/requirements.txt
index 219189829..a54cc4e4e 100644
--- a/test/test_cpu/requirements.txt
+++ b/test/test_cpu/requirements.txt
@@ -3,7 +3,6 @@ modelscope
 gguf
 sentencepiece
 torchvision
-parameterized
 pillow
 numba
 llmcompressor @ git+https://github.com/vllm-project/llm-compressor.git@main
diff --git a/test/test_cpu/test_act_quantization.py b/test/test_cpu/test_act_quantization.py
index 31ba51f1b..cd41c0985 100644
--- a/test/test_cpu/test_act_quantization.py
+++ b/test/test_cpu/test_act_quantization.py
@@ -1,87 +1,72 @@
 import copy
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
 
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
+class TestAutoRoundAct:
+    save_dir = "./saved"
 
-    def __iter__(self):
-        for i in range(3):
-            yield torch.ones([1, 10], dtype=torch.long)
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
 
+        # Yield to hand control to the test methods
+        yield
 
-class TestAutoRoundAct(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        self.save_dir = "./saved"
-        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-        self.llm_dataloader = LLMDataLoader()
-
-    @classmethod
-    def tearDownClass(self):
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_mx_fp4(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    def test_mx_fp4(self, tiny_opt_model, opt_tokenizer, dataloader):
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
-            model,
-            tokenizer,
+            tiny_opt_model,
+            opt_tokenizer,
             bits=bits,
             group_size=group_size,
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             act_bits=4,
             data_type="mx_fp",
         )
         autoround.quantize()
 
-    def test_wint4fp8_dynamic(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    def test_wint4fp8_dynamic(self, tiny_opt_model, opt_tokenizer, dataloader):
         bits, group_size = 4, 128
         autoround = AutoRound(
-            model,
-            tokenizer,
+            tiny_opt_model,
+            opt_tokenizer,
             bits=bits,
             group_size=group_size,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             act_bits=8,
             data_type="fp8",
             act_data_type="fp8",
         )
         autoround.quantize()
 
-    def test_wint4fp8_static(self):
+    def test_wint4fp8_static(self, tiny_opt_model, opt_tokenizer, dataloader):
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
-            self.model,
-            self.tokenizer,
+            tiny_opt_model,
+            opt_tokenizer,
             bits=bits,
             group_size=group_size,
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             act_bits=8,
             data_type="fp8_to_int_sym",
             act_dynamic=False,
@@ -89,66 +74,42 @@ def test_wint4fp8_static(self):
         )
         autoround.quantize()
 
-    def test_wfp8afp8_static(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    @pytest.mark.parametrize("act_group_size", [-1, 128])
+    def test_wfp8afp8_static(self, act_group_size, tiny_opt_model, opt_tokenizer, dataloader):
         from auto_round.wrapper import WrapperWALayer
 
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         autoround = AutoRound(
-            model,
-            tokenizer,
+            tiny_opt_model,
+            opt_tokenizer,
             group_size=128,
-            act_group_size=-1,
+            act_group_size=act_group_size,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             data_type="fp8",
             act_dynamic=False,
             act_data_type="fp8",
         )
         autoround.quantize()
 
-        self.assertTrue(isinstance(autoround.model.model.decoder.layers[2].self_attn.k_proj, WrapperWALayer))
-        self.assertEqual(autoround.model.model.decoder.layers[2].self_attn.k_proj.orig_layer.act_scale.shape[0], 30)
-        self.assertEqual(autoround.model.model.decoder.layers[2].self_attn.k_proj.orig_layer.act_max.shape[0], 30)
+        k_proj = autoround.model.model.decoder.layers[1].self_attn.k_proj
+        assert isinstance(k_proj, WrapperWALayer), "k_proj should be WrapperWALayer"
+        if act_group_size == -1:
+            assert k_proj.orig_layer.act_scale.shape[0] == 20, "act_scale shape[0] should be 20"
+            assert k_proj.orig_layer.act_max.shape[0] == 20, "act_max shape[0] should be 20"
+        else:
+            assert k_proj.orig_layer.act_scale.shape[0] == int(2 * 10 * 768 / 128), "act_scale shape[0] is incorrect"
+            assert k_proj.orig_layer.act_max.shape[0] == int(2 * 10 * 768 / 128), "act_max shape[0] is incorrect"
 
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-        autoround = AutoRound(
-            model,
-            tokenizer,
-            group_size=128,
-            act_group_size=128,
-            iters=0,
-            seqlen=2,
-            dataset=self.llm_dataloader,
-            data_type="fp8",
-            act_dynamic=False,
-            act_data_type="fp8",
-        )
-        autoround.quantize()
-        self.assertTrue(isinstance(autoround.model.model.decoder.layers[2].self_attn.k_proj, WrapperWALayer))
-
-        self.assertEqual(
-            autoround.model.model.decoder.layers[2].self_attn.k_proj.orig_layer.act_scale.shape[0],
-            int(3 * 10 * 768 / 128),
-        )
-        self.assertEqual(
-            autoround.model.model.decoder.layers[2].self_attn.k_proj.orig_layer.act_max.shape[0],
-            int(3 * 10 * 768 / 128),
-        )
-
-    def test_act_config_MXFP4_saving(self):
+    def test_act_config_MXFP4_saving(self, tiny_opt_model_path, dataloader):
         scheme = "MXFP4"
         layer_config = {"lm_head": {"act_bits": 8, "bits": 8}, "k_proj": {"act_bits": 8, "bits": 8}}
         autoround = AutoRound(
-            self.model_name,
+            tiny_opt_model_path,
             scheme=scheme,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
@@ -168,15 +129,15 @@ def test_act_config_MXFP4_saving(self):
         assert "sym" in kproj_config.keys() and kproj_config["sym"]
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_act_config_NVFP4_saving(self):
+    def test_act_config_NVFP4_saving(self, tiny_opt_model_path, dataloader):
         scheme = "NVFP4"
         layer_config = {"k_proj": {"act_bits": 16, "bits": 16}}
         autoround = AutoRound(
-            self.model_name,
+            tiny_opt_model_path,
             scheme=scheme,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
@@ -193,16 +154,16 @@ def test_act_config_NVFP4_saving(self):
         assert "sym" in kproj_config.keys() and kproj_config["sym"]
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_WOQ_config_INT_saving(self):
+    def test_WOQ_config_INT_saving(self, tiny_opt_model_path, dataloader):
         scheme = "W4A16"
         layer_config = {"k_proj": {"bits": 8}}
         autoround = AutoRound(
-            self.model_name,
+            tiny_opt_model_path,
             scheme=scheme,
             iters=2,
             seqlen=2,
             sym=False,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
@@ -223,7 +184,7 @@ def test_WOQ_config_INT_saving(self):
         assert "act_dynamic" in kproj_config.keys() and kproj_config["act_dynamic"]
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_act_config_FP8_saving(self):
+    def test_act_config_FP8_saving(self, tiny_opt_model_path, dataloader):
         scheme = "FP8_STATIC"
         layer_config = {
             "lm_head": {"act_bits": 8, "bits": 8},
@@ -237,11 +198,11 @@ def test_act_config_FP8_saving(self):
             },
         }
         autoround = AutoRound(
-            self.model_name,
+            tiny_opt_model_path,
             scheme=scheme,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
@@ -262,7 +223,3 @@ def test_act_config_FP8_saving(self):
         assert "group_size" in kproj_config.keys() and kproj_config["group_size"] == 0
         assert "sym" in kproj_config.keys() and kproj_config["sym"]
         shutil.rmtree(quantized_model_path, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_alg_ext.py b/test/test_cpu/test_alg_ext.py
index b0c909bd3..0bfdfba47 100644
--- a/test/test_cpu/test_alg_ext.py
+++ b/test/test_cpu/test_alg_ext.py
@@ -1,37 +1,30 @@
-import copy
-import shutil
-import sys
-import unittest
-
-from parameterized import parameterized
-
-sys.path.insert(0, "../..")
-
 from auto_round import AutoRound
 
+from ..helpers import qwen_name_or_path
+
 
-class TestAlgExt(unittest.TestCase):
-    def test_alg_ext(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+class TestAlgExt:
+    def test_alg_ext(self, tiny_opt_model_path, tiny_qwen_model_path):
+        model_name = tiny_opt_model_path
         ar = AutoRound(model_name, scheme="W2A16", iters=1, nsamples=1, enable_alg_ext=True)
         ar.quantize()
 
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B"
+        model_name = tiny_qwen_model_path
         ar = AutoRound(model_name, scheme="gguf:q4_k_s", iters=1, nsamples=1, enable_alg_ext=True)
         ar.quantize()
 
         from auto_round.auto_scheme import AutoScheme
 
         scheme = AutoScheme(options=["mxfp4", "mxfp8"], avg_bits=5.5, ignore_scale_zp_bits=True)
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B"
+        model_name = tiny_qwen_model_path
         ar = AutoRound(model_name, scheme=scheme, iters=1, nsamples=1, enable_alg_ext=True, enable_torch_compile=True)
         ar.quantize()
 
     def test_alg_ext_import(self):
         from auto_round.alg_ext import wrapper_autoround
 
-    def test_all_support_dtype(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_all_support_dtype(self, tiny_opt_model_path):
+        model_name = tiny_opt_model_path
         for scheme in ["MXFP4", "NVFP4", "W2A16G64"]:
             ar = AutoRound(
                 model_name, scheme=scheme, iters=1, nsamples=1, enable_alg_ext=True, enable_torch_compile=True
diff --git a/test/test_cpu/test_asym.py b/test/test_cpu/test_asym.py
index 842b208ed..32a0151b3 100644
--- a/test/test_cpu/test_asym.py
+++ b/test/test_cpu/test_asym.py
@@ -6,13 +6,14 @@
 sys.path.insert(0, "../..")
 
 import torch
-from _test_helpers import model_infer
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
 from auto_round import AutoRound
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.utils import get_module
 
+from ..helpers import get_model_path, model_infer
+
 
 class LLMDataLoader:
     def __init__(self):
@@ -27,7 +28,7 @@ class TestAutoRoundAsym(unittest.TestCase):
     @classmethod
     def setUpClass(self):
         # self.model_name = "/models/opt-125m"
-        self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        self.model_name = get_model_path("facebook/opt-125m")
         self.save_folder = "./saved"
 
     @classmethod
diff --git a/test/test_cpu/test_auto_scheme.py b/test/test_cpu/test_auto_scheme.py
index cd38b220d..9d549076f 100644
--- a/test/test_cpu/test_auto_scheme.py
+++ b/test/test_cpu/test_auto_scheme.py
@@ -1,24 +1,28 @@
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
+
 from auto_round import AutoRound, AutoRoundConfig, AutoScheme
 
 
-class TestAutoScheme(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.save_dir = "./saved"
-        self.tasks = "lambada_openai"
+class TestAutoScheme:
+    save_dir = "./saved"
+
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
+
+        # Yield to hand control to the test methods
+        yield
 
-    @classmethod
-    def tearDownClass(self):
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_auto_scheme_export(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_auto_scheme_export(self, tiny_opt_model_path):
+        model_name = tiny_opt_model_path
         scheme = AutoScheme(avg_bits=2, options=("W2A16"), nsamples=1, ignore_scale_zp_bits=True)
         ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1)
         ar.quantize_and_save(self.save_dir)
@@ -29,27 +33,23 @@ def test_auto_scheme_export(self):
         ar.quantize_and_save(self.save_dir)
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
-    def test_layer_config(self):
+    def test_layer_config(self, tiny_opt_model_path):
         from auto_round.auto_scheme.utils import compute_avg_bits_for_model
         from auto_round.utils import get_module
 
-        target_bits = 3.0
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        scheme = AutoScheme(avg_bits=3, options=("W2A16", "W4A16", "BF16"))
-        user_layer_config = {"model.decoder.layers.10.fc1": {"bits": 8, "group_size": 32, "sym": False}}
+        target_bits = 3.5
+        model_name = tiny_opt_model_path
+        scheme = AutoScheme(avg_bits=target_bits, options=("W2A16", "W4A16", "BF16"))
+        user_layer_config = {"model.decoder.layers.1.fc1": {"bits": 8, "group_size": 32, "sym": False}}
         ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config)
         model, layer_config = ar.quantize()
-        self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["bits"], 8)
-        self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["sym"], False)
-        self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["group_size"], 32)
-        layer = get_module(model, "model.decoder.layers.10.fc1")
-        self.assertEqual(layer.bits, 8)
-        self.assertEqual(layer.sym, False)
-        self.assertEqual(layer.group_size, 32)
+        assert layer_config["model.decoder.layers.1.fc1"]["bits"] == 8
+        assert layer_config["model.decoder.layers.1.fc1"]["sym"] is False
+        assert layer_config["model.decoder.layers.1.fc1"]["group_size"] == 32
+        layer = get_module(model, "model.decoder.layers.1.fc1")
+        assert layer.bits == 8
+        assert layer.sym is False
+        assert layer.group_size == 32
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)
         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_autoopt.py b/test/test_cpu/test_autoopt.py
index f9801217e..c14e04c0e 100644
--- a/test/test_cpu/test_autoopt.py
+++ b/test/test_cpu/test_autoopt.py
@@ -1,9 +1,7 @@
 import copy
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -11,48 +9,37 @@
 from auto_round import AutoRoundAdam
 
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
+class TestAutoRound:
 
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
 
+        # Yield to hand control to the test methods
+        yield
 
-class TestAutoRound(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-        self.llm_dataloader = LLMDataLoader()
-
-    @classmethod
-    def tearDownClass(self):
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_Adam(self):
+    def test_Adam(self, tiny_opt_model, opt_tokenizer, dataloader):
         bits, group_size, sym = 4, 128, False
         from auto_round.utils import get_block_names
 
-        llm_block_names = get_block_names(self.model, quant_vision=True)
+        llm_block_names = get_block_names(tiny_opt_model, quant_vision=True)
         bits, group_size, sym, batch_size = 4, 128, False, 20
         adamround = AutoRoundAdam(
-            self.model,
-            self.tokenizer,
+            tiny_opt_model,
+            opt_tokenizer,
             bits=bits,
             group_size=group_size,
             sym=sym,
             iters=2,
             seqlen=2,
             batch_size=batch_size,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             to_quant_block_names=llm_block_names,
         )
         adamround.quantize()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_autoround.py b/test/test_cpu/test_autoround.py
index 2790f8817..aa7aeca5e 100644
--- a/test/test_cpu/test_autoround.py
+++ b/test/test_cpu/test_autoround.py
@@ -1,56 +1,40 @@
 import copy
 import shutil
-import sys
-import unittest
-
-from parameterized import parameterized
-
-sys.path.insert(0, "../..")
 
+import pytest
 import torch
-from _test_helpers import model_infer
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
 from auto_round import AutoRound
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.utils import get_module
 
-
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(3):
-            yield torch.ones([1, 10], dtype=torch.long)
+from ..helpers import get_model_path, model_infer, opt_name_or_path, qwen_name_or_path
 
 
-class TestAutoRound(unittest.TestCase):
+class TestAutoRound:
     @classmethod
-    def setUpClass(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def setup_class(self):
+        model_name = opt_name_or_path
         self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-        self.llm_dataloader = LLMDataLoader()
         self.save_folder = "./saved"
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree(self.save_folder, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_bits_setting(self):
+    def test_bits_setting(self, tiny_opt_model_path):
         layer_config = {"model.decoder.layers.0.self_attn.k_proj": {"data_type": "mx_fp8", "group_size": 32}}
-        autoround = AutoRound(
-            "/tf_dataset/auto_round/models/facebook/opt-125m", iters=2, seqlen=2, nsamples=1, layer_config=layer_config
-        )
+        autoround = AutoRound(tiny_opt_model_path, iters=2, seqlen=2, nsamples=1, layer_config=layer_config)
         autoround.quantize()
         module = get_module(autoround.model, "model.decoder.layers.0.self_attn.k_proj")
         if module.bits != 8:
             raise ValueError(f"Expected bits to be 8, but got {module.bits}")
 
-    def test_layer_config(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_layer_config(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         layer_config = {"self_attn": {"bits": 4, "data_type": "nv_fp", "act_bits": 16, "group_size": 16}}
         autoround = AutoRound(
             model_name,
@@ -58,15 +42,15 @@ def test_layer_config(self):
             scheme="NVFP4",
             iters=0,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
             amp=False,
         )
         autoround.quantize_and_save(self.save_folder, inplace=False, format="fake")
         shutil.rmtree(self.save_folder)
 
-    def test_remove_whole_block(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_remove_whole_block(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         layer_config = {
             "model.decoder.layers.0.self_attn.k_proj": {"bits": 32},
             "model.decoder.layers.0.self_attn.v_proj": {"bits": 32},
@@ -83,45 +67,37 @@ def test_remove_whole_block(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         autoround.quantize()
 
-    def test_consecutive_quant(self):
+    def test_consecutive_quant(self, tiny_opt_model_path, tiny_phi2_model_path, dataloader):
         bits, group_size, sym = 4, -1, False
         autoround = AutoRound(
-            self.model,
-            self.tokenizer,
+            tiny_opt_model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
 
-        model = AutoModelForCausalLM.from_pretrained(
-            "/tf_dataset/auto_round/models/microsoft/phi-2", torch_dtype="auto", trust_remote_code=True
-        )
-        tokenizer = AutoTokenizer.from_pretrained(
-            "/tf_dataset/auto_round/models/microsoft/phi-2", trust_remote_code=True
-        )
         autoround = AutoRound(
-            model,
-            tokenizer,
+            tiny_phi2_model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
 
-    def test_mx_fp4(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_mx_fp4(self, dataloader):
+        model_name = opt_name_or_path
         bits, group_size, sym = 4, 32, False
         autoround = AutoRound(
             model_name,
@@ -140,10 +116,10 @@ def test_mx_fp4(self):
             model, self.tokenizer, batch_size="auto:8", tasks="lambada_openai", limit=32
         )
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.3)  # 0.375
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.3  # 0.375
 
-    def test_nv_fp4(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_nv_fp4(self, dataloader):
+        model_name = opt_name_or_path
         bits, group_size, sym = 4, 16, False
         autoround = AutoRound(
             model_name,
@@ -152,7 +128,7 @@ def test_nv_fp4(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             data_type="nv_fp4",
         )
         model, _ = autoround.quantize()
@@ -160,10 +136,10 @@ def test_nv_fp4(self):
             model, self.tokenizer, batch_size="auto:8", tasks="lambada_openai", limit=32
         )
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.35
 
-    def test_w4g1(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_w4g1(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         bits, group_size, sym = 4, -1, True
         autoround = AutoRound(
             model_name,
@@ -172,13 +148,13 @@ def test_w4g1(self):
             sym=sym,
             iters=2,
             seqlen=10,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
 
-    @parameterized.expand([(2,), (3,), (4,)])
-    def test_g128(self, bits):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    @pytest.mark.parametrize("bits", [2, 3, 4])
+    def test_g128(self, bits, dataloader):
+        model_name = opt_name_or_path
         group_size, sym = 128, True
         autoround = AutoRound(
             model_name,
@@ -187,7 +163,7 @@ def test_g128(self, bits):
             sym=sym,
             iters=2,
             seqlen=10,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         model, _ = autoround.quantize()
         if bits > 2:
@@ -195,9 +171,9 @@ def test_g128(self, bits):
                 model, self.tokenizer, batch_size="auto:8", tasks="lambada_openai", limit=32
             )
             print(result["results"]["lambada_openai"]["acc,none"])
-            self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.3)
+            assert result["results"]["lambada_openai"]["acc,none"] > 0.3
 
-    def test_disable_quanted_input(self):
+    def test_disable_quanted_input(self, dataloader):
         bits, group_size, sym = 4, -1, True
         autoround = AutoRound(
             self.model,
@@ -208,13 +184,13 @@ def test_disable_quanted_input(self):
             iters=2,
             seqlen=10,
             enable_quanted_input=False,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
 
-    def test_enable_norm_bias_tuning_qwen3(self):
+    def test_enable_norm_bias_tuning_qwen3(self, tiny_qwen_model_path, dataloader):
         bits, group_size, sym = 4, 128, True
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B"
+        model_name = tiny_qwen_model_path
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         autoround = AutoRound(
@@ -226,11 +202,11 @@ def test_enable_norm_bias_tuning_qwen3(self):
             iters=2,
             seqlen=10,
             enable_norm_bias_tuning=True,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
 
-    def test_enable_norm_bias_tuning(self):
+    def test_enable_norm_bias_tuning(self, dataloader):
         bits, group_size, sym = 4, -1, True
         autoround = AutoRound(
             self.model,
@@ -242,11 +218,11 @@ def test_enable_norm_bias_tuning(self):
             seqlen=10,
             enable_quanted_input=False,
             enable_norm_bias_tuning=True,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
 
-    def test_disable_minmax_tuning(self):
+    def test_disable_minmax_tuning(self, dataloader):
         bits, group_size, sym = 4, -1, True
         autoround = AutoRound(
             self.model,
@@ -257,13 +233,13 @@ def test_disable_minmax_tuning(self):
             iters=2,
             seqlen=10,
             enable_minmax_tuning=False,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
 
     #
-    def test_signround(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_signround(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         bits, group_size, sym = 4, -1, False
         autoround = AutoRound(
             model_name,
@@ -274,11 +250,11 @@ def test_signround(self):
             seqlen=10,
             enable_minmax_tuning=False,
             enable_quanted_input=False,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
 
-    def test_lm_head_layer_config_way(self):
+    def test_lm_head_layer_config_way(self, dataloader):
         bits, group_size, sym = 4, -1, False
         layer_config = {"lm_head": {"data_type": "int"}}
         autoround = AutoRound(
@@ -291,13 +267,13 @@ def test_lm_head_layer_config_way(self):
             seqlen=10,
             enable_minmax_tuning=False,
             enable_quanted_input=False,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         autoround.quantize()
 
-    def test_wa_quant(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_wa_quant(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         bits, group_size, sym, act_bits = 4, 128, False, 4
         autoround = AutoRound(
             model_name,
@@ -306,14 +282,14 @@ def test_wa_quant(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             act_bits=act_bits,
         )
         autoround.quantize()
 
-    def test_auto_device_map(self):
+    def test_auto_device_map(self, tiny_opt_model_path, dataloader):
         bits, group_size, sym = 4, 128, False
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = tiny_opt_model_path
         model = AutoModelForCausalLM.from_pretrained(
             model_name, torch_dtype="auto", trust_remote_code=True, device_map="auto"
         )
@@ -325,11 +301,11 @@ def test_auto_device_map(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
 
-    def test_device_map_dict(self):
+    def test_device_map_dict(self, tiny_opt_model_path, dataloader):
         bits, group_size, sym = 4, 128, False
         device_map = {".*": "cpu"}
         autoround = AutoRound(
@@ -340,13 +316,13 @@ def test_device_map_dict(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             device_map=device_map,
         )
         autoround.quantize()
 
         # test model_name
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = tiny_opt_model_path
         autoround = AutoRound(
             model_name,
             self.tokenizer,
@@ -355,14 +331,14 @@ def test_device_map_dict(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             device_map=device_map,
         )
         autoround.quantize()
 
-    def test_fp32(self):
+    def test_fp32(self, tiny_opt_model_path, dataloader):
         bits, group_size, sym = 4, 128, False
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = tiny_opt_model_path
         model = AutoModelForCausalLM.from_pretrained(
             model_name, torch_dtype=torch.float32, trust_remote_code=True, device_map="auto"
         )
@@ -374,12 +350,12 @@ def test_fp32(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             amp=False,
         )
         autoround.quantize()
 
-    def test_tensor_reshape(self):
+    def test_tensor_reshape(self, dataloader):
         bits, group_size, sym = 4, 100, False
         autoround = AutoRound(
             self.model,
@@ -389,12 +365,12 @@ def test_tensor_reshape(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
 
-    def test_rtn(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_rtn(self, tiny_opt_model_path):
+        model_name = tiny_opt_model_path
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 
@@ -412,9 +388,9 @@ def test_rtn(self):
         model_infer(model, tokenizer)
         shutil.rmtree(self.save_folder)
 
-    def test_embed_quant(self):
+    def test_embed_quant(self, tiny_opt_model_path, dataloader):
         bits, group_size, sym = 4, 128, True
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = tiny_opt_model_path
         layer_config = {
             "model.decoder.embed_tokens": {"bits": 4},
         }
@@ -426,14 +402,14 @@ def test_embed_quant(self):
             iters=2,
             seqlen=2,
             nsamples=3,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         autoround.quantize()
 
-    def test_fallback_layers(self):
+    def test_fallback_layers(self, tiny_opt_model_path, dataloader):
         bits, group_size, sym = 4, 128, True
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = tiny_opt_model_path
         model = AutoModelForCausalLM.from_pretrained(
             model_name, torch_dtype=torch.float32, trust_remote_code=True, device_map="auto"
         )
@@ -450,7 +426,7 @@ def test_fallback_layers(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         autoround.quantize()
@@ -475,17 +451,17 @@ def test_not_convert_modules(self):
 
         from auto_round_extension.ipex.qlinear_ipex_awq import QuantLinear
 
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct-AWQ"
+        model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct-AWQ")
         quantization_config = AutoRoundConfig()
         model = Qwen2VLForConditionalGeneration.from_pretrained(
             model_name, quantization_config=quantization_config, device_map="cpu", torch_dtype=torch.float16
         )
-        self.assertTrue(isinstance(model.visual.blocks[0].attn.qkv, torch.nn.Linear))
-        self.assertFalse(isinstance(model.visual.merger.mlp[0], QuantLinear))
+        assert isinstance(model.visual.blocks[0].attn.qkv, torch.nn.Linear)
+        assert not isinstance(model.visual.merger.mlp[0], QuantLinear)
         if hasattr(model.model, "language_model"):
-            self.assertTrue(isinstance(model.model.language_model.layers[0].self_attn.v_proj, QuantLinear))
+            assert isinstance(model.model.language_model.layers[0].self_attn.v_proj, QuantLinear)
         else:
-            self.assertTrue(isinstance(model.model.layers[0].self_attn.v_proj, QuantLinear))
+            assert isinstance(model.model.layers[0].self_attn.v_proj, QuantLinear)
 
         processor = AutoProcessor.from_pretrained(model_name, size=None)
         image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
@@ -520,8 +496,8 @@ def test_not_convert_modules(self):
         )
         print(output_text)
 
-    def test_fallback_layers_regex_awq(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_fallback_layers_regex_awq(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         bits, group_size, sym = 4, 128, True
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -537,7 +513,7 @@ def test_fallback_layers_regex_awq(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         autoround.quantize()
@@ -556,8 +532,8 @@ def test_fallback_layers_regex_awq(self):
         print(res)
         shutil.rmtree(self.save_folder, ignore_errors=True)
 
-    def test_fallback_layers_regex_gptq(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_fallback_layers_regex_gptq(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         bits, group_size, sym = 4, 128, True
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -573,7 +549,7 @@ def test_fallback_layers_regex_gptq(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         autoround.quantize()
@@ -592,8 +568,8 @@ def test_fallback_layers_regex_gptq(self):
         print(res)
         shutil.rmtree(self.save_folder, ignore_errors=True)
 
-    def test_fallback_layers_regex_round(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_fallback_layers_regex_round(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         bits, group_size, sym = 4, 128, True
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -609,7 +585,7 @@ def test_fallback_layers_regex_round(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         autoround.quantize()
@@ -628,13 +604,13 @@ def test_fallback_layers_regex_round(self):
         print(res)
         shutil.rmtree(self.save_folder, ignore_errors=True)
 
-    def test_fallback_layers_regex_exception(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_fallback_layers_regex_exception(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         bits, group_size, sym = 4, 128, True
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         layer_config = {"model.decoder.layers.12.self_attn.k_proj": {"bits": 16}}
-        with self.assertRaises(ValueError):
+        with pytest.raises(ValueError):
             autoround = AutoRound(
                 model,
                 tokenizer=tokenizer,
@@ -643,21 +619,11 @@ def test_fallback_layers_regex_exception(self):
                 sym=sym,
                 iters=2,
                 seqlen=2,
-                dataset=self.llm_dataloader,
+                dataset=dataloader,
                 layer_config=layer_config,
             )
             autoround.quantize()
 
-    # def test_fp8_model_input_rtn_generation(self):
-    #     model_name = "Qwen/Qwen3-0.6B-FP8"
-    #     ar = AutoRound(model=model_name, iters=0)
-    #     ar.quantize_and_save(output_dir=self.save_folder)
-    #     model = AutoModelForCausalLM.from_pretrained(self.save_folder, torch_dtype="auto", trust_remote_code=True)
-    #     tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-    #     text = "There is a girl who likes adventure,"
-    #     inputs = tokenizer(text, return_tensors="pt").to(model.device)
-    #     print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
-
     def test_dequant_fp8_weight(self):
         from auto_round.utils import dequant_block_fp8_weight
 
@@ -666,44 +632,44 @@ def test_dequant_fp8_weight(self):
         weight_scale = torch.randn(5, 56)
         block_size = [128, 128]
         dequant_weight = dequant_block_fp8_weight(weight, weight_scale, block_size)
-        self.assertEqual(dequant_weight.shape.numel(), 4207616)
+        assert dequant_weight.shape.numel() == 4207616
 
         # test experts are stacked.
         weight = torch.randn([32, 5760, 1440])
         weight_scale = torch.randn([32, 5760, 90])
         block_size = [1, 16]
         dequant_weight = dequant_block_fp8_weight(weight, weight_scale, block_size)
-        self.assertEqual(len(dequant_weight.shape), 3)
-        self.assertEqual(dequant_weight.shape[0], 32)
-        self.assertEqual(dequant_weight.shape.numel(), 32 * 5760 * 1440)
+        assert len(dequant_weight.shape) == 3
+        assert dequant_weight.shape[0] == 32
+        assert dequant_weight.shape.numel() == 32 * 5760 * 1440
 
-    def test_mixed_bit_setting(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        layer_config = {"model.decoder.layers.7.fc1": {"bits": 8, "act_bits": 8}}
+    def test_mixed_bit_setting(self, tiny_opt_model_path):
+        model_name = tiny_opt_model_path
+        layer_config = {"model.decoder.layers.1.fc1": {"bits": 8, "act_bits": 8}}
         ar = AutoRound(model_name, data_type="mx_fp4", act_bits=4, iters=0, layer_config=layer_config)
         ar.quantize()
         layer_config = ar.layer_config
         if (
-            layer_config["model.decoder.layers.7.fc1"]["bits"] != 8
-            or layer_config["model.decoder.layers.7.fc1"]["act_bits"] != 8
+            layer_config["model.decoder.layers.1.fc1"]["bits"] != 8
+            or layer_config["model.decoder.layers.1.fc1"]["act_bits"] != 8
         ):
             raise ValueError("mixed bits is not correct")
 
-    def test_invalid_layer_config(self):
-        with self.assertRaises(ValueError):
+    def test_invalid_layer_config(self, tiny_opt_model_path):
+        with pytest.raises(ValueError):
             layer_config = {"model.decoder.layers.2.self_attnx": {"bits": 2}}
             ar = AutoRound(
-                "/tf_dataset/auto_round/models/facebook/opt-125m",
+                tiny_opt_model_path,
                 scheme="W3A16",
                 nsamples=1,
                 iters=1,
                 layer_config=layer_config,
             )
             ar.quantize()
-        with self.assertRaises(ValueError):
+        with pytest.raises(ValueError):
             layer_config = {"model.decoder.layers.2.self_attn": {"bit": 2}}  # should be bits
             ar = AutoRound(
-                "/tf_dataset/auto_round/models/facebook/opt-125m",
+                tiny_opt_model_path,
                 scheme="W3A16",
                 nsamples=1,
                 iters=1,
@@ -711,8 +677,8 @@ def test_invalid_layer_config(self):
             )
             ar.quantize()
 
-    def test_quant_lm_head(self):
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-8B"
+    def test_quant_lm_head(self, tiny_untied_qwen_model_path):
+        model_name = tiny_untied_qwen_model_path
         ar = AutoRound(model_name, quant_lm_head=True, iters=0, seqlen=8, nsamples=1, disable_opt_rtn=True)
         ar.quantize_and_save(output_dir=self.save_folder, format="auto_round")
         model = AutoModelForCausalLM.from_pretrained(self.save_folder, device_map="cpu")
@@ -734,8 +700,8 @@ def test_quant_lm_head(self):
         assert "lm_head" in model.config.quantization_config.extra_config
         assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4
 
-    def test_quant_lm_head_layer_config(self):
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-8B"
+    def test_quant_lm_head_layer_config(self, tiny_untied_qwen_model_path):
+        model_name = tiny_untied_qwen_model_path
         layer_config = {"lm_head": {"bits": 4}}
         ar = AutoRound(
             model_name,
@@ -751,22 +717,22 @@ def test_quant_lm_head_layer_config(self):
         assert "lm_head" in model.config.quantization_config.extra_config
         assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4
 
-    def test_compressor(self):
-        model_name = "Qwen/Qwen2-VL-2B-Instruct"
+    def test_compressor(self, tiny_qwen_vl_model_path):
+        model_name = tiny_qwen_vl_model_path
         ar = AutoRound(model_name, enable_adam=True)
-        self.assertEqual(ar.optimizer, torch.optim.AdamW)
-        self.assertTrue(ar.mllm)
+        assert ar.optimizer == torch.optim.AdamW
+        assert ar.mllm
 
         # test old api
         from auto_round import AutoRoundMLLM
 
         ar = AutoRoundMLLM(model_name)
-        self.assertTrue(ar.mllm)
+        assert ar.mllm
 
     def test_attention_mask_in_dataset(self):
         from transformers import AutoTokenizer
 
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B"
+        model_name = qwen_name_or_path
         # model_name = "/models/Qwen3-0.6B"
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         text = ["haha", "hello world"]
@@ -784,7 +750,7 @@ def test_attention_mask_in_dataset(self):
     def test_attention_mask_via_tokenize_in_dataset(self):
         from transformers import AutoTokenizer
 
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B"
+        model_name = qwen_name_or_path
         # model_name = "/models/Qwen3-0.6B"
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         text = ["haha", "hello world"]
@@ -801,9 +767,9 @@ def test_attention_mask_via_tokenize_in_dataset(self):
         ar = AutoRound(model_name, iters=1, dataset=data, seqlen=8)
         ar.quantize()
 
-    def test_low_cpu_mem_usage(self):
+    def test_low_cpu_mem_usage(self, tiny_opt_model_path, dataloader):
         bits, group_size = 4, 32
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = tiny_opt_model_path
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         quantized_model_path = self.save_folder
@@ -814,7 +780,7 @@ def test_low_cpu_mem_usage(self):
             group_size=group_size,
             iters=2,
             seqlen=10,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             low_cpu_mem_usage=True,
             device_map="cpu",
         )
@@ -822,11 +788,7 @@ def test_low_cpu_mem_usage(self):
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     def test_create_adam(self):
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B"
+        model_name = qwen_name_or_path
         from auto_round import AutoRound
 
         ar = AutoRound(model=model_name, enable_adam=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_autoround_acc.py b/test/test_cpu/test_autoround_acc.py
index 41b28e663..876d4a452 100644
--- a/test/test_cpu/test_autoround_acc.py
+++ b/test/test_cpu/test_autoround_acc.py
@@ -1,42 +1,29 @@
 import copy
 import shutil
-import sys
-import unittest
-
-from auto_round.eval.evaluation import simple_evaluate
-
-sys.path.insert(0, "../..")
 from math import isclose
 
+import pytest
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound  # pylint: disable=E0401
 
-
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
+from ..helpers import gptj_name_or_path
 
 
-class TestAutoRound(unittest.TestCase):
+class TestAutoRound:
     @classmethod
-    def setUpClass(self):
-        self.llm_dataloader = LLMDataLoader()
+    def setup_class(self):
         self.save_dir = "./saved"
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree(self.save_dir, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_default_acc(self):
-        model_name = "/tf_dataset/auto_round/models/hf-internal-testing/tiny-random-GPTJForCausalLM"
+    def test_default_acc(self, dataloader):
+        model_name = gptj_name_or_path
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
@@ -50,7 +37,7 @@ def test_default_acc(self):
             sym=sym,
             iters=2,
             seqlen=10,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
         out0 = model(inp)
@@ -66,28 +53,19 @@ def test_default_acc(self):
             device="cpu",
             iters=2,
             seqlen=10,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround_1.quantize()
         out1 = model_tmp(inp)
 
         assert out0[0].equal(out1[0])
-        self.assertTrue(isclose(float(out0[0][0][0][0]), -0.021002087742090225, rel_tol=5e-04))
+        assert isclose(float(out0[0][0][0][0]), -0.021002087742090225, rel_tol=5e-04)
 
-    def test_3bits_asym_autoround(self):
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def test_3bits_asym_autoround(self, tiny_opt_model_path):
+        model_name = tiny_opt_model_path
 
         bits, sym = 3, False
         autoround = AutoRound(model_name, bits=bits, sym=sym, iters=0)
         autoround.quantize_and_save(self.save_dir, format="auto_round", inplace=False)
         model_args = f"pretrained={self.save_dir}"
-        # res = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto", limit=10)
-
-        # accuracy = res["results"]["lambada_openai"]["acc,none"]
-        # print(f"accuracy = {accuracy}")
-        # assert accuracy > 0.15
         shutil.rmtree(self.save_dir, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_autoround_export_to_itrex.py b/test/test_cpu/test_autoround_export_to_itrex.py
index d9b4f42c6..19f196270 100644
--- a/test/test_cpu/test_autoround_export_to_itrex.py
+++ b/test/test_cpu/test_autoround_export_to_itrex.py
@@ -1,15 +1,15 @@
 import copy
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
 
+from ..helpers import get_model_path, gptj_name_or_path
+
 
 class SimpleDataLoader:
     def __init__(self):
@@ -20,35 +20,23 @@ def __iter__(self):
             yield torch.randn([1, 30])
 
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestAutoroundExport(unittest.TestCase):
+class TestAutoroundExport:
     approach = "weight_only"
 
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.gptj = transformers.AutoModelForCausalLM.from_pretrained(
-            "/tf_dataset/auto_round/models/hf-internal-testing/tiny-random-GPTJForCausalLM",
+            gptj_name_or_path,
             torchscript=True,
         )
-        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-            "/tf_dataset/auto_round/models/hf-internal-testing/tiny-random-GPTJForCausalLM", trust_remote_code=True
-        )
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(gptj_name_or_path, trust_remote_code=True)
         self.gptj_no_jit = transformers.AutoModelForCausalLM.from_pretrained(
-            "/tf_dataset/auto_round/models/hf-internal-testing/tiny-random-GPTJForCausalLM",
+            gptj_name_or_path,
         )
-        self.llm_dataloader = LLMDataLoader()
         self.lm_input = torch.ones([1, 10], dtype=torch.long)
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
@@ -64,11 +52,11 @@ def test_autoround_int_quant(self):
         out2 = model(self.lm_input)
         out3 = q_model(self.lm_input)
         out4 = compressed_model(self.lm_input)
-        self.assertTrue(torch.all(torch.isclose(out1[0], out2[0], atol=1e-1)))
-        self.assertFalse(torch.all(out1[0] == out2[0]))
-        self.assertTrue(torch.all(out2[0] == out3[0]))
-        self.assertTrue(torch.all(torch.isclose(out3[0], out4[0], atol=1e-3)))
-        self.assertTrue("transformer.h.0.attn.k_proj.qzeros" in compressed_model.state_dict().keys())
+        assert torch.all(torch.isclose(out1[0], out2[0], atol=1e-1))
+        assert not torch.all(out1[0] == out2[0])
+        assert torch.all(out2[0] == out3[0])
+        assert torch.all(torch.isclose(out3[0], out4[0], atol=1e-3))
+        assert "transformer.h.0.attn.k_proj.qzeros" in compressed_model.state_dict().keys()
 
         model = copy.deepcopy(self.gptj)
         out6 = model(self.lm_input)
@@ -78,19 +66,19 @@ def test_autoround_int_quant(self):
         compressed_model = compressed_model.to(torch.float32)
         out4 = q_model(self.lm_input)
         out5 = compressed_model(self.lm_input)
-        self.assertTrue(torch.all(out1[0] == out6[0]))
-        self.assertTrue(torch.all(torch.isclose(out4[0], out5[0], atol=5e-3)))
+        assert torch.all(out1[0] == out6[0])
+        assert torch.all(torch.isclose(out4[0], out5[0], atol=5e-3))
 
     def test_config(self):
         from auto_round.export.export_to_itrex import QuantConfig
 
-        config = QuantConfig.from_pretrained("/tf_dataset/auto_round/models/TheBloke/Llama-2-7B-Chat-GPTQ")
+        config = QuantConfig.from_pretrained(get_model_path("TheBloke/Llama-2-7B-Chat-GPTQ"))
         config.save_pretrained("quantization_config_dir")
         loaded_config = QuantConfig.from_pretrained("quantization_config_dir")
-        self.assertEqual(config.group_size, loaded_config.group_size)
-        self.assertEqual(config.desc_act, loaded_config.desc_act)
-        self.assertEqual(config.bits, loaded_config.bits)
-        self.assertEqual(config.sym, loaded_config.sym)
+        assert config.group_size == loaded_config.group_size
+        assert config.desc_act == loaded_config.desc_act
+        assert config.bits == loaded_config.bits
+        assert config.sym == loaded_config.sym
 
     def test_xpu_export(self):
         model = copy.deepcopy(self.gptj)
@@ -106,12 +94,8 @@ def test_xpu_export(self):
         out3 = q_model(self.lm_input)
         out4 = compressed_model_xpu(self.lm_input)
         out5 = compressed_model_cpu(self.lm_input)
-        self.assertTrue(torch.all(torch.isclose(out1[0], out2[0], atol=1e-1)))
-        self.assertFalse(torch.all(out1[0] == out2[0]))
-        self.assertTrue(torch.all(out2[0] == out3[0]))
-        self.assertTrue(torch.all(torch.isclose(out3[0], out4[0], atol=1e-3)))
-        self.assertTrue(torch.all(torch.isclose(out4[0], out5[0], atol=1e-5)))
-
-
-if __name__ == "__main__":
-    unittest.main()
+        assert torch.all(torch.isclose(out1[0], out2[0], atol=1e-1))
+        assert not torch.all(out1[0] == out2[0])
+        assert torch.all(out2[0] == out3[0])
+        assert torch.all(torch.isclose(out3[0], out4[0], atol=1e-3))
+        assert torch.all(torch.isclose(out4[0], out5[0], atol=1e-5))
diff --git a/test/test_cpu/test_block_names.py b/test/test_cpu/test_block_names.py
index 501caee25..47c554317 100644
--- a/test/test_cpu/test_block_names.py
+++ b/test/test_cpu/test_block_names.py
@@ -1,25 +1,14 @@
 import os
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, ".")
-sys.path.insert(0, "../..")
+import pytest
 import torch
 import torch.nn as nn
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
 
-
-class LLMDataLoader:
-    def __init__(self, input_size=10):
-        self.batch_size = 1
-        self.input_size = input_size
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, self.input_size], dtype=torch.long)
+from ..helpers import get_model_path, lamini_name_or_path
 
 
 # ================= simple multimodal model =================
@@ -116,15 +105,14 @@ def forward(self, x):
         return output
 
 
-class TestQuantizationBlocks(unittest.TestCase):
+class TestQuantizationBlocks:
     @classmethod
-    def setUpClass(self):
-        self.model_name = "/tf_dataset/auto_round/models/MBZUAI/LaMini-GPT-124M"
+    def setup_class(self):
+        self.model_name = lamini_name_or_path
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-        self.llm_dataloader = LLMDataLoader()
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
@@ -133,7 +121,6 @@ def test_moe_quant(self):
         hidden_size = 10
         num_groups = 2
         experts_per_group = 2
-        self.llm_dataloader = LLMDataLoader(input_size)
         self.model = NestedMoEModel(input_size, hidden_size, num_groups, experts_per_group)
         from auto_round.utils import get_block_names
 
@@ -159,7 +146,7 @@ def test_multimodal_quant(self):
         assert block_names_wo_vision == llm_block_names
         assert len(block_names_wo_vision) != (block_names_with_vision)
 
-    def test_block_name_quant(self):
+    def test_block_name_quant(self, dataloader):
         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         from auto_round.utils import get_block_names
 
@@ -174,7 +161,7 @@ def test_block_name_quant(self):
             iters=2,
             seqlen=2,
             batch_size=batch_size,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             to_quant_block_names=llm_block_names,
         )
         autoround.quantize()
@@ -191,33 +178,29 @@ def test_block_name_quant(self):
         assert quant_config.block_name_to_quantize is not None
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_mm_block_name(self):
+    def test_mm_block_name(self, tiny_qwen_vl_model_path):
         from transformers import Qwen2VLForConditionalGeneration
 
         from auto_round.utils import get_block_names
 
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct"
+        model_name = tiny_qwen_vl_model_path
         model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, trust_remote_code=True, device_map="auto")
         block_name = get_block_names(model, quant_vision=True)
-        self.assertTrue(len(block_name) == 2)
-        self.assertTrue(all(["visual.merger.mlp" not in n for n in block_name]))
+        assert len(block_name) == 2
+        assert all(["visual.merger.mlp" not in n for n in block_name])
         block_name = get_block_names(model, quant_vision=False)
-        self.assertTrue(len(block_name) == 1)
-        self.assertTrue(block_name == get_block_names(model))
+        assert len(block_name) == 1
+        assert block_name == get_block_names(model)
 
     def test_moe(self):
         from auto_round.utils import get_block_names
 
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen1.5-MoE-A2.7B"
+        model_name = get_model_path("Qwen/Qwen1.5-MoE-A2.7B")
         # config = AutoConfig.from_pretrained(model_name)
         model = AutoModelForCausalLM.from_pretrained(model_name)
 
         block_name = get_block_names(model)
         block_name_2 = get_block_names(model, quant_vision=True)
-        self.assertTrue(block_name == block_name_2)
-        self.assertTrue(len(block_name_2) == 1)
-        self.assertTrue("model.layers.23" == block_name_2[0][-1])
-
-
-if __name__ == "__main__":
-    unittest.main()
+        assert block_name == block_name_2
+        assert len(block_name_2) == 1
+        assert "model.layers.23" == block_name_2[0][-1]
diff --git a/test/test_cpu/test_calib_dataset.py b/test/test_cpu/test_calib_dataset.py
index 689cc705c..cb276147e 100644
--- a/test/test_cpu/test_calib_dataset.py
+++ b/test/test_cpu/test_calib_dataset.py
@@ -1,29 +1,19 @@
+import json
 import os
 import shutil
-import sys
-import unittest
-
-sys.path.insert(0, "../..")
-import json
 
+import pytest
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
 
+from ..helpers import get_model_path, opt_name_or_path
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
 
-
-class TestLocalCalibDataset(unittest.TestCase):
+class TestLocalCalibDataset:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         json_data = [{"text": "awefdsfsddfd"}, {"text": "fdfdfsdfdfdfd"}, {"text": "dfdsfsdfdfdfdf"}]
         os.makedirs("./saved", exist_ok=True)
         self.json_file = "./saved/tmp.json"
@@ -38,7 +28,7 @@ def setUpClass(self):
                 json.dump(item, jsonl_file, ensure_ascii=False)
                 jsonl_file.write("\n")
 
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = opt_name_or_path
         self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 
@@ -71,7 +61,7 @@ def test_jsonl(self):
         autoround.quantize()
 
     def test_apply_chat_template(self):
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct"
+        model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct")
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         dataset = "NeelNanda/pile-10k:apply_chat_template:system_prompt=''"
@@ -130,10 +120,6 @@ def test_combine_dataset2(self):
     #     autoround.quantize()
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_cli_usage.py b/test/test_cpu/test_cli_usage.py
index 2b93f5131..b3aecf2f1 100644
--- a/test/test_cpu/test_cli_usage.py
+++ b/test/test_cpu/test_cli_usage.py
@@ -1,45 +1,44 @@
 import os
 import shutil
 import sys
-import unittest
 
-sys.path.insert(0, "../..")
+from ..helpers import get_model_path
 
 
-class TestAutoRoundCmd(unittest.TestCase):
+class TestAutoRoundCmd:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         pass
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
         shutil.rmtree("../../saved", ignore_errors=True)
         shutil.rmtree("../../tmp_autoround", ignore_errors=True)
 
-    def test_auto_round_cmd(self):
+    def test_auto_round_cmd(self, tiny_opt_model_path, tiny_qwen_vl_model_path):
         python_path = sys.executable
 
         # Test llm script
-        res = os.system(f"cd ../.. && {python_path} -m auto_round -h")
+        res = os.system(f"cd .. && {python_path} -m auto_round -h")
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --model '/tf_dataset/auto_round/models/facebook/opt-125m' --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved  --tasks piqa"
+            f"cd .. && {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved  --tasks piqa"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --model '/tf_dataset/auto_round/models/facebook/opt-125m' --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32"
+            f"cd .. && {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd ../.. && {python_path} -c 'from auto_round.__main__ import run_light; run_light()' --seqlen 8 --iter 2 --nsamples 8 --output_dir ./saved --tasks lambada_openai"
+            f"cd .. && {python_path} -c 'from auto_round.__main__ import run_light; run_light()' --seqlen 8 --iter 2 --nsamples 8 --output_dir ./saved --tasks lambada_openai"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
@@ -47,28 +46,24 @@ def test_auto_round_cmd(self):
         # test mllm script
 
         # test auto_round_mllm --eval help
-        res = os.system(f"cd ../.. && {python_path} -m auto_round --eval -h")
+        res = os.system(f"cd .. && {python_path} -m auto_round --eval -h")
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         # test auto_round_mllm --lmms help
-        res = os.system(f"cd ../.. && {python_path} -m auto_round --eval --lmms -h")
+        res = os.system(f"cd .. && {python_path} -m auto_round --eval --lmms -h")
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --mllm --model /tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct --iter 2 --nsamples 2 --seqlen 32 --format auto_round --output_dir ./saved"
+            f"cd .. && {python_path} -m auto_round --mllm --model {tiny_qwen_vl_model_path} --iter 2 --nsamples 2 --seqlen 32 --format auto_round --output_dir ./saved"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --mllm --iter 2 --nsamples 2 --model /tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct --seqlen 32 --format auto_round"
+            f"cd .. && {python_path} -m auto_round --mllm --iter 2 --nsamples 2 --model {tiny_qwen_vl_model_path} --seqlen 32 --format auto_round"
             " --quant_nontext_module --output_dir ./saved "
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_conv1d.py b/test/test_cpu/test_conv1d.py
index edd28110f..1997026b3 100644
--- a/test/test_cpu/test_conv1d.py
+++ b/test/test_cpu/test_conv1d.py
@@ -1,38 +1,27 @@
 import copy
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
-from _test_helpers import model_infer
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
 
+from ..helpers import lamini_name_or_path, model_infer
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
 
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestQuantizationConv1d(unittest.TestCase):
+class TestQuantizationConv1d:
     @classmethod
-    def setUpClass(self):
-        self.model_name = "/tf_dataset/auto_round/models/MBZUAI/LaMini-GPT-124M"
+    def setup_class(self):
+        self.model_name = lamini_name_or_path
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-        self.llm_dataloader = LLMDataLoader()
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_quant(self):
+    def test_quant(self, dataloader):
         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
@@ -43,7 +32,7 @@ def test_quant(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
 
         autoround.quantize()
@@ -51,7 +40,3 @@ def test_quant(self):
 
         model = AutoModelForCausalLM.from_pretrained("./saved", device_map="cpu", trust_remote_code=True)
         model_infer(model, self.tokenizer)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py
index 47b18f314..a2c9e1fa6 100644
--- a/test/test_cpu/test_export.py
+++ b/test/test_cpu/test_export.py
@@ -1,16 +1,14 @@
 import os
 import shutil
-import sys
-import unittest
 
-from parameterized import parameterized
-
-sys.path.insert(0, "../..")
+import pytest
 import torch
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
 from auto_round import AutoRound
 
+from ..helpers import get_model_path, opt_name_or_path
+
 
 def _get_folder_size(path: str) -> float:
     """Return folder size in GB."""
@@ -23,30 +21,20 @@ def _get_folder_size(path: str) -> float:
     return total_size / (1024**3)  # convert to GB
 
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestAutoRound(unittest.TestCase):
+class TestAutoRound:
     @classmethod
-    def setUpClass(self):
-        self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def setup_class(self):
+        self.model_name = opt_name_or_path
         self.save_dir = "./saved"
         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-        self.llm_dataloader = LLMDataLoader()
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_autogptq_format(self):
+    def test_autogptq_format(self, dataloader):
         for group_size in [-1, 32, 128]:
             bits, sym = 4, False
             model_name = self.model_name
@@ -57,7 +45,7 @@ def test_autogptq_format(self):
                 sym=sym,
                 iters=2,
                 seqlen=2,
-                dataset=self.llm_dataloader,
+                dataset=dataloader,
             )
 
             quantized_model_path = "./saved"
@@ -76,7 +64,7 @@ def test_autogptq_format(self):
             print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
             shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_autoround_format(self):
+    def test_autoround_format(self, dataloader):
         for group_size in [-1, 32, 128]:
             bits, sym = 4, True
             model_name = self.model_name
@@ -87,7 +75,7 @@ def test_autoround_format(self):
                 sym=sym,
                 iters=2,
                 seqlen=2,
-                dataset=self.llm_dataloader,
+                dataset=dataloader,
             )
             quantized_model_path = "./saved"
             autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
@@ -102,7 +90,7 @@ def test_autoround_format(self):
             print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
             shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_autoround_awq_format(self):
+    def test_autoround_awq_format(self, dataloader):
         for group_size in [-1, 32, 128]:
             bits, sym = 4, False
             model_name = self.model_name
@@ -113,7 +101,7 @@ def test_autoround_awq_format(self):
                 sym=sym,
                 iters=2,
                 seqlen=2,
-                dataset=self.llm_dataloader,
+                dataset=dataloader,
             )
             quantized_model_path = "./saved"
 
@@ -132,7 +120,7 @@ def test_autoround_awq_format(self):
             print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
             shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_autoawq_format(self):
+    def test_autoawq_format(self, dataloader):
         for group_size in [-1, 32, 128]:
             bits, sym = 4, False
             autoround = AutoRound(
@@ -143,7 +131,7 @@ def test_autoawq_format(self):
                 sym=sym,
                 iters=2,
                 seqlen=2,
-                dataset=self.llm_dataloader,
+                dataset=dataloader,
             )
             autoround.quantize()
             quantized_model_path = "./saved"
@@ -163,7 +151,7 @@ def test_autoawq_format(self):
             print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
             shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_autoround_3bit_asym_format(self):
+    def test_autoround_3bit_asym_format(self, dataloader):
         bits, group_size, sym = 3, 128, False
         autoround = AutoRound(
             self.model,
@@ -173,7 +161,7 @@ def test_autoround_3bit_asym_format(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
         quantized_model_path = self.save_dir
@@ -187,7 +175,7 @@ def test_autoround_3bit_asym_format(self):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_autoround_3bit_sym_format(self):
+    def test_autoround_3bit_sym_format(self, dataloader):
         bits, group_size, sym = 3, 128, True
         autoround = AutoRound(
             self.model,
@@ -197,7 +185,7 @@ def test_autoround_3bit_sym_format(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
         quantized_model_path = self.save_dir
@@ -211,7 +199,7 @@ def test_autoround_3bit_sym_format(self):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    @parameterized.expand([(None,), ("fp8",), ("float16")])
+    @pytest.mark.parametrize("static_kv_dtype", ["fp8", "float16"])
     def test_static_afp8_export(self, static_kv_dtype):
         import os
 
@@ -237,10 +225,10 @@ def test_static_afp8_export(self, static_kv_dtype):
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
         f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt")
-        self.assertIn("model.decoder.layers.8.self_attn.k_proj.input_scale", f.keys())
-        self.assertIn("model.decoder.layers.8.self_attn.k_proj.weight_scale", f.keys())
-        self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape, torch.Size([1]))
-        self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype, torch.float8_e4m3fn)
+        assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys()
+        assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys()
+        assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape == torch.Size([1])
+        assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype == torch.float8_e4m3fn
         if static_kv_dtype is None:
             with torch.no_grad():
                 import transformers
@@ -270,11 +258,11 @@ def test_static_afp8_export(self, static_kv_dtype):
                     assert output is not None, "Output should not be None"
 
         if static_kv_dtype == "fp8":
-            self.assertIn("model.decoder.layers.8.self_attn.k_scale", f.keys())
-            self.assertIn("model.decoder.layers.8.self_attn.v_scale", f.keys())
-            self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_scale").shape, torch.Size([1]))
-            self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.k_scale").shape, torch.Size([1]))
-            self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.k_scale").dtype, torch.float32)
+            assert "model.decoder.layers.8.self_attn.k_scale" in f.keys()
+            assert "model.decoder.layers.8.self_attn.v_scale" in f.keys()
+            assert f.get_tensor("model.decoder.layers.5.self_attn.v_scale").shape == torch.Size([1])
+            assert f.get_tensor("model.decoder.layers.5.self_attn.k_scale").shape == torch.Size([1])
+            assert f.get_tensor("model.decoder.layers.5.self_attn.k_scale").dtype == torch.float32
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
@@ -296,10 +284,10 @@ def test_static_afp8_export(self, static_kv_dtype):
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
 
         f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt")
-        self.assertIn("model.decoder.layers.8.self_attn.k_proj.input_scale", f.keys())
-        self.assertIn("model.decoder.layers.8.self_attn.k_proj.weight_scale", f.keys())
-        self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape, torch.Size([1]))
-        self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype, torch.float8_e4m3fn)
+        assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys()
+        assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys()
+        assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape == torch.Size([1])
+        assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype == torch.float8_e4m3fn
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     def test_static_fp8_attn(self):
@@ -321,22 +309,22 @@ def test_static_fp8_attn(self):
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
         f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt")
-        self.assertIn("model.decoder.layers.8.self_attn.k_proj.input_scale", f.keys())
-        self.assertIn("model.decoder.layers.8.self_attn.k_proj.weight_scale", f.keys())
-        self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape, torch.Size([1]))
-        self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype, torch.float8_e4m3fn)
+        assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys()
+        assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys()
+        assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape == torch.Size([1])
+        assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype == torch.float8_e4m3fn
         check_attrs = ["k_scale", "v_scale", "q_scale"]
         for attr in check_attrs:
             weight_name = f"model.decoder.layers.8.self_attn.{attr}"
-            self.assertIn(weight_name, f.keys())
-            self.assertEqual(f.get_tensor(weight_name).shape, torch.Size([1]))
-            self.assertEqual(f.get_tensor(weight_name).dtype, torch.float32)
+            assert weight_name in f.keys()
+            assert f.get_tensor(weight_name).shape == torch.Size([1])
+            assert f.get_tensor(weight_name).dtype == torch.float32
 
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_awq_lmhead_export(self):
+    def test_awq_lmhead_export(self, dataloader):
         bits, sym, group_size = 4, False, 128
-        model_name = "/tf_dataset/auto_round/models/microsoft/phi-2"
+        model_name = get_model_path("microsoft/phi-2")
         layer_config = {
             "lm_head": {"bits": 4},  # set lm_head quant
             "layer": {"bits": 16},
@@ -350,7 +338,7 @@ def test_awq_lmhead_export(self):
             nsamples=2,
             seqlen=2,
             layer_config=layer_config,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = "./saved"
         compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq")
@@ -360,10 +348,10 @@ def test_awq_lmhead_export(self):
         assert isinstance(lm_head, WQLinear_GEMM), "Illegal AWQ quantization for lm_head layer"
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_gptq_lmhead_export(self):
+    def test_gptq_lmhead_export(self, dataloader):
         bits, sym, group_size = 4, True, 128
         # Note that, to save UT tuning time, the local model is intentionally kept lightweight, using only 2 hidden layers.
-        model_name = "/tf_dataset/auto_round/models/microsoft/phi-2"
+        model_name = get_model_path("microsoft/phi-2")
         layer_config = {
             "lm_head": {"bits": 4},  # set lm_head quant
             "layer": {"bits": 16},
@@ -377,7 +365,7 @@ def test_gptq_lmhead_export(self):
             iters=2,
             seqlen=2,
             layer_config=layer_config,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = "./saved"
         compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
@@ -393,7 +381,3 @@ def test_gptq_lmhead_export(self):
         res = tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0])
         print(res)
         shutil.rmtree(quantized_model_path, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_generation.py b/test/test_cpu/test_generation.py
index 5018d1610..e1e9dc3f1 100644
--- a/test/test_cpu/test_generation.py
+++ b/test/test_cpu/test_generation.py
@@ -1,39 +1,29 @@
 import copy
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
 from auto_round import AutoRound
 
+from ..helpers import opt_name_or_path
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
 
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestAutoRoundFormatGeneration(unittest.TestCase):
+class TestAutoRoundFormatGeneration:
     @classmethod
-    def setUpClass(self):
-        self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def setup_class(self):
+        self.model_name = opt_name_or_path
         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-        self.llm_dataloader = LLMDataLoader()
         self.save_folder = "./saved"
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree(self.save_folder, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_4bits_sym(self):
+    def test_4bits_sym(self, dataloader):
         bits = 4
         group_size = 128
         sym = True
@@ -45,7 +35,7 @@ def test_4bits_sym(self):
             sym=sym,
             iters=1,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_folder
 
@@ -72,7 +62,7 @@ def test_4bits_sym(self):
         print(res)
         assert "!!!" not in res
 
-    def test_autoround_sym(self):
+    def test_autoround_sym(self, dataloader):
         for bits in [4]:
             model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
             tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
@@ -85,7 +75,7 @@ def test_autoround_sym(self):
                 sym=sym,
                 iters=2,
                 seqlen=2,
-                dataset=self.llm_dataloader,
+                dataset=dataloader,
             )
             quantized_model_path = "./saved"
 
diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py
index 53b199c41..92e9d620e 100644
--- a/test/test_cpu/test_gguf_format.py
+++ b/test/test_cpu/test_gguf_format.py
@@ -1,43 +1,32 @@
 import os
 import shutil
 import sys
-import unittest
-
-sys.path.insert(0, "../..")
 
+import pytest
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
 
+from ..helpers import get_model_path, get_tiny_model
 
-class LLMDataLoader:
-
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
 
-
-class TestGGUF(unittest.TestCase):
+class TestGGUF:
 
     @classmethod
-    def setUpClass(self):
-        self.model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct"
+    def setup_class(self):
+        self.model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct")
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-        self.llm_dataloader = LLMDataLoader()
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_basic_usage(self):
+    def test_basic_usage(self, tiny_gemma_model_path, tiny_qwen_model_path):
         python_path = sys.executable
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --model /tf_dataset/auto_round/models/benzart/gemma-2b-it-fine-tuning-for-code-test "
+            f"cd .. && {python_path} -m auto_round --model {tiny_gemma_model_path} "
             f" --bs 16 --iters 0 --nsamples 1 --format gguf:q4_k_m"
         )
         if res > 0 or res == -1:
@@ -45,7 +34,7 @@ def test_basic_usage(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --model {self.model_name}"
+            f"cd .. && {python_path} -m auto_round --model {tiny_qwen_model_path}"
             f" --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0"
         )
         if res > 0 or res == -1:
@@ -73,39 +62,12 @@ def test_q4_0(self):
         inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
         print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
 
-        # from auto_round.eval.evaluation import simple_evaluate_user_model
-        # result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="openbookqa", eval_model_dtype="bf16")
-        # # 0.246
-        # self.assertGreater(result['results']['openbookqa']['acc,none'], 0.23)
         shutil.rmtree("./saved", ignore_errors=True)
 
-    # def test_q4_1(self):
-    #     bits, group_size, sym = 4, 32, False
-    #     autoround = AutoRound(
-    #         self.model, self.tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, data_type="int", nsamples=1
-    #     )
-    #     quantized_model_path = "./saved"
-    #
-    #     autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q4_1")
-    #     gguf_file = os.listdir(quantized_model_path)[0]
-    #     model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
-    #     text = "There is a girl who likes adventure,"
-    #     inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
-    #     print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
-    #
-    #     # from auto_round.eval.evaluation import simple_evaluate_user_model
-    #     # result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="openbookqa", eval_model_dtype="bf16")
-    #     # # 0.23
-    #     # self.assertGreater(result['results']['openbookqa']['acc,none'], 0.22)
-    #     shutil.rmtree("./saved", ignore_errors=True)
-
     def test_func(self):
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
             self.model_name,
-            # bits=bits,
-            # group_size=group_size,
-            # sym=sym,
             iters=1,
             nsamples=1,
             seqlen=10,
@@ -113,8 +75,8 @@ def test_func(self):
         )
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q*_1")
-        self.assertTrue(autoround.group_size == 32)
-        self.assertFalse(autoround.sym)
+        assert autoround.group_size == 32
+        assert not autoround.sym
         gguf_file = os.listdir("saved")[0]
         model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
         text = "There is a girl who likes adventure,"
@@ -122,80 +84,8 @@ def test_func(self):
         print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
         shutil.rmtree("./saved", ignore_errors=True)
 
-        # model_name = "Qwen/Qwen2.5-1.5B-Instruct"
-        # model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        # autoround = AutoRound(
-        #     model,
-        #     self.tokenizer,
-        #     bits=3,
-        #     group_size=16,
-        #     sym=True,
-        #     iters=1,
-        #     nsamples=1,
-        #     data_type="int_sym_dq",
-        #     super_group_size=16,
-        #     super_bits=6,
-        # )
-        quantized_model_path = "./saved"
-        # autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q*_k_s")
-        # from auto_round.eval.evaluation import simple_evaluate_user_model
-        # gguf_file = os.listdir("saved")[0]
-        # model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
-        # result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="lambada_openai", eval_model_dtype="bf16")
-        # self.assertGreater(result['results']['lambada_openai']['acc,none'], 0.5)
-        shutil.rmtree("./saved", ignore_errors=True)
-
-    #
-    # def test_q5_k(self):
-    #     model_name = "Qwen/Qwen2.5-1.5B-Instruct"
-    #     model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-    #     autoround = AutoRound(
-    #         model,
-    #         self.tokenizer,
-    #         bits=5,
-    #         group_size=32,
-    #         sym=False,
-    #         iters=1,
-    #         nsamples=1,
-    #         data_type="int_asym_dq",
-    #         super_group_size=8,
-    #         super_bits=6,
-    #     )
-    #     quantized_model_path = "./saved"
-    #     autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q*_k_s")
-    #     gguf_file = os.listdir("saved")[0]
-    #     model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
-    #     text = "There is a girl who likes adventure,"
-    #     inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
-    #     print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
-    #     shutil.rmtree("./saved", ignore_errors=True)
-
-    # def test_q6_k(self):
-    #     model_name = "Qwen/Qwen2.5-1.5B-Instruct"
-    #     model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-    #     autoround = AutoRound(
-    #         model,
-    #         self.tokenizer,
-    #         bits=6,
-    #         group_size=16,
-    #         sym=True,
-    #         iters=1,
-    #         nsamples=1,
-    #         data_type="int_sym_dq",
-    #         super_group_size=16,
-    #         super_bits=8,
-    #     )
-    #     quantized_model_path = "./saved"
-    #     autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q*_k")
-    #     gguf_file = os.listdir("saved")[0]
-    #     model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto")
-    #     text = "There is a girl who likes adventure,"
-    #     inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
-    #     print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
-    #     shutil.rmtree("./saved", ignore_errors=True)
-
     def test_gguf_baseline(self):
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-1.5B-Instruct"
+        model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct")
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         autoround = AutoRound(
             model,
@@ -218,31 +108,9 @@ def test_gguf_baseline(self):
         inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
         print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
         shutil.rmtree("./saved", ignore_errors=True)
-        #
-        # model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        # autoround = AutoRound(
-        #     model,
-        #     self.tokenizer,
-        #     bits=5,
-        #     group_size=32,
-        #     sym=True,
-        #     iters=0,
-        #     nsamples=8,
-        #     data_type="int_asym_dq",
-        #     super_group_size=8,
-        #     super_bits=6,
-        #     disable_opt_rtn=True,
-        # )
-        # quantized_model_path = "./saved"
-        # autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q5_k_s,fake")
-        # model = AutoModelForCausalLM.from_pretrained(quantized_model_path + "/fake", device_map="auto")
-        # text = "There is a girl who likes adventure,"
-        # inputs = self.tokenizer(text, return_tensors="pt").to(model.device)
-        # print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]))
-        # shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_q4_k_m(self):
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-1.5B-Instruct"
+    def test_q4_k_m(self, dataloader):
+        model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct")
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         layer_config = {
@@ -265,21 +133,21 @@ def test_q4_k_m(self):
             iters=0,
             seqlen=1,
             nsamples=8,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             disable_opt_rtn=True,
         )
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake")
-        self.assertEqual(autoround.layer_config["model.layers.11.self_attn.v_proj"]["super_group_size"], 16)
-        self.assertEqual(autoround.layer_config["model.layers.11.self_attn.v_proj"]["data_type"], "int_sym_dq")
-        self.assertEqual(autoround.layer_config["model.layers.7.self_attn.v_proj"]["data_type"], "int_asym_dq")
-        self.assertEqual(autoround.model.model.layers[0].self_attn.v_proj.bits, 6)
-        self.assertEqual(autoround.model.model.layers[12].self_attn.v_proj.bits, 4)
-        self.assertEqual(autoround.model.model.embed_tokens.bits, 6)
-        self.assertEqual(autoround.model.model.embed_tokens.group_size, 16)
-        self.assertEqual(autoround.model.model.layers[12].mlp.gate_proj.bits, 3)
-        self.assertEqual(autoround.model.model.layers[10].mlp.gate_proj.bits, 8)
-        self.assertEqual(autoround.layer_config["model.layers.10.mlp.gate_proj"]["mostly"], "gguf:q8_0")
+        assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["super_group_size"] == 16
+        assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["data_type"] == "int_sym_dq"
+        assert autoround.layer_config["model.layers.7.self_attn.v_proj"]["data_type"] == "int_asym_dq"
+        assert autoround.model.model.layers[0].self_attn.v_proj.bits == 6
+        assert autoround.model.model.layers[12].self_attn.v_proj.bits == 4
+        assert autoround.model.model.embed_tokens.bits == 6
+        assert autoround.model.model.embed_tokens.group_size == 16
+        assert autoround.model.model.layers[12].mlp.gate_proj.bits == 3
+        assert autoround.model.model.layers[10].mlp.gate_proj.bits == 8
+        assert autoround.layer_config["model.layers.10.mlp.gate_proj"]["mostly"] == "gguf:q8_0"
         shutil.rmtree("./saved", ignore_errors=True)
 
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
@@ -288,13 +156,13 @@ def test_q4_k_m(self):
         autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake")
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_all_format(self):
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-1.5B-Instruct"
+    def test_all_format(self, tiny_qwen_model_path):
+        model_name = tiny_qwen_model_path
         python_path = sys.executable
         # for gguf_format in ["gguf:q4_0", "gguf:q4_1", "gguf:q4_k_m", "gguf:q6_k"]:
         for gguf_format in ["gguf:q4_k_m"]:
             res = os.system(
-                f"cd ../.. && {python_path} -m auto_round --model {model_name} "
+                f"cd .. && {python_path} -m auto_round --model {model_name} "
                 f" --bs 16 --iters 1 --nsamples 1 --seqlen 16 --format {gguf_format}"
             )
             if res > 0 or res == -1:
@@ -302,7 +170,7 @@ def test_all_format(self):
             shutil.rmtree("../../tmp_autoround", ignore_errors=True)
 
             res = os.system(
-                f"cd ../.. && {python_path} -m auto_round --model {model_name}"
+                f"cd .. && {python_path} -m auto_round --model {model_name}"
                 f" --bs 16 --iters 0 --nsamples 1 --seqlen 16 --format fake,{gguf_format}"
             )
             if res > 0 or res == -1:
@@ -311,7 +179,7 @@ def test_all_format(self):
 
         # test mixed q2_k_s
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --model {model_name}"
+            f"cd .. && {python_path} -m auto_round --model {model_name}"
             f" --bs 16 --iters 0 --nsamples 1 --seqlen 16 --scheme GGUF:Q2_K_MIXED"
         )
         if res > 0 or res == -1:
@@ -319,7 +187,7 @@ def test_all_format(self):
         shutil.rmtree("../../tmp_autoround", ignore_errors=True)
 
     def test_vlm_gguf(self):
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct"
+        model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct")
         from auto_round import AutoRoundMLLM
         from auto_round.utils import mllm_load_model
 
@@ -334,13 +202,13 @@ def test_vlm_gguf(self):
         )
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
-        self.assertTrue("mmproj-model.gguf" in os.listdir("./saved"))
+        assert "mmproj-model.gguf" in os.listdir("./saved")
         for file_name in os.listdir(quantized_model_path):
             file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2
             if file_name == "mmproj-model.gguf":
-                self.assertAlmostEqual(file_size, 2537, delta=5.0)
+                assert abs(file_size - 2537) < 5.0
             else:
-                self.assertAlmostEqual(file_size, 892, delta=5.0)
+                assert abs(file_size - 892) < 5.0
         shutil.rmtree("./saved", ignore_errors=True)
 
     def test_qtype_setting(self):
@@ -351,7 +219,7 @@ def test_qtype_setting(self):
         from auto_round.compressors.utils import set_layer_config
         from auto_round.export.export_to_gguf.config import ModelType
 
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct"
+        model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct")
         ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0)
         ar.formats = ["gguf:q4_0"]
         ar.layer_config, _, _ = set_layer_config(
@@ -367,8 +235,8 @@ def test_qtype_setting(self):
             enable_gguf_official_mixed=True,
             is_mllm=ar.mllm,
         )
-        self.assertTrue(ar.layer_config["model.embed_tokens"]["bits"] == 8)
-        self.assertTrue("lm_head" not in ar.layer_config)
+        assert ar.layer_config["model.embed_tokens"]["bits"] == 8
+        assert "lm_head" not in ar.layer_config
 
         model_name = "Qwen/Qwen3-0.6B"
         ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0)
@@ -386,8 +254,8 @@ def test_qtype_setting(self):
             enable_gguf_official_mixed=True,
             is_mllm=ar.mllm,
         )
-        self.assertTrue(ar.layer_config["model.embed_tokens"]["bits"] == 4)
-        self.assertTrue(ar.layer_config["lm_head"]["bits"] == 6 and ar.layer_config["lm_head"]["super_bits"] == 8)
+        assert ar.layer_config["model.embed_tokens"]["bits"] == 4
+        assert ar.layer_config["lm_head"]["bits"] == 6 and ar.layer_config["lm_head"]["super_bits"] == 8
 
         layer_config = {
             "model.embed_tokens": {"bits": 6, "super_bits": 8},
@@ -408,12 +276,8 @@ def test_qtype_setting(self):
             enable_gguf_official_mixed=True,
             is_mllm=ar.mllm,
         )
-        self.assertTrue(ar.layer_config["lm_head"]["bits"] == 4)
-        self.assertTrue(
-            ar.layer_config["model.embed_tokens"]["bits"] == 6
+        assert (
+            ar.layer_config["lm_head"]["bits"] == 4
+            and ar.layer_config["model.embed_tokens"]["bits"] == 6
             and ar.layer_config["model.embed_tokens"]["super_bits"] == 8
         )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_gpt_oss.py b/test/test_cpu/test_gpt_oss.py
deleted file mode 100644
index ccc997eba..000000000
--- a/test/test_cpu/test_gpt_oss.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import pytest
-from transformers import AutoConfig, AutoTokenizer
-from transformers.models.gpt_oss.modeling_gpt_oss import GptOssForCausalLM
-
-from auto_round import AutoRound
-
-
-@pytest.fixture
-def setup_gpt_oss():
-    """Fixture to set up the GPT-OSS model and tokenizer."""
-    model_name = "/tf_dataset/auto_round/models/unsloth/gpt-oss-20b-BF16"
-    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
-    config.num_hidden_layers = 1  # Reduce layers for testing
-    model = GptOssForCausalLM(config)
-    output_dir = "/tmp/test_quantized_gpt_oss"
-    return model, tokenizer, output_dir, config
-
-
-def quantize_model(model, tokenizer, output_dir, scheme, iters=0):
-    """Helper function to quantize the model with the given scheme."""
-    autoround = AutoRound(
-        model,
-        tokenizer,
-        scheme=scheme,
-        nsamples=2,
-        iters=iters,
-        fp_layers="self_attn,router,lm_head,mlp.gate",
-    )
-    quantized_model, save_folder = autoround.quantize_and_save(format="auto_round", output_dir=output_dir)
-    return quantized_model
-
-
-def count_modules_by_type(model, target_module_name_or_class):
-    """Helper function to count modules of a specific type in the model."""
-    cnt = 0
-    for name, module in model.named_modules():
-        if isinstance(target_module_name_or_class, str):
-            if target_module_name_or_class == module.__class__.__name__:
-                cnt += 1
-        else:
-            if isinstance(module, target_module_name_or_class):
-                cnt += 1
-    return cnt
-
-
-@pytest.mark.parametrize("scheme", ["MXFP4", "MXFP8"])
-def test_quantization(setup_gpt_oss, scheme):
-    """Test quantization with the scheme."""
-    model, tokenizer, output_dir, config = setup_gpt_oss
-    quantized_model = quantize_model(model, tokenizer, output_dir, scheme)
-
-    # Ensure the quantized model is not None
-    assert quantized_model is not None, "Quantized model should not be None."
-    from auto_round.export.export_to_autoround.qlinear_fp import QuantLinear
-    from auto_round.modelling.gpt_oss import GPTOssSingleExpert
-
-    single_expert_cnt = count_modules_by_type(quantized_model, GPTOssSingleExpert)
-    quant_linear_cnt = count_modules_by_type(quantized_model, QuantLinear)
-    assert (
-        single_expert_cnt == config.num_local_experts
-    ), f"Expected {config.num_local_experts} GPTOssSingleExpert modules, found {single_expert_cnt}."
-    assert (
-        quant_linear_cnt == config.num_hidden_layers * 3 * config.num_local_experts
-    ), f"Expected {config.num_hidden_layers * 3 * config.num_local_experts} QuantLinear modules, found {quant_linear_cnt}."
-
-    print(f"[{scheme}] Total {GPTOssSingleExpert.__name__} modules: {single_expert_cnt}")
-    print(f"[{scheme}] Total {QuantLinear.__name__} modules: {quant_linear_cnt}")
-    # clean the output directory after test
-    import shutil
-
-    shutil.rmtree(output_dir, ignore_errors=True)
diff --git a/test/test_cpu/test_init.py b/test/test_cpu/test_init.py
index 6ebee954d..01785d679 100644
--- a/test/test_cpu/test_init.py
+++ b/test/test_cpu/test_init.py
@@ -1,8 +1,8 @@
 from auto_round import AutoRound
 
 
-def test_torch_compile():
-    ar = AutoRound(model="facebook/opt-125m", scheme="NVFP4", enable_torch_compile=True)
+def test_torch_compile(tiny_opt_model_path):
+    ar = AutoRound(model=tiny_opt_model_path, scheme="NVFP4", enable_torch_compile=True)
     assert not ar.enable_torch_compile, "NVFP4 cannot work with torch.compile."
-    ar = AutoRound(model="facebook/opt-125m", scheme="FP8_STATIC", enable_torch_compile=True)
+    ar = AutoRound(model=tiny_opt_model_path, scheme="FP8_STATIC", enable_torch_compile=True)
     assert not ar.enable_torch_compile, "FP8_STATIC cannot work with torch.compile."
diff --git a/test/test_cpu/test_llmc_integration.py b/test/test_cpu/test_llmc_integration.py
index 6dba09cfa..cea412327 100644
--- a/test/test_cpu/test_llmc_integration.py
+++ b/test/test_cpu/test_llmc_integration.py
@@ -85,7 +85,7 @@ def test_oneshot_application(recipe, tmp_path):
     assert weight_args.num_bits == 4
 
     # Check a specific layer is quantized
-    targeted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj
+    targeted_linear_layer = model_loaded.model.layers[1].self_attn.q_proj
     assert hasattr(targeted_linear_layer, "quantization_scheme")
 
     # Check lm-head is not quantized
diff --git a/test/test_cpu/test_llmcompressor.py b/test/test_cpu/test_llmcompressor.py
index 051dfb075..614701943 100644
--- a/test/test_cpu/test_llmcompressor.py
+++ b/test/test_cpu/test_llmcompressor.py
@@ -1,25 +1,24 @@
 import os
 import shutil
-import sys
-import unittest
-
-sys.path.insert(0, "../..")
 
+import pytest
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
 
+from ..helpers import get_model_path, opt_name_or_path
+
 
-class TestLLMC(unittest.TestCase):
+class TestLLMC:
     @classmethod
-    def setUpClass(self):
-        self.model_name = "/tf_dataset/auto_round/models/stas/tiny-random-llama-2"
+    def setup_class(self):
+        self.model_name = get_model_path("stas/tiny-random-llama-2")
         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
@@ -42,7 +41,7 @@ def test_llmcompressor_w8a8(self):
 
     def test_llmcompressor_fp8(self):
         ## quantize the model
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = opt_name_or_path
         autoround = AutoRound(
             model_name,
             scheme="FP8_STATIC",
@@ -59,14 +58,14 @@ def test_llmcompressor_fp8(self):
         import json
 
         config = json.load(open("./saved/config.json"))
-        self.assertIn("group_0", config["quantization_config"]["config_groups"])
-        self.assertEqual(config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"], 8)
-        self.assertEqual(config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"], "channel")
-        self.assertEqual(config["quantization_config"]["quant_method"], "compressed-tensors")
+        assert "group_0" in config["quantization_config"]["config_groups"]
+        assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8
+        assert config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"] == "channel"
+        assert config["quantization_config"]["quant_method"] == "compressed-tensors"
 
     def test_autoround_llmcompressor_fp8(self):
         ## quantize the model
-        model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+        model_name = opt_name_or_path
         autoround = AutoRound(
             model_name,
             scheme="FP8_STATIC",
@@ -80,14 +79,8 @@ def test_autoround_llmcompressor_fp8(self):
         import json
 
         config = json.load(open("./saved/config.json"))
-        self.assertIn("group_0", config["quantization_config"]["config_groups"])
-        self.assertEqual(config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"], 8)
-        self.assertEqual(config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"], "tensor")
-        self.assertEqual(
-            config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["strategy"], "tensor"
-        )
-        self.assertEqual(config["quantization_config"]["quant_method"], "compressed-tensors")
-
-
-if __name__ == "__main__":
-    unittest.main()
+        assert "group_0" in config["quantization_config"]["config_groups"]
+        assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8
+        assert config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"] == "tensor"
+        assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["strategy"] == "tensor"
+        assert config["quantization_config"]["quant_method"] == "compressed-tensors"
diff --git a/test/test_cpu/test_load_awq_gptq.py b/test/test_cpu/test_load_awq_gptq.py
index 4fb6bb977..6dc295b4e 100644
--- a/test/test_cpu/test_load_awq_gptq.py
+++ b/test/test_cpu/test_load_awq_gptq.py
@@ -1,46 +1,21 @@
 import shutil
-import sys
-import unittest
-
-sys.path.insert(0, "../..")
 
+import pytest
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
+from ..helpers import get_model_path, model_infer
 
-class TestAutoRound(unittest.TestCase):
-    def model_infer(self, model, tokenizer):
-        prompts = [
-            "Hello,my name is",
-            # "The president of the United States is",
-            # "The capital of France is",
-            # "The future of AI is",
-        ]
-
-        inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
-
-        outputs = model.generate(
-            input_ids=inputs["input_ids"].to(model.device),
-            attention_mask=inputs["attention_mask"].to(model.device),
-            do_sample=False,  ## change this to follow official usage
-            max_new_tokens=5,
-        )
-        generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
-
-        decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
 
-        for i, prompt in enumerate(prompts):
-            print(f"Prompt: {prompt}")
-            print(f"Generated: {decoded_outputs[i]}")
-            print("-" * 50)
+class TestAutoRound:
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("runs", ignore_errors=True)
 
     def test_load_gptq_no_dummy_gidx_model(self):
-        model_name = "/tf_dataset/auto_round/models/ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1"
+        model_name = get_model_path("ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1")
         quantization_config = AutoRoundConfig()
-        with self.assertRaises(NotImplementedError) as cm:
+        with pytest.raises(NotImplementedError):
             model = AutoModelForCausalLM.from_pretrained(
                 model_name,
                 torch_dtype="auto",
@@ -50,7 +25,7 @@ def test_load_gptq_no_dummy_gidx_model(self):
             )
 
     def test_load_awq(self):
-        model_name = "/tf_dataset/auto_round/models/casperhansen/opt-125m-awq"
+        model_name = get_model_path("casperhansen/opt-125m-awq")
         quantization_config = AutoRoundConfig()
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
@@ -60,4 +35,4 @@ def test_load_awq(self):
             quantization_config=quantization_config,
         )
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
diff --git a/test/test_cpu/test_mix_bits.py b/test/test_cpu/test_mix_bits.py
index 2c73d42cd..6cc390637 100644
--- a/test/test_cpu/test_mix_bits.py
+++ b/test/test_cpu/test_mix_bits.py
@@ -1,19 +1,17 @@
 import json
 import os
 import shutil
-import sys
-import unittest
 from pathlib import Path
 
-from parameterized import parameterized
-
-sys.path.insert(0, "../..")
+import pytest
 import torch
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
 from auto_round import AutoRound
 from auto_round.testing_utils import require_gptqmodel
 
+from ..helpers import opt_name_or_path
+
 
 def _get_folder_size(path: str) -> float:
     """Return folder size in GB."""
@@ -26,31 +24,21 @@ def _get_folder_size(path: str) -> float:
     return total_size / (1024**3)  # convert to GB
 
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestAutoRound(unittest.TestCase):
+class TestAutoRound:
     @classmethod
-    def setUpClass(self):
-        self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def setup_class(self):
+        self.model_name = opt_name_or_path
         self.save_dir = ".saved/"
         self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-        self.llm_dataloader = LLMDataLoader()
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_gptqmodel
-    def test_mixed_gptqmodel(self):
+    def test_mixed_gptqmodel(self, dataloader):
         layer_config = {
             "k_proj": {"bits": 8},  # part name
             "lm_head": {"bits": 4},  # set lm_head quant
@@ -64,7 +52,7 @@ def test_mixed_gptqmodel(self):
             iters=2,
             seqlen=2,
             layer_config=layer_config,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
@@ -79,7 +67,7 @@ def test_mixed_gptqmodel(self):
         assert "!!!" not in model.tokenizer.decode(result)  # string output
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mixed_gptqmodel_convert_to_ar(self):
+    def test_mixed_gptqmodel_convert_to_ar(self, dataloader):
         layer_config = {
             "k_proj": {"bits": 8},  # part name
             "lm_head": {"bits": 4},  # set lm_head quant
@@ -93,7 +81,7 @@ def test_mixed_gptqmodel_convert_to_ar(self):
             iters=2,
             seqlen=2,
             layer_config=layer_config,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
@@ -108,7 +96,7 @@ def test_mixed_gptqmodel_convert_to_ar(self):
         print(res)
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mixed_autoround_format(self):
+    def test_mixed_autoround_format(self, dataloader):
         layer_config = {
             "k_proj": {"bits": 8},
             "q_proj": {"bits": 3},
@@ -120,7 +108,7 @@ def test_mixed_autoround_format(self):
             scheme="W4A16",
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = "./saved"
@@ -134,7 +122,7 @@ def test_mixed_autoround_format(self):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_fallback_regex_for_awq_format(self):
+    def test_fallback_regex_for_awq_format(self, dataloader):
         layer_config = {
             "lm_head": {"bits": 16},
             "fc1": {"bits": 16},
@@ -144,7 +132,7 @@ def test_fallback_regex_for_awq_format(self):
             scheme="W4A16",
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = "./saved"
@@ -159,7 +147,7 @@ def test_fallback_regex_for_awq_format(self):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mixed_ar_format_part_name_hf_loading(self):
+    def test_mixed_ar_format_part_name_hf_loading(self, dataloader):
         layer_config = {
             "k_proj": {"bits": 8},  # part name
             "lm_head": {"bits": 16},  # full name
@@ -170,7 +158,7 @@ def test_mixed_ar_format_part_name_hf_loading(self):
             scheme="W4A16",
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = "./saved"
@@ -220,7 +208,7 @@ def test_mixed_ar_format_part_name_hf_loading(self):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mixed_MXFP_autoround_format_loading(self):
+    def test_mixed_MXFP_autoround_format_loading(self, dataloader):
         layer_config = {
             "k_proj": {"bits": 8, "act_bits": 8},
             "lm_head": {"bits": 16, "act_bits": 16},
@@ -231,7 +219,7 @@ def test_mixed_MXFP_autoround_format_loading(self):
             scheme="MXFP4",
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
@@ -246,9 +234,5 @@ def test_mixed_MXFP_autoround_format_loading(self):
 
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10)
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.14)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.14
         shutil.rmtree(quantized_model_path, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_mllm.py b/test/test_cpu/test_mllm.py
index 8510adca5..2eb1d3e2f 100644
--- a/test/test_cpu/test_mllm.py
+++ b/test/test_cpu/test_mllm.py
@@ -1,14 +1,12 @@
-import sys
-import unittest
-
-sys.path.insert(0, "../..")
-
 import shutil
 
+import pytest
 from transformers import AutoProcessor, AutoTokenizer, Qwen2VLForConditionalGeneration
 
 from auto_round import AutoRoundMLLM
 
+from ..helpers import get_model_path, opt_name_or_path
+
 
 class FakeDataLoader:
     def __init__(self):
@@ -27,23 +25,21 @@ def __iter__(self):
             yield self.data
 
 
-class TestAutoRoundMLLM(unittest.TestCase):
+class TestAutoRoundMLLM:
     @classmethod
-    def setUpClass(self):
-        self.model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct"
+    def setup_class(self):
+        self.model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct")
         self.dataset = FakeDataLoader()
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-        return super().tearDownClass()
-
-    def test_tune(self):
+    def test_tune(self, tiny_qwen_vl_model_path):
         bits, group_size = 4, 128
         autoround = AutoRoundMLLM(
-            model=self.model_name,
+            model=tiny_qwen_vl_model_path,
             bits=bits,
             group_size=group_size,
             nsamples=1,
@@ -56,11 +52,11 @@ def test_tune(self):
         autoround.save_quantized("./saved/", format="auto_gptq", inplace=False)
         autoround.save_quantized("./saved/", format="auto_round", inplace=False)
 
-    def test_quant_vision(self):  ## bug need to fix
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-        processor = AutoProcessor.from_pretrained(self.model_name, trust_remote_code=True)
+    def test_quant_vision(self, tiny_qwen_vl_model_path):  ## bug need to fix
+        tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_vl_model_path)
+        processor = AutoProcessor.from_pretrained(tiny_qwen_vl_model_path, trust_remote_code=True)
         model = Qwen2VLForConditionalGeneration.from_pretrained(
-            self.model_name, trust_remote_code=True, device_map="auto"
+            tiny_qwen_vl_model_path, trust_remote_code=True, device_map="auto"
         )
         bits, group_size = 4, 128
         autoround = AutoRoundMLLM(
@@ -105,17 +101,17 @@ class Myclass:
         dataset = MLLM_DATASET["liuhaotian/llava"](
             template=Myclass(), model=None, tokenzier=None, dataset_path="liuhaotian/llava", seqlen=32, nsamples=32
         )
-        self.assertEqual(len(dataset.questions), 32)
+        assert len(dataset.questions) == 32
         dataset = MLLM_DATASET["liuhaotian/llava"](
             template=Myclass(), model=None, tokenzier=None, dataset_path="liuhaotian/llava", seqlen=2048, nsamples=512
         )
-        self.assertEqual(len(dataset.questions), 512)
+        assert len(dataset.questions) == 512
 
-    def test_diff_dataset(self):
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-        processor = AutoProcessor.from_pretrained(self.model_name, trust_remote_code=True)
+    def test_diff_dataset(self, tiny_qwen_vl_model_path):
+        tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_vl_model_path)
+        processor = AutoProcessor.from_pretrained(tiny_qwen_vl_model_path, trust_remote_code=True)
         model = Qwen2VLForConditionalGeneration.from_pretrained(
-            self.model_name, trust_remote_code=True, device_map="auto"
+            tiny_qwen_vl_model_path, trust_remote_code=True, device_map="auto"
         )
         bits, group_size = 4, 128
         dataset = ["dataset test", "list test"]
@@ -133,19 +129,17 @@ def test_diff_dataset(self):
         )
         autoround.quantize()
 
-    def test_pure_text_model_check(self):
+    def test_pure_text_model_check(self, tiny_qwen_vl_model_path):
         from transformers import AutoModelForCausalLM
 
         from auto_round.utils import is_pure_text_model
 
         model = Qwen2VLForConditionalGeneration.from_pretrained(
-            self.model_name, trust_remote_code=True, device_map="auto"
+            tiny_qwen_vl_model_path, trust_remote_code=True, device_map="auto"
         )
-        self.assertFalse(is_pure_text_model(model))
-        model = AutoModelForCausalLM.from_pretrained(
-            "/tf_dataset/auto_round/models/facebook/opt-125m", trust_remote_code=True
-        )
-        self.assertTrue(is_pure_text_model(model))
+        assert not is_pure_text_model(model)
+        model = AutoModelForCausalLM.from_pretrained(opt_name_or_path, trust_remote_code=True)
+        assert is_pure_text_model(model)
 
     def test_str_input(self):
         tokenizer = AutoTokenizer.from_pretrained(self.model_name)
@@ -211,10 +205,10 @@ def test_str_input(self):
         )
         print(output_text[0])
 
-    def test_qwen2_5(self):
+    def test_qwen2_5(self, tiny_qwen_2_5_vl_model_path):
         from auto_round.utils import mllm_load_model
 
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-VL-3B-Instruct"
+        model_name = tiny_qwen_2_5_vl_model_path
         model, processor, tokenizer, image_processor = mllm_load_model(model_name)
         autoround = AutoRoundMLLM(
             model,
@@ -264,8 +258,3 @@ def test_qwen2_5(self):
         output_text = processor.batch_decode(
             generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
-        print(output_text)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_model_scope.py b/test/test_cpu/test_model_scope.py
index 6da33cdc3..7edcab156 100644
--- a/test/test_cpu/test_model_scope.py
+++ b/test/test_cpu/test_model_scope.py
@@ -1,30 +1,19 @@
 import copy
 import os
 import shutil
-import sys
-import unittest
-
-sys.path.insert(0, "../..")
 
+import pytest
 import torch
 
 from auto_round import AutoRound
 
+from ..helpers import get_model_path
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(3):
-            yield torch.ones([1, 10], dtype=torch.long)
 
-
-class TestModelScope(unittest.TestCase):
+class TestModelScope:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.saved_path = "./saved"
-        self.dataset = LLMDataLoader()
 
         self.source_path, self.cache_path = "/tf_dataset/auto_round/modelscope", "/home/hostuser/.cache/modelscope"
         if os.path.exists(self.source_path):
@@ -33,28 +22,20 @@ def setUpClass(self):
             shutil.copytree(self.source_path, self.cache_path, dirs_exist_ok=True)
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
         if os.path.exists(self.cache_path):
             shutil.rmtree(self.cache_path, ignore_errors=True)
 
-        return super().tearDownClass()
-
-    def test_llm(self):
-        model_name = "Qwen/Qwen2.5-0.5B-Instruct"
-        autoround = AutoRound(
-            model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=self.dataset
-        )
+    def test_llm(self, dataloader):
+        model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct")
+        autoround = AutoRound(model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=dataloader)
         autoround.quantize_and_save()
 
-    def test_mllm(self):
-        model_name = "Qwen/Qwen2-VL-2B-Instruct"
+    def test_mllm(self, dataloader):
+        model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct")
         autoround = AutoRound(
-            model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=self.dataset, batch_size=2
+            model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=dataloader, batch_size=2
         )
         autoround.quantize_and_save(self.saved_path)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_moe_model.py b/test/test_cpu/test_moe_model.py
index c88571346..c30ab0e39 100644
--- a/test/test_cpu/test_moe_model.py
+++ b/test/test_cpu/test_moe_model.py
@@ -6,29 +6,37 @@
 
 from auto_round import AutoRound
 
+from ..helpers import get_model_path
+
+gpt_oss_name_or_path = get_model_path("unsloth/gpt-oss-20b-BF16")
+llama4_name_or_path = get_model_path("meta-llama/Llama-4-Scout-17B-16E-Instruct")
+
+# local path for debug
+# llama4_name_or_path = get_model_path("/dataset/Llama-4-Scout-17B-16E-Instruct")
+
 
 @pytest.fixture
 def setup_gpt_oss():
     """Fixture to set up the GPT-OSS model and tokenizer."""
-    model_name = "/tf_dataset/auto_round/models/unsloth/gpt-oss-20b-BF16"
+    model_name = gpt_oss_name_or_path
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
     config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
     config.num_hidden_layers = 1  # Reduce layers for testing
     model = GptOssForCausalLM(config)
-    output_dir = "/tmp/test_quantized_gpt_oss"
+    output_dir = "./tmp/test_quantized_gpt_oss"
     return model, tokenizer, output_dir, config
 
 
 @pytest.fixture
 def setup_llama4():
     """Fixture to set up the llama4 model and tokenizer."""
-    model_name = "/tf_dataset/auto_round/models/meta-llama/Llama-4-Scout-17B-16E-Instruct"
+    model_name = llama4_name_or_path
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
     config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
     config.vision_config.num_hidden_layers = 2  # Reduce layers for testing
     config.text_config.num_hidden_layers = 2
     model = Llama4ForConditionalGeneration(config)
-    output_dir = "/tmp/test_quantized_llama4"
+    output_dir = "./tmp/test_quantized_llama4"
     return model, tokenizer, output_dir, config
 
 
@@ -46,23 +54,52 @@ def quantize_model(model, tokenizer, output_dir, scheme, iters=0):
     return quantized_model
 
 
-def test_gptoss(setup_gpt_oss):
+def count_modules_by_type(model, target_module_name_or_class):
+    """Helper function to count modules of a specific type in the model."""
+    cnt = 0
+    for name, module in model.named_modules():
+        if isinstance(target_module_name_or_class, str):
+            if target_module_name_or_class == module.__class__.__name__:
+                cnt += 1
+        else:
+            if isinstance(module, target_module_name_or_class):
+                cnt += 1
+    return cnt
+
+
+@pytest.mark.parametrize("scheme", ["MXFP4", "MXFP8"])
+def test_gptoss(setup_gpt_oss, scheme):
     model, tokenizer, output_dir, config = setup_gpt_oss
 
     # Below parameter is set to be same as the full model
     # Remove it to avoid mismatch during quantized model loading
     delattr(model.config, "layer_types")
 
-    quantized_model = quantize_model(model, tokenizer, output_dir, "MXFP4")
+    quantized_model = quantize_model(model, tokenizer, output_dir, scheme)
 
     # Ensure the quantized model is not None
     assert quantized_model is not None, "Quantized model should not be None."
-
-    loaded_model = GptOssForCausalLM.from_pretrained(output_dir)
-    for n, m in quantized_model.named_modules():
-        if m.__class__.__name__ == "QuantLinear":
-            loaded_m = loaded_model.get_submodule(n)
-            assert (loaded_m.weight_packed.to("cpu") == m.weight_packed.to("cpu")).all()
+    from auto_round.export.export_to_autoround.qlinear_fp import QuantLinear
+    from auto_round.modelling.gpt_oss import GPTOssSingleExpert
+
+    single_expert_cnt = count_modules_by_type(quantized_model, GPTOssSingleExpert)
+    quant_linear_cnt = count_modules_by_type(quantized_model, QuantLinear)
+    assert (
+        single_expert_cnt == config.num_local_experts
+    ), f"Expected {config.num_local_experts} GPTOssSingleExpert modules, found {single_expert_cnt}."
+    assert (
+        quant_linear_cnt == config.num_hidden_layers * 3 * config.num_local_experts
+    ), f"Expected {config.num_hidden_layers * 3 * config.num_local_experts} QuantLinear modules, found {quant_linear_cnt}."
+
+    print(f"[{scheme}] Total {GPTOssSingleExpert.__name__} modules: {single_expert_cnt}")
+    print(f"[{scheme}] Total {QuantLinear.__name__} modules: {quant_linear_cnt}")
+
+    if scheme == "MXFP4":
+        loaded_model = GptOssForCausalLM.from_pretrained(output_dir)
+        for n, m in quantized_model.named_modules():
+            if m.__class__.__name__ == "QuantLinear":
+                loaded_m = loaded_model.get_submodule(n)
+                assert (loaded_m.weight_packed.to("cpu") == m.weight_packed.to("cpu")).all()
     # clean the output directory after test
     shutil.rmtree(output_dir, ignore_errors=True)
 
diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py
index f38fe3eb6..7e0600f05 100644
--- a/test/test_cpu/test_mxfp_nvfp.py
+++ b/test/test_cpu/test_mxfp_nvfp.py
@@ -1,16 +1,14 @@
 import os
 import shutil
-import sys
-import unittest
 
-from parameterized import parameterized
-
-sys.path.insert(0, "../..")
+import pytest
 import torch
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
 from auto_round import AutoRound
 
+from ..helpers import is_model_outputs_similar
+
 
 def _get_folder_size(path: str) -> float:
     """Return folder size in GB."""
@@ -23,31 +21,18 @@ def _get_folder_size(path: str) -> float:
     return total_size / (1024**3)  # convert to GB
 
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestAutoRoundFP(unittest.TestCase):
+class TestAutoRoundFP:
     @classmethod
-    def setUpClass(self):
-        self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def setup_class(self):
         self.save_dir = "./saved"
-        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto")
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-        self.llm_dataloader = LLMDataLoader()
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_nvfp4_moe_actmax_rtn(self):
-        model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
+    def test_nvfp4_moe_actmax_rtn(self, tiny_deepseek_v2_model_path, dataloader):
+        model_name = tiny_deepseek_v2_model_path
         layer_config = {
             "self_attn": {"bits": 16, "act_bits": 16},
             "mlp.shared_experts": {"bits": 16, "act_bits": 16},
@@ -62,7 +47,7 @@ def test_nvfp4_moe_actmax_rtn(self):
             iters=0,
             seqlen=2,
             nsamples=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         compressed_model, _ = autoround.quantize()
@@ -73,8 +58,8 @@ def test_nvfp4_moe_actmax_rtn(self):
         ), "Illegal NVFP4 quantization for lm_head layer"
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
-    def test_nvfp4_moe_actmax_ar(self):
-        model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
+    def test_nvfp4_moe_actmax_ar(self, tiny_deepseek_v2_model_path, dataloader):
+        model_name = tiny_deepseek_v2_model_path
         layer_config = {
             "q_proj": {"bits": 16, "act_bits": 16},
             "mlp.shared_experts": {"bits": 16, "act_bits": 16},
@@ -89,7 +74,7 @@ def test_nvfp4_moe_actmax_ar(self):
             iters=1,
             seqlen=3,
             nsamples=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         compressed_model, _ = autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round")
@@ -102,17 +87,11 @@ def test_nvfp4_moe_actmax_ar(self):
             and lm_head.weight_scale.dtype is torch.float8_e4m3fn
         ), "Illegal NVFP4 packing for lm_head layer"
         quantized_model_path = self.save_dir
-        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="auto")
-        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
-        from auto_round.eval.evaluation import simple_evaluate_user_model
-
-        result = simple_evaluate_user_model(model, tokenizer, batch_size=4, tasks="piqa", limit=4)
-        print(result["results"]["piqa"]["acc,none"])
-        self.assertGreater(result["results"]["piqa"]["acc,none"], 0.49)
+        assert is_model_outputs_similar(model_name, quantized_model_path)
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
-    def test_mxfp4_moe_ar(self):
-        model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite"
+    def test_mxfp4_moe_ar(self, tiny_deepseek_v2_model_path, dataloader):
+        model_name = tiny_deepseek_v2_model_path
         layer_config = {
             "q_proj": {"bits": 16, "act_bits": 16, "data_type": "float"},
             "mlp.shared_experts": {"bits": 16, "act_bits": 16, "data_type": "float"},
@@ -127,7 +106,7 @@ def test_mxfp4_moe_ar(self):
             iters=1,
             seqlen=2,
             nsamples=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         compressed_model, _ = autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round")
@@ -139,8 +118,8 @@ def test_mxfp4_moe_ar(self):
         ), "Illegal MXFP4 packing for lm_head layer"
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
-    def test_mxfp4_llmcompressor_format(self):
-        model_name = self.model_name
+    def test_mxfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         from transformers import AutoConfig
 
         scheme = "MXFP4"
@@ -151,15 +130,15 @@ def test_mxfp4_llmcompressor_format(self):
             iters=2,
             seqlen=2,
             layer_config=layer_config,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_dir
         autoround.quantize()
         compressed_model = autoround.save_quantized(
             output_dir=quantized_model_path, inplace=True, format="llm_compressor"
         )
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
-        skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj
+        tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
+        skip_layer = compressed_model.model.decoder.layers[1].self_attn.k_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
             and hasattr(tmp_layer, "weight_packed")
@@ -179,8 +158,8 @@ def test_mxfp4_llmcompressor_format(self):
         ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_rtn_mxfp4_llmcompressor_format(self):
-        model_name = self.model_name
+    def test_rtn_mxfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         from transformers import AutoConfig
 
         scheme = "MXFP4"
@@ -191,15 +170,15 @@ def test_rtn_mxfp4_llmcompressor_format(self):
             iters=0,
             seqlen=2,
             layer_config=layer_config,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_dir
         autoround.quantize()
         compressed_model = autoround.save_quantized(
             output_dir=quantized_model_path, inplace=True, format="llm_compressor"
         )
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
-        skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj
+        tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
+        skip_layer = compressed_model.model.decoder.layers[1].self_attn.k_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
             and hasattr(tmp_layer, "weight_packed")
@@ -219,8 +198,8 @@ def test_rtn_mxfp4_llmcompressor_format(self):
         ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mxfp8_llmcompressor_format(self):
-        model_name = self.model_name
+    def test_mxfp8_llmcompressor_format(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         from transformers import AutoConfig
 
         scheme = "MXFP8"
@@ -229,11 +208,11 @@ def test_mxfp8_llmcompressor_format(self):
             scheme=scheme,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_dir
         compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
+        tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
             and hasattr(tmp_layer, "weight")
@@ -250,14 +229,14 @@ def test_mxfp8_llmcompressor_format(self):
             and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 8
         ), f"Invalid MXFP8 quantization configuration: {quantization_config}"
         folder_size_gb = _get_folder_size(quantized_model_path)
-        # Original opt-125m is < 0.5GB -> quantized mxfp8 model should be smaller but not empty
+        # Original tiny_opt_model_path-125m is < 0.1GB -> quantized mxfp8 model should be smaller but not empty
         assert (
-            0.15 < folder_size_gb < 0.2
-        ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.2 GB)"
+            0.05 < folder_size_gb < 0.1
+        ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.05~0.1 GB)"
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_nvfp4_llmcompressor_format(self):
-        model_name = self.model_name
+    def test_nvfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         from transformers import AutoConfig
 
         scheme = "NVFP4"
@@ -266,11 +245,11 @@ def test_nvfp4_llmcompressor_format(self):
             scheme=scheme,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_dir
         compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
+        tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
             and hasattr(tmp_layer, "weight_global_scale")
@@ -287,14 +266,14 @@ def test_nvfp4_llmcompressor_format(self):
             and quantization_config["config_groups"]["group_0"]["input_activations"]["num_bits"] == 4
         ), f"Invalid NVFP4 quantization configuration: {quantization_config}"
         folder_size_gb = _get_folder_size(quantized_model_path)
-        # Original opt-125m is < 0.5GB -> quantized nvfp4 model should be smaller but not empty
+        # Original opt-125m is < 0.1GB -> quantized nvfp4 model should be smaller but not empty
         assert (
-            0.1 < folder_size_gb < 0.15
-        ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.15 GB)"
+            0.05 < folder_size_gb < 0.1
+        ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.05~0.1 GB)"
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_nvfp4_autoround_format(self):
-        model_name = self.model_name
+    def test_nvfp4_autoround_format(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         from transformers import AutoConfig
 
         scheme = "NVFP4"
@@ -303,11 +282,11 @@ def test_nvfp4_autoround_format(self):
             scheme=scheme,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_dir
         compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
+        tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
             and hasattr(tmp_layer, "weight_global_scale")
@@ -318,8 +297,8 @@ def test_nvfp4_autoround_format(self):
         ), "Illegal NVFP4 packing name or data_type or shape"
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_nvfp4_autoround_save_quantized(self):
-        model_name = self.model_name
+    def test_nvfp4_autoround_save_quantized(self, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
         from transformers import AutoConfig
 
         scheme = "NVFP4"
@@ -328,12 +307,12 @@ def test_nvfp4_autoround_save_quantized(self):
             scheme=scheme,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_dir
         autoround.quantize()
         compressed_model = autoround.save_quantized(output_dir=quantized_model_path, format="auto_round")
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
+        tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
             and hasattr(tmp_layer, "weight_global_scale")
@@ -344,10 +323,10 @@ def test_nvfp4_autoround_save_quantized(self):
         ), "Illegal NVFP4 packing name or data_type or shape"
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_qwen_moe_quant_infer(self):
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen1.5-MoE-A2.7B"
+    def test_qwen_moe_quant_infer(self, tiny_qwen_moe_model_path, dataloader):
+        model_name = tiny_qwen_moe_model_path
         layer_config = {
-            "layers\.(?:[3-9]|1[0-9]|2[0-3])": {"bits": 16, "act_bits": 16},
+            "layers.0": {"bits": 16, "act_bits": 16},
         }
         scheme = "nvfp4"
         autoround = AutoRound(
@@ -356,21 +335,16 @@ def test_qwen_moe_quant_infer(self):
             iters=1,
             seqlen=2,
             nsamples=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round")
-        model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="cpu")
-        tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
-        from auto_round.eval.evaluation import simple_evaluate_user_model
-
-        result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="piqa", limit=10)
-        print(result["results"]["piqa"]["acc,none"])
-        self.assertGreater(result["results"]["piqa"]["acc,none"], 0.60)
-        shutil.rmtree(quantized_model_path, ignore_errors=True)
+        assert is_model_outputs_similar(model_name, quantized_model_path)
+        shutil.rmtree(self.save_dir, ignore_errors=True)
 
-    @parameterized.expand(
+    @pytest.mark.parametrize(
+        "scheme, static_kv_dtype, static_attention_dtype",
         [
             # scheme,  static_kv_dtype, static_attention_dtype
             ("MXFP4", None, "fp8"),
@@ -379,11 +353,11 @@ def test_qwen_moe_quant_infer(self):
             ("MXFP8", "fp8", None),
             ("NVFP4", None, "fp8"),
             ("NVFP4", "fp8", None),
-        ]
+        ],
     )
-    def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype):
-        model_name = self.model_name
-        from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+    def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype, tiny_opt_model_path, dataloader):
+        model_name = tiny_opt_model_path
+        from transformers import AutoConfig
         from transformers.models.opt.modeling_opt import OPTForCausalLM
 
         config = AutoConfig.from_pretrained(model_name)
@@ -397,7 +371,7 @@ def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype):
             scheme=scheme,
             iters=0,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             static_kv_dtype=static_kv_dtype,
             static_attention_dtype=static_attention_dtype,
         )
@@ -433,7 +407,3 @@ def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype):
                 getattr(attn, "q_scale", None) is not None
             ), f"Missing q_scale in attention for scheme={scheme}, static_attention_dtype={static_attention_dtype}"
         shutil.rmtree(quantized_model_path, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_mxfp_save_load.py b/test/test_cpu/test_mxfp_save_load.py
index aca5c7592..bf3e9853b 100644
--- a/test/test_cpu/test_mxfp_save_load.py
+++ b/test/test_cpu/test_mxfp_save_load.py
@@ -14,6 +14,8 @@
 from auto_round.inference.backend import MX_TENSOR_DATA_TYPES
 from auto_round.testing_utils import has_module
 
+from ..helpers import get_model_path
+
 testing_scheme_name_lst = [
     AutoRoundFormat.MXFP8.value,
     AutoRoundFormat.MXFP4.value,
@@ -35,7 +37,7 @@
 def test_e2e_quant_and_load(scheme_name, weight_data_type, act_data_type):
     # Use a temporary directory for saving the quantized model
     with tempfile.TemporaryDirectory() as temp_dir:
-        model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct"
+        model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct")
         config = AutoConfig.from_pretrained(model_name)
         config.num_hidden_layers = 2  # Use a smaller model for testing
         # Fix configuration validation issues
diff --git a/test/test_cpu/test_scheme.py b/test/test_cpu/test_scheme.py
index c2d165639..7a60a9ccd 100644
--- a/test/test_cpu/test_scheme.py
+++ b/test/test_cpu/test_scheme.py
@@ -1,134 +1,119 @@
 import shutil
-import sys
-import unittest
 
-import torch
+import transformers
 
-sys.path.insert(0, "../..")
 from auto_round import AutoRound
 from auto_round.schemes import QuantizationScheme
 
+from ..helpers import get_model_path, get_tiny_model, opt_name_or_path, qwen_name_or_path
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
 
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestAutoRound(unittest.TestCase):
+class TestAutoRound:
     @classmethod
-    def setUpClass(self):
-        self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
+    def setup_class(self):
         self.save_folder = "./saved"
-        self.llm_dataloader = LLMDataLoader()
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree(self.save_folder, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_gguf(self):
+    def test_gguf(self, tiny_qwen_model_path, dataloader):
         ar = AutoRound(
-            "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B",
+            tiny_qwen_model_path,
             scheme="W2A16",
             nsamples=1,
             iters=1,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         ar.quantize_and_save(self.save_folder, format="gguf:q4_k_m")
-        self.assertEqual(ar.bits, 4)
+        assert ar.bits == 4
         shutil.rmtree(self.save_folder, ignore_errors=True)
 
-    def test_w4a16(self):
-        ar = AutoRound(self.model_name, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=self.llm_dataloader)
-        self.assertEqual(ar.bits, 4)
+    def test_w4a16(self, tiny_opt_model_path, dataloader):
+        ar = AutoRound(tiny_opt_model_path, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
+        assert ar.bits == 4
         ar.quantize()
 
-    def test_w2a16_rtn(self):
-        ar = AutoRound(self.model_name, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=self.llm_dataloader)
-        self.assertEqual(ar.bits, 2)
+    def test_w2a16_rtn(self, tiny_opt_model_path, dataloader):
+        ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=dataloader)
+        assert ar.bits == 2
         ar.quantize()
 
-    def test_mxfp4(self):
-        ar = AutoRound(self.model_name, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=self.llm_dataloader)
-        self.assertEqual(ar.bits, 4)
-        self.assertEqual(ar.act_bits, 4)
-        self.assertEqual(ar.data_type, "mx_fp")
-        self.assertEqual(ar.act_data_type, "mx_fp_rceil")
+    def test_mxfp4(self, tiny_opt_model_path, dataloader):
+        ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
+        assert ar.bits == 4
+        assert ar.act_bits == 4
+        assert ar.data_type == "mx_fp"
+        assert ar.act_data_type == "mx_fp_rceil"
         ar.quantize()
 
-    def test_vllm(self):
+    def test_vllm(self, tiny_qwen_vl_model_path):
         from auto_round import AutoRoundMLLM
 
-        ar = AutoRoundMLLM(
-            "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct", scheme="W2A16", nsamples=1, iters=1, seqlen=2
-        )
-        self.assertEqual(ar.bits, 2)
-        self.assertEqual(ar.act_bits, 16)
-
-    def test_nvfp4(self):
-        ar = AutoRound(self.model_name, scheme="NVFP4", nsamples=1, iters=1, seqlen=2, dataset=self.llm_dataloader)
-        self.assertEqual(ar.bits, 4)
-        self.assertEqual(ar.act_bits, 4)
-        self.assertEqual(ar.data_type, "nv_fp")
-        self.assertEqual(ar.act_data_type, "nv_fp4_with_static_gs")
+        ar = AutoRoundMLLM(tiny_qwen_vl_model_path, scheme="W2A16", nsamples=1, iters=1, seqlen=2)
+        assert ar.bits == 2
+        assert ar.act_bits == 16
+
+    def test_nvfp4(self, tiny_opt_model_path, dataloader):
+        ar = AutoRound(tiny_opt_model_path, scheme="NVFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader)
+        assert ar.bits == 4
+        assert ar.act_bits == 4
+        assert ar.data_type == "nv_fp"
+        assert ar.act_data_type == "nv_fp4_with_static_gs"
         ar.quantize()
 
-    def test_all_scheme(self):
+    def test_all_scheme(self, tiny_opt_model_path, tiny_qwen_model_path, dataloader):
         import copy
 
         preset_schemes = ["W8A16", "MXFP8", "FPW8A16", "FP8_STATIC", "GGUF:Q2_K_S", "GGUF:Q4_K_M"]
         for scheme in preset_schemes:
-            model_name = self.model_name
+            model_name = tiny_opt_model_path
             if "gguf" in scheme.lower():
-                model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-1.5B-Instruct"
+                model_name = tiny_qwen_model_path
             print(f"scheme={scheme}")
-            ar = AutoRound(model_name, scheme=scheme, nsamples=1, iters=1, seqlen=2, dataset=self.llm_dataloader)
+            ar = AutoRound(model_name, scheme=scheme, nsamples=1, iters=1, seqlen=2, dataset=dataloader)
             ar.quantize_and_save(self.save_folder)
             shutil.rmtree(self.save_folder, ignore_errors=True)
 
-    def test_scheme_in_layer_config(self):
+    def test_scheme_in_layer_config(self, dataloader):
+        model = get_tiny_model(opt_name_or_path, num_layers=5)
+        tokenizer = transformers.AutoTokenizer.from_pretrained(opt_name_or_path, trust_remote_code=True)
         layer_config = {
             "model.decoder.layers.2.self_attn": {"bits": 2},
             "model.decoder.layers.3.self_attn.v_proj": "W8A16",
             "model.decoder.layers.4.self_attn.k_proj": QuantizationScheme.from_dict({"group_size": 64}),
         }
         ar = AutoRound(
-            "/tf_dataset/auto_round/models/facebook/opt-125m",
+            model,
+            tokenizer,
             scheme="W3A16",
             nsamples=1,
             iters=1,
             layer_config=layer_config,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
 
         ar.quantize()
         for n, m in ar.model.named_modules():
             if n == "model.decoder.layers.2.self_attn.q_proj":
-                self.assertEqual(m.bits, 2)
+                assert m.bits == 2
             if n == "model.decoder.layers.2.self_attn.k_proj":
-                self.assertEqual(m.bits, 2)
+                assert m.bits == 2
             if n == "model.decoder.layers.3.self_attn.v_proj":
-                self.assertEqual(m.bits, 8)
+                assert m.bits == 8
             if n == "model.decoder.layers.4.self_attn.k_proj":
-                self.assertEqual(m.group_size, 64)
+                assert m.group_size == 64
 
     def test_parse_available_devices(self):
         from auto_round.utils.device import parse_available_devices
 
         device_list = parse_available_devices("auto")
-        self.assertTrue(len(device_list) == 1 and "cpu" in device_list)
+        assert len(device_list) == 1 and "cpu" in device_list
         device_list = parse_available_devices("a:cuda:0,b:cuda:1,c:cpu")
-        self.assertTrue(len(device_list) == 3)
-        self.assertEqual(device_list, ["cuda:0", "cuda:1", "cpu"])
+        assert len(device_list) == 3
+        assert device_list == ["cuda:0", "cuda:1", "cpu"]
         device_list = parse_available_devices("0,1")
-        self.assertTrue(len(device_list) == 1 and "cpu" in device_list)
-
-
-if __name__ == "__main__":
-    unittest.main()
+        assert len(device_list) == 1 and "cpu" in device_list
diff --git a/test/test_cpu/test_script.py b/test/test_cpu/test_script.py
deleted file mode 100644
index 01bbba644..000000000
--- a/test/test_cpu/test_script.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import os
-import sys
-import unittest
-
-sys.path.insert(0, "../..")
-
-
-class TestScript(unittest.TestCase):
-    def test_default(self):
-        os.system(
-            """
-                cd ../.. && 
-                python -m auto_round
-                    --iters 2
-                    --deployment_device fake
-                    --output_dir ./tmp_script_test"""
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_torch_backend.py b/test/test_cpu/test_torch_backend.py
index d1e9bd293..0be8f76e6 100644
--- a/test/test_cpu/test_torch_backend.py
+++ b/test/test_cpu/test_torch_backend.py
@@ -1,11 +1,6 @@
 import shutil
-import sys
-import unittest
 
 import pytest
-
-sys.path.insert(0, "../..")
-
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -13,56 +8,22 @@
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.testing_utils import require_autogptq, require_gptqmodel
 
+from ..helpers import get_model_path, model_infer
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
 
-
-class TestAutoRoundTorchBackend(unittest.TestCase):
+class TestAutoRoundTorchBackend:
 
     @classmethod
-    def setUpClass(self):
-        self.model_name = "facebook/opt-125m"
+    def setup_class(self):
+        self.model_name = get_model_path("facebook/opt-125m")
         self.save_folder = "./saved"
-        self.llm_dataloader = LLMDataLoader()
-
-    def model_infer(self, model, tokenizer):
-        prompts = [
-            "Hello,my name is",
-            # "The president of the United States is",
-            # "The capital of France is",
-            # "The future of AI is",
-        ]
-
-        inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
-
-        outputs = model.generate(
-            input_ids=inputs["input_ids"].to(model.device),
-            attention_mask=inputs["attention_mask"].to(model.device),
-            do_sample=False,  ## change this to follow official usage
-            max_new_tokens=5,
-        )
-        generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
-
-        decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-        for i, prompt in enumerate(prompts):
-            print(f"Prompt: {prompt}")
-            print(f"Generated: {decoded_outputs[i]}")
-            print("-" * 50)
-        return decoded_outputs[0]
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree(self.save_folder, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_torch_4bits_asym(self):
+    def test_torch_4bits_asym(self, dataloader):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, False
@@ -74,7 +35,7 @@ def test_torch_4bits_asym(self):
             sym=sym,
             iters=0,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_folder
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel")
@@ -85,10 +46,10 @@ def test_torch_4bits_asym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10)
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.35
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
@@ -96,14 +57,14 @@ def test_torch_4bits_asym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10)
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.35
         torch.cuda.empty_cache()
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_torch_4bits_sym(self):
+    def test_torch_4bits_sym(self, dataloader):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 32, True
@@ -115,7 +76,7 @@ def test_torch_4bits_sym(self):
             sym=sym,
             iters=0,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_folder
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")  ##will convert to gptq model
@@ -126,13 +87,9 @@ def test_torch_4bits_sym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=1000)
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.28)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.28
         torch.cuda.empty_cache()
         shutil.rmtree(self.save_folder, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cpu/test_utils.py b/test/test_cpu/test_utils.py
index e70a4b7b4..3dec97010 100644
--- a/test/test_cpu/test_utils.py
+++ b/test/test_cpu/test_utils.py
@@ -1,7 +1,5 @@
-import sys
 from unittest.mock import patch
 
-sys.path.insert(0, "../..")
 import auto_round.utils.device as auto_round_utils
 
 
diff --git a/test/test_cpu/test_woq_linear.py b/test/test_cpu/test_woq_linear.py
index e077c7a21..8f5bedc2c 100644
--- a/test/test_cpu/test_woq_linear.py
+++ b/test/test_cpu/test_woq_linear.py
@@ -1,9 +1,6 @@
-import sys
-
 import pytest
 import torch
 
-sys.path.insert(0, "../..")
 from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear
 
 
diff --git a/test/test_cuda/__init__.py b/test/test_cuda/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/test_cuda/_test_helpers.py b/test/test_cuda/_test_helpers.py
deleted file mode 100644
index b4b8a5955..000000000
--- a/test/test_cuda/_test_helpers.py
+++ /dev/null
@@ -1,32 +0,0 @@
-def model_infer(model, tokenizer, apply_chat_template=False):
-    prompts = [
-        "Hello,my name is",
-        # "The president of the United States is",
-        # "The capital of France is",
-        # "The future of AI is",
-    ]
-    if apply_chat_template:
-        texts = []
-        for prompt in prompts:
-            messages = [{"role": "user", "content": prompt}]
-            text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-            texts.append(text)
-        prompts = texts
-
-    inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
-
-    outputs = model.generate(
-        input_ids=inputs["input_ids"].to(model.device),
-        attention_mask=inputs["attention_mask"].to(model.device),
-        do_sample=False,  ## change this to follow official usage
-        max_new_tokens=5,
-    )
-    generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
-
-    decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-    for i, prompt in enumerate(prompts):
-        print(f"Prompt: {prompt}")
-        print(f"Generated: {decoded_outputs[i]}")
-        print("-" * 50)
-    return decoded_outputs[0]
diff --git a/test/test_cuda/requirements.txt b/test/test_cuda/requirements.txt
index e7dd4e0d8..071eb233e 100644
--- a/test/test_cuda/requirements.txt
+++ b/test/test_cuda/requirements.txt
@@ -6,7 +6,6 @@ intel-extension-for-pytorch
 lm-eval>=0.4.9.1
 optimum
 pandas
-parameterized
 pillow
 torchvision
 numba
diff --git a/test/test_cuda/test_2_3bits.py b/test/test_cuda/test_2_3bits.py
index 2ea407f20..1b305f494 100644
--- a/test/test_cuda/test_2_3bits.py
+++ b/test/test_cuda/test_2_3bits.py
@@ -1,10 +1,8 @@
 import copy
 import re
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 import transformers
 from lm_eval.utils import make_table  # pylint: disable=E0401
@@ -14,6 +12,8 @@
 from auto_round.eval.evaluation import simple_evaluate, simple_evaluate_user_model
 from auto_round.testing_utils import require_autogptq, require_greater_than_050, require_greater_than_051
 
+from ..helpers import get_model_path, model_infer
+
 
 def get_accuracy(data):
     match = re.search(r"\|acc\s+\|[↑↓]\s+\|\s+([\d.]+)\|", data)
@@ -25,49 +25,27 @@ def get_accuracy(data):
         return 0.0
 
 
-class TestAutoRound(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.save_dir = "./saved"
-        self.tasks = "lambada_openai"
+class TestAutoRound:
+    save_dir = "./saved"
+    tasks = "lambada_openai"
 
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree("./saved", ignore_errors=True)
-        shutil.rmtree("runs", ignore_errors=True)
-
-    def model_infer(self, model, tokenizer):
-        prompts = [
-            "Hello,my name is",
-            # "The president of the United States is",
-            # "The capital of France is",
-            # "The future of AI is",
-        ]
-
-        inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
-
-        outputs = model.generate(
-            input_ids=inputs["input_ids"].to(model.device),
-            attention_mask=inputs["attention_mask"].to(model.device),
-            do_sample=False,  ## change this to follow official usage
-            max_new_tokens=5,
-        )
-        generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
 
-        decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        # Yield to hand control to the test methods
+        yield
 
-        for i, prompt in enumerate(prompts):
-            print(f"Prompt: {prompt}")
-            print(f"Generated: {decoded_outputs[i]}")
-            print("-" * 50)
-        return decoded_outputs[0]
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
+        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree("runs", ignore_errors=True)
 
     @require_greater_than_051
     def test_3bits_autoround(self):
-        model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        autoround = AutoRound(model, tokenizer, bits=3)
+        model_name = get_model_path("facebook/opt-125m")
+        autoround = AutoRound(model_name, bits=3)
         quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")  ##will convert to gptq model
 
@@ -77,18 +55,16 @@ def test_3bits_autoround(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.3)  ## 0.3130
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.3
 
     @require_greater_than_051
     def test_3bits_asym_autoround(self):
-        model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model_name = get_model_path("facebook/opt-125m")
         bits, sym = 3, False
-        autoround = AutoRound(model, tokenizer, bits=bits, sym=sym)
+        autoround = AutoRound(model_name, bits=bits, sym=sym)
         autoround.quantize_and_save(self.save_dir, format="auto_round", inplace=False)
         model_args = f"pretrained={self.save_dir}"
         res = simple_evaluate(
@@ -106,10 +82,8 @@ def test_3bits_asym_autoround(self):
 
     @require_greater_than_050
     def test_norm_bias_tuning(self):
-        model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        autoround = AutoRound(model, tokenizer, bits=2, group_size=64, enable_norm_bias_tuning=True)
+        model_name = get_model_path("facebook/opt-125m")
+        autoround = AutoRound(model_name, bits=2, group_size=64, enable_norm_bias_tuning=True)
         autoround.quantize()
 
         ##test auto_round format
@@ -123,10 +97,8 @@ def test_norm_bias_tuning(self):
 
     @require_greater_than_050
     def test_2bits_autoround(self):
-        model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        autoround = AutoRound(model, tokenizer, bits=2, group_size=64)
+        model_name = get_model_path("facebook/opt-125m")
+        autoround = AutoRound(model_name, bits=2, group_size=64)
         autoround.quantize()
 
         ##test auto_round format
@@ -145,7 +117,3 @@ def test_2bits_autoround(self):
         accuracy = get_accuracy(res)
         assert accuracy > 0.17
         shutil.rmtree("./saved", ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_alg_ext.py b/test/test_cuda/test_alg_ext.py
index c83d6f3b4..6cdbc82ab 100644
--- a/test/test_cuda/test_alg_ext.py
+++ b/test/test_cuda/test_alg_ext.py
@@ -1,30 +1,34 @@
 import shutil
 import sys
-import unittest
-
-sys.path.insert(0, "../..")
 
+import pytest
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound, AutoRoundConfig
 from auto_round.eval.evaluation import simple_evaluate_user_model
 
+from ..helpers import get_model_path
 
-class TestAlgExt(unittest.TestCase):
 
-    @classmethod
-    def setUpClass(self):
-        self.model_name = "/models/opt-125m"
-        self.save_folder = "./saved"
+class TestAlgExt:
+    save_folder = "./saved"
 
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
+
+        # Yield to hand control to the test methods
+        yield
+
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
+        shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     def test_2bits(self):
-        model_name = "/models/opt-125m"
+        model_name = get_model_path("facebook/opt-125m")
         ar = AutoRound(model=model_name, bits=2, group_size=64, enable_alg_ext=True)
         ar.quantize_and_save(self.save_folder)
         model = AutoModelForCausalLM.from_pretrained(
@@ -36,39 +40,39 @@ def test_2bits(self):
         result = simple_evaluate_user_model(model, tokenizer, batch_size=64, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
         # wo alg ext 0.2078, with 0.2371
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.22)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.22
         shutil.rmtree(self.save_folder, ignore_errors=True)
 
-    def test_cli(self):
+    def test_cli(self, tiny_opt_model_path):
         import os
 
-        model_name = "/models/opt-125m"
         python_path = sys.executable
 
         res = os.system(
-            f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {model_name} --iters 1 --device auto --enable_alg_ext --avg_bits 2 --options=W2A16,W4A16 --ignore_scale_zp_bits"
+            f"cd .. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 2 --options=W2A16,W4A16 --ignore_scale_zp_bits --nsamples 1 --seqlen 32"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {model_name} --iters 1 --device auto --enable_alg_ext --avg_bits 5.5 --options=mxfp4,mxfp8 --ignore_scale_zp_bits --enable_torch_compile"
+            f"cd .. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 5.5 --options=mxfp4,mxfp8 --ignore_scale_zp_bits --enable_torch_compile --nsamples 1 --seqlen 32"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
-    def test_all_support_dtype(self):
+    def test_all_support_dtype(self, tiny_qwen_model_path):
         from auto_round.auto_scheme import AutoScheme
 
-        model_name = "/models/Qwen3-0.6B"
         for scheme in ["MXFP4", "NVFP4", "W2A16G64", "gguf:q2_k_s,gguf:q4_k_s"]:
             avg_bits = 2 if scheme == "W2A16G64" else 4
             scheme = AutoScheme(options=scheme, avg_bits=avg_bits, ignore_scale_zp_bits=True)
             ar = AutoRound(
-                model_name, scheme=scheme, iters=1, nsamples=1, enable_alg_ext=True, enable_torch_compile=True
+                tiny_qwen_model_path,
+                scheme=scheme,
+                iters=1,
+                nsamples=1,
+                seqlen=32,
+                enable_alg_ext=True,
+                enable_torch_compile=True,
             )
             ar.quantize()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_asym.py b/test/test_cuda/test_asym.py
index c41c0d5d8..1eda6f146 100644
--- a/test/test_cuda/test_asym.py
+++ b/test/test_cuda/test_asym.py
@@ -3,16 +3,16 @@
 import sys
 import unittest
 
-sys.path.insert(0, "../..")
-
+import pytest
 import torch
-from _test_helpers import model_infer
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
 from auto_round import AutoRound
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.utils import get_module
 
+from ..helpers import model_infer
+
 
 class LLMDataLoader:
     def __init__(self):
@@ -23,140 +23,138 @@ def __iter__(self):
             yield torch.ones([1, 10], dtype=torch.long)
 
 
-class TestAutoRoundAsym(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.model_name = "/models/opt-125m"
-        # self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        self.save_folder = "./saved"
+class TestAutoRoundAsym:
+    save_dir = "./saved"
+
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
+
+        # Yield to hand control to the test methods
+        yield
 
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
+        shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_asym_group_size(self):
-        model_name = self.model_name
-        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def test_asym_group_size(self, tiny_opt_model_path):
         for group_size in [32, 64, 128]:
             bits, sym = 4, False
-            ar = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1)
-            ar.quantize_and_save(format="auto_round", output_dir=self.save_folder)
+            ar = AutoRound(
+                tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1
+            )
+            ar.quantize_and_save(format="auto_round", output_dir=self.save_dir)
 
             # TODO when ark is ready, uncomment the following lines to do inference test
 
             # model = AutoModelForCausalLM.from_pretrained(
-            #     self.save_folder,
+            #     self.save_dir,
             #     torch_dtype="auto",
             #     device_map="auto",
             # )
 
-            # tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+            # tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
             # model_infer(model, tokenizer)
-            shutil.rmtree(self.save_folder)
+            shutil.rmtree(self.save_dir)
 
-    def test_asym_bits(self):
-        model_name = self.model_name
-        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def test_asym_bits(self, tiny_opt_model_path):
         for bits in [2, 3, 8]:
             group_size, sym = 128, False
-            ar = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1)
-            ar.quantize_and_save(format="auto_round", output_dir=self.save_folder)
+            ar = AutoRound(
+                tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1
+            )
+            ar.quantize_and_save(format="auto_round", output_dir=self.save_dir)
 
             # TODO when ark is ready, uncomment the following lines to do inference test
 
             # model = AutoModelForCausalLM.from_pretrained(
-            #     self.save_folder,
+            #     self.save_dir,
             #     torch_dtype="auto",
             #     device_map="auto",
             # )
 
-            # tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+            # tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
             # model_infer(model, tokenizer)
-            shutil.rmtree(self.save_folder)
+            shutil.rmtree(self.save_dir)
 
     # use parameters later
-    def test_asym_format(self):
-        model_name = self.model_name
-        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def test_asym_format(self, tiny_opt_model_path):
         for format in ["auto_round", "auto_round:auto_gptq", "auto_round:gptqmodel"]:
             bits, group_size, sym = 4, 128, False
-            ar = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1)
+            ar = AutoRound(
+                tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1
+            )
             # TODO when ark is ready, uncomment the following lines to do inference test
-            ar.quantize_and_save(format=format, output_dir=self.save_folder)
+            ar.quantize_and_save(format=format, output_dir=self.save_dir)
 
             # model = AutoModelForCausalLM.from_pretrained(
-            #     self.save_folder,
+            #     self.save_dir,
             #     torch_dtype="auto",
             #     device_map="auto",
             # )
 
-            # tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+            # tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
             # model_infer(model, tokenizer)
-            shutil.rmtree(self.save_folder)
+            shutil.rmtree(self.save_dir)
 
-    def test_asym_group_size_with_tuning(self):
-        model_name = self.model_name
-        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def test_asym_group_size_with_tuning(self, tiny_opt_model_path):
         for group_size in [32, 64, 128]:
             bits, sym = 4, False
-            ar = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1)
-            ar.quantize_and_save(format="auto_round", output_dir=self.save_folder)
+            ar = AutoRound(
+                tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1
+            )
+            ar.quantize_and_save(format="auto_round", output_dir=self.save_dir)
 
             # TODO when ark is ready, uncomment the following lines to do inference test
 
             # model = AutoModelForCausalLM.from_pretrained(
-            #     self.save_folder,
+            #     self.save_dir,
             #     torch_dtype="auto",
             #     device_map="auto",
             # )
 
-            # tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+            # tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
             # model_infer(model, tokenizer)
-            shutil.rmtree(self.save_folder)
+            shutil.rmtree(self.save_dir)
 
-    def test_asym_bits_with_tuning(self):
-        model_name = self.model_name
-        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def test_asym_bits_with_tuning(self, tiny_opt_model_path):
         for bits in [2, 3, 8]:
             group_size, sym = 128, False
-            ar = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1)
-            ar.quantize_and_save(format="auto_round", output_dir=self.save_folder)
+            ar = AutoRound(
+                tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1
+            )
+            ar.quantize_and_save(format="auto_round", output_dir=self.save_dir)
 
             # TODO when ark is ready, uncomment the following lines to do inference test
 
             # model = AutoModelForCausalLM.from_pretrained(
-            #     self.save_folder,
+            #     self.save_dir,
             #     torch_dtype="auto",
             #     device_map="auto",
             # )
 
-            # tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+            # tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
             # model_infer(model, tokenizer)
-            shutil.rmtree(self.save_folder)
+            shutil.rmtree(self.save_dir)
 
     # use parameters later
-    def test_asym_format_with_tuning(self):
-        model_name = self.model_name
-        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def test_asym_format_with_tuning(self, tiny_opt_model_path):
         for format in ["auto_round", "auto_round:auto_gptq", "auto_round:gptqmodel"]:
             bits, group_size, sym = 4, 128, False
-            ar = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1)
+            ar = AutoRound(
+                tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1
+            )
             # TODO when ark is ready, uncomment the following lines to do inference test
-            ar.quantize_and_save(format=format, output_dir=self.save_folder)
+            ar.quantize_and_save(format=format, output_dir=self.save_dir)
 
             # model = AutoModelForCausalLM.from_pretrained(
-            #     self.save_folder,
+            #     self.save_dir,
             #     torch_dtype="auto",
             #     device_map="auto",
             # )
 
-            # tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+            # tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
             # model_infer(model, tokenizer)
-            shutil.rmtree(self.save_folder)
+            shutil.rmtree(self.save_dir)
diff --git a/test/test_cuda/test_auto_round_format.py b/test/test_cuda/test_auto_round_format.py
index 55fc1690f..9604ffff1 100644
--- a/test/test_cuda/test_auto_round_format.py
+++ b/test/test_cuda/test_auto_round_format.py
@@ -1,9 +1,7 @@
 import copy
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -18,84 +16,41 @@
     require_package_version_ut,
 )
 
+from ..helpers import get_model_path, get_tiny_model, model_infer
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestAutoRound(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.model_name = "facebook/opt-125m"
-
-        self.llm_dataloader = LLMDataLoader()
-        self.save_folder = "./saved"
-
-    def model_infer(self, model, tokenizer):
-        prompts = [
-            "Hello,my name is",
-            # "The president of the United States is",
-            # "The capital of France is",
-            # "The future of AI is",
-        ]
-
-        ##texts = []
-        # for prompt in prompts:
-        #     messages = [
-        #         {"role": "user", "content": prompt}
-        #     ]
-        #     text = tokenizer.apply_chat_template(
-        #         messages,
-        #         tokenize=False,
-        #         add_generation_prompt=True
-        #     )
-        #     texts.append(text)
-
-        inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
-
-        outputs = model.generate(
-            input_ids=inputs["input_ids"].to(model.device),
-            attention_mask=inputs["attention_mask"].to(model.device),
-            do_sample=False,  ## change this to follow official usage
-            max_new_tokens=5,
-        )
-        generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
 
-        decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+class TestAutoRound:
+    save_dir = "./saved"
+
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
 
-        for i, prompt in enumerate(prompts):
-            print(f"Prompt: {prompt}")
-            print(f"Generated: {decoded_outputs[i]}")
-            print("-" * 50)
+        # Yield to hand control to the test methods
+        yield
 
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
+        shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_greater_than_050
     @require_package_version_ut("transformers", "<4.57.0")
-    def test_autoround_asym(self):
+    def test_autoround_asym(self, tiny_opt_model_path, dataloader):
         for bits in [2, 3, 4, 8]:
-            model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-            tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+            # model_name = get_model_path("facebook/opt-125m")
             bits, group_size, sym = bits, 128, False
             autoround = AutoRound(
-                model,
-                tokenizer,
+                tiny_opt_model_path,
                 bits=bits,
                 group_size=group_size,
                 sym=sym,
                 iters=2,
                 seqlen=2,
-                dataset=self.llm_dataloader,
+                dataset=dataloader,
             )
-            quantized_model_path = self.save_folder
+            quantized_model_path = self.save_dir
 
             autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
 
@@ -108,12 +63,11 @@ def test_autoround_asym(self):
             res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])
             print(res)
             assert "!!!" not in res
-            shutil.rmtree(self.save_folder, ignore_errors=True)
+            shutil.rmtree(self.save_dir, ignore_errors=True)
 
     @require_autogptq
     def test_mixed_precision(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+        model_name = get_model_path("facebook/opt-125m")
         layer_config = {}
 
         layer_config["model.decoder.layers.0.self_attn.k_proj"] = {"bits": 8}
@@ -123,85 +77,80 @@ def test_mixed_precision(self):
         }  ## 3bits when using asym will have some issue
         layer_config["model.decoder.layers.6.self_attn.out_proj"] = {"bits": 2, "group_size": 32}
         bits, group_size, sym = 4, 128, True
-        autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, layer_config=layer_config)
-        quantized_model_path = self.save_folder
+        autoround = AutoRound(model_name, bits=bits, group_size=group_size, sym=sym, layer_config=layer_config)
+        quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
         quantization_config = AutoRoundConfig(backend="auto")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.32)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.32
 
     @require_awq
     @require_package_version_ut("transformers", "<4.57.0")
     def test_awq_backend(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+        model_name = get_model_path("facebook/opt-125m")
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
-            model,
-            tokenizer,
+            model_name,
             bits=bits,
             group_size=group_size,
             iters=1,
             nsamples=1,
             sym=sym,
         )
-        quantized_model_path = self.save_folder
+        quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq")
 
         quantization_config = AutoRoundConfig(backend="auto")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.18)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.18
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            self.save_dir, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+        model_infer(model, tokenizer)
+        shutil.rmtree(self.save_dir, ignore_errors=True)
 
     @require_greater_than_050
     def test_tritonv2_bf16(self):
-        model_name = "OPEA/Meta-Llama-3.1-8B-Instruct-int4-sym-inc"
+        model_name = get_model_path("OPEA/Meta-Llama-3.1-8B-Instruct-int4-sym-inc")
         quantization_config = AutoRoundConfig(backend="tritonv2")
-        model = AutoModelForCausalLM.from_pretrained(
+        model = get_tiny_model(
             model_name, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
         tokenizer = AutoTokenizer.from_pretrained(model_name)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
 
         torch.cuda.empty_cache()
 
     @require_ipex
-    def test_autoround_gptq_sym_format(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+    def test_autoround_gptq_sym_format(self, tiny_opt_model_path, dataloader):
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
-            model,
-            tokenizer,
+            tiny_opt_model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = "./saved"
 
@@ -244,19 +193,16 @@ def test_autoround_gptq_sym_format(self):
     @require_awq
     @require_ipex
     @require_package_version_ut("transformers", "<4.57.0")
-    def test_autoround_awq_sym_format(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+    def test_autoround_awq_sym_format(self, tiny_opt_model_path, dataloader):
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
-            model,
-            tokenizer,
+            tiny_opt_model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = "./saved"
 
@@ -283,20 +229,17 @@ def test_autoround_awq_sym_format(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
     @require_greater_than_050
-    def test_autoround_sym(self):
+    def test_autoround_sym(self, tiny_opt_model_path, dataloader):
         for bits in [2, 3, 4, 8]:
-            model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-            tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
             bits, group_size, sym = bits, 128, True
             autoround = AutoRound(
-                model,
-                tokenizer,
+                tiny_opt_model_path,
                 bits=bits,
                 group_size=group_size,
                 sym=sym,
                 iters=2,
                 seqlen=2,
-                dataset=self.llm_dataloader,
+                dataset=dataloader,
             )
             quantized_model_path = "./saved"
 
@@ -311,11 +254,11 @@ def test_autoround_sym(self):
             res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])
             print(res)
             assert "!!!" not in res
-            shutil.rmtree(self.save_folder, ignore_errors=True)
+            shutil.rmtree(self.save_dir, ignore_errors=True)
 
     @require_greater_than_050
     def test_load_gptq_model_3bits(self):
-        model_name = "LucasSantiago257/gemma-2b-2bits-gptq"
+        model_name = get_model_path("LucasSantiago257/gemma-2b-2bits-gptq")
         quantization_config = AutoRoundConfig()
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
@@ -325,8 +268,4 @@ def test_load_gptq_model_3bits(self):
             quantization_config=quantization_config,
         )
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-        self.model_infer(model, tokenizer)
-
-
-if __name__ == "__main__":
-    unittest.main()
+        model_infer(model, tokenizer)
diff --git a/test/test_cuda/test_auto_scheme.py b/test/test_cuda/test_auto_scheme.py
index 681e3b29b..259bc4450 100644
--- a/test/test_cuda/test_auto_scheme.py
+++ b/test/test_cuda/test_auto_scheme.py
@@ -1,73 +1,78 @@
 import copy
 import re
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
+import transformers
+
 from auto_round import AutoRound, AutoRoundConfig, AutoScheme
 from auto_round.auto_scheme.utils import compute_avg_bits_for_model
 from auto_round.eval.evaluation import simple_evaluate
 from auto_round.testing_utils import multi_card
 from auto_round.utils import get_module
 
+from ..helpers import get_model_path, get_tiny_model
+
+
+class TestAutoScheme:
+    save_dir = "./saved"
 
-class TestAutoScheme(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.save_dir = "./saved"
-        self.tasks = "lambada_openai"
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
 
-    @classmethod
-    def tearDownClass(self):
+        # Yield to hand control to the test methods
+        yield
+
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_gguf_k_0(self):
-        model_name = "/models/Qwen3-0.6B"
+    def test_gguf_k_0(self, tiny_qwen_model_path):
         target_bits = 5.5
         scheme = AutoScheme(avg_bits=target_bits, options=("GGUF:Q4_K_M", "GGUF:Q8_0"))
-        ar = AutoRound(model=model_name, scheme=scheme, iters=1, enable_alg_ext=True)
+        ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=1, enable_alg_ext=True)
         ar.quantize_and_save(self.save_dir, format="gguf:q2_k_s")
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
-    def test_gguf_k_1(self):
-        model_name = "/models/Qwen3-0.6B"
+    def test_gguf_k_1(self, tiny_qwen_model_path):
         target_bits = 3.5
         scheme = AutoScheme(avg_bits=target_bits, options=("GGUF:Q2_K_S", "GGUF:Q4_1"))
-        ar = AutoRound(model=model_name, scheme=scheme, iters=1, enable_alg_ext=True)
+        ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=1, enable_alg_ext=True)
         ar.quantize_and_save(self.save_dir, format="gguf:q2_k_s")
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     #
-    def test_embedding_fallback(self):
-        model_name = "/models/Qwen3-0.6B"
+    def test_embedding_fallback(self, tiny_qwen_model_path):
         target_bits = 5.0
         scheme = AutoScheme(avg_bits=target_bits, options=("GGUF:Q4_K_M", "GGUF:Q8_0"))
-        ar = AutoRound(model=model_name, scheme=scheme, iters=1, enable_alg_ext=True)
+        ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=1, enable_alg_ext=True)
         ar.quantize_and_save(self.save_dir, format="gguf:q2_k_s")
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
-    def test_gguf_export(self):
-        model_name = "/models/Qwen3-0.6B"
+    def test_gguf_export(self, tiny_qwen_model_path):
         target_bits = 3
         scheme = AutoScheme(avg_bits=target_bits, options=("GGUF:Q2_K_S", "GGUF:Q4_K_M"), ignore_scale_zp_bits=True)
-        ar = AutoRound(model=model_name, scheme=scheme, iters=0)
+        ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0)
         ar.quantize_and_save(self.save_dir, format="gguf:q2_k_s")
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     def test_gguf(self):
-        model_name = "/models/Qwen3-8B"
+        model_name = get_model_path("qwen/Qwen3-8B")
+        model = get_tiny_model(model_name)
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         target_bits = 3
         scheme = AutoScheme(avg_bits=target_bits, options=("GGUF:Q2_K_S", "GGUF:Q4_K_M"), ignore_scale_zp_bits=True)
-        ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, disable_opt_rtn=True)
+        ar = AutoRound(model=model, tokenizer=tokenizer, scheme=scheme, iters=0, nsamples=1, disable_opt_rtn=True)
         model, layer_config = ar.quantize()
         avg_bits, _ = compute_avg_bits_for_model(model, ignore_scale_zp_bits=True)
         print(avg_bits)
         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
 
     def test_shared_layers(self):
-        model_name = "/models/opt-125m"
+        model_name = get_model_path("facebook/opt-125m")
         from transformers import AutoModelForCausalLM, AutoTokenizer
 
         model = AutoModelForCausalLM.from_pretrained(model_name)
@@ -79,7 +84,7 @@ def test_shared_layers(self):
         from auto_round.auto_scheme.utils import parse_shared_layers
 
         res = parse_shared_layers(model, shared_layers)
-        self.assertEqual(len(res), 24)
+        assert len(res) == 24
         assert [
             "model.decoder.layers.2.self_attn.out_proj",
             "model.decoder.layers.2.self_attn.q_proj",
@@ -101,68 +106,61 @@ def test_shared_layers(self):
                 else:
                     bits.append(module.bits)
             bits = set(bits)
-            self.assertEqual(len(bits), 1)
+            assert len(bits) == 1
         print(avg_bits)
         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
 
     #
     @multi_card
-    def test_multi_card(self):
-        model_name = "/models/Qwen3-0.6B"
+    def test_multi_card(self, tiny_qwen_model_path):
         target_bits = 4.5
         for device_map in ["auto", "0,1", "0", None]:
             scheme = AutoScheme(avg_bits=target_bits, options=("NVFP4"))
-            ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, device_map=device_map)
+            ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0, nsamples=1, device_map=device_map)
             model, layer_config = ar.quantize()
             avg_bits, _ = compute_avg_bits_for_model(model)
             print(avg_bits)
             assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
 
     @multi_card
-    def test_multi_card_1(self):
-        model_name = "/models/Qwen3-0.6B"
+    def test_multi_card_1(self, tiny_qwen_model_path):
         target_bits = 4.5
         from transformers import AutoModelForCausalLM, AutoTokenizer
 
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
         scheme = AutoScheme(avg_bits=target_bits, options=("NVFP4"))
-        ar = AutoRound(model=model, tokenizer=tokenizer, scheme=scheme, iters=0, nsamples=1)
+        ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0, nsamples=1)
         model, layer_config = ar.quantize()
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)
         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
 
-    def test_non_low_gpu_mem_usage(self):
-        model_name = "/models/Qwen3-0.6B"
+    def test_non_low_gpu_mem_usage(self, tiny_qwen_model_path):
         target_bits = 4.5
         # for device_map in ["auto", "0,1", "0", None]:
         scheme = AutoScheme(avg_bits=target_bits, options=("NVFP4"), low_gpu_mem_usage=False, device_map="auto")
 
-        ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1)
+        ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0, nsamples=1)
         model, layer_config = ar.quantize()
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)
         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
 
     @multi_card
-    def test_dict_device_map(self):
-        model_name = "/models/Qwen3-8B"
+    def test_dict_device_map(self, tiny_qwen_model_path):
         target_bits = 8.25
         device_map = {"up_proj": 0, "down_proj": 1}
 
         scheme = AutoScheme(avg_bits=target_bits, options=("MXFP8"))
-        ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, device_map=device_map)
+        ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0, nsamples=1, device_map=device_map)
         model, layer_config = ar.quantize()
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)
         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
 
-    def test_min_target_bits(self):
-        model_name = "/models/opt-125m"
+    def test_min_target_bits(self, tiny_opt_model_path):
         target_bits = 4.644
         scheme = AutoScheme(avg_bits=target_bits, options=("MXFP4", "W8A16"))
-        ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1)
+        ar = AutoRound(model=tiny_opt_model_path, scheme=scheme, iters=0, nsamples=1)
         model, layer_config = ar.quantize()
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)
@@ -170,102 +168,97 @@ def test_min_target_bits(self):
 
     #
     def test_max_target_bits(self):
-        model_name = "/models/opt-125m"
         target_bits = 8.025
+        model_path = get_model_path("facebook/opt-125m")
         scheme = AutoScheme(avg_bits=target_bits, options=("MXFP4", "W8A16"))
-        ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1)
+        ar = AutoRound(model=model_path, scheme=scheme, iters=0, nsamples=1)
         model, layer_config = ar.quantize()
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)
         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
 
-    def test_patch_scheme(self):
-        model_name = "/models/opt-125m"
+    def test_patch_scheme(self, tiny_opt_model_path):
         target_bits = 5
         scheme = AutoScheme(avg_bits=target_bits, options=("MXFP4", "W8A16"))
-        ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, group_size=32)
+        ar = AutoRound(model=tiny_opt_model_path, scheme=scheme, iters=0, nsamples=1, group_size=32)
         model, layer_config = ar.quantize()
         for n, m in model.named_modules():
             if hasattr(m, "group_size"):
-                self.assertEqual(m.group_size, 32)
+                assert m.group_size == 32
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)
         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
 
     def test_layer_config(self):
         target_bits = 3.0
-        model_name = "/models/opt-125m"
+        model_name = get_model_path("facebook/opt-125m")
         scheme = AutoScheme(avg_bits=3, options=("W2A16", "W4A16", "BF16"))
         user_layer_config = {"model.decoder.layers.10.fc1": {"bits": 8, "group_size": 32, "sym": False}}
         ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config)
         model, layer_config = ar.quantize()
-        self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["bits"], 8)
-        self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["sym"], False)
-        self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["group_size"], 32)
+        assert layer_config["model.decoder.layers.10.fc1"]["bits"] == 8
+        assert layer_config["model.decoder.layers.10.fc1"]["sym"] is False
+        assert layer_config["model.decoder.layers.10.fc1"]["group_size"] == 32
         layer = get_module(model, "model.decoder.layers.10.fc1")
-        self.assertEqual(layer.bits, 8)
-        self.assertEqual(layer.sym, False)
-        self.assertEqual(layer.group_size, 32)
+        assert layer.bits == 8
+        assert layer.sym is False
+        assert layer.group_size == 32
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)
         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
 
         target_bits = 5.5
-        model_name = "/models/opt-125m"
         scheme = AutoScheme(avg_bits=target_bits, options=("mxfp4", "mxfp8"))
         user_layer_config = {"model.decoder.layers.10.fc1": {"bits": 8, "group_size": 32, "sym": False}}
         ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config)
         model, layer_config = ar.quantize()
-        self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["bits"], 8)
-        self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["sym"], False)
-        self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["group_size"], 32)
+        assert layer_config["model.decoder.layers.10.fc1"]["bits"] == 8
+        assert layer_config["model.decoder.layers.10.fc1"]["sym"] is False
+        assert layer_config["model.decoder.layers.10.fc1"]["group_size"] == 32
         layer = get_module(model, "model.decoder.layers.10.fc1")
-        self.assertEqual(layer.orig_layer.bits, 8)
-        self.assertEqual(layer.orig_layer.sym, False)
-        self.assertEqual(layer.orig_layer.group_size, 32)
+        assert layer.orig_layer.bits == 8
+        assert layer.orig_layer.sym is False
+        assert layer.orig_layer.group_size == 32
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)
         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
 
-    def test_lm_head_and_mix_dtype(self):
-        model_name = "/models/Qwen3-8B"
+    def test_lm_head_and_mix_dtype(self, tiny_untied_qwen_model_path):
+        model_name = tiny_untied_qwen_model_path
+        model = get_tiny_model(model_name)
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         target_bits = 6
         scheme = AutoScheme(avg_bits=target_bits, options=("MXFP4", "MXFP8"))
-        ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, quant_lm_head=True)
+        ar = AutoRound(model=model, tokenizer=tokenizer, scheme=scheme, iters=0, nsamples=1, quant_lm_head=True)
         model, layer_config = ar.quantize()
-        self.assertLessEqual(layer_config["lm_head"]["bits"], 8)
+        assert layer_config["lm_head"]["bits"] <= 8
         avg_bits, _ = compute_avg_bits_for_model(model)
         print(avg_bits)
         assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3
 
-    def test_auto_scheme_export(self):
-        model_name = "/models/opt-125m"
+    def test_auto_scheme_export(self, tiny_qwen_model_path):
+        model_name = get_model_path("facebook/opt-125m")
         scheme = AutoScheme(avg_bits=3, options=("W2A16", "W4A16", "W8A16", "BF16"))
         ar = AutoRound(model=model_name, scheme=scheme)
         ar.quantize_and_save(self.save_dir)
         model_args = f"pretrained={self.save_dir}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.25)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.25
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
-        model_name = "/models/Qwen3-0.6B"
         scheme = AutoScheme(avg_bits=3, options=("gguf:q2_k_s,gguf:q4_k_s"), nsamples=1, ignore_scale_zp_bits=True)
-        ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1)
+        ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0, nsamples=1)
         ar.quantize_and_save(self.save_dir)
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     def test_enable_torch_compile(self):
-        model_name = "/models/opt-125m"
+        model_name = get_model_path("facebook/opt-125m")
         scheme = AutoScheme(avg_bits=2, options=("W2A16"), ignore_scale_zp_bits=True)
         ar = AutoRound(model=model_name, scheme=scheme, enable_torch_compile=True)
         ar.quantize_and_save(self.save_dir)
         model_args = f"pretrained={self.save_dir}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.10)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.10
         shutil.rmtree(self.save_dir, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_calib_dataset.py b/test/test_cuda/test_calib_dataset.py
index b66f60127..bdee2ebeb 100644
--- a/test/test_cuda/test_calib_dataset.py
+++ b/test/test_cuda/test_calib_dataset.py
@@ -1,46 +1,19 @@
+import json
 import os
 import shutil
-import sys
-import unittest
-
-sys.path.insert(0, "../..")
-import json
 
+import pytest
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
 
 
-class TestLocalCalibDataset(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        json_data = [{"text": "awefdsfsddfd"}, {"text": "fdfdfsdfdfdfd"}, {"text": "dfdsfsdfdfdfdf"}]
-        os.makedirs("./saved", exist_ok=True)
-        self.json_file = "./saved/tmp.json"
-        with open(self.json_file, "w") as json_file:
-            json.dump(json_data, json_file, indent=4)
-
-        jsonl_data = [{"text": "哈哈，開心點"}, {"text": "hello world"}]
-        os.makedirs("./saved", exist_ok=True)
-        self.jsonl_file = "./saved/tmp.jsonl"
-        with open(self.jsonl_file, "w") as jsonl_file:
-            for item in jsonl_data:
-                json.dump(item, jsonl_file, ensure_ascii=False)
-                jsonl_file.write("\n")
-
-        model_name = "facebook/opt-125m"
-        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-
-    def test_combine_dataset(self):
+class TestLocalCalibDataset:
+    def test_combine_dataset(self, tiny_opt_model_path):
         dataset = "NeelNanda/pile-10k" + ",BAAI/CCI3-HQ" + ",madao33/new-title-chinese"
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
-            self.model, self.tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=128, dataset=dataset
+            tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=128, dataset=dataset
         )
         autoround.quantize()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_conv1d.py b/test/test_cuda/test_conv1d.py
index e617bf55e..11f80a1b2 100644
--- a/test/test_cuda/test_conv1d.py
+++ b/test/test_cuda/test_conv1d.py
@@ -1,53 +1,52 @@
 import copy
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
-from _test_helpers import model_infer
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
 from auto_round.testing_utils import require_gptqmodel
 
+from ..helpers import get_model_path, get_tiny_model, model_infer
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
 
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
+class TestQuantizationConv1d:
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
 
+        # Yield to hand control to the test methods
+        yield
 
-class TestQuantizationConv1d(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.model_name = "MBZUAI/LaMini-GPT-124M"
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-        self.llm_dataloader = LLMDataLoader()
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
+        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree("runs", ignore_errors=True)
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_gptqmodel
-    def test_quant(self):
-        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+    def test_quant(self, dataloader):
+        model_name = get_model_path("MBZUAI/LaMini-GPT-124M")
+        model = get_tiny_model(model_name)
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
         from auto_round import AutoRoundConfig
 
         autoround = AutoRound(
-            self.model,
-            self.tokenizer,
+            model,
+            tokenizer,
             bits=bits,
             group_size=group_size,
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
 
         autoround.quantize()
@@ -55,7 +54,3 @@ def test_quant(self):
 
         model = AutoModelForCausalLM.from_pretrained("./saved", device_map="cuda", trust_remote_code=True)
         model_infer(model, self.tokenizer)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_diffusion.py b/test/test_cuda/test_diffusion.py
index 9a5a8bfd3..a3a90d14e 100644
--- a/test/test_cuda/test_diffusion.py
+++ b/test/test_cuda/test_diffusion.py
@@ -2,13 +2,9 @@
 import os
 import re
 import shutil
-import sys
-import unittest
 
+import pytest
 import requests
-
-sys.path.insert(0, "../..")
-
 from diffusers import AutoPipelineForText2Image
 from PIL import Image
 
@@ -16,13 +12,20 @@
 from auto_round.testing_utils import require_gptqmodel, require_optimum, require_vlm_env
 
 
-class TestAutoRound(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.model_name = "/dataset/FLUX.1-dev"
+class TestAutoRound:
+    model_name = "/dataset/FLUX.1-dev"
+
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
 
-    @classmethod
-    def tearDownClass(self):
+        # Yield to hand control to the test methods
+        yield
+
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
+        shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_optimum
@@ -73,11 +76,7 @@ def test_diffusion_rtn(self):
     def test_diffusion_model_checker(self):
         from auto_round.utils import is_diffusion_model
 
-        self.assertTrue(is_diffusion_model("/dataset/FLUX.1-dev"))
-        self.assertTrue(is_diffusion_model("/models/stable-diffusion-2-1"))
-        self.assertTrue(is_diffusion_model("/models/stable-diffusion-xl-base-1.0"))
-        self.assertFalse(is_diffusion_model("/models/Qwen3-8B"))
-
-
-if __name__ == "__main__":
-    unittest.main()
+        assert is_diffusion_model("/dataset/FLUX.1-dev")
+        assert is_diffusion_model("/models/stable-diffusion-2-1")
+        assert is_diffusion_model("/models/stable-diffusion-xl-base-1.0")
+        assert is_diffusion_model("/models/Qwen3-8B") is False
diff --git a/test/test_cuda/test_exllamav2_backend.py b/test/test_cuda/test_exllamav2_backend.py
index c489b37b2..d0f5bed53 100644
--- a/test/test_cuda/test_exllamav2_backend.py
+++ b/test/test_cuda/test_exllamav2_backend.py
@@ -1,12 +1,6 @@
 import shutil
-import sys
-import unittest
 
 import pytest
-
-sys.path.insert(0, "../..")
-
-
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -14,157 +8,118 @@
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.testing_utils import require_autogptq, require_gptqmodel, require_package_version_ut
 
+from ..helpers import get_model_path, model_infer
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
 
+class TestAutoRoundexllamaBackend:
+    save_dir = "./saved"
 
-class TestAutoRoundexllamaBackend(unittest.TestCase):
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
 
-    @classmethod
-    def setUpClass(self):
-        self.model_name = "/models/opt-125m"
-        self.save_folder = "./saved"
-        self.llm_dataloader = LLMDataLoader()
+        # Yield to hand control to the test methods
+        yield
 
-    def model_infer(self, model, tokenizer):
-        prompts = [
-            "Hello,my name is",
-            # "The president of the United States is",
-            # "The capital of France is",
-            # "The future of AI is",
-        ]
-
-        inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
-
-        outputs = model.generate(
-            input_ids=inputs["input_ids"].to(model.device),
-            attention_mask=inputs["attention_mask"].to(model.device),
-            do_sample=False,  ## change this to follow official usage
-            max_new_tokens=5,
-        )
-        generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
-
-        decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-        for i, prompt in enumerate(prompts):
-            print(f"Prompt: {prompt}")
-            print(f"Generated: {decoded_outputs[i]}")
-            print("-" * 50)
-        return decoded_outputs[0]
-
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
+        shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_gptqmodel
-    def test_gptqmodel_exllmav2_4bits_asym(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+    def test_gptqmodel_exllmav2_4bits_asym(self, dataloader):
+        model_path = get_model_path("facebook/opt-125m")
         bits, group_size, sym = 4, 128, False
         autoround = AutoRound(
-            model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, dataset=self.llm_dataloader
+            model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, dataset=dataloader
         )
-        quantized_model_path = self.save_folder
+        quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel")
 
         quantization_config = AutoRoundConfig(backend="gptqmodel:exllamav2")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.35
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            self.save_dir, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.35
         torch.cuda.empty_cache()
         shutil.rmtree("./saved", ignore_errors=True)
 
     @require_autogptq
     @require_package_version_ut("torch", "<2.6.0")
-    def test_gptq_exllamav2_4bits_sym(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+    def test_gptq_exllamav2_4bits_sym(self, dataloader):
+        model_path = get_model_path("facebook/opt-125m")
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
-            model,
-            tokenizer,
+            model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
             iters=1,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
-        quantized_model_path = self.save_folder
+        quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")  ##will convert to gptq model
 
         quantization_config = AutoRoundConfig(backend="gptq:exllamav2")  ## or exllamav2
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.27
         torch.cuda.empty_cache()
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+        shutil.rmtree(self.save_dir, ignore_errors=True)
 
     @require_autogptq
     @require_package_version_ut("torch", "<2.6.0")
     def test_gptq_exllamav2_4bits_sym_group_size(self):
+        model_path = get_model_path("facebook/opt-125m")
         for group_size in [-1, 32, 64, 128, 256, 1024]:  ## 384, 768 has accuracy issue
             print(f"!!!!!!!!!!!!!!!!!{group_size}!!!!!!!!!!!!!!!!!")
-            model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-            tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
             bits, group_size, sym = 4, group_size, True
             autoround = AutoRound(
-                model,
-                tokenizer,
+                model_path,
                 bits=bits,
                 iters=1,
                 nsamples=1,
                 group_size=group_size,
                 sym=sym,
             )
-            quantized_model_path = self.save_folder
+            quantized_model_path = self.save_dir
             autoround.quantize_and_save(
                 output_dir=quantized_model_path, format="auto_round"
             )  ##will convert to gptq model
 
             quantization_config = AutoRoundConfig(backend="gptq:exllamav2")  ## or exllamav2
             model = AutoModelForCausalLM.from_pretrained(
-                self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+                self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
             )
 
-            tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-            self.model_infer(model, tokenizer)
+            tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+            model_infer(model, tokenizer)
             result = simple_evaluate_user_model(model, tokenizer, batch_size=64, tasks="lambada_openai")
             print(result["results"]["lambada_openai"]["acc,none"])
-            self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.15)
+            assert result["results"]["lambada_openai"]["acc,none"] > 0.15
             torch.cuda.empty_cache()
-            shutil.rmtree(self.save_folder, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
+            shutil.rmtree(self.save_dir, ignore_errors=True)
diff --git a/test/test_cuda/test_export.py b/test/test_cuda/test_export.py
index 297b20193..c8f87b4bf 100644
--- a/test/test_cuda/test_export.py
+++ b/test/test_cuda/test_export.py
@@ -1,9 +1,7 @@
 import copy
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 import transformers
 from transformers import AutoConfig, AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
@@ -11,42 +9,37 @@
 from auto_round import AutoRound
 from auto_round.testing_utils import require_awq, require_optimum, require_package_version_ut
 
+from ..helpers import get_model_path, get_tiny_model
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
 
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
+class TestAutoRound:
+    save_dir = "./saved"
 
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
 
-class TestAutoRound(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.model_name = "facebook/opt-125m"
-        self.save_dir = "./saved"
-        self.llm_dataloader = LLMDataLoader()
+        # Yield to hand control to the test methods
+        yield
 
-    @classmethod
-    def tearDownClass(self):
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_optimum
-    def test_autogptq_format(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+    def test_autogptq_format(self, dataloader):
+        model_path = get_model_path("facebook/opt-125m")
         bits, group_size, sym = 4, 128, False
         autoround = AutoRound(
-            model,
-            tokenizer,
+            model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
         quantized_model_path = "./saved"
@@ -65,10 +58,10 @@ def test_autogptq_format(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
     @require_optimum
-    def test_autogptq_format_fp_layers(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+    def test_autogptq_format_fp_layers(self, tiny_opt_model_path, dataloader):
         layer_config = {}
+        model = AutoModelForCausalLM.from_pretrained(tiny_opt_model_path)
+        tokenizer = AutoTokenizer.from_pretrained(tiny_opt_model_path)
         for n, m in model.named_modules():
             if "q_proj" in n:
                 layer_config[n] = {"bits": 16}
@@ -82,7 +75,7 @@ def test_autogptq_format_fp_layers(self):
             sym=sym,
             iters=1,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         autoround.quantize()
@@ -102,9 +95,10 @@ def test_autogptq_format_fp_layers(self):
         #     "there there there there there there")
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_autogptq_format_qsave_fp_layers(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+    def test_autogptq_format_qsave_fp_layers(self, dataloader):
+        model_path = get_model_path("facebook/opt-125m")
+        model = AutoModelForCausalLM.from_pretrained(model_path)
+
         layer_config = {}
         for n, m in model.named_modules():
             if "q_proj" in n:
@@ -112,14 +106,13 @@ def test_autogptq_format_qsave_fp_layers(self):
 
         bits, group_size, sym = 4, 128, False
         autoround = AutoRound(
-            model,
-            tokenizer,
+            model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
             iters=1,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = "./saved"
@@ -153,19 +146,16 @@ def test_autogptq_format_qsave_fp_layers(self):
         ##print(res)
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_autoround_format(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+    def test_autoround_format(self, tiny_opt_model_path, dataloader):
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
-            model,
-            tokenizer,
+            tiny_opt_model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
             iters=1,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
         quantized_model_path = "./saved"
@@ -186,19 +176,17 @@ def test_autoround_format(self):
 
     @require_awq
     @require_package_version_ut("transformers", "<4.57.0")
-    def test_autoawq_format(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+    def test_autoawq_format(self, dataloader):
+        model_path = get_model_path("facebook/opt-125m")
         bits, group_size, sym = 4, 128, False
         autoround = AutoRound(
-            model,
-            tokenizer,
+            model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
             iters=1,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
         quantized_model_path = "./saved"
@@ -220,23 +208,21 @@ def test_autoawq_format(self):
     @require_optimum
     @require_awq
     @require_package_version_ut("transformers", "<4.57.0")
-    def test_autoawq_format_fp_qsave_layers(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+    def test_autoawq_format_fp_qsave_layers(self, dataloader):
+        model_path = get_model_path("facebook/opt-125m")
         layer_config = {
             "model.decoder.layers.0.self_attn.k_proj": {"bits": 16},
             "model.decoder.layers.9.self_attn.v_proj": {"bits": 16},
         }
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, False
         autoround = AutoRound(
-            model,
-            tokenizer,
+            model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
             iters=1,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = "./saved/test_export"
@@ -261,19 +247,16 @@ def test_autoawq_format_fp_qsave_layers(self):
 
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_autoround_3bit_asym_torch_format(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+    def test_autoround_3bit_asym_torch_format(self, tiny_opt_model_path, dataloader):
         bits, group_size, sym = 3, 128, False
         autoround = AutoRound(
-            model,
-            tokenizer,
+            tiny_opt_model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
         quantized_model_path = "./saved"
@@ -290,19 +273,16 @@ def test_autoround_3bit_asym_torch_format(self):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_autoround_3bit_sym_torch_format(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+    def test_autoround_3bit_sym_torch_format(self, tiny_opt_model_path, dataloader):
         bits, group_size, sym = 3, 128, True
         autoround = AutoRound(
-            model,
-            tokenizer,
+            tiny_opt_model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
         quantized_model_path = "./saved"
@@ -322,21 +302,24 @@ def test_autoround_3bit_sym_torch_format(self):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_awq_lmhead_export(self):
+    def test_awq_lmhead_export(self, dataloader):
         bits, sym, group_size = 4, False, 128
-        model_name = "/models/phi-2"
+        model_name = get_model_path("microsoft/phi-2")
+        tiny_model = get_tiny_model(model_name)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
         layer_config = {
             "lm_head": {"bits": 4},  # set lm_head quant
         }
         autoround = AutoRound(
-            model=model_name,
+            model=tiny_model,
+            tokenizer=tokenizer,
             bits=bits,
             group_size=group_size,
             sym=sym,
             iters=2,
             seqlen=2,
             layer_config=layer_config,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = "./saved"
         compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq")
@@ -346,21 +329,24 @@ def test_awq_lmhead_export(self):
         assert isinstance(lm_head, WQLinear_GEMM), "Illegal AWQ quantization for lm_head layer"
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_gptq_lmhead_export(self):
+    def test_gptq_lmhead_export(self, tiny_qwen_model_path, dataloader):
         bits, sym, group_size = 4, True, 128
-        model_name = "/models/phi-2"
+        model_name = get_model_path("microsoft/phi-2")
+        tiny_model = get_tiny_model(model_name)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
         layer_config = {
             "lm_head": {"bits": 4},  # set lm_head quant
         }
         autoround = AutoRound(
-            model=model_name,
+            model=tiny_model,
+            tokenizer=tokenizer,
             bits=bits,
             group_size=group_size,
             sym=sym,
             iters=2,
             seqlen=2,
             layer_config=layer_config,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = "./saved"
         compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
@@ -376,7 +362,3 @@ def test_gptq_lmhead_export(self):
         res = tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0])
         print(res)
         shutil.rmtree(quantized_model_path, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_fp8_input.py b/test/test_cuda/test_fp8_input.py
index 5258fe183..9e1c1cc3a 100644
--- a/test/test_cuda/test_fp8_input.py
+++ b/test/test_cuda/test_fp8_input.py
@@ -1,30 +1,43 @@
 import os
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
 from auto_round.eval.evaluation import simple_evaluate
+from auto_round.utils import llm_load_model
 
+from ..helpers import get_model_path, get_tiny_model
 
-class TestAutoRound(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.save_dir = "./saved"
 
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree(self.save_dir, ignore_errors=True)
+class TestAutoRound:
+    save_dir = "./saved"
+
+    def tiny_fp8_model(self):
+        model_name = get_model_path("qwen/Qwen3-0.6B-FP8")
+        model, tokenizer = llm_load_model(model_name)
+        model.model.layers = model.model.layers[:3]
+        return model, tokenizer
+
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
+
+        # Yield to hand control to the test methods
+        yield
+
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
+        shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     def test_small_model_rtn_generation(self):
-        model_name = "/models/Qwen3-0.6B-FP8"
-        ar = AutoRound(model=model_name, iters=0)
+        model, tokenizer = self.tiny_fp8_model()
+        ar = AutoRound(model=model, tokenizer=tokenizer, iters=0)
         ar.quantize_and_save(output_dir=self.save_dir)
         model = AutoModelForCausalLM.from_pretrained(self.save_dir, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
@@ -34,8 +47,8 @@ def test_small_model_rtn_generation(self):
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     def test_gguf_imatrix(self):
-        model_name = "/models/Qwen3-0.6B-FP8"
-        ar = AutoRound(model=model_name, iters=0)
+        model, tokenizer = self.tiny_fp8_model()
+        ar = AutoRound(model=model, tokenizer=tokenizer, iters=0)
         ar.quantize_and_save(format="gguf:q2_k_s", output_dir=self.save_dir)
         # from llama_cpp import Llama
         #
@@ -51,56 +64,55 @@ def test_gguf_imatrix(self):
         # print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
 
     def test_small_model_rtn(self):
-        model_name = "/models/Qwen3-0.6B-FP8"
-        ar = AutoRound(model=model_name, iters=0)
+        model, tokenizer = self.tiny_fp8_model()
+        ar = AutoRound(model=model, tokenizer=tokenizer, iters=0)
         _, folder = ar.quantize_and_save(output_dir=self.save_dir)
         model_args = f"pretrained={self.save_dir}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.25)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.25
 
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     def test_small_model_iters1(self):
-        model_name = "/models/Qwen3-0.6B-FP8"
-        ar = AutoRound(model=model_name, iters=1)
+        model, tokenizer = self.tiny_fp8_model()
+        ar = AutoRound(model=model, tokenizer=tokenizer, iters=1)
         _, folder = ar.quantize_and_save(output_dir=self.save_dir)
         model_args = f"pretrained={self.save_dir}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.25)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.25
 
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     def test_medium_model_rtn(self):
-        model_name = "/models/Qwen3-8B-FP8"
-        ar = AutoRound(model=model_name, iters=0)
+        model, tokenizer = self.tiny_fp8_model()
+        ar = AutoRound(model=model, tokenizer=tokenizer, iters=0)
         _, folder = ar.quantize_and_save(output_dir=self.save_dir)
         model_args = f"pretrained={self.save_dir}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.55)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.55
 
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     def test_medium_model_rtn_with_lm_head(self):
-        model_name = "/models/Qwen3-8B-FP8"
+        model, tokenizer = self.tiny_fp8_model()
         layer_config = {"lm_head": {"bits": 4}}
-        ar = AutoRound(model=model_name, iters=0, layer_config=layer_config)
+        ar = AutoRound(model=model, tokenizer=tokenizer, iters=0, layer_config=layer_config)
         _, folder = ar.quantize_and_save(output_dir=self.save_dir)
         model_args = f"pretrained={self.save_dir}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.55)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.55
 
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     def test_fp8_model_gguf(self):
         from llama_cpp import Llama
 
-        model_name = "Qwen/Qwen3-0.6B-FP8"
-
-        ar = AutoRound(model=model_name, iters=0)
+        model, tokenizer = self.tiny_fp8_model()
+        ar = AutoRound(model=model, tokenizer=tokenizer, iters=0)
         ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q4_0")
         for file in os.listdir(self.save_dir):
             if file.endswith(".gguf"):
@@ -110,7 +122,8 @@ def test_fp8_model_gguf(self):
         print(output)
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
-        ar = AutoRound(model=model_name, iters=1)
+        model, tokenizer = self.tiny_fp8_model()
+        ar = AutoRound(model=model, tokenizer=tokenizer, iters=1)
         ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q3_k_s")
         for file in os.listdir(self.save_dir):
             if file.endswith(".gguf"):
@@ -121,14 +134,10 @@ def test_fp8_model_gguf(self):
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     def test_diff_datatype(self):
-        model_name = "/models/Qwen3-0.6B-FP8"
         for scheme in ["NVFP4", "MXFP4"]:
+            model, tokenizer = self.tiny_fp8_model()
             for iters in [0, 1]:
                 print(f"Testing scheme: {scheme}, iters: {iters}")
-                ar = AutoRound(model=model_name, iters=iters, scheme=scheme)
+                ar = AutoRound(model=model, tokenizer=tokenizer, iters=iters, scheme=scheme)
                 ar.quantize_and_save(output_dir=self.save_dir)
                 shutil.rmtree(self.save_dir, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_get_block_name.py b/test/test_cuda/test_get_block_name.py
index cc9297653..829ac1e46 100644
--- a/test/test_cuda/test_get_block_name.py
+++ b/test/test_cuda/test_get_block_name.py
@@ -1,9 +1,7 @@
 import copy
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 import transformers
 from diffusers import AutoPipelineForText2Image
@@ -20,13 +18,13 @@
 from auto_round.utils import get_block_names, is_pure_text_model
 
 
-class TestAutoRound(unittest.TestCase):
+class TestAutoRound:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         pass
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("runs", ignore_errors=True)
 
     def check_block_names(self, block_names, prefixs=[], n_layers=[]):
@@ -195,11 +193,7 @@ def test_flux(self):
 
         block_names = get_block_names(model)
         self.check_block_names(block_names, ["transformer_blocks", "single_transformer_blocks"], [19, 38])
-        self.assertTrue(any(["context_embedder" not in n for n in block_names]))
+        assert any(["context_embedder" not in n for n in block_names])
 
         block_names = get_block_names(model, quant_vision=True)
         self.check_block_names(block_names, ["transformer_blocks", "single_transformer_blocks"], [19, 38])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_gguf.py b/test/test_cuda/test_gguf.py
index 312e561cf..174deab2f 100644
--- a/test/test_cuda/test_gguf.py
+++ b/test/test_cuda/test_gguf.py
@@ -1,9 +1,8 @@
 import os
 import shutil
 import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -11,35 +10,37 @@
 from auto_round import AutoRound
 from auto_round.testing_utils import require_gguf
 
+from ..helpers import get_model_path, get_tiny_model, save_tiny_model
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
 
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
+class TestAutoRound:
+    save_dir = "./saved"
 
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
 
-class TestAutoRound(unittest.TestCase):
-    @classmethod
-    def tearDownClass(self):
+        # Yield to hand control to the test methods
+        yield
+
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_gguf
-    def test_gguf_format(self):
-        model_name = "Qwen/Qwen2.5-0.5B-Instruct"
+    def test_gguf_format(self, tiny_qwen_model_path, dataloader):
         bits, group_size, sym = 4, 32, False
         autoround = AutoRound(
-            model_name,
+            tiny_qwen_model_path,
             bits=bits,
             group_size=group_size,
             sym=sym,
             iters=2,
             seqlen=2,
             nsamples=2,
-            dataset=LLMDataLoader(),
+            dataset=dataloader,
         )
         autoround.quantize()
         quantized_model_path = "./saved"
@@ -54,34 +55,36 @@ def test_gguf_format(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
         save_dir = os.path.join(os.path.dirname(__file__), "saved")
-        model_path = "Qwen/Qwen2.5-0.5B-Instruct"
         res = os.system(
-            f"cd ../.. && {sys.executable} -m auto_round --model {model_path} --iter 2 "
+            f"cd .. && {sys.executable} -m auto_round --model {tiny_qwen_model_path} --iter 2 "
             f"--output_dir {save_dir} --nsample 2 --format gguf:q4_0 --device 0"
         )
         print(save_dir)
-        self.assertFalse(res > 0 or res == -1, msg="qwen2 tuning fail")
+        assert not (res > 0 or res == -1), "qwen2 tuning fail"
 
         from llama_cpp import Llama
 
-        gguf_file = os.listdir("saved/Qwen2.5-0.5B-Instruct-gguf")[0]
-        llm = Llama(f"saved/Qwen2.5-0.5B-Instruct-gguf/{gguf_file}", n_gpu_layers=-1)
+        gguf_file = os.listdir("saved/tmp_tiny_qwen_model_path-gguf")[0]
+        llm = Llama(f"saved/tmp_tiny_qwen_model_path-gguf/{gguf_file}", n_gpu_layers=-1)
         output = llm("There is a girl who likes adventure,", max_tokens=32)
         print(output)
         shutil.rmtree("./saved", ignore_errors=True)
 
     @require_gguf
-    def test_q2_k_export(self):
+    def test_q2_k_export(self, dataloader):
         bits, group_size, sym = 2, 16, False
-        model_name = "Qwen/Qwen2.5-1.5B-Instruct"
+        model_path = get_model_path("Qwen/Qwen2.5-1.5B-Instruct")
+        model = get_tiny_model(model_path)
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
         autoround = AutoRound(
-            model_name,
+            model,
+            tokenizer,
             bits=bits,
             group_size=group_size,
             sym=sym,
             iters=1,
             seqlen=1,
-            dataset=LLMDataLoader(),
+            dataset=dataloader,
             data_type="int_asym_dq",
         )
         autoround.quantize()
@@ -94,20 +97,13 @@ def test_q2_k_export(self):
         inputs = autoround.tokenizer(text, return_tensors="pt").to(model.device)
         result = autoround.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])
         print(result)
-
-        from auto_round.eval.evaluation import simple_evaluate_user_model
-
-        result = simple_evaluate_user_model(model, autoround.tokenizer, batch_size=16, tasks="piqa")
-        self.assertGreater(result["results"]["piqa"]["acc,none"], 0.45)
-
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     @require_gguf
-    def test_basic_usage(self):
-        model_name = "Qwen/Qwen2.5-0.5B-Instruct"
+    def test_basic_usage(self, tiny_qwen_model_path):
         python_path = sys.executable
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --model {model_name} --eval_task_by_task"
+            f"cd .. && {python_path} -m auto_round --model {tiny_qwen_model_path} --eval_task_by_task"
             f" --tasks piqa,openbookqa --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0 --eval_model_dtype bf16"
         )
         if res > 0 or res == -1:
@@ -116,7 +112,7 @@ def test_basic_usage(self):
 
     @require_gguf
     def test_q4_0(self):
-        model_name = "Qwen/Qwen2.5-0.5B-Instruct"
+        model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct")
         bits, group_size, sym = 4, 32, True
         autoround = AutoRound(model_name, bits=bits, group_size=group_size, sym=sym, iters=1, data_type="int")
         autoround.quantize()
@@ -132,12 +128,12 @@ def test_q4_0(self):
         from auto_round.eval.evaluation import simple_evaluate_user_model
 
         result = simple_evaluate_user_model(model, autoround.tokenizer, batch_size=16, tasks="piqa")
-        self.assertGreater(result["results"]["piqa"]["acc,none"], 0.54)
+        assert result["results"]["piqa"]["acc,none"] > 0.54
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     @require_gguf
     def test_q4_1(self):
-        model_name = "Qwen/Qwen2.5-0.5B-Instruct"
+        model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct")
         bits, group_size, sym = 4, 32, False
         autoround = AutoRound(model=model_name, bits=bits, group_size=group_size, sym=sym, iters=1, data_type="int")
         autoround.quantize()
@@ -153,36 +149,28 @@ def test_q4_1(self):
         from auto_round.eval.evaluation import simple_evaluate_user_model
 
         result = simple_evaluate_user_model(model, autoround.tokenizer, batch_size=16, tasks="piqa")
-        self.assertGreater(result["results"]["piqa"]["acc,none"], 0.54)
+        assert result["results"]["piqa"]["acc,none"] > 0.54
         shutil.rmtree("./saved", ignore_errors=True)
 
     @require_gguf
     def test_all_format(self):
-        from auto_round.export.export_to_gguf.config import GGUF_CONFIG
+        for model_name in ["qwen/Qwen3-8B", "meta-llama/Llama-3.1-8B-Instruct", "meta-llama/Llama-3.2-3B"]:
+            for gguf_format in ["gguf:q5_0", "gguf:q5_1", "gguf:q3_k_m", "q5_k_m", "q6_k", "q8_0"]:
+                model_path = get_model_path(model_name)
+                tiny_model_path = "tmp_tiny_model"
+                tiny_model_path = save_tiny_model(model_path, tiny_model_path, num_layers=2)
+                ar = AutoRound(tiny_model_path, scheme=gguf_format, iters=0, nsampels=1, seqlen=16)
+                ar.quantize_and_save(output_dir=self.save_dir, format=gguf_format)
 
-        python_path = sys.executable
-        for model_name in ["/models/Qwen3-8B/", "/models/Llama-3.2-3B/", "/models/Meta-Llama-3.1-8B-Instruct"]:
-            for gguf_format in GGUF_CONFIG.keys():
-                print(model_name, gguf_format)
-                res = os.system(
-                    f"cd ../.. && {python_path} -m auto_round --model {model_name} "
-                    f" --bs 16 --iters 1 --nsamples 1 --format fake,{gguf_format}"
-                )
-                if res > 0 or res == -1:
-                    assert False, "cmd line test fail, please have a check"
-                shutil.rmtree("../../tmp_autoround", ignore_errors=True)
-
-                res = os.system(
-                    f"cd ../.. && {python_path} -m auto_round --model {model_name} "
-                    f" --bs 16 --iters 0 --nsamples 1 --format {gguf_format}"
-                )
-                if res > 0 or res == -1:
-                    assert False, "cmd line test fail, please have a check"
-                shutil.rmtree("../../tmp_autoround", ignore_errors=True)
+                ar = AutoRound(tiny_model_path, scheme=gguf_format, iters=1, nsampels=1, seqlen=16)
+                ar.quantize_and_save(output_dir=self.save_dir, format=gguf_format)
+
+                shutil.rmtree(tiny_model_path, ignore_errors=True)
+                shutil.rmtree(self.save_dir, ignore_errors=True)
 
     @require_gguf
     def test_vlm_gguf(self):
-        model_name = "/models/Qwen2.5-VL-7B-Instruct"
+        model_name = "/models/Qwen2-VL-2B-Instruct"
         from auto_round import AutoRoundMLLM
         from auto_round.utils import mllm_load_model
 
@@ -197,11 +185,11 @@ def test_vlm_gguf(self):
         )
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
-        self.assertTrue("mmproj-model.gguf" in os.listdir("./saved"))
-        file_size = os.path.getsize("./saved/Qwen2.5-VL-7B-Instruct-Q4_0.gguf") / 1024**2
-        self.assertAlmostEqual(file_size, 4242, delta=5.0)
+        assert "mmproj-model.gguf" in os.listdir("./saved")
+        file_size = os.path.getsize("./saved/Qwen2-VL-2B-Instruct-Q4_0.gguf") / 1024**2
+        assert abs(file_size - 4242) < 5.0
         file_size = os.path.getsize("./saved/mmproj-model.gguf") / 1024**2
-        self.assertAlmostEqual(file_size, 2580, delta=5.0)
+        assert abs(file_size - 2580) < 5.0
         shutil.rmtree("./saved", ignore_errors=True)
 
         model_name = "/models/gemma-3-12b-it"
@@ -218,41 +206,9 @@ def test_vlm_gguf(self):
         )
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m")
-        self.assertTrue("mmproj-model.gguf" in os.listdir("./saved"))
+        assert "mmproj-model.gguf" in os.listdir("./saved")
         file_size = os.path.getsize("./saved/gemma-3-12B-it-Q4_K_M.gguf") / 1024**2
-        self.assertAlmostEqual(file_size, 6568, delta=5.0)
+        assert abs(file_size - 6568) < 5.0
         file_size = os.path.getsize("./saved/mmproj-model.gguf") / 1024**2
-        self.assertAlmostEqual(file_size, 1599, delta=5.0)
+        assert abs(file_size - 1599) < 5.0
         shutil.rmtree(quantized_model_path, ignore_errors=True)
-
-    # @require_gguf
-    # def test_llama_4(self):
-    #     model_name = "/dataset/Llama-4-Scout-17B-16E-Instruct/"
-    #     from auto_round import AutoRoundMLLM
-    #     from auto_round.utils import mllm_load_model
-
-    #     model, processor, tokenizer, image_processor = mllm_load_model(model_name, use_auto_mapping=False)
-    #     autoround = AutoRoundMLLM(
-    #         model,
-    #         tokenizer=tokenizer,
-    #         processor=processor,
-    #         image_processor=image_processor,
-    #         device="auto",
-    #         iters=0,
-    #     )
-    #     quantized_model_path = "/dataset/Llam-4-test"
-    #     shutil.rmtree(quantized_model_path, ignore_errors=True)
-    #     autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
-    #     self.assertTrue("mmproj-model.gguf" in os.listdir(quantized_model_path))
-    #     file_size = (
-    #         os.path.getsize(os.path.join(quantized_model_path, "Llama-4-Scout-17B-16E-Instruct-16x17B-Q4_0.gguf"))
-    #         / 1024**2
-    #     )
-    #     self.assertAlmostEqual(file_size, 58093.62, delta=1.0)
-    #     file_size = os.path.getsize(os.path.join(quantized_model_path, "mmproj-model.gguf")) / 1024**2
-    #     self.assertAlmostEqual(file_size, 3326.18, delta=5.0)
-    #     shutil.rmtree(quantized_model_path, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_main_func.py b/test/test_cuda/test_main_func.py
index 571fc10f5..ac8b8b91e 100644
--- a/test/test_cuda/test_main_func.py
+++ b/test/test_cuda/test_main_func.py
@@ -1,10 +1,8 @@
 import copy
 import re
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 import transformers
 from lm_eval.utils import make_table  # pylint: disable=E0401
@@ -15,6 +13,8 @@
 from auto_round.eval.evaluation import simple_evaluate
 from auto_round.testing_utils import require_awq, require_gptqmodel, require_optimum, require_package_version_ut
 
+from ..helpers import get_model_path
+
 
 def get_accuracy(data):
     match = re.search(r"\|acc\s+\|[↑↓]\s+\|\s+([\d.]+)\|", data)
@@ -26,21 +26,27 @@ def get_accuracy(data):
         return 0.0
 
 
-class TestMainFunc(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.save_dir = "./saved"
-        self.tasks = "lambada_openai"
+class TestMainFunc:
+    save_dir = "./saved"
+    tasks = "lambada_openai"
+
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
 
-    @classmethod
-    def tearDownClass(self):
+        # Yield to hand control to the test methods
+        yield
+
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_gptqmodel
     @require_optimum
     def test_backend(self):
-        model_name = "/models/opt-125m"
+        model_name = get_model_path("facebook/opt-125m")
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         autoround = AutoRound(model, tokenizer, bits=4, group_size=128)
@@ -68,7 +74,7 @@ def test_backend(self):
     @require_awq
     @require_package_version_ut("transformers", "<4.57.0")
     def test_backend_awq(self):
-        model_name = "/models/opt-125m"
+        model_name = get_model_path("facebook/opt-125m")
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         autoround = AutoRound(model, tokenizer, bits=4, group_size=128)
@@ -83,10 +89,10 @@ def test_backend_awq(self):
         assert accuracy > 0.35
         shutil.rmtree("./saved", ignore_errors=True)
 
-    @unittest.skipIf(torch.cuda.is_available() is False, "Skipping because no cuda")
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
     @require_gptqmodel
     def test_fp_layers(self):
-        model_name = "/models/opt-125m"
+        model_name = get_model_path("facebook/opt-125m")
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         from auto_round.compressors.utils import get_fp_layer_names
@@ -107,11 +113,11 @@ def test_fp_layers(self):
         assert accuracy > 0.35
         shutil.rmtree("./saved", ignore_errors=True)
 
-    @unittest.skipIf(torch.cuda.is_available() is False, "Skipping because no cuda")
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
     @require_awq
     @require_package_version_ut("transformers", "<4.57.0")
     def test_fp_layers_awq(self):
-        model_name = "/models/opt-125m"
+        model_name = get_model_path("facebook/opt-125m")
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         from auto_round.compressors.utils import get_fp_layer_names
@@ -132,18 +138,17 @@ def test_fp_layers_awq(self):
         assert accuracy > 0.35
         shutil.rmtree("./saved", ignore_errors=True)
 
-    @unittest.skipIf(torch.cuda.is_available() is False, "Skipping because no cuda")
-    def test_undivided_group_size_tuning(self):
-        model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_undivided_group_size_tuning(self, tiny_opt_model_path):
+        model = AutoModelForCausalLM.from_pretrained(tiny_opt_model_path, torch_dtype=torch.float16, device_map="auto")
+        tokenizer = AutoTokenizer.from_pretrained(tiny_opt_model_path)
 
         autoround = AutoRound(model, tokenizer, bits=4, group_size=127, nsamples=2, iters=2)
         autoround.quantize()
 
     @require_gptqmodel
     def test_adam(self):
-        model_name = "/models/opt-125m"
+        model_name = get_model_path("facebook/opt-125m")
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         autoround = AutoRoundAdam(model, tokenizer, bits=4, group_size=128)
@@ -164,7 +169,7 @@ def test_autoround_asym(self):  ##need to install false
         except ImportError as e:
             print("skip autoround asym test, as autoround is not installed from source")
             return
-        model_name = "/models/opt-125m"
+        model_name = get_model_path("facebook/opt-125m")
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         autoround = AutoRound(model, tokenizer, bits=4, group_size=128, sym=False)
@@ -179,12 +184,12 @@ def test_autoround_asym(self):  ##need to install false
         assert accuracy > 0.35
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_attention_mask_lm_head(self):
+    def test_attention_mask_lm_head(self, tiny_qwen_moe_model_path):
         from transformers import AutoTokenizer
 
-        model_name = "/models/Qwen3-8B"
+        # model_name = "/models/Qwen3-8B"
         # model_name = "/models/Qwen3-0.6B"
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_moe_model_path)
         text = ["haha", "hello world"]
         res = tokenizer(text, return_tensors="pt", max_length=8, padding="max_length", truncation=True)
         res.data.pop("attention_mask")
@@ -196,14 +201,13 @@ def test_attention_mask_lm_head(self):
         data.append(res.data)
         from auto_round import AutoRound
 
-        ar = AutoRound(model_name, iters=1, dataset=data, seqlen=8, quant_lm_head=True)
+        ar = AutoRound(tiny_qwen_moe_model_path, iters=1, dataset=data, seqlen=8, quant_lm_head=True)
         ar.quantize()
 
-    def test_low_cpu_mem_usage(self):
+    def test_low_cpu_mem_usage(self, tiny_opt_model_path):
         bits, group_size = 4, 32
-        model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(tiny_opt_model_path, torch_dtype="auto", trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(tiny_opt_model_path, trust_remote_code=True)
         quantized_model_path = "./saved"
         autoround = AutoRound(
             model,
@@ -216,7 +220,3 @@ def test_low_cpu_mem_usage(self):
         )
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
         shutil.rmtree(quantized_model_path, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_marlin_backend.py b/test/test_cuda/test_marlin_backend.py
index 26d3ddca2..8d7594086 100644
--- a/test/test_cuda/test_marlin_backend.py
+++ b/test/test_cuda/test_marlin_backend.py
@@ -1,29 +1,32 @@
 import shutil
-import sys
-import unittest
 
 import pytest
-
-sys.path.insert(0, "../..")
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound, AutoRoundConfig
 from auto_round.eval.evaluation import simple_evaluate_user_model
 
+from ..helpers import model_infer
+
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
+class TestAutoRoundMarlinBackend:
+    save_dir = "./saved"
 
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
 
+        # Yield to hand control to the test methods
+        yield
 
-class TestAutoRoundMarlinBackend(unittest.TestCase):
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
+        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree("runs", ignore_errors=True)
 
-    def test_marlin_group_size(self):
+    def test_marlin_group_size(self, dataloader):
         for group_size in [-1, 64]:
             print(f"{group_size}!!!!!!!!!!!!!!!!!")
             model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
@@ -37,21 +40,21 @@ def test_marlin_group_size(self):
                 sym=sym,
                 iters=1,
                 seqlen=2,
-                dataset=self.llm_dataloader,
+                dataset=dataloader,
             )
-            quantized_model_path = self.save_folder
+            quantized_model_path = self.save_dir
             autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
 
             quantization_config = AutoRoundConfig(backend="marlin")
             model = AutoModelForCausalLM.from_pretrained(
-                self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+                self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
             )
 
-            tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-            self.model_infer(model, tokenizer)
+            tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+            model_infer(model, tokenizer)
             result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
             print(result["results"]["lambada_openai"]["acc,none"])
-            self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.14)
+            assert result["results"]["lambada_openai"]["acc,none"] > 0.14
 
         for group_size in [32, 128]:
             print(f"{group_size}!!!!!!!!!!!!!!!!!")
@@ -66,60 +69,23 @@ def test_marlin_group_size(self):
                 sym=sym,
                 iters=1,
                 seqlen=2,
-                dataset=self.llm_dataloader,
+                dataset=dataloader,
             )
-            quantized_model_path = self.save_folder
+            quantized_model_path = self.save_dir
             autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
 
             quantization_config = AutoRoundConfig(backend="marlin")
             model = AutoModelForCausalLM.from_pretrained(
-                self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+                self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
             )
 
-            tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-            self.model_infer(model, tokenizer)
+            tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+            model_infer(model, tokenizer)
             result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
             print(result["results"]["lambada_openai"]["acc,none"])
-            self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.14)
-
-    @classmethod
-    def setUpClass(self):
-        self.model_name = "/models/opt-125m"
-        self.save_folder = "./saved"
-        self.llm_dataloader = LLMDataLoader()
-
-    def model_infer(self, model, tokenizer):
-        prompts = [
-            "Hello,my name is",
-            # "The president of the United States is",
-            # "The capital of France is",
-            # "The future of AI is",
-        ]
-
-        inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
-
-        outputs = model.generate(
-            input_ids=inputs["input_ids"].to(model.device),
-            attention_mask=inputs["attention_mask"].to(model.device),
-            do_sample=False,  ## change this to follow official usage
-            max_new_tokens=5,
-        )
-        generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
-
-        decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+            assert result["results"]["lambada_openai"]["acc,none"] > 0.14
 
-        for i, prompt in enumerate(prompts):
-            print(f"Prompt: {prompt}")
-            print(f"Generated: {decoded_outputs[i]}")
-            print("-" * 50)
-        return decoded_outputs[0]
-
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree("./saved", ignore_errors=True)
-        shutil.rmtree("runs", ignore_errors=True)
-
-    def test_marlin_4bits_sym_with_zp_m_1(self):
+    def test_marlin_4bits_sym_with_zp_m_1(self, dataloader):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
@@ -131,32 +97,32 @@ def test_marlin_4bits_sym_with_zp_m_1(self):
             sym=sym,
             iters=1,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
-        quantized_model_path = self.save_folder
+        quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
 
         quantization_config = AutoRoundConfig(backend="marlin")
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
+            self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.27
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            self.save_dir, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.27
         torch.cuda.empty_cache()
         shutil.rmtree("./saved", ignore_errors=True)
 
@@ -172,41 +138,37 @@ def test_marlin_4bits_sym_with_zp_m_1(self):
     #         sym=sym,
     #         iters=1,
     #         seqlen=2,
-    #         dataset=self.llm_dataloader,
+    #         dataset=dataloader,
     #     )
-    #     quantized_model_path = self.save_folder
+    #     quantized_model_path = self.save_dir
     #     autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
     #
     #     quantization_config = AutoRoundConfig(backend="marlin")
     #     model = AutoModelForCausalLM.from_pretrained(
-    #         self.save_folder,
+    #         self.save_dir,
     #         torch_dtype=torch.float16,
     #         device_map="auto",
     #         quantization_config=quantization_config
     #     )
     #
-    #     tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-    #     self.model_infer(model, tokenizer)
+    #     tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+    #     model_infer(model, tokenizer)
     #     result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
     #     print(result['results']['lambada_openai']['acc,none'])
-    #     self.assertGreater(result['results']['lambada_openai']['acc,none'], 0.27)
+    #     assert result['results']['lambada_openai']['acc,none'] > 0.27
     #     torch.cuda.empty_cache()
     #
     #     model = AutoModelForCausalLM.from_pretrained(
-    #         self.save_folder,
+    #         self.save_dir,
     #         torch_dtype=torch.bfloat16,
     #         device_map="auto",
     #         quantization_config=quantization_config
     #     )
     #
-    #     tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-    #     self.model_infer(model, tokenizer)
+    #     tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+    #     model_infer(model, tokenizer)
     #     result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
     #     print(result['results']['lambada_openai']['acc,none'])
-    #     self.assertGreater(result['results']['lambada_openai']['acc,none'], 0.27)
+    #     assert result['results']['lambada_openai']['acc,none'] > 0.27
     #     torch.cuda.empty_cache()
     #     shutil.rmtree("./saved", ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_mix_bits.py b/test/test_cuda/test_mix_bits.py
index 4f7d39d8c..6988709d5 100644
--- a/test/test_cuda/test_mix_bits.py
+++ b/test/test_cuda/test_mix_bits.py
@@ -1,14 +1,9 @@
 import json
 import os
 import shutil
-import sys
-import unittest
-
-from parameterized import parameterized
-
-sys.path.insert(0, "../..")
 from pathlib import Path
 
+import pytest
 import torch
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 
@@ -19,32 +14,27 @@
     require_package_version_ut,
 )
 
+from ..helpers import get_model_path
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
 
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
+class TestAutoRound:
+    save_dir = "./saved"
 
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
 
-class TestAutoRound(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.model_name = "/models/opt-125m"
-        self.save_dir = "./saved"
-        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
-        self.llm_dataloader = LLMDataLoader()
+        # Yield to hand control to the test methods
+        yield
 
-    @classmethod
-    def tearDownClass(self):
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_gptqmodel
-    def test_mixed_gptqmodel(self):
+    def test_mixed_gptqmodel(self, tiny_opt_model_path, dataloader):
         scheme = "W4A16"
         layer_config = {
             "k_proj": {"bits": 8},  # part name
@@ -54,12 +44,12 @@ def test_mixed_gptqmodel(self):
             "model.decoder.layers.0.self_attn.q_proj": {"bits": 8},  # full name
         }
         autoround = AutoRound(
-            model=self.model_name,
+            model=tiny_opt_model_path,
             scheme=scheme,
             iters=2,
             seqlen=2,
             layer_config=layer_config,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
@@ -73,7 +63,7 @@ def test_mixed_gptqmodel(self):
         print(res)
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mixed_gptqmodel_convert_to_ar(self):
+    def test_mixed_gptqmodel_convert_to_ar(self, tiny_opt_model_path, dataloader):
         layer_config = {
             "k_proj": {"bits": 8},  # part name
             "lm_head": {"bits": 4},  # set lm_head quant
@@ -81,12 +71,12 @@ def test_mixed_gptqmodel_convert_to_ar(self):
             "model.decoder.layers.0.self_attn.q_proj": {"bits": 8},  # full name
         }
         autoround = AutoRound(
-            model=self.model_name,
+            model=tiny_opt_model_path,
             scheme="W4A16",
             iters=2,
             seqlen=2,
             layer_config=layer_config,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq")
@@ -101,7 +91,7 @@ def test_mixed_gptqmodel_convert_to_ar(self):
         print(res)
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mixed_autoround_format(self):
+    def test_mixed_autoround_format(self, tiny_opt_model_path, dataloader):
         layer_config = {
             "k_proj": {"bits": 8},
             "q_proj": {"bits": 3},
@@ -109,11 +99,11 @@ def test_mixed_autoround_format(self):
             "fc1": {"bits": 16},
         }
         autoround = AutoRound(
-            model=self.model_name,
+            model=tiny_opt_model_path,
             scheme="W4A16",
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = "self.save_dir"
@@ -129,18 +119,17 @@ def test_mixed_autoround_format(self):
 
     @require_awq
     @require_package_version_ut("transformers", "<4.57.0")
-    def test_fallback_regex_for_awq_format(self):
-        model_name = "facebook/opt-125m"
+    def test_fallback_regex_for_awq_format(self, tiny_opt_model_path, dataloader):
         layer_config = {
             "lm_head": {"bits": 16},
             "fc1": {"bits": 16},
         }
         autoround = AutoRound(
-            model=model_name,
+            model=tiny_opt_model_path,
             scheme="W4A16",
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = "self.save_dir"
@@ -155,18 +144,18 @@ def test_fallback_regex_for_awq_format(self):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mixed_ar_format_part_name_hf_loading(self):
+    def test_mixed_ar_format_part_name_hf_loading(self, tiny_opt_model_path, dataloader):
         layer_config = {
             "k_proj": {"bits": 8},  # part name
             "lm_head": {"bits": 16},  # full name
             ".*fc1.*": {"bits": 16},  # standard regex
         }
         autoround = AutoRound(
-            model=self.model_name,
+            model=tiny_opt_model_path,
             scheme="W4A16",
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = "self.save_dir"
@@ -216,18 +205,19 @@ def test_mixed_ar_format_part_name_hf_loading(self):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mixed_MXFP_autoround_format_loading(self):
+    def test_mixed_MXFP_autoround_format_loading(self, dataloader):
         layer_config = {
             "k_proj": {"bits": 8, "act_bits": 8},
             "lm_head": {"bits": 16, "act_bits": 16},
             "fc1": {"bits": 8, "act_bits": 8},
         }
+        model_path = get_model_path("facebook/opt-125m")
         autoround = AutoRound(
-            self.model_name,
+            model_path,
             scheme="MXFP4",
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
@@ -242,21 +232,21 @@ def test_mixed_MXFP_autoround_format_loading(self):
 
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.32)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.32
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mixed_autoround_format_vllm(self):
+    def test_mixed_autoround_format_vllm(self, tiny_opt_model_path, dataloader):
         layer_config = {
             "self_attn": {"bits": 8},
             "lm_head": {"bits": 16},
         }
         autoround = AutoRound(
-            self.model,
+            tiny_opt_model_path,
             self.tokenizer,
             scheme="W4A16",
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         autoround.quantize()
@@ -285,18 +275,18 @@ def test_mixed_autoround_format_vllm(self):
             print(f"{prompt}: {generated_text}")
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_mixed_llmcompressor_format_vllm(self):
+    def test_mixed_llmcompressor_format_vllm(self, tiny_opt_model_path, dataloader):
         layer_config = {
             "self_attn": {"bits": 16, "act_bits": 16},
             "lm_head": {"bits": 16, "act_bits": 16},
             "fc1": {"bits": 16, "act_bits": 16},
         }
         autoround = AutoRound(
-            self.model_name,
+            tiny_opt_model_path,
             scheme="NVFP4",
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
@@ -323,7 +313,3 @@ def test_mixed_llmcompressor_format_vllm(self):
             print(f"{prompt}: {generated_text}")
             assert "!!!" not in generated_text
         shutil.rmtree(quantized_model_path, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_multiple_card.py b/test/test_cuda/test_multiple_card.py
index 5dac584fe..e09975a19 100644
--- a/test/test_cuda/test_multiple_card.py
+++ b/test/test_cuda/test_multiple_card.py
@@ -1,11 +1,7 @@
 import re
 import shutil
-import sys
-import unittest
-
-sys.path.insert(0, "../..")
-
 
+import pytest
 import torch
 from lm_eval.utils import make_table  # pylint: disable=E0401
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -14,6 +10,8 @@
 from auto_round.eval.evaluation import simple_evaluate
 from auto_round.testing_utils import multi_card, require_gptqmodel, require_greater_than_050
 
+from ..helpers import get_model_path, get_tiny_model
+
 
 def get_accuracy(data):
     match = re.search(r"\|acc\s+\|[↑↓]\s+\|\s+([\d.]+)\|", data)
@@ -27,15 +25,21 @@ def get_accuracy(data):
 
 # import os
 # os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
-class TestAutoRound(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.save_dir = "./saved"
-        self.tasks = "lambada_openai"
-
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree(self.save_dir, ignore_errors=True)
+class TestAutoRound:
+    save_dir = "./saved"
+    tasks = "lambada_openai"
+
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
+
+        # Yield to hand control to the test methods
+        yield
+
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
+        shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @multi_card
@@ -57,10 +61,9 @@ def test_device_map_str(self):
         shutil.rmtree("./saved", ignore_errors=True)
 
     @multi_card
-    def test_layer_norm(self):
-        model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def test_layer_norm(self, tiny_opt_model_path):
+        model = AutoModelForCausalLM.from_pretrained(tiny_opt_model_path, torch_dtype=torch.float16)
+        tokenizer = AutoTokenizer.from_pretrained(tiny_opt_model_path)
         device_map = {"norm": "cuda:1"}
         autoround = AutoRound(
             model, tokenizer, iters=2, device_map=device_map, nsamples=7, seqlen=32, enable_norm_bias_tuning=True
@@ -68,10 +71,9 @@ def test_layer_norm(self):
         autoround.quantize()
 
     @multi_card
-    def test_rms_norm(self):
-        model_name = "/models/Qwen2-0.5B-Instruct"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def test_rms_norm(self, tiny_qwen_model_path):
+        model = AutoModelForCausalLM.from_pretrained(tiny_qwen_model_path, torch_dtype=torch.float16)
+        tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_model_path)
         device_map = {"norm": "cuda:1"}
         autoround = AutoRound(
             model, tokenizer, iters=2, device_map=device_map, nsamples=7, seqlen=32, enable_norm_bias_tuning=True
@@ -79,10 +81,9 @@ def test_rms_norm(self):
         autoround.quantize()
 
     @multi_card
-    def test_act_quantization(self):
-        model_name = "/models/Qwen2-0.5B-Instruct"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def test_act_quantization(self, tiny_qwen_model_path):
+        model = AutoModelForCausalLM.from_pretrained(tiny_qwen_model_path, torch_dtype=torch.float16)
+        tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_model_path)
         device_map = {".*q_proj": "0", ".*k_proj": "cuda:1", "v_proj": 1, ".*up_proj": "1"}
         autoround = AutoRound(
             model, tokenizer, iters=2, device_map=device_map, nsamples=7, seqlen=32, act_bits=4, act_dynamic=False
@@ -91,9 +92,9 @@ def test_act_quantization(self):
 
     @multi_card
     def test_lm_head(self):
-        model_name = "/models/Qwen2.5-7B-Instruct"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model_path = get_model_path("qwen/Qwen2.5-7B-Instruct")
+        model = get_tiny_model(model_path)
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
         device_map = {".*q_proj": "0", ".*k_proj": "cuda:1", "v_proj": 1, ".*up_proj": "1", "lm_head": 1}
         layer_config = {"lm_head": {"bits": 4}}
         autoround = AutoRound(
@@ -109,10 +110,9 @@ def test_lm_head(self):
         autoround.quantize()
 
     @multi_card
-    def test_device_map(self):
-        model_name = "/models/Qwen2-0.5B-Instruct"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+    def test_device_map(self, tiny_qwen_model_path):
+        model = AutoModelForCausalLM.from_pretrained(tiny_qwen_model_path, torch_dtype=torch.float16)
+        tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_model_path)
         device_map = {".*q_proj": "0", ".*k_proj": "cuda:1", "v_proj": 1, ".*up_proj": "cpu"}
         autoround = AutoRound(model, tokenizer, iters=2, device_map=device_map, nsamples=7, seqlen=32)
         autoround.quantize()
@@ -210,12 +210,11 @@ def test_device_map(self):
             torch.cuda.empty_cache()
 
     @multi_card
-    def test_device_map_dict(self):
+    def test_device_map_dict(self, tiny_opt_model_path):
         device_map = {".*q_proj": "0", ".*k_proj": "cuda:1", "v_proj": 1, ".*up_proj": "1"}
         bits, group_size, sym = 4, 128, False
-        model_name = "/models/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForCausalLM.from_pretrained(tiny_opt_model_path, torch_dtype=torch.float16, device_map="auto")
+        tokenizer = AutoTokenizer.from_pretrained(tiny_opt_model_path)
         autoround = AutoRound(
             model,
             tokenizer,
@@ -229,9 +228,8 @@ def test_device_map_dict(self):
         autoround.quantize()
 
         # test model_name
-        model_name = "/models/opt-125m"
         autoround = AutoRound(
-            model_name,
+            tiny_opt_model_path,
             tokenizer,
             bits=bits,
             group_size=group_size,
@@ -244,7 +242,7 @@ def test_device_map_dict(self):
 
         # test rtn
         autoround = AutoRound(
-            model_name,
+            tiny_opt_model_path,
             tokenizer,
             bits=bits,
             group_size=group_size,
@@ -356,29 +354,25 @@ def test_device_map_for_triton(self):
 
     @multi_card
     def test_mllm_device_map(self):
-        model_name = "/models/Qwen2-VL-2B-Instruct/"
+        model_name = get_model_path("qwen/Qwen2-VL-2B-Instruct/")
         from auto_round import AutoRoundMLLM
 
         device_map = "0,1"
         ar = AutoRoundMLLM(model_name, device_map=device_map)
-        self.assertEqual(ar.device, "cuda:0")
-        self.assertEqual(ar.device_map, device_map)
+        assert ar.device == "cuda:0"
+        assert ar.device_map == device_map
 
         device_map = 1
         ar = AutoRoundMLLM(ar.model, ar.tokenizer, processor=ar.processor, device_map=device_map)
-        self.assertEqual(ar.device, "cuda:1")
-        self.assertEqual(ar.device_map, device_map)
+        assert ar.device == "cuda:1"
+        assert ar.device_map == device_map
 
         device_map = "auto"
         ar = AutoRoundMLLM(ar.model, ar.tokenizer, processor=ar.processor, device_map=device_map)
-        self.assertEqual(ar.device, "cuda")
-        self.assertEqual(ar.device_map, device_map)
+        assert ar.device == "cuda"
+        assert ar.device_map == device_map
 
         device_map = {"model.language_model.layers": 0, "model.visual.blocks": 1}
         ar = AutoRoundMLLM(ar.model, ar.tokenizer, processor=ar.processor, device_map=device_map)
-        self.assertEqual(ar.model.model.language_model.layers[0].self_attn.q_proj.tuning_device, "cuda:0")
-        self.assertEqual(ar.model.model.visual.blocks[0].mlp.fc1.tuning_device, "cuda:1")
-
-
-if __name__ == "__main__":
-    unittest.main()
+        assert ar.model.model.language_model.layers[0].self_attn.q_proj.tuning_device == "cuda:0"
+        assert ar.model.model.visual.blocks[0].mlp.fc1.tuning_device == "cuda:1"
diff --git a/test/test_cuda/test_multiple_card_calib.py b/test/test_cuda/test_multiple_card_calib.py
index 8d97046fb..fedb3f328 100644
--- a/test/test_cuda/test_multiple_card_calib.py
+++ b/test/test_cuda/test_multiple_card_calib.py
@@ -2,9 +2,8 @@
 import re
 import shutil
 import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 
 from auto_round.testing_utils import multi_card
 
@@ -19,14 +18,20 @@ def get_accuracy(data):
         return 0.0
 
 
-class TestAutoRound(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.save_dir = "./saved"
-        self.tasks = "lambada_openai"
+class TestAutoRound:
+    save_dir = "./saved"
+    tasks = "lambada_openai"
 
-    @classmethod
-    def tearDownClass(self):
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
+
+        # Yield to hand control to the test methods
+        yield
+
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
@@ -36,7 +41,7 @@ def test_multiple_card_calib(self):
 
         ##test llm script
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --model /models/Meta-Llama-3.1-8B-Instruct --devices '0,1' --quant_lm_head --iters 1 --nsamples 1 --output_dir None"
+            f"cd .. && {python_path} -m auto_round --model /models/Meta-Llama-3.1-8B-Instruct --devices '0,1' --quant_lm_head --iters 1 --nsamples 1 --output_dir None"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
@@ -47,11 +52,7 @@ def test_multiple_card_nvfp4(self):
 
         ##test llm script
         res = os.system(
-            f"cd ../.. && {python_path} -m auto_round --model facebook/opt-125m  --scheme NVFP4 --devices '0,1' --iters 1 --nsamples 1 --enable_torch_compile --low_gpu_mem_usage"
+            f"cd .. && {python_path} -m auto_round --model facebook/opt-125m  --scheme NVFP4 --devices '0,1' --iters 1 --nsamples 1 --enable_torch_compile --low_gpu_mem_usage"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_mxfp_and_nvfp_quant.py b/test/test_cuda/test_mxfp_and_nvfp_quant.py
index 0dc43b093..808fa4a28 100644
--- a/test/test_cuda/test_mxfp_and_nvfp_quant.py
+++ b/test/test_cuda/test_mxfp_and_nvfp_quant.py
@@ -12,6 +12,8 @@
 from auto_round.export.export_to_autoround import qlinear_fp as ar_qlinear_fp
 from auto_round.testing_utils import has_module
 
+from ..helpers import get_model_path
+
 testing_schemes = [AutoRoundFormat.MXFP8.value, AutoRoundFormat.MXFP4.value, AutoRoundFormat.NVFP4.value]
 QMODULE_MAPPING = {
     AutoRoundFormat.MXFP8.value: ar_qmodules.MXFP8QuantLinear,
@@ -22,15 +24,14 @@
 
 @pytest.mark.parametrize("scheme", testing_schemes)
 @torch.inference_mode()
-def test_e2e_quant_and_infer(scheme):
+def test_e2e_quant_and_infer(scheme, tiny_qwen_model_path):
     # Use a temporary directory for saving the quantized model
     with tempfile.TemporaryDirectory() as temp_dir:
-        model_name = "Qwen/Qwen2.5-0.5B-Instruct"
 
         # Load the tokenizer and model
-        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_model_path, trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
-            model_name,
+            tiny_qwen_model_path,
             device_map="cpu",
             torch_dtype="auto",
             trust_remote_code=True,
diff --git a/test/test_cuda/test_mxfp_nvfp.py b/test/test_cuda/test_mxfp_nvfp.py
index 552016f17..41c996b95 100644
--- a/test/test_cuda/test_mxfp_nvfp.py
+++ b/test/test_cuda/test_mxfp_nvfp.py
@@ -1,9 +1,7 @@
 import copy
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 import transformers
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
@@ -11,37 +9,34 @@
 from auto_round import AutoRound
 from auto_round.testing_utils import require_awq, require_optimum
 
+from ..helpers import get_model_path, get_tiny_model
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
 
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
+class TestAutoRound:
+    save_dir = "./saved"
 
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
 
-class TestAutoRound(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.model_name = "facebook/opt-125m"
-        self.save_dir = "./saved"
-        self.llm_dataloader = LLMDataLoader()
+        # Yield to hand control to the test methods
+        yield
 
-    @classmethod
-    def tearDownClass(self):
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_fp8input_mxfp4_llmcompressor_format(self):
-        model_name = "/models/Qwen3-0.6B-FP8"
+    def test_fp8input_mxfp4_llmcompressor_format(self, dataloader):
+        model_name = get_model_path("qwen/Qwen3-0.6B-FP8")
         scheme = "mxfp4"
         ar = AutoRound(
             model=model_name,
             iters=2,
             seqlen=2,
             scheme=scheme,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         compressed_model, _ = ar.quantize_and_save(output_dir=self.save_dir, format="llm_compressor")
         tmp_layer = compressed_model.model.layers[3].self_attn.q_proj
@@ -59,18 +54,18 @@ def test_fp8input_mxfp4_llmcompressor_format(self):
         ), f"Invalid MXFP4 quantization configuration: {quantization_config}"
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
-    def test_nvfp4_llmcompressor_format(self):
+    def test_nvfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader):
         scheme = "nvfp4"
         autoround = AutoRound(
-            self.model_name,
+            tiny_opt_model_path,
             scheme=scheme,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_dir
         compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor")
-        tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj
+        tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj
         assert (
             hasattr(tmp_layer, "weight_scale")
             and hasattr(tmp_layer, "weight_global_scale")
@@ -110,38 +105,37 @@ def test_nvfp4_llmcompressor_format(self):
         #     if "France" in prompt:
         #         assert "Paris" in generated_text
 
-    def test_nvfp4_moe_actmax_rtn(self):
-        model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite"
+    def test_nvfp4_moe_actmax_rtn(self, tiny_deepseek_v2_model_path, dataloader):
+        # model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite"
         scheme = "nvfp4"
         autoround = AutoRound(
-            model_name,
+            tiny_deepseek_v2_model_path,
             scheme=scheme,
             iters=0,
             seqlen=2,
             nsamples=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
         quantized_model_path = self.save_dir
         autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round")
 
-    def test_nvfp4_moe_actmax_ar(self):
-        model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite"
+    def test_nvfp4_moe_actmax_ar(self, tiny_deepseek_v2_model_path, dataloader):
         scheme = "nvfp4"
         autoround = AutoRound(
-            model_name,
+            tiny_deepseek_v2_model_path,
             scheme=scheme,
             iters=1,
             seqlen=2,
             nsamples=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         autoround.quantize()
         quantized_model_path = self.save_dir
         autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round")
 
-    def test_qwen_moe_quant_infer(self):
-        model_name = "/models/Qwen1.5-MoE-A2.7B"
+    def test_qwen_moe_quant_infer(self, dataloader):
+        model_name = get_model_path("qwen/Qwen1.5-MoE-A2.7B")
         layer_config = {
             "layers\.(?:[3-9]|1[0-9]|2[0-3])": {"bits": 16, "act_bits": 16},
         }
@@ -152,7 +146,7 @@ def test_qwen_moe_quant_infer(self):
             iters=1,
             seqlen=3,
             nsamples=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
             layer_config=layer_config,
         )
         quantized_model_path = self.save_dir
@@ -163,9 +157,5 @@ def test_qwen_moe_quant_infer(self):
 
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="piqa")
         print(result["results"]["piqa"]["acc,none"])
-        self.assertGreater(result["results"]["piqa"]["acc,none"], 0.49)
+        assert result["results"]["piqa"]["acc,none"] > 0.49
         shutil.rmtree(quantized_model_path, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_qbits.py b/test/test_cuda/test_qbits.py
index d73d474d6..37e119b2c 100644
--- a/test/test_cuda/test_qbits.py
+++ b/test/test_cuda/test_qbits.py
@@ -1,48 +1,28 @@
 import shutil
-import sys
-import unittest
-
-sys.path.insert(0, "../..")
 
+import pytest
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound, AutoRoundConfig
 from auto_round.testing_utils import require_gptqmodel, require_itrex
 
+from ..helpers import get_model_path, model_infer
 
-class TestAutoRound(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.model_name = "/models/opt-125m"
-        self.save_folder = "./saved"
-
-    def model_infer(self, model, tokenizer):
-        prompts = [
-            "Hello,my name is",
-            # "The president of the United States is",
-            # "The capital of France is",
-            # "The future of AI is",
-        ]
-
-        inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
 
-        outputs = model.generate(
-            input_ids=inputs["input_ids"].to(model.device),
-            attention_mask=inputs["attention_mask"].to(model.device),
-            do_sample=False,  ## change this to follow official usage
-            max_new_tokens=5,
-        )
-        generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
+class TestAutoRound:
+    save_dir = "./saved"
 
-        decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
 
-        for i, prompt in enumerate(prompts):
-            print(f"Prompt: {prompt}")
-            print(f"Generated: {decoded_outputs[i]}")
-            print("-" * 50)
+        # Yield to hand control to the test methods
+        yield
 
-    @classmethod
-    def tearDownClass(self):
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
+        shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     ## require torch 2.6
@@ -58,7 +38,7 @@ def test_load_gptq_model_8bits(self):
             quantization_config=quantization_config,
         )
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
 
     @require_itrex
     def test_load_gptq_model_2bits(self):
@@ -72,12 +52,13 @@ def test_load_gptq_model_2bits(self):
             quantization_config=quantization_config,
         )
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
 
     @require_itrex
     def test_mixed_precision(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+        model_path = get_model_path("facebook/opt-125m")
+        model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto", trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
         layer_config = {}
 
         layer_config["model.decoder.layers.0.self_attn.k_proj"] = {"bits": 8}
@@ -90,27 +71,29 @@ def test_mixed_precision(self):
         autoround = AutoRound(
             model, tokenizer, bits=bits, group_size=group_size, iters=1, nsamples=1, sym=sym, layer_config=layer_config
         )
-        quantized_model_path = self.save_folder
+        quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder,
+            self.save_dir,
             torch_dtype=torch.float16,
             device_map="cpu",
         )
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
+        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
         text = "There is a girl who likes adventure,"
         inputs = tokenizer(text, return_tensors="pt").to(model.device)
         res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])
         print(res)
         assert "!!!" not in res
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+        shutil.rmtree(self.save_dir, ignore_errors=True)
 
     @require_gptqmodel
-    def test_autoround_sym(self):
+    def test_autoround_sym(self, tiny_opt_model_path):
         for bits in [4]:
-            model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-            tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+            model = AutoModelForCausalLM.from_pretrained(
+                tiny_opt_model_path, torch_dtype="auto", trust_remote_code=True
+            )
+            tokenizer = AutoTokenizer.from_pretrained(tiny_opt_model_path, trust_remote_code=True)
             bits, group_size, sym = bits, 128, True
             autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=2)
             quantized_model_path = "./saved"
@@ -126,4 +109,4 @@ def test_autoround_sym(self):
             res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])
             print(res)
             assert "!!!" not in res
-            shutil.rmtree(self.save_folder, ignore_errors=True)
+            shutil.rmtree(self.save_dir, ignore_errors=True)
diff --git a/test/test_cuda/test_scheme.py b/test/test_cuda/test_scheme.py
index 1c603c7ed..2ed5527bd 100644
--- a/test/test_cuda/test_scheme.py
+++ b/test/test_cuda/test_scheme.py
@@ -1,103 +1,104 @@
 import shutil
-import sys
-import unittest
 
+import pytest
+
+from auto_round import AutoRound
 from auto_round.schemes import QuantizationScheme
 
-sys.path.insert(0, "../..")
+from ..helpers import get_model_path
 
-from auto_round import AutoRound
 
+class TestAutoRound:
+    save_dir = "./saved"
 
-class TestAutoRound(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.model_name = "/models/opt-125m"
-        self.save_folder = "./saved"
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
 
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+        # Yield to hand control to the test methods
+        yield
+
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
+        shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     # Tuning tests
-    def test_gguf(self):
-        ar = AutoRound("/models/Qwen3-0.6B", scheme="W2A16", nsamples=1, iters=1)
-        ar.quantize_and_save(self.save_folder, format="gguf:q4_k_m")
-        self.assertEqual(ar.bits, 4)
-        shutil.rmtree(self.save_folder, ignore_errors=True)
-
-    def test_w4a16(self):
-        ar = AutoRound(self.model_name, scheme="W4A16", nsamples=1, iters=1)
-        self.assertEqual(ar.bits, 4)
+    def test_gguf(self, tiny_qwen_model_path):
+        ar = AutoRound(tiny_qwen_model_path, scheme="W2A16", nsamples=1, iters=1)
+        ar.quantize_and_save(self.save_dir, format="gguf:q4_k_m")
+        assert ar.bits == 4
+        shutil.rmtree(self.save_dir, ignore_errors=True)
+
+    def test_w4a16(self, tiny_opt_model_path):
+        ar = AutoRound(tiny_opt_model_path, scheme="W4A16", nsamples=1, iters=1)
+        assert ar.bits == 4
         ar.quantize()
 
-    def test_w2a16(self):
-        ar = AutoRound(self.model_name, scheme="W2A16", nsamples=1, iters=1)
-        self.assertEqual(ar.bits, 2)
+    def test_w2a16(self, tiny_opt_model_path):
+        ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=1)
+        assert ar.bits == 2
         ar.quantize()
 
-    def test_mxfp4(self):
-        ar = AutoRound(self.model_name, scheme="MXFP4", nsamples=1, iters=1)
-        self.assertEqual(ar.bits, 4)
-        self.assertEqual(ar.act_bits, 4)
-        self.assertEqual(ar.data_type, "mx_fp")
-        self.assertEqual(ar.act_data_type, "mx_fp_rceil")
+    def test_mxfp4(self, tiny_opt_model_path):
+        ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=1)
+        assert ar.bits == 4
+        assert ar.act_bits == 4
+        assert ar.data_type == "mx_fp"
+        assert ar.act_data_type == "mx_fp_rceil"
         ar.quantize()
 
-    def test_fp8_static(self):
-        ar = AutoRound(self.model_name, scheme="FP8_STATIC", nsamples=1, iters=1)
-        self.assertEqual(ar.bits, 8)
-        self.assertEqual(ar.act_bits, 8)
-        self.assertEqual(ar.data_type, "fp")
-        self.assertEqual(ar.act_data_type, "fp")
-        self.assertEqual(ar.group_size, -1)
-        self.assertEqual(ar.act_dynamic, False)
+    def test_fp8_static(self, tiny_opt_model_path):
+        ar = AutoRound(tiny_opt_model_path, scheme="FP8_STATIC", nsamples=1, iters=1)
+        assert ar.bits == 8
+        assert ar.act_bits == 8
+        assert ar.data_type == "fp"
+        assert ar.act_data_type == "fp"
+        assert ar.group_size == -1
+        assert ar.act_dynamic is False
         ar.quantize()
 
     ## RTN tests
-    def test_w2a16_rtn(self):
-        ar = AutoRound(self.model_name, scheme="W2A16", nsamples=1, iters=0)
-        self.assertEqual(ar.bits, 2)
+    def test_w2a16_rtn(self, tiny_opt_model_path):
+        ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=0)
+        assert ar.bits == 2
         ar.quantize()
 
-    def test_mxfp4_rtn(self):
-        ar = AutoRound(self.model_name, scheme="MXFP4", nsamples=1, iters=0)
-        self.assertEqual(ar.bits, 4)
-        self.assertEqual(ar.act_bits, 4)
-        self.assertEqual(ar.data_type, "mx_fp")
-        self.assertEqual(ar.act_data_type, "mx_fp_rceil")
+    def test_mxfp4_rtn(self, tiny_opt_model_path):
+        ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=0)
+        assert ar.bits == 4
+        assert ar.act_bits == 4
+        assert ar.data_type == "mx_fp"
+        assert ar.act_data_type == "mx_fp_rceil"
         ar.quantize()
 
-    def test_fp8_static_rtn(self):
-        ar = AutoRound(self.model_name, scheme="FP8_STATIC", nsamples=1, iters=0)
-        self.assertEqual(ar.bits, 8)
-        self.assertEqual(ar.act_bits, 8)
-        self.assertEqual(ar.data_type, "fp")
-        self.assertEqual(ar.act_data_type, "fp")
-        self.assertEqual(ar.group_size, -1)
-        self.assertEqual(ar.act_dynamic, False)
+    def test_fp8_static_rtn(self, tiny_opt_model_path):
+        ar = AutoRound(tiny_opt_model_path, scheme="FP8_STATIC", nsamples=1, iters=0)
+        assert ar.bits == 8
+        assert ar.act_bits == 8
+        assert ar.data_type == "fp"
+        assert ar.act_data_type == "fp"
+        assert ar.group_size == -1
+        assert ar.act_dynamic is False
         ar.quantize()
 
     def test_scheme_in_layer_config(self):
+        model_path = get_model_path("facebook/opt-125m")
         layer_config = {
             "model.decoder.layers.2.self_attn": {"bits": 2},
             "model.decoder.layers.3.self_attn.v_proj": "W8A16",
             "model.decoder.layers.4.self_attn.k_proj": QuantizationScheme.from_dict({"group_size": 64}),
         }
-        ar = AutoRound(self.model_name, scheme="W3A16", nsamples=1, iters=1, layer_config=layer_config)
+        ar = AutoRound(model_path, scheme="W3A16", nsamples=1, iters=1, layer_config=layer_config)
 
         ar.quantize()
         for n, m in ar.model.named_modules():
             if n == "model.decoder.layers.2.self_attn.q_proj":
-                self.assertEqual(m.bits, 2)
+                assert m.bits == 2
             if n == "model.decoder.layers.2.self_attn.k_proj":
-                self.assertEqual(m.bits, 2)
+                assert m.bits == 2
             if n == "model.decoder.layers.3.self_attn.v_proj":
-                self.assertEqual(m.bits, 8)
+                assert m.bits == 8
             if n == "model.decoder.layers.4.self_attn.k_proj":
-                self.assertEqual(m.group_size, 64)
-
-
-if __name__ == "__main__":
-    unittest.main()
+                assert m.group_size == 64
diff --git a/test/test_cuda/test_support_vlms.py b/test/test_cuda/test_support_vlms.py
index 5a2759021..3358c8226 100644
--- a/test/test_cuda/test_support_vlms.py
+++ b/test/test_cuda/test_support_vlms.py
@@ -1,10 +1,8 @@
 import os
 import shutil
 import sys
-import unittest
-
-sys.path.insert(0, "../..")
 
+import pytest
 import requests
 from PIL import Image
 
@@ -12,15 +10,15 @@
 from auto_round.testing_utils import require_gptqmodel, require_package_version_ut, require_vlm_env
 
 
-class TestSupportVLMS(unittest.TestCase):
+class TestSupportVLMS:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.save_dir = os.path.join(os.path.dirname(__file__), "ut_saved")
         self.python_path = sys.executable
         self.device = 0
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     @require_gptqmodel
@@ -28,10 +26,10 @@ def test_qwen2(self):
         model_path = "/models/Qwen2-VL-2B-Instruct/"
         # test tune
         res = os.system(
-            f"cd ../.. && {self.python_path} -m auto_round --mllm "
+            f"cd .. && {self.python_path} -m auto_round --mllm "
             f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}"
         )
-        self.assertFalse(res > 0 or res == -1, msg="qwen2 tuning fail")
+        assert not (res > 0 or res == -1), "qwen2 tuning fail"
 
         # test infer
         quantized_model_path = os.path.join(self.save_dir, "Qwen2-VL-2B-Instruct-w4g128")
@@ -83,10 +81,10 @@ def test_phi3(self):
         model_path = "/models/Phi-3.5-vision-instruct/"
         ## test tune
         res = os.system(
-            f"cd ../.. && {self.python_path} -m auto_round --mllm "
+            f"cd .. && {self.python_path} -m auto_round --mllm "
             f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}"
         )
-        self.assertFalse(res > 0 or res == -1, msg="Phi-3.5 tuning fail")
+        assert not (res > 0 or res == -1), "Phi-3.5 tuning fail"
 
         ## test infer
         from transformers import AutoModelForCausalLM, AutoProcessor
@@ -131,12 +129,12 @@ def test_phi3_vision_awq(self):
         model_path = "/models/Phi-3.5-vision-instruct/"
         ## test tune
         res = os.system(
-            f"cd ../.. && {self.python_path} -m auto_round --mllm "
+            f"cd .. && {self.python_path} -m auto_round --mllm "
             f"--model {model_path} --iter 2 --quant_nontext_module "
             f"--nsample 64 --seqlen 32 "
             f"--format auto_awq --output_dir {self.save_dir} --device {self.device}"
         )
-        self.assertFalse(res > 0 or res == -1, msg="Phi-3.5 tuning fail")
+        assert not (res > 0 or res == -1), "Phi-3.5 tuning fail"
 
         ## test infer
         from transformers import AutoModelForCausalLM, AutoProcessor
@@ -179,20 +177,16 @@ def test_glm(self):
         model_path = "/models/glm-4v-9b/"
         ## test tune
         res = os.system(
-            f"cd ../.. && {self.python_path} -m auto_round "
+            f"cd .. && {self.python_path} -m auto_round "
             f"--model {model_path} --iter 1 --output_dir {self.save_dir} --device {self.device}"
         )
-        self.assertFalse(res > 0 or res == -1, msg="glm-4v-9b tuning fail")
+        assert not (res > 0 or res == -1), "glm-4v-9b tuning fail"
 
     def test_granite_vision(self):
         model_path = "/models/granite-vision-3.2-2b"
         ## test tune
         res = os.system(
-            f"cd ../.. && {self.python_path} -m auto_round "
+            f"cd .. && {self.python_path} -m auto_round "
             f"--model {model_path} --iter 1 --output_dir {self.save_dir} --device {self.device}"
         )
-        self.assertFalse(res > 0 or res == -1, msg="granite-vision-3.2-2b tuning fail")
-
-
-if __name__ == "__main__":
-    unittest.main()
+        assert not (res > 0 or res == -1), "granite-vision-3.2-2b tuning fail"
diff --git a/test/test_cuda/test_torch_backend.py b/test/test_cuda/test_torch_backend.py
index 3f7cb4141..a7eb30552 100644
--- a/test/test_cuda/test_torch_backend.py
+++ b/test/test_cuda/test_torch_backend.py
@@ -1,12 +1,6 @@
 import shutil
-import sys
-import unittest
 
 import pytest
-
-sys.path.insert(0, "../..")
-
-
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -14,58 +8,30 @@
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.testing_utils import require_autogptq, require_gptqmodel
 
+from ..helpers import get_model_path, model_infer
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestAutoRoundTorchBackend(unittest.TestCase):
 
-    @classmethod
-    def setUpClass(self):
-        self.model_name = "/models/opt-125m"
-        self.save_folder = "./saved"
-        self.llm_dataloader = LLMDataLoader()
+class TestAutoRoundTorchBackend:
 
-    def model_infer(self, model, tokenizer):
-        prompts = [
-            "Hello,my name is",
-            # "The president of the United States is",
-            # "The capital of France is",
-            # "The future of AI is",
-        ]
+    save_dir = "./saved"
 
-        inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
+    @pytest.fixture(autouse=True, scope="class")
+    def setup_and_teardown_class(self):
+        # ===== SETUP (setup_class) =====
+        print("[Setup] Running before any test in class")
 
-        outputs = model.generate(
-            input_ids=inputs["input_ids"].to(model.device),
-            attention_mask=inputs["attention_mask"].to(model.device),
-            do_sample=False,  ## change this to follow official usage
-            max_new_tokens=5,
-        )
-        generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
-
-        decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-        for i, prompt in enumerate(prompts):
-            print(f"Prompt: {prompt}")
-            print(f"Generated: {decoded_outputs[i]}")
-            print("-" * 50)
-        return decoded_outputs[0]
+        # Yield to hand control to the test methods
+        yield
 
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree(self.save_folder, ignore_errors=True)
+        # ===== TEARDOWN (teardown_class) =====
+        print("[Teardown] Running after all tests in class")
+        shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
-    def test_torch_4bits_asym(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+    def test_torch_4bits_asym(self, dataloader):
+        model_path = get_model_path("facebook/opt-125m")
+        model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto", trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
         bits, group_size, sym = 4, 128, False
         autoround = AutoRound(
             model,
@@ -75,9 +41,9 @@ def test_torch_4bits_asym(self):
             sym=sym,
             iters=1,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
-        quantized_model_path = self.save_folder
+        quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel")
 
         quantization_config = AutoRoundConfig(backend="torch")
@@ -85,28 +51,29 @@ def test_torch_4bits_asym(self):
             quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.35
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
-            self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
+            self.save_dir, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.35
         torch.cuda.empty_cache()
         shutil.rmtree("./saved", ignore_errors=True)
 
-    def test_torch_4bits_sym(self):
-        model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
-        tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
+    def test_torch_4bits_sym(self, dataloader):
+        model_path = get_model_path("facebook/opt-125m")
+        model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto", trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
             model,
@@ -116,9 +83,9 @@ def test_torch_4bits_sym(self):
             sym=sym,
             iters=1,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
-        quantized_model_path = self.save_folder
+        quantized_model_path = self.save_dir
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round")  ##will convert to gptq model
 
         quantization_config = AutoRoundConfig(backend="torch")
@@ -126,14 +93,10 @@ def test_torch_4bits_sym(self):
             quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config
         )
 
-        tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        tokenizer = AutoTokenizer.from_pretrained(self.save_dir)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.28)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.28
         torch.cuda.empty_cache()
-        shutil.rmtree(self.save_folder, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
+        shutil.rmtree(self.save_dir, ignore_errors=True)
diff --git a/test/test_cuda/test_transformers.py b/test/test_cuda/test_transformers.py
index 6f953339d..f37fe94ff 100644
--- a/test/test_cuda/test_transformers.py
+++ b/test/test_cuda/test_transformers.py
@@ -14,8 +14,8 @@
 import gc
 import os
 import tempfile
-import unittest
 
+import pytest
 from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer
 from transformers.testing_utils import (
     require_accelerate,
@@ -27,6 +27,8 @@
 )
 from transformers.utils import is_torch_available
 
+from ..helpers import get_model_path
+
 if is_torch_available():
     import torch
 
@@ -34,7 +36,7 @@
 # @slow
 @require_torch_gpu
 @require_accelerate
-class AutoRoundTest(unittest.TestCase):
+class AutoRoundTest:
     model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc"
     input_text = "There is a girl who likes adventure,"
     EXPECTED_OUTPUTS = set()
@@ -53,7 +55,7 @@ class AutoRoundTest(unittest.TestCase):
 
     # called only once for all test in this class
     @classmethod
-    def setUpClass(cls):
+    def setup_class(cls):
         """
         Setup quantized model
         """
@@ -74,12 +76,12 @@ def test_quantized_model(self):
         """
         input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
         output = self.quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False)
-        self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
+        assert self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUTS
 
-    def test_raise_if_non_quantized(self):
-        model_id = "facebook/opt-125m"
+    def test_raise_if_non_quantized(self, tiny_opt_model_path):
+        model_id = tiny_opt_model_path
         quantization_config = AutoRoundConfig(bits=4)
-        with self.assertRaises(ValueError):
+        with pytest.raises(ValueError):
             _ = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
 
     def test_quantized_model_bf16(self):
@@ -96,7 +98,7 @@ def test_quantized_model_bf16(self):
         )
 
         output = quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False)
-        self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
+        assert self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUTS
 
     @require_intel_extension_for_pytorch
     def test_quantized_model_on_cpu(self):
@@ -108,7 +110,7 @@ def test_quantized_model_on_cpu(self):
         quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto")
         output = quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False)
 
-        self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
+        assert self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUTS
 
     def test_save_pretrained(self):
         """
@@ -131,7 +133,7 @@ def test_save_pretrained(self):
             input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device)
 
             output = model.generate(**input_ids, max_new_tokens=40, do_sample=False)
-            self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
+            assert self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUTS
 
     @require_torch_multi_gpu
     def test_quantized_model_multi_gpu(self):
@@ -144,7 +146,7 @@ def test_quantized_model_multi_gpu(self):
         )
         input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(quantized_model.device)
         output = quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False)
-        self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS)
+        assert self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUTS
 
     def test_convert_from_gptq(self):
         """
@@ -185,7 +187,7 @@ def test_mixed_bits(self):
         """
         Simple test that checks if auto-round work properly with mixed bits
         """
-        model_name = "facebook/opt-125m"
+        model_name = get_model_path("facebook/opt-125m")
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         layer_config = {
@@ -203,7 +205,3 @@ def test_mixed_bits(self):
             text = "There is a girl who likes adventure,"
             inputs = tokenizer(text, return_tensors="pt").to(model.device)
             tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_triton_backend.py b/test/test_cuda/test_triton_backend.py
index 7cbc8719d..ac5436f47 100644
--- a/test/test_cuda/test_triton_backend.py
+++ b/test/test_cuda/test_triton_backend.py
@@ -1,8 +1,6 @@
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -10,56 +8,22 @@
 from auto_round.eval.evaluation import simple_evaluate_user_model
 from auto_round.testing_utils import require_greater_than_050
 
+from ..helpers import model_infer
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
 
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestAutoRoundTritonBackend(unittest.TestCase):
+class TestAutoRoundTritonBackend:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.model_name = "/models/opt-125m"
         self.save_folder = "./saved"
-        self.llm_dataloader = LLMDataLoader()
-
-    def model_infer(self, model, tokenizer):
-        prompts = [
-            "Hello,my name is",
-            # "The president of the United States is",
-            # "The capital of France is",
-            # "The future of AI is",
-        ]
-
-        inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
-
-        outputs = model.generate(
-            input_ids=inputs["input_ids"].to(model.device),
-            attention_mask=inputs["attention_mask"].to(model.device),
-            do_sample=False,  ## change this to follow official usage
-            max_new_tokens=5,
-        )
-        generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
-
-        decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-        for i, prompt in enumerate(prompts):
-            print(f"Prompt: {prompt}")
-            print(f"Generated: {decoded_outputs[i]}")
-            print("-" * 50)
-        return decoded_outputs[0]
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
     @require_greater_than_050
-    def test_tritonv2_4bits_asym(self):
+    def test_tritonv2_4bits_asym(self, dataloader):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, False
@@ -71,7 +35,7 @@ def test_tritonv2_4bits_asym(self):
             sym=sym,
             iters=1,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_folder
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel")
@@ -82,10 +46,10 @@ def test_tritonv2_4bits_asym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.34)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.34
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
@@ -93,10 +57,10 @@ def test_tritonv2_4bits_asym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.34)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.34
         torch.cuda.empty_cache()
         shutil.rmtree("./saved", ignore_errors=True)
 
@@ -115,10 +79,10 @@ def test_tritonv2_2bits_asym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.19)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.19
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
@@ -126,15 +90,15 @@ def test_tritonv2_2bits_asym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.19)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.19
         torch.cuda.empty_cache()
         shutil.rmtree("./saved", ignore_errors=True)
 
     @require_greater_than_050
-    def test_tritonv2_4bits_sym(self):
+    def test_tritonv2_4bits_sym(self, dataloader):
         model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
@@ -146,7 +110,7 @@ def test_tritonv2_4bits_sym(self):
             sym=sym,
             iters=1,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = self.save_folder
         autoround.quantize_and_save(output_dir=quantized_model_path)
@@ -157,10 +121,10 @@ def test_tritonv2_4bits_sym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         # print(result['results']['lambada_openai']['acc,none'])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.26)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.26
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
@@ -168,10 +132,10 @@ def test_tritonv2_4bits_sym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         # print(result['results']['lambada_openai']['acc,none'])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.26)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.26
         torch.cuda.empty_cache()
 
         shutil.rmtree("./saved", ignore_errors=True)
@@ -191,10 +155,10 @@ def test_tritonv2_8bits_sym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.27
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
@@ -202,10 +166,10 @@ def test_tritonv2_8bits_sym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         # print(result['results']['lambada_openai']['acc,none'])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.27
         torch.cuda.empty_cache()
         shutil.rmtree("./saved", ignore_errors=True)
 
@@ -230,10 +194,10 @@ def test_tritonv2_2bits_sym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         print(result["results"]["lambada_openai"]["acc,none"])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.18)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.18
         torch.cuda.empty_cache()
 
         model = AutoModelForCausalLM.from_pretrained(
@@ -241,13 +205,9 @@ def test_tritonv2_2bits_sym(self):
         )
 
         tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
-        self.model_infer(model, tokenizer)
+        model_infer(model, tokenizer)
         result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
         # print(result['results']['lambada_openai']['acc,none'])
-        self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.18)
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.18
         torch.cuda.empty_cache()
         shutil.rmtree("./saved", ignore_errors=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/test_cuda/test_vlms.py b/test/test_cuda/test_vlms.py
index d06c48ff5..c8a4adb53 100644
--- a/test/test_cuda/test_vlms.py
+++ b/test/test_cuda/test_vlms.py
@@ -2,26 +2,22 @@
 import os
 import re
 import shutil
-import sys
-import unittest
 
+import pytest
 import requests
-
-sys.path.insert(0, "../..")
-
 from PIL import Image
 
 from auto_round import AutoRoundConfig
 from auto_round.testing_utils import require_gptqmodel, require_optimum, require_vlm_env
 
 
-class TestAutoRound(unittest.TestCase):
+class TestAutoRound:
     @classmethod
-    def setUpClass(self):
+    def setup_class(self):
         self.save_dir = "./saved"
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree(self.save_dir, ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
 
@@ -123,12 +119,12 @@ def test_mm_block_name(self):
 
         model = MllamaForConditionalGeneration.from_pretrained(model_name, trust_remote_code=True, device_map="auto")
         block_name = get_block_names(model, quant_vision=True)
-        self.assertTrue(len(block_name) == 3)
-        self.assertTrue(any(["vision_model.global_transformer.layers.0" not in n for n in block_name]))
-        self.assertTrue(any(["vision_model.transformer.layers.0" not in n for n in block_name]))
+        assert len(block_name) == 3
+        assert any(["vision_model.global_transformer.layers.0" not in n for n in block_name])
+        assert any(["vision_model.transformer.layers.0" not in n for n in block_name])
         block_name = get_block_names(model, quant_vision=False)
-        self.assertTrue(len(block_name) == 1)
-        self.assertTrue(get_block_names(model) == block_name)
+        assert len(block_name) == 1
+        assert get_block_names(model) == block_name
 
     def test_mllm_detect(self):
         from auto_round.utils import is_mllm_model, llm_load_model, mllm_load_model
@@ -144,18 +140,14 @@ def test_mllm_detect(self):
             "/models/InternVL3-1B",
             "/models/pixtral-12b",
         ]:
-            self.assertTrue(is_mllm_model(model_name))
+            assert is_mllm_model(model_name)
             try:
                 model, _, _, _ = mllm_load_model(model_name)
             except:
                 continue
-            self.assertTrue(is_mllm_model(model))
+            assert is_mllm_model(model)
 
         for model_name in ["/models/glm-4-9b-chat", "/models/Qwen2.5-1.5B-Instruct/"]:
-            self.assertFalse(is_mllm_model(model_name))
+            assert not is_mllm_model(model_name)
             model, _ = llm_load_model(model_name)
-            self.assertFalse(is_mllm_model(model))
-
-
-if __name__ == "__main__":
-    unittest.main()
+            assert not is_mllm_model(model)
diff --git a/test/test_hpu/__init__.py b/test/test_hpu/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/test_hpu/_test_helpers.py b/test/test_hpu/_test_helpers.py
deleted file mode 100644
index 48e8398d7..000000000
--- a/test/test_hpu/_test_helpers.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import pytest
-
-
-def is_pytest_mode_compile():
-    return pytest.mode == "compile"
-
-
-def is_pytest_mode_lazy():
-    return pytest.mode == "lazy"
-
-
-def model_infer(model, tokenizer, apply_chat_template=False):
-    prompts = [
-        "Hello,my name is",
-        # "The president of the United States is",
-        # "The capital of France is",
-        # "The future of AI is",
-    ]
-    if apply_chat_template:
-        texts = []
-        for prompt in prompts:
-            messages = [{"role": "user", "content": prompt}]
-            text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-            texts.append(text)
-        prompts = texts
-
-    inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True)
-
-    outputs = model.generate(
-        input_ids=inputs["input_ids"].to(model.device),
-        attention_mask=inputs["attention_mask"].to(model.device),
-        do_sample=False,  ## change this to follow official usage
-        max_new_tokens=5,
-    )
-    generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)]
-
-    decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-
-    for i, prompt in enumerate(prompts):
-        print(f"Prompt: {prompt}")
-        print(f"Generated: {decoded_outputs[i]}")
-        print("-" * 50)
-    return decoded_outputs[0]
diff --git a/test/test_hpu/test_auto_round.py b/test/test_hpu/test_auto_round.py
index 2bb7983e5..d2e33dd03 100644
--- a/test/test_hpu/test_auto_round.py
+++ b/test/test_hpu/test_auto_round.py
@@ -1,16 +1,17 @@
 import pytest
 import torch
-from _test_helpers import is_pytest_mode_compile, is_pytest_mode_lazy
 
 from auto_round.utils import is_hpex_available
 
+from ..helpers import get_model_path, is_pytest_mode_compile, is_pytest_mode_lazy
+
 
 def run_opt_125m_on_hpu():
     from transformers import AutoModelForCausalLM, AutoTokenizer
 
     from auto_round import AutoRound
 
-    model_name = "facebook/opt-125m"
+    model_name = get_model_path("facebook/opt-125m")
     model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 
@@ -55,7 +56,7 @@ def test_w4a8(data_type):
 
     from auto_round import AutoRound
 
-    model_name = "facebook/opt-125m"
+    model_name = get_model_path("facebook/opt-125m")
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
         torch_dtype="auto",
diff --git a/test/test_hpu/test_inference.py b/test/test_hpu/test_inference.py
index e0a0ef321..95c680c2d 100644
--- a/test/test_hpu/test_inference.py
+++ b/test/test_hpu/test_inference.py
@@ -1,23 +1,12 @@
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound
 
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
-
-    def __iter__(self):
-        for i in range(2):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
 def is_hpex_available():
     try:
         import habana_frameworks.torch.core as htcore  # pylint: disable=E0401
@@ -28,16 +17,15 @@ def is_hpex_available():
 
 # TODO: This test case is temporarily commented out since it not tested for a long time. We need to add it back and change it into pytest format.
 
-# class TestAutoRound(unittest.TestCase):
+# class TestAutoRound:
 #     @classmethod
-#     def setUpClass(self):
+#     def setup_class(self):
 #         model_name = "facebook/opt-125m"
 #         self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
 #         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-#         self.llm_dataloader = LLMDataLoader()
 
 #     @classmethod
-#     def tearDownClass(self):
+#     def teardown_class(self):
 #         shutil.rmtree("./saved", ignore_errors=True)
 #         shutil.rmtree("runs", ignore_errors=True)
 
@@ -57,7 +45,7 @@ def is_hpex_available():
 #             sym=sym,
 #             iters=2,
 #             seqlen=2,
-#             dataset=self.llm_dataloader,
+#             dataset=dataloader,
 #         )
 #         autoround.quantize()
 #         quantized_model_path = "./saved"
@@ -86,7 +74,7 @@ def is_hpex_available():
 #             sym=sym,
 #             iters=2,
 #             seqlen=2,
-#             dataset=self.llm_dataloader,
+#             dataset=dataloader,
 #         )
 #         autoround.quantize()
 #         quantized_model_path = "./saved"
diff --git a/test/test_xpu/__init__.py b/test/test_xpu/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/test/test_xpu/test_autoround.py b/test/test_xpu/test_autoround.py
index 8052a8af0..d857e3bdc 100644
--- a/test/test_xpu/test_autoround.py
+++ b/test/test_xpu/test_autoround.py
@@ -1,39 +1,29 @@
 import copy
 import shutil
-import sys
-import unittest
 
-sys.path.insert(0, "../..")
+import pytest
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from auto_round import AutoRound, AutoRoundConfig
 
+from ..helpers import get_model_path
 
-class LLMDataLoader:
-    def __init__(self):
-        self.batch_size = 1
 
-    def __iter__(self):
-        for i in range(3):
-            yield torch.ones([1, 10], dtype=torch.long)
-
-
-class TestAutoRoundXPU(unittest.TestCase):
+class TestAutoRoundXPU:
     @classmethod
-    def setUpClass(self):
-
-        self.llm_dataloader = LLMDataLoader()
+    def setup_class(self):
+        pass
 
     @classmethod
-    def tearDownClass(self):
+    def teardown_class(self):
         shutil.rmtree("./saved", ignore_errors=True)
         shutil.rmtree("runs", ignore_errors=True)
         pass
 
-    def test_gptq_format(self):
-        model_name = "facebook/opt-125m"
+    def test_gptq_format(self, dataloader):
+        model_name = get_model_path("facebook/opt-125m")
         model = AutoModelForCausalLM.from_pretrained(
             model_name, torch_dtype="auto", trust_remote_code=True, device_map="auto"
         )
@@ -48,7 +38,7 @@ def test_gptq_format(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path)
@@ -65,8 +55,8 @@ def test_gptq_format(self):
         print(res)
         assert "!!!" not in res
 
-    def test_awq_format(self):
-        model_name = "facebook/opt-125m"
+    def test_awq_format(self, dataloader):
+        model_name = get_model_path("facebook/opt-125m")
         model = AutoModelForCausalLM.from_pretrained(
             model_name, torch_dtype="auto", trust_remote_code=True, device_map="xpu"
         )
@@ -80,7 +70,7 @@ def test_awq_format(self):
             sym=sym,
             iters=2,
             seqlen=2,
-            dataset=self.llm_dataloader,
+            dataset=dataloader,
         )
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq")
@@ -97,7 +87,3 @@ def test_awq_format(self):
         res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])
         print(res)
         assert "!!!" not in res
-
-
-if __name__ == "__main__":
-    unittest.main()