diff --git a/.azure-pipelines/scripts/ut/run_ut.sh b/.azure-pipelines/scripts/ut/run_ut.sh index dcf1a7170..e7d3d9e00 100644 --- a/.azure-pipelines/scripts/ut/run_ut.sh +++ b/.azure-pipelines/scripts/ut/run_ut.sh @@ -19,8 +19,7 @@ cd /auto-round && uv pip install . echo "##[endgroup]" uv pip list -cd /auto-round/test/test_cpu || exit 1 -find . -type f -exec sed -i '/sys\.path\.insert(0, "\.\.")/d' {} + +cd /auto-round/test || exit 1 export LD_LIBRARY_PATH=${HOME}/.venv/lib/:$LD_LIBRARY_PATH export FORCE_BF16=1 @@ -32,7 +31,7 @@ mkdir -p ${LOG_DIR} ut_log_name=${LOG_DIR}/ut.log # Split test files into 5 parts -find . -name "test*.py" | sort > all_tests.txt +find ./test_cpu -name "test*.py" | sort > all_tests.txt total_lines=$(wc -l < all_tests.txt) NUM_CHUNKS=5 q=$(( total_lines / NUM_CHUNKS )) diff --git a/.azure-pipelines/scripts/ut/run_ut_cuda.sh b/.azure-pipelines/scripts/ut/run_ut_cuda.sh index 18a9bb00d..0f111d3fa 100644 --- a/.azure-pipelines/scripts/ut/run_ut_cuda.sh +++ b/.azure-pipelines/scripts/ut/run_ut_cuda.sh @@ -27,16 +27,14 @@ function create_conda_env() { # install AutoRound cd ${REPO_PATH} - pip uninstall auto-round -y + uv pip install torch==2.8.0 torchvision uv pip install -r requirements.txt - sed -i '/^torch==/d;/^transformers==/d;/^lm-eval==/d' requirements.txt if [ -d "/proc/driver/nvidia" ]; then export PATH=/usr/local/cuda/bin${PATH:+:${PATH}} export LD_LIBRARY_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/nvidia/nvjitlink/lib:$LD_LIBRARY_PATH fi uv pip install --no-build-isolation . uv pip install pytest-cov pytest-html cmake==4.0.2 - uv pip install torch==2.8.0 torchvision } function print_test_results_table() { @@ -92,7 +90,7 @@ function run_unit_test() { # install unit test dependencies create_conda_env - cd ${REPO_PATH}/test/test_cuda + cd ${REPO_PATH}/test rm -rf .coverage* *.xml *.html uv pip install -v git+https://github.com/casper-hansen/AutoAWQ.git --no-build-isolation @@ -100,15 +98,15 @@ function run_unit_test() { uv pip install -r https://raw.githubusercontent.com/ModelCloud/GPTQModel/refs/heads/main/requirements.txt CMAKE_ARGS="-DGGML_CUDA=on -DLLAVA_BUILD=off" uv pip install llama-cpp-python uv pip install 'git+https://github.com/ggml-org/llama.cpp.git#subdirectory=gguf-py' - uv pip install -r requirements.txt - uv pip install -r requirements_diffusion.txt + uv pip install -r test_cuda/requirements.txt + uv pip install -r test_cuda/requirements_diffusion.txt pip list > ${LOG_DIR}/ut_pip_list.txt export COVERAGE_RCFILE=${REPO_PATH}/.azure-pipelines/scripts/ut/.coverage local auto_round_path=$(python -c 'import auto_round; print(auto_round.__path__[0])') # run unit tests individually with separate logs - for test_file in $(find . -name "test_*.py" ! -name "test_*vlms.py" ! -name "test_llmc*.py" | sort); do + for test_file in $(find ./test_cuda -name "test_*.py" ! -name "test_*vlms.py" ! -name "test_llmc*.py" | sort); do local test_basename=$(basename ${test_file} .py) local ut_log_name=${LOG_DIR}/unittest_cuda_${test_basename}.log echo "Running ${test_file}..." @@ -128,7 +126,7 @@ function run_unit_test() { function run_unit_test_vlm() { # install unit test dependencies create_conda_env - cd ${REPO_PATH}/test/test_cuda + cd ${REPO_PATH}/test rm -rf .coverage* *.xml *.html uv pip install git+https://github.com/haotian-liu/LLaVA.git@v1.2.2 --no-deps @@ -138,14 +136,14 @@ function run_unit_test_vlm() { uv pip install git+https://github.com/deepseek-ai/DeepSeek-VL2.git timm attrdict --no-deps uv pip install -v git+https://github.com/casper-hansen/AutoAWQ.git@v0.2.0 --no-build-isolation uv pip install flash-attn==2.7.4.post1 --no-build-isolation - uv pip install -r requirements_vlm.txt + uv pip install -r test_cuda/requirements_vlm.txt pip list > ${LOG_DIR}/vlm_ut_pip_list.txt export COVERAGE_RCFILE=${REPO_PATH}/.azure-pipelines/scripts/ut/.coverage local auto_round_path=$(python -c 'import auto_round; print(auto_round.__path__[0])') # run VLM unit tests individually with separate logs - for test_file in $(find . -name "test*vlms.py"); do + for test_file in $(find ./test_cuda -name "test*vlms.py"); do local test_basename=$(basename ${test_file} .py) local ut_log_name=${LOG_DIR}/unittest_cuda_vlm_${test_basename}.log echo "Running ${test_file}..." @@ -166,17 +164,17 @@ function run_unit_test_llmc() { # install unit test dependencies create_conda_env - cd ${REPO_PATH}/test/test_cuda + cd ${REPO_PATH}/test rm -rf .coverage* *.xml *.html - uv pip install -r requirements_llmc.txt + uv pip install -r test_cuda/requirements_llmc.txt pip list > ${LOG_DIR}/llmc_ut_pip_list.txt export COVERAGE_RCFILE=${REPO_PATH}/.azure-pipelines/scripts/ut/.coverage local auto_round_path=$(python -c 'import auto_round; print(auto_round.__path__[0])') # run unit tests individually with separate logs - for test_file in $(find . -name "test_llmc*.py" | sort); do + for test_file in $(find ./test_cuda -name "test_llmc*.py" | sort); do local test_basename=$(basename ${test_file} .py) local ut_log_name=${LOG_DIR}/unittest_cuda_llmc_${test_basename}.log echo "Running ${test_file}..." diff --git a/.azure-pipelines/scripts/ut/run_ut_hpu.sh b/.azure-pipelines/scripts/ut/run_ut_hpu.sh index 3c3bb6991..b370edfb5 100644 --- a/.azure-pipelines/scripts/ut/run_ut_hpu.sh +++ b/.azure-pipelines/scripts/ut/run_ut_hpu.sh @@ -7,8 +7,7 @@ export TQDM_MININTERVAL=60 pip install pytest-cov pytest-html pip list -cd /auto-round/test/test_hpu || exit 1 -find . -type f -exec sed -i '/sys\.path\.insert(0, "\.\.")/d' {} + +cd /auto-round/test || exit 1 export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH export FORCE_BF16=1 @@ -19,8 +18,8 @@ LOG_DIR=/auto-round/log_dir mkdir -p ${LOG_DIR} ut_log_name=${LOG_DIR}/ut.log -find . -name "test*.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_lazy.sh -find . -name "test*.py" | sed "s,\.\/,python -m pytest --mode compile --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_compile.sh +find ./test_hpu -name "test*.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_lazy.sh +find ./test_hpu -name "test*.py" | sed "s,\.\/,python -m pytest --mode compile --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_compile.sh cat run_lazy.sh bash run_lazy.sh 2>&1 | tee ${ut_log_name} diff --git a/.azure-pipelines/scripts/ut/run_ut_xpu.sh b/.azure-pipelines/scripts/ut/run_ut_xpu.sh index 2ab0aef64..740937d18 100644 --- a/.azure-pipelines/scripts/ut/run_ut_xpu.sh +++ b/.azure-pipelines/scripts/ut/run_ut_xpu.sh @@ -12,8 +12,7 @@ echo "##[endgroup]" uv pip list # test ark cpu part only before external xpu available -cd /auto-round/test/test_ark || exit 1 -find . -type f -exec sed -i '/sys\.path\.insert(0, "\.\.")/d' {} + +cd /auto-round/test || exit 1 export LD_LIBRARY_PATH=${HOME}/.venv/lib/:$LD_LIBRARY_PATH export COVERAGE_RCFILE=/auto-round/.azure-pipelines/scripts/ut/.coverage @@ -23,7 +22,7 @@ LOG_DIR=/auto-round/log_dir mkdir -p ${LOG_DIR} ut_log_name=${LOG_DIR}/ut.log -find . -name "test*.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run.sh +find ./test_ark -name "test*.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run.sh cat run.sh bash run.sh 2>&1 | tee "${ut_log_name}" diff --git a/auto_round/compressors/mllm/compressor.py b/auto_round/compressors/mllm/compressor.py index 6ea6d2cdf..31f97cbe3 100644 --- a/auto_round/compressors/mllm/compressor.py +++ b/auto_round/compressors/mllm/compressor.py @@ -206,6 +206,9 @@ def __init__( if hasattr(model, "name_or_path") and any([name in model.name_or_path for name in MISTRAL_3_2_MODELS]): template = "mistral3_2" if iters > 0: + # TODO: Remove after fixing https://github.com/huggingface/transformers/issues/43005 + model.config.model_type = model.config.to_dict()["model_type"] + if template is None and model.config.model_type not in TEMPLATES: self.template = None else: diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index f4bb15575..38f984663 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -1047,6 +1047,11 @@ def set_module(model, key, new_module): setattr(module, name_list[-1], new_module) +# For getting and setting attribution, such as 'lm_head.weight' +get_attr = get_module +set_attr = set_module + + def get_layer_features(layer): """Extracts input and output feature dimensions for supported layers.""" from auto_round.utils import deepspeed_exists diff --git a/test/README.md b/test/README.md new file mode 100644 index 000000000..9ccca0017 --- /dev/null +++ b/test/README.md @@ -0,0 +1,46 @@ +# Unit Test (UT) Guide + +This project uses `pytest` for unit testing. All test cases are under the `test/` directory. Below is a simple guide for new users to write and run UTs: + +## 1. Environment Setup +- Recommended Python 3.8 or above. +- Install dependencies: + ```sh + pip install -r ../requirements.txt + pip install pytest + ``` + +## 2. Test Structure +- Place your test files in the `test/` directory, and name them starting with `test_`. +- You can refer to existing `test_*.py` files. +- Common fixtures (such as `tiny_opt_model`, `opt_model`, `opt_tokenizer`, `dataloader`) and helper functions (such as `model_infer`) are defined in `confest.py` and `helpers.py` and can be imported directly. +- Example: + ```python + # test_example.py + from ..helpers import model_infer + + def test_model_infer(tiny_opt_model, opt_tokenizer): + result = model_infer(tiny_opt_model, opt_tokenizer, input_text="hello world") + assert result is not None + ``` + +## 3. Running Tests +- In the `test/` directory, run: + ```sh + pytest + ``` +- You can specify a single file or test case: + ```sh + pytest test_xxx.py + pytest -k "test_func_name" + ``` + +## 4. Debugging Tips +- `confest.py` adds the parent directory to `sys.path`, so you can debug without installing the local package. +- You can directly import project source code in your test cases. + +## 5. Reference +- Fixtures are defined in `confest.py` and `fixtures.py` +- Helper functions are in `helpers.py` + +If you have any questions, feel free to open an issue. diff --git a/test/test_hpu/conftest.py b/test/conftest.py similarity index 81% rename from test/test_hpu/conftest.py rename to test/conftest.py index f4e9675bf..d21100824 100644 --- a/test/test_hpu/conftest.py +++ b/test/conftest.py @@ -1,9 +1,16 @@ import os +import sys from typing import Mapping import pytest +from .fixtures import * +# Easy debugging without installing auto-round. +sys.path.insert(0, "..") + + +### HPU related configuration, usage: `pytest --mode=compile/lazy`` def pytest_addoption(parser): parser.addoption( "--mode", diff --git a/test/fixtures.py b/test/fixtures.py new file mode 100644 index 000000000..c76040322 --- /dev/null +++ b/test/fixtures.py @@ -0,0 +1,169 @@ +import os +import shutil + +import pytest +import torch +import transformers + +from .helpers import ( + DataLoader, + deepseek_v2_name_or_path, + gemma_name_or_path, + get_tiny_model, + gptj_name_or_path, + lamini_name_or_path, + opt_name_or_path, + phi2_name_or_path, + qwen_2_5_vl_name_or_path, + qwen_moe_name_or_path, + qwen_name_or_path, + qwen_vl_name_or_path, + save_tiny_model, +) + + +# Create tiny model path fixtures for testing +@pytest.fixture(scope="session") +def tiny_opt_model_path(): + model_name_or_path = opt_name_or_path + tiny_model_path = "./tmp/tiny_opt_model_path" + tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path) + yield tiny_model_path + shutil.rmtree(tiny_model_path) + + +@pytest.fixture(scope="session") +def tiny_lamini_model_path(): + model_name_or_path = lamini_name_or_path + tiny_model_path = "./tmp/tiny_lamini_model_path" + tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path) + yield tiny_model_path + shutil.rmtree(tiny_model_path) + + +@pytest.fixture(scope="session") +def tiny_gptj_model_path(): + model_name_or_path = gptj_name_or_path + tiny_model_path = "./tmp/tiny_gptj_model_path" + tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path) + yield tiny_model_path + shutil.rmtree(tiny_model_path) + + +@pytest.fixture(scope="session") +def tiny_phi2_model_path(): + model_name_or_path = phi2_name_or_path + tiny_model_path = "./tmp/tiny_phi2_model_path" + tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path) + yield tiny_model_path + shutil.rmtree(tiny_model_path) + + +@pytest.fixture(scope="session") +def tiny_deepseek_v2_model_path(): + model_name_or_path = deepseek_v2_name_or_path + tiny_model_path = "./tmp/tiny_deepseek_v2_model_path" + tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2) + yield tiny_model_path + shutil.rmtree(tiny_model_path) + + +@pytest.fixture(scope="session") +def tiny_gemma_model_path(): + model_name_or_path = gemma_name_or_path + tiny_model_path = "./tmp/tiny_gemma_model_path" + tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2) + yield tiny_model_path + shutil.rmtree(tiny_model_path) + + +@pytest.fixture(scope="session") +def tiny_qwen_model_path(): + model_name_or_path = qwen_name_or_path + tiny_model_path = "./tmp/tiny_qwen_model_path" + tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path) + yield tiny_model_path + shutil.rmtree(tiny_model_path) + + +@pytest.fixture(scope="session") +def tiny_untied_qwen_model_path(): + model_name_or_path = qwen_name_or_path + tiny_model_path = "./tmp/tiny_untied_qwen_model_path" + tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, force_untie=True) + yield tiny_model_path + shutil.rmtree(tiny_model_path) + + +@pytest.fixture(scope="session") +def tiny_qwen_moe_model_path(): + model_name_or_path = qwen_moe_name_or_path + tiny_model_path = "./tmp/tiny_qwen_moe_model_path" + tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2) + yield tiny_model_path + shutil.rmtree(tiny_model_path) + + +@pytest.fixture(scope="session") +def tiny_qwen_vl_model_path(): + model_name_or_path = qwen_vl_name_or_path + tiny_model_path = "./tmp/tiny_qwen_vl_model_path" + tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2, is_mllm=True) + yield tiny_model_path + shutil.rmtree(tiny_model_path) + + +@pytest.fixture(scope="session") +def tiny_qwen_2_5_vl_model_path(): + model_name_or_path = qwen_2_5_vl_name_or_path + tiny_model_path = "./tmp/tiny_qwen_2_5_vl_model_path" + tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2, is_mllm=True) + yield tiny_model_path + shutil.rmtree(tiny_model_path) + + +@pytest.fixture(autouse=True, scope="session") +def clean_tmp_model_folder(): + yield + shutil.rmtree("./tmp", ignore_errors=True) # unittest default workspace + shutil.rmtree("./tmp_autoround", ignore_errors=True) # autoround default workspace + + +# Create objective fixtures for testing +@pytest.fixture(scope="function") +def tiny_opt_model(): + model_name_or_path = opt_name_or_path + return get_tiny_model(model_name_or_path, num_layers=2) + + +@pytest.fixture(scope="function") +def opt_model(): + model_name_or_path = opt_name_or_path + model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype="auto", trust_remote_code=True) + return model + + +@pytest.fixture(scope="session") +def opt_tokenizer(): + model_name_or_path = opt_name_or_path + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) + return tokenizer + + +@pytest.fixture(scope="function") +def model(): + model_name_or_path = opt_name_or_path + model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype="auto", trust_remote_code=True) + return model + + +@pytest.fixture(scope="session") +def tokenizer(): + model_name_or_path = opt_name_or_path + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) + return tokenizer + + +@pytest.fixture(scope="session") +def dataloader(): + return DataLoader() diff --git a/test/helpers.py b/test/helpers.py new file mode 100644 index 000000000..89b832c6d --- /dev/null +++ b/test/helpers.py @@ -0,0 +1,236 @@ +import copy +import os + +import pytest +import torch +import transformers + +from auto_round.utils import get_attr, llm_load_model, mllm_load_model, set_attr + + +# Automatic choose local path or model name. +def get_model_path(model_name: str) -> str: + ut_path = f"/tf_dataset/auto_round/models/{model_name}" + local_path = f"/models/{model_name.split('/')[-1]}" + + if "DeepSeek-V2-Lite" in model_name and os.path.exists("/data0/deepseek-ai/DeepSeek-V2-Lite"): + return "/data0/deepseek-ai/DeepSeek-V2-Lite" + + if os.path.exists(ut_path): + return ut_path + elif os.path.exists(local_path): + return local_path + else: + return model_name + + +opt_name_or_path = get_model_path("facebook/opt-125m") +qwen_name_or_path = get_model_path("Qwen/Qwen3-0.6B") +lamini_name_or_path = get_model_path("MBZUAI/LaMini-GPT-124M") +gptj_name_or_path = get_model_path("hf-internal-testing/tiny-random-GPTJForCausalLM") +phi2_name_or_path = get_model_path("microsoft/phi-2") +deepseek_v2_name_or_path = get_model_path("deepseek-ai/DeepSeek-V2-Lite") +qwen_moe_name_or_path = get_model_path("Qwen/Qwen1.5-MoE-A2.7B") +qwen_vl_name_or_path = get_model_path("Qwen/Qwen2-VL-2B-Instruct") +qwen_2_5_vl_name_or_path = get_model_path("Qwen/Qwen2.5-VL-3B-Instruct") +gemma_name_or_path = get_model_path("benzart/gemma-2b-it-fine-tuning-for-code-test") + + +# Slice model into tiny model for speedup +def get_tiny_model(model_name_or_path, num_layers=2, is_mllm=False, **kwargs): + """Generate a tiny model by slicing layers from the original model.""" + model_name_or_path = get_model_path(model_name_or_path) + + def slice_layers(module): + """slice layers in the model.""" + sliced = False + for name, child in module.named_children(): + if isinstance(child, torch.nn.ModuleList) and len(child) > num_layers: + new_layers = torch.nn.ModuleList(child[:num_layers]) + setattr(module, name, new_layers) + sliced = True + elif slice_layers(child): + sliced = True + return sliced + + kwargs["dtype"] = "auto" if "auto" not in kwargs else kwargs["dtype"] + kwargs["trust_remote_code"] = True if "trust_remote_code" not in kwargs else kwargs["trust_remote_code"] + if is_mllm: + model, processor, tokenizer, image_processor = mllm_load_model(model_name_or_path, **kwargs) + if hasattr(model.config, "vision_config"): + if hasattr(model.config.vision_config, "num_hidden_layers"): # mistral, etc. + model.config.num_hidden_layers = num_layers + elif hasattr(model.config.vision_config, "depth"): # qwen vl + model.config.vision_config.depth = num_layers + else: + model, tokenizer = llm_load_model(model_name_or_path, **kwargs) + + slice_layers(model) + + if hasattr(model.config, "num_hidden_layers"): + model.config.num_hidden_layers = num_layers + if hasattr(model.config, "layer_types"): + model.config.layer_types = model.config.layer_types[:num_layers] + + return model + + +# for fixture usage only +def save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2, is_mllm=False, force_untie=False, **kwargs): + """Generate a tiny model and save to the specified path.""" + model = get_tiny_model(model_name_or_path, num_layers=num_layers, is_mllm=is_mllm, **kwargs) + if force_untie: + if getattr(getattr(model, "config", None), "tie_word_embeddings", False): + model.config.tie_word_embeddings = False + for key in model._tied_weights_keys: + weight = get_attr(model, key) + set_attr(model, key, copy.deepcopy(weight)) + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) + test_path = os.path.dirname(__file__) + tiny_model_path = os.path.join(test_path, tiny_model_path.removeprefix("./")) + model.save_pretrained(tiny_model_path) + tokenizer.save_pretrained(tiny_model_path) + if is_mllm: + processor = transformers.AutoProcessor.from_pretrained(model_name_or_path, trust_remote_code=True) + image_processor = transformers.AutoImageProcessor.from_pretrained(model_name_or_path, trust_remote_code=True) + processor.save_pretrained(tiny_model_path) + image_processor.save_pretrained(tiny_model_path) + print(f"[Fixture]: built tiny model path:{tiny_model_path} for testing in session") + return tiny_model_path + + +# HPU mode checking +def is_pytest_mode_compile(): + return pytest.mode == "compile" + + +def is_pytest_mode_lazy(): + return pytest.mode == "lazy" + + +# General model inference code +def model_infer(model, tokenizer, apply_chat_template=False): + """Run model inference and print generated outputs.""" + prompts = [ + "Hello,my name is", + # "The president of the United States is", + # "The capital of France is", + # "The future of AI is", + ] + if apply_chat_template: + texts = [] + for prompt in prompts: + messages = [{"role": "user", "content": prompt}] + text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + texts.append(text) + prompts = texts + + inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) + + outputs = model.generate( + input_ids=inputs["input_ids"].to(model.device), + attention_mask=inputs["attention_mask"].to(model.device), + do_sample=False, ## change this to follow official usage + max_new_tokens=5, + ) + generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] + + decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + + for i, prompt in enumerate(prompts): + print(f"Prompt: {prompt}") + print(f"Generated: {decoded_outputs[i]}") + print("-" * 50) + return decoded_outputs[0] + + +# Dummy dataloader for testing +class DataLoader: + def __init__(self): + self.batch_size = 1 + + def __iter__(self): + for i in range(2): + yield torch.ones([1, 10], dtype=torch.long) + + +fixed_input = torch.tensor([[10, 20, 30, 40, 50]], dtype=torch.long) + + +def get_output(model_name_or_path): + """Get model output for fixed input.""" + try: + model, tokenizer = llm_load_model(model_name_or_path) + except: + model, processor, tokenizer, image_processor = mllm_load_model(model_name_or_path) + outputs = model(fixed_input)[0] + return outputs.detach().cpu() + + +def is_model_outputs_similar(model_path_1, model_path_2, metric="cosine_similarity", threshold=0.98, k=5, verbose=True): + """ + Compare outputs from two models using specified metric and return pass/fail. + + Args: + model_path_1: Path to first model + model_path_2: Path to second model + metric: Metric to use - "mse", "cosine_similarity"/"cos_sim", or "topk" + threshold: Threshold value for pass/fail + k: K value for top-k metric (only used when metric="topk") + verbose: Whether to print detailed results + + Returns: + bool: True if metric passes threshold, False otherwise + """ + if verbose: + print(f"\n{'='*70}") + print("Comparing Model Outputs") + print(f"{'='*70}") + print(f"Model 1: {model_path_1}") + print(f"Model 2: {model_path_2}") + print(f"Metric: {metric} | Threshold: {threshold}" + (f" | K: {k}" if "top" in metric.lower() else "")) + print(f"{'='*70}\n") + + output_1 = get_output(model_path_1) + output_2 = get_output(model_path_2) + metric = metric.lower().replace("-", "_") + + # Calculate metric and check threshold + if metric == "mse": + value = torch.mean((output_1.float() - output_2.float()) ** 2).item() + passed = value <= threshold + if verbose: + print(f"MSE: {value:.6f} | Threshold: <= {threshold} | {'✓ PASS' if passed else '✗ FAIL'}\n") + + elif metric in ["cosine_similarity", "cos_sim", "cosine"]: + out1 = output_1.float().flatten() + out2 = output_2.float().flatten() + value = torch.nn.functional.cosine_similarity(out1.unsqueeze(0), out2.unsqueeze(0)).item() + passed = value >= threshold + if verbose: + print(f"Cosine Similarity: {value:.6f} | Threshold: >= {threshold} | {'✓ PASS' if passed else '✗ FAIL'}\n") + + elif metric in ["topk", "top_k"]: + _, topk_1 = torch.topk(output_1, k=min(k, output_1.size(-1)), dim=-1) + _, topk_2 = torch.topk(output_2, k=min(k, output_2.size(-1)), dim=-1) + + total_agreement = 0 + total_positions = topk_1.numel() // topk_1.size(-1) + + for i in range(topk_1.size(0)): + for j in range(topk_1.size(1)): + set1 = set(topk_1[i, j].tolist()) + set2 = set(topk_2[i, j].tolist()) + total_agreement += len(set1 & set2) / k + + value = total_agreement / total_positions + passed = value >= threshold + if verbose: + print( + f"Top-{k} Agreement: {value:.4%} | Threshold: >= {threshold:.4%} | {'✓ PASS' if passed else '✗ FAIL'}\n" + ) + + else: + raise ValueError(f"Unknown metric: {metric}. Choose from: 'mse', 'cosine_similarity', 'topk'") + + return passed diff --git a/test/test_ark/__init__.py b/test/test_ark/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/test_ark/test_model.py b/test/test_ark/test_model.py index 09d8bf25a..bd4734609 100644 --- a/test/test_ark/test_model.py +++ b/test/test_ark/test_model.py @@ -2,65 +2,27 @@ import sys import pytest - -sys.path.insert(0, "../..") - import torch from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound, AutoRoundConfig from auto_round.eval.evaluation import simple_evaluate_user_model -from auto_round.testing_utils import require_autogptq, require_gptqmodel - -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) +from ..helpers import get_model_path, model_infer class TestAutoRoundARKBackend: @classmethod def setup_class(self): - self.model_name = "facebook/opt-125m" + self.model_name = get_model_path("facebook/opt-125m") self.save_folder = "./saved" - self.llm_dataloader = LLMDataLoader() @classmethod def teardown_class(self): shutil.rmtree(self.save_folder, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def model_infer(self, model, tokenizer): - prompts = [ - "Hello,my name is", - # "The president of the United States is", - # "The capital of France is", - # "The future of AI is", - ] - - inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) - - outputs = model.generate( - input_ids=inputs["input_ids"].to(model.device), - attention_mask=inputs["attention_mask"].to(model.device), - do_sample=False, ## change this to follow official usage - max_new_tokens=5, - ) - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] - - decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - - for i, prompt in enumerate(prompts): - print(f"Prompt: {prompt}") - print(f"Generated: {decoded_outputs[i]}") - print("-" * 50) - return decoded_outputs[0] - def main_op(self, format, bits, group_size, sym, dtype, device, fast_cfg=True, tar_acc=0.28): limit = 100 if device == "xpu": @@ -86,7 +48,7 @@ def main_op(self, format, bits, group_size, sym, dtype, device, fast_cfg=True, t ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=limit) print(result["results"]["lambada_openai"]["acc,none"]) assert result["results"]["lambada_openai"]["acc,none"] > tar_acc diff --git a/test/test_cpu/__init__.py b/test/test_cpu/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/test_cpu/_test_helpers.py b/test/test_cpu/_test_helpers.py deleted file mode 100644 index b4b8a5955..000000000 --- a/test/test_cpu/_test_helpers.py +++ /dev/null @@ -1,32 +0,0 @@ -def model_infer(model, tokenizer, apply_chat_template=False): - prompts = [ - "Hello,my name is", - # "The president of the United States is", - # "The capital of France is", - # "The future of AI is", - ] - if apply_chat_template: - texts = [] - for prompt in prompts: - messages = [{"role": "user", "content": prompt}] - text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - texts.append(text) - prompts = texts - - inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) - - outputs = model.generate( - input_ids=inputs["input_ids"].to(model.device), - attention_mask=inputs["attention_mask"].to(model.device), - do_sample=False, ## change this to follow official usage - max_new_tokens=5, - ) - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] - - decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - - for i, prompt in enumerate(prompts): - print(f"Prompt: {prompt}") - print(f"Generated: {decoded_outputs[i]}") - print("-" * 50) - return decoded_outputs[0] diff --git a/test/test_cpu/requirements.txt b/test/test_cpu/requirements.txt index 219189829..a54cc4e4e 100644 --- a/test/test_cpu/requirements.txt +++ b/test/test_cpu/requirements.txt @@ -3,7 +3,6 @@ modelscope gguf sentencepiece torchvision -parameterized pillow numba llmcompressor @ git+https://github.com/vllm-project/llm-compressor.git@main diff --git a/test/test_cpu/test_act_quantization.py b/test/test_cpu/test_act_quantization.py index 31ba51f1b..cd41c0985 100644 --- a/test/test_cpu/test_act_quantization.py +++ b/test/test_cpu/test_act_quantization.py @@ -1,87 +1,72 @@ import copy import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 +class TestAutoRoundAct: + save_dir = "./saved" - def __iter__(self): - for i in range(3): - yield torch.ones([1, 10], dtype=torch.long) + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + # Yield to hand control to the test methods + yield -class TestAutoRoundAct(unittest.TestCase): - @classmethod - def setUpClass(self): - self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - self.save_dir = "./saved" - self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) - self.llm_dataloader = LLMDataLoader() - - @classmethod - def tearDownClass(self): + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_mx_fp4(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + def test_mx_fp4(self, tiny_opt_model, opt_tokenizer, dataloader): bits, group_size, sym = 4, 128, True autoround = AutoRound( - model, - tokenizer, + tiny_opt_model, + opt_tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, act_bits=4, data_type="mx_fp", ) autoround.quantize() - def test_wint4fp8_dynamic(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + def test_wint4fp8_dynamic(self, tiny_opt_model, opt_tokenizer, dataloader): bits, group_size = 4, 128 autoround = AutoRound( - model, - tokenizer, + tiny_opt_model, + opt_tokenizer, bits=bits, group_size=group_size, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, act_bits=8, data_type="fp8", act_data_type="fp8", ) autoround.quantize() - def test_wint4fp8_static(self): + def test_wint4fp8_static(self, tiny_opt_model, opt_tokenizer, dataloader): bits, group_size, sym = 4, 128, True autoround = AutoRound( - self.model, - self.tokenizer, + tiny_opt_model, + opt_tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, act_bits=8, data_type="fp8_to_int_sym", act_dynamic=False, @@ -89,66 +74,42 @@ def test_wint4fp8_static(self): ) autoround.quantize() - def test_wfp8afp8_static(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + @pytest.mark.parametrize("act_group_size", [-1, 128]) + def test_wfp8afp8_static(self, act_group_size, tiny_opt_model, opt_tokenizer, dataloader): from auto_round.wrapper import WrapperWALayer - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) autoround = AutoRound( - model, - tokenizer, + tiny_opt_model, + opt_tokenizer, group_size=128, - act_group_size=-1, + act_group_size=act_group_size, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, data_type="fp8", act_dynamic=False, act_data_type="fp8", ) autoround.quantize() - self.assertTrue(isinstance(autoround.model.model.decoder.layers[2].self_attn.k_proj, WrapperWALayer)) - self.assertEqual(autoround.model.model.decoder.layers[2].self_attn.k_proj.orig_layer.act_scale.shape[0], 30) - self.assertEqual(autoround.model.model.decoder.layers[2].self_attn.k_proj.orig_layer.act_max.shape[0], 30) + k_proj = autoround.model.model.decoder.layers[1].self_attn.k_proj + assert isinstance(k_proj, WrapperWALayer), "k_proj should be WrapperWALayer" + if act_group_size == -1: + assert k_proj.orig_layer.act_scale.shape[0] == 20, "act_scale shape[0] should be 20" + assert k_proj.orig_layer.act_max.shape[0] == 20, "act_max shape[0] should be 20" + else: + assert k_proj.orig_layer.act_scale.shape[0] == int(2 * 10 * 768 / 128), "act_scale shape[0] is incorrect" + assert k_proj.orig_layer.act_max.shape[0] == int(2 * 10 * 768 / 128), "act_max shape[0] is incorrect" - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - autoround = AutoRound( - model, - tokenizer, - group_size=128, - act_group_size=128, - iters=0, - seqlen=2, - dataset=self.llm_dataloader, - data_type="fp8", - act_dynamic=False, - act_data_type="fp8", - ) - autoround.quantize() - self.assertTrue(isinstance(autoround.model.model.decoder.layers[2].self_attn.k_proj, WrapperWALayer)) - - self.assertEqual( - autoround.model.model.decoder.layers[2].self_attn.k_proj.orig_layer.act_scale.shape[0], - int(3 * 10 * 768 / 128), - ) - self.assertEqual( - autoround.model.model.decoder.layers[2].self_attn.k_proj.orig_layer.act_max.shape[0], - int(3 * 10 * 768 / 128), - ) - - def test_act_config_MXFP4_saving(self): + def test_act_config_MXFP4_saving(self, tiny_opt_model_path, dataloader): scheme = "MXFP4" layer_config = {"lm_head": {"act_bits": 8, "bits": 8}, "k_proj": {"act_bits": 8, "bits": 8}} autoround = AutoRound( - self.model_name, + tiny_opt_model_path, scheme=scheme, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = self.save_dir @@ -168,15 +129,15 @@ def test_act_config_MXFP4_saving(self): assert "sym" in kproj_config.keys() and kproj_config["sym"] shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_act_config_NVFP4_saving(self): + def test_act_config_NVFP4_saving(self, tiny_opt_model_path, dataloader): scheme = "NVFP4" layer_config = {"k_proj": {"act_bits": 16, "bits": 16}} autoround = AutoRound( - self.model_name, + tiny_opt_model_path, scheme=scheme, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = self.save_dir @@ -193,16 +154,16 @@ def test_act_config_NVFP4_saving(self): assert "sym" in kproj_config.keys() and kproj_config["sym"] shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_WOQ_config_INT_saving(self): + def test_WOQ_config_INT_saving(self, tiny_opt_model_path, dataloader): scheme = "W4A16" layer_config = {"k_proj": {"bits": 8}} autoround = AutoRound( - self.model_name, + tiny_opt_model_path, scheme=scheme, iters=2, seqlen=2, sym=False, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = self.save_dir @@ -223,7 +184,7 @@ def test_WOQ_config_INT_saving(self): assert "act_dynamic" in kproj_config.keys() and kproj_config["act_dynamic"] shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_act_config_FP8_saving(self): + def test_act_config_FP8_saving(self, tiny_opt_model_path, dataloader): scheme = "FP8_STATIC" layer_config = { "lm_head": {"act_bits": 8, "bits": 8}, @@ -237,11 +198,11 @@ def test_act_config_FP8_saving(self): }, } autoround = AutoRound( - self.model_name, + tiny_opt_model_path, scheme=scheme, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = self.save_dir @@ -262,7 +223,3 @@ def test_act_config_FP8_saving(self): assert "group_size" in kproj_config.keys() and kproj_config["group_size"] == 0 assert "sym" in kproj_config.keys() and kproj_config["sym"] shutil.rmtree(quantized_model_path, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_alg_ext.py b/test/test_cpu/test_alg_ext.py index b0c909bd3..0bfdfba47 100644 --- a/test/test_cpu/test_alg_ext.py +++ b/test/test_cpu/test_alg_ext.py @@ -1,37 +1,30 @@ -import copy -import shutil -import sys -import unittest - -from parameterized import parameterized - -sys.path.insert(0, "../..") - from auto_round import AutoRound +from ..helpers import qwen_name_or_path + -class TestAlgExt(unittest.TestCase): - def test_alg_ext(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" +class TestAlgExt: + def test_alg_ext(self, tiny_opt_model_path, tiny_qwen_model_path): + model_name = tiny_opt_model_path ar = AutoRound(model_name, scheme="W2A16", iters=1, nsamples=1, enable_alg_ext=True) ar.quantize() - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B" + model_name = tiny_qwen_model_path ar = AutoRound(model_name, scheme="gguf:q4_k_s", iters=1, nsamples=1, enable_alg_ext=True) ar.quantize() from auto_round.auto_scheme import AutoScheme scheme = AutoScheme(options=["mxfp4", "mxfp8"], avg_bits=5.5, ignore_scale_zp_bits=True) - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B" + model_name = tiny_qwen_model_path ar = AutoRound(model_name, scheme=scheme, iters=1, nsamples=1, enable_alg_ext=True, enable_torch_compile=True) ar.quantize() def test_alg_ext_import(self): from auto_round.alg_ext import wrapper_autoround - def test_all_support_dtype(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_all_support_dtype(self, tiny_opt_model_path): + model_name = tiny_opt_model_path for scheme in ["MXFP4", "NVFP4", "W2A16G64"]: ar = AutoRound( model_name, scheme=scheme, iters=1, nsamples=1, enable_alg_ext=True, enable_torch_compile=True diff --git a/test/test_cpu/test_asym.py b/test/test_cpu/test_asym.py index 842b208ed..32a0151b3 100644 --- a/test/test_cpu/test_asym.py +++ b/test/test_cpu/test_asym.py @@ -6,13 +6,14 @@ sys.path.insert(0, "../..") import torch -from _test_helpers import model_infer from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from auto_round import AutoRound from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.utils import get_module +from ..helpers import get_model_path, model_infer + class LLMDataLoader: def __init__(self): @@ -27,7 +28,7 @@ class TestAutoRoundAsym(unittest.TestCase): @classmethod def setUpClass(self): # self.model_name = "/models/opt-125m" - self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + self.model_name = get_model_path("facebook/opt-125m") self.save_folder = "./saved" @classmethod diff --git a/test/test_cpu/test_auto_scheme.py b/test/test_cpu/test_auto_scheme.py index cd38b220d..9d549076f 100644 --- a/test/test_cpu/test_auto_scheme.py +++ b/test/test_cpu/test_auto_scheme.py @@ -1,24 +1,28 @@ import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest + from auto_round import AutoRound, AutoRoundConfig, AutoScheme -class TestAutoScheme(unittest.TestCase): - @classmethod - def setUpClass(self): - self.save_dir = "./saved" - self.tasks = "lambada_openai" +class TestAutoScheme: + save_dir = "./saved" + + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + + # Yield to hand control to the test methods + yield - @classmethod - def tearDownClass(self): + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_auto_scheme_export(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_auto_scheme_export(self, tiny_opt_model_path): + model_name = tiny_opt_model_path scheme = AutoScheme(avg_bits=2, options=("W2A16"), nsamples=1, ignore_scale_zp_bits=True) ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1) ar.quantize_and_save(self.save_dir) @@ -29,27 +33,23 @@ def test_auto_scheme_export(self): ar.quantize_and_save(self.save_dir) shutil.rmtree(self.save_dir, ignore_errors=True) - def test_layer_config(self): + def test_layer_config(self, tiny_opt_model_path): from auto_round.auto_scheme.utils import compute_avg_bits_for_model from auto_round.utils import get_module - target_bits = 3.0 - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - scheme = AutoScheme(avg_bits=3, options=("W2A16", "W4A16", "BF16")) - user_layer_config = {"model.decoder.layers.10.fc1": {"bits": 8, "group_size": 32, "sym": False}} + target_bits = 3.5 + model_name = tiny_opt_model_path + scheme = AutoScheme(avg_bits=target_bits, options=("W2A16", "W4A16", "BF16")) + user_layer_config = {"model.decoder.layers.1.fc1": {"bits": 8, "group_size": 32, "sym": False}} ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config) model, layer_config = ar.quantize() - self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["bits"], 8) - self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["sym"], False) - self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["group_size"], 32) - layer = get_module(model, "model.decoder.layers.10.fc1") - self.assertEqual(layer.bits, 8) - self.assertEqual(layer.sym, False) - self.assertEqual(layer.group_size, 32) + assert layer_config["model.decoder.layers.1.fc1"]["bits"] == 8 + assert layer_config["model.decoder.layers.1.fc1"]["sym"] is False + assert layer_config["model.decoder.layers.1.fc1"]["group_size"] == 32 + layer = get_module(model, "model.decoder.layers.1.fc1") + assert layer.bits == 8 + assert layer.sym is False + assert layer.group_size == 32 avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_autoopt.py b/test/test_cpu/test_autoopt.py index f9801217e..c14e04c0e 100644 --- a/test/test_cpu/test_autoopt.py +++ b/test/test_cpu/test_autoopt.py @@ -1,9 +1,7 @@ import copy import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer @@ -11,48 +9,37 @@ from auto_round import AutoRoundAdam -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 +class TestAutoRound: - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + # Yield to hand control to the test methods + yield -class TestAutoRound(unittest.TestCase): - @classmethod - def setUpClass(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - self.llm_dataloader = LLMDataLoader() - - @classmethod - def tearDownClass(self): + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_Adam(self): + def test_Adam(self, tiny_opt_model, opt_tokenizer, dataloader): bits, group_size, sym = 4, 128, False from auto_round.utils import get_block_names - llm_block_names = get_block_names(self.model, quant_vision=True) + llm_block_names = get_block_names(tiny_opt_model, quant_vision=True) bits, group_size, sym, batch_size = 4, 128, False, 20 adamround = AutoRoundAdam( - self.model, - self.tokenizer, + tiny_opt_model, + opt_tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=2, batch_size=batch_size, - dataset=self.llm_dataloader, + dataset=dataloader, to_quant_block_names=llm_block_names, ) adamround.quantize() - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_autoround.py b/test/test_cpu/test_autoround.py index 2790f8817..aa7aeca5e 100644 --- a/test/test_cpu/test_autoround.py +++ b/test/test_cpu/test_autoround.py @@ -1,56 +1,40 @@ import copy import shutil -import sys -import unittest - -from parameterized import parameterized - -sys.path.insert(0, "../..") +import pytest import torch -from _test_helpers import model_infer from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from auto_round import AutoRound from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.utils import get_module - -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(3): - yield torch.ones([1, 10], dtype=torch.long) +from ..helpers import get_model_path, model_infer, opt_name_or_path, qwen_name_or_path -class TestAutoRound(unittest.TestCase): +class TestAutoRound: @classmethod - def setUpClass(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def setup_class(self): + model_name = opt_name_or_path self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - self.llm_dataloader = LLMDataLoader() self.save_folder = "./saved" @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree(self.save_folder, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_bits_setting(self): + def test_bits_setting(self, tiny_opt_model_path): layer_config = {"model.decoder.layers.0.self_attn.k_proj": {"data_type": "mx_fp8", "group_size": 32}} - autoround = AutoRound( - "/tf_dataset/auto_round/models/facebook/opt-125m", iters=2, seqlen=2, nsamples=1, layer_config=layer_config - ) + autoround = AutoRound(tiny_opt_model_path, iters=2, seqlen=2, nsamples=1, layer_config=layer_config) autoround.quantize() module = get_module(autoround.model, "model.decoder.layers.0.self_attn.k_proj") if module.bits != 8: raise ValueError(f"Expected bits to be 8, but got {module.bits}") - def test_layer_config(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_layer_config(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path layer_config = {"self_attn": {"bits": 4, "data_type": "nv_fp", "act_bits": 16, "group_size": 16}} autoround = AutoRound( model_name, @@ -58,15 +42,15 @@ def test_layer_config(self): scheme="NVFP4", iters=0, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, amp=False, ) autoround.quantize_and_save(self.save_folder, inplace=False, format="fake") shutil.rmtree(self.save_folder) - def test_remove_whole_block(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_remove_whole_block(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path layer_config = { "model.decoder.layers.0.self_attn.k_proj": {"bits": 32}, "model.decoder.layers.0.self_attn.v_proj": {"bits": 32}, @@ -83,45 +67,37 @@ def test_remove_whole_block(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) autoround.quantize() - def test_consecutive_quant(self): + def test_consecutive_quant(self, tiny_opt_model_path, tiny_phi2_model_path, dataloader): bits, group_size, sym = 4, -1, False autoround = AutoRound( - self.model, - self.tokenizer, + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() - model = AutoModelForCausalLM.from_pretrained( - "/tf_dataset/auto_round/models/microsoft/phi-2", torch_dtype="auto", trust_remote_code=True - ) - tokenizer = AutoTokenizer.from_pretrained( - "/tf_dataset/auto_round/models/microsoft/phi-2", trust_remote_code=True - ) autoround = AutoRound( - model, - tokenizer, + tiny_phi2_model_path, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() - def test_mx_fp4(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_mx_fp4(self, dataloader): + model_name = opt_name_or_path bits, group_size, sym = 4, 32, False autoround = AutoRound( model_name, @@ -140,10 +116,10 @@ def test_mx_fp4(self): model, self.tokenizer, batch_size="auto:8", tasks="lambada_openai", limit=32 ) print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.3) # 0.375 + assert result["results"]["lambada_openai"]["acc,none"] > 0.3 # 0.375 - def test_nv_fp4(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_nv_fp4(self, dataloader): + model_name = opt_name_or_path bits, group_size, sym = 4, 16, False autoround = AutoRound( model_name, @@ -152,7 +128,7 @@ def test_nv_fp4(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, data_type="nv_fp4", ) model, _ = autoround.quantize() @@ -160,10 +136,10 @@ def test_nv_fp4(self): model, self.tokenizer, batch_size="auto:8", tasks="lambada_openai", limit=32 ) print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35) + assert result["results"]["lambada_openai"]["acc,none"] > 0.35 - def test_w4g1(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_w4g1(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path bits, group_size, sym = 4, -1, True autoround = AutoRound( model_name, @@ -172,13 +148,13 @@ def test_w4g1(self): sym=sym, iters=2, seqlen=10, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() - @parameterized.expand([(2,), (3,), (4,)]) - def test_g128(self, bits): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + @pytest.mark.parametrize("bits", [2, 3, 4]) + def test_g128(self, bits, dataloader): + model_name = opt_name_or_path group_size, sym = 128, True autoround = AutoRound( model_name, @@ -187,7 +163,7 @@ def test_g128(self, bits): sym=sym, iters=2, seqlen=10, - dataset=self.llm_dataloader, + dataset=dataloader, ) model, _ = autoround.quantize() if bits > 2: @@ -195,9 +171,9 @@ def test_g128(self, bits): model, self.tokenizer, batch_size="auto:8", tasks="lambada_openai", limit=32 ) print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.3) + assert result["results"]["lambada_openai"]["acc,none"] > 0.3 - def test_disable_quanted_input(self): + def test_disable_quanted_input(self, dataloader): bits, group_size, sym = 4, -1, True autoround = AutoRound( self.model, @@ -208,13 +184,13 @@ def test_disable_quanted_input(self): iters=2, seqlen=10, enable_quanted_input=False, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() - def test_enable_norm_bias_tuning_qwen3(self): + def test_enable_norm_bias_tuning_qwen3(self, tiny_qwen_model_path, dataloader): bits, group_size, sym = 4, 128, True - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B" + model_name = tiny_qwen_model_path model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) autoround = AutoRound( @@ -226,11 +202,11 @@ def test_enable_norm_bias_tuning_qwen3(self): iters=2, seqlen=10, enable_norm_bias_tuning=True, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() - def test_enable_norm_bias_tuning(self): + def test_enable_norm_bias_tuning(self, dataloader): bits, group_size, sym = 4, -1, True autoround = AutoRound( self.model, @@ -242,11 +218,11 @@ def test_enable_norm_bias_tuning(self): seqlen=10, enable_quanted_input=False, enable_norm_bias_tuning=True, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() - def test_disable_minmax_tuning(self): + def test_disable_minmax_tuning(self, dataloader): bits, group_size, sym = 4, -1, True autoround = AutoRound( self.model, @@ -257,13 +233,13 @@ def test_disable_minmax_tuning(self): iters=2, seqlen=10, enable_minmax_tuning=False, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() # - def test_signround(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_signround(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path bits, group_size, sym = 4, -1, False autoround = AutoRound( model_name, @@ -274,11 +250,11 @@ def test_signround(self): seqlen=10, enable_minmax_tuning=False, enable_quanted_input=False, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() - def test_lm_head_layer_config_way(self): + def test_lm_head_layer_config_way(self, dataloader): bits, group_size, sym = 4, -1, False layer_config = {"lm_head": {"data_type": "int"}} autoround = AutoRound( @@ -291,13 +267,13 @@ def test_lm_head_layer_config_way(self): seqlen=10, enable_minmax_tuning=False, enable_quanted_input=False, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) autoround.quantize() - def test_wa_quant(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_wa_quant(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path bits, group_size, sym, act_bits = 4, 128, False, 4 autoround = AutoRound( model_name, @@ -306,14 +282,14 @@ def test_wa_quant(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, act_bits=act_bits, ) autoround.quantize() - def test_auto_device_map(self): + def test_auto_device_map(self, tiny_opt_model_path, dataloader): bits, group_size, sym = 4, 128, False - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + model_name = tiny_opt_model_path model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", trust_remote_code=True, device_map="auto" ) @@ -325,11 +301,11 @@ def test_auto_device_map(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() - def test_device_map_dict(self): + def test_device_map_dict(self, tiny_opt_model_path, dataloader): bits, group_size, sym = 4, 128, False device_map = {".*": "cpu"} autoround = AutoRound( @@ -340,13 +316,13 @@ def test_device_map_dict(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, device_map=device_map, ) autoround.quantize() # test model_name - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + model_name = tiny_opt_model_path autoround = AutoRound( model_name, self.tokenizer, @@ -355,14 +331,14 @@ def test_device_map_dict(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, device_map=device_map, ) autoround.quantize() - def test_fp32(self): + def test_fp32(self, tiny_opt_model_path, dataloader): bits, group_size, sym = 4, 128, False - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + model_name = tiny_opt_model_path model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float32, trust_remote_code=True, device_map="auto" ) @@ -374,12 +350,12 @@ def test_fp32(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, amp=False, ) autoround.quantize() - def test_tensor_reshape(self): + def test_tensor_reshape(self, dataloader): bits, group_size, sym = 4, 100, False autoround = AutoRound( self.model, @@ -389,12 +365,12 @@ def test_tensor_reshape(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() - def test_rtn(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_rtn(self, tiny_opt_model_path): + model_name = tiny_opt_model_path model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) @@ -412,9 +388,9 @@ def test_rtn(self): model_infer(model, tokenizer) shutil.rmtree(self.save_folder) - def test_embed_quant(self): + def test_embed_quant(self, tiny_opt_model_path, dataloader): bits, group_size, sym = 4, 128, True - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + model_name = tiny_opt_model_path layer_config = { "model.decoder.embed_tokens": {"bits": 4}, } @@ -426,14 +402,14 @@ def test_embed_quant(self): iters=2, seqlen=2, nsamples=3, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) autoround.quantize() - def test_fallback_layers(self): + def test_fallback_layers(self, tiny_opt_model_path, dataloader): bits, group_size, sym = 4, 128, True - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + model_name = tiny_opt_model_path model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float32, trust_remote_code=True, device_map="auto" ) @@ -450,7 +426,7 @@ def test_fallback_layers(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) autoround.quantize() @@ -475,17 +451,17 @@ def test_not_convert_modules(self): from auto_round_extension.ipex.qlinear_ipex_awq import QuantLinear - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct-AWQ" + model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct-AWQ") quantization_config = AutoRoundConfig() model = Qwen2VLForConditionalGeneration.from_pretrained( model_name, quantization_config=quantization_config, device_map="cpu", torch_dtype=torch.float16 ) - self.assertTrue(isinstance(model.visual.blocks[0].attn.qkv, torch.nn.Linear)) - self.assertFalse(isinstance(model.visual.merger.mlp[0], QuantLinear)) + assert isinstance(model.visual.blocks[0].attn.qkv, torch.nn.Linear) + assert not isinstance(model.visual.merger.mlp[0], QuantLinear) if hasattr(model.model, "language_model"): - self.assertTrue(isinstance(model.model.language_model.layers[0].self_attn.v_proj, QuantLinear)) + assert isinstance(model.model.language_model.layers[0].self_attn.v_proj, QuantLinear) else: - self.assertTrue(isinstance(model.model.layers[0].self_attn.v_proj, QuantLinear)) + assert isinstance(model.model.layers[0].self_attn.v_proj, QuantLinear) processor = AutoProcessor.from_pretrained(model_name, size=None) image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" @@ -520,8 +496,8 @@ def test_not_convert_modules(self): ) print(output_text) - def test_fallback_layers_regex_awq(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_fallback_layers_regex_awq(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path bits, group_size, sym = 4, 128, True model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) @@ -537,7 +513,7 @@ def test_fallback_layers_regex_awq(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) autoround.quantize() @@ -556,8 +532,8 @@ def test_fallback_layers_regex_awq(self): print(res) shutil.rmtree(self.save_folder, ignore_errors=True) - def test_fallback_layers_regex_gptq(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_fallback_layers_regex_gptq(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path bits, group_size, sym = 4, 128, True model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) @@ -573,7 +549,7 @@ def test_fallback_layers_regex_gptq(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) autoround.quantize() @@ -592,8 +568,8 @@ def test_fallback_layers_regex_gptq(self): print(res) shutil.rmtree(self.save_folder, ignore_errors=True) - def test_fallback_layers_regex_round(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_fallback_layers_regex_round(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path bits, group_size, sym = 4, 128, True model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) @@ -609,7 +585,7 @@ def test_fallback_layers_regex_round(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) autoround.quantize() @@ -628,13 +604,13 @@ def test_fallback_layers_regex_round(self): print(res) shutil.rmtree(self.save_folder, ignore_errors=True) - def test_fallback_layers_regex_exception(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_fallback_layers_regex_exception(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path bits, group_size, sym = 4, 128, True model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) layer_config = {"model.decoder.layers.12.self_attn.k_proj": {"bits": 16}} - with self.assertRaises(ValueError): + with pytest.raises(ValueError): autoround = AutoRound( model, tokenizer=tokenizer, @@ -643,21 +619,11 @@ def test_fallback_layers_regex_exception(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) autoround.quantize() - # def test_fp8_model_input_rtn_generation(self): - # model_name = "Qwen/Qwen3-0.6B-FP8" - # ar = AutoRound(model=model_name, iters=0) - # ar.quantize_and_save(output_dir=self.save_folder) - # model = AutoModelForCausalLM.from_pretrained(self.save_folder, torch_dtype="auto", trust_remote_code=True) - # tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - # text = "There is a girl who likes adventure," - # inputs = tokenizer(text, return_tensors="pt").to(model.device) - # print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) - def test_dequant_fp8_weight(self): from auto_round.utils import dequant_block_fp8_weight @@ -666,44 +632,44 @@ def test_dequant_fp8_weight(self): weight_scale = torch.randn(5, 56) block_size = [128, 128] dequant_weight = dequant_block_fp8_weight(weight, weight_scale, block_size) - self.assertEqual(dequant_weight.shape.numel(), 4207616) + assert dequant_weight.shape.numel() == 4207616 # test experts are stacked. weight = torch.randn([32, 5760, 1440]) weight_scale = torch.randn([32, 5760, 90]) block_size = [1, 16] dequant_weight = dequant_block_fp8_weight(weight, weight_scale, block_size) - self.assertEqual(len(dequant_weight.shape), 3) - self.assertEqual(dequant_weight.shape[0], 32) - self.assertEqual(dequant_weight.shape.numel(), 32 * 5760 * 1440) + assert len(dequant_weight.shape) == 3 + assert dequant_weight.shape[0] == 32 + assert dequant_weight.shape.numel() == 32 * 5760 * 1440 - def test_mixed_bit_setting(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - layer_config = {"model.decoder.layers.7.fc1": {"bits": 8, "act_bits": 8}} + def test_mixed_bit_setting(self, tiny_opt_model_path): + model_name = tiny_opt_model_path + layer_config = {"model.decoder.layers.1.fc1": {"bits": 8, "act_bits": 8}} ar = AutoRound(model_name, data_type="mx_fp4", act_bits=4, iters=0, layer_config=layer_config) ar.quantize() layer_config = ar.layer_config if ( - layer_config["model.decoder.layers.7.fc1"]["bits"] != 8 - or layer_config["model.decoder.layers.7.fc1"]["act_bits"] != 8 + layer_config["model.decoder.layers.1.fc1"]["bits"] != 8 + or layer_config["model.decoder.layers.1.fc1"]["act_bits"] != 8 ): raise ValueError("mixed bits is not correct") - def test_invalid_layer_config(self): - with self.assertRaises(ValueError): + def test_invalid_layer_config(self, tiny_opt_model_path): + with pytest.raises(ValueError): layer_config = {"model.decoder.layers.2.self_attnx": {"bits": 2}} ar = AutoRound( - "/tf_dataset/auto_round/models/facebook/opt-125m", + tiny_opt_model_path, scheme="W3A16", nsamples=1, iters=1, layer_config=layer_config, ) ar.quantize() - with self.assertRaises(ValueError): + with pytest.raises(ValueError): layer_config = {"model.decoder.layers.2.self_attn": {"bit": 2}} # should be bits ar = AutoRound( - "/tf_dataset/auto_round/models/facebook/opt-125m", + tiny_opt_model_path, scheme="W3A16", nsamples=1, iters=1, @@ -711,8 +677,8 @@ def test_invalid_layer_config(self): ) ar.quantize() - def test_quant_lm_head(self): - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-8B" + def test_quant_lm_head(self, tiny_untied_qwen_model_path): + model_name = tiny_untied_qwen_model_path ar = AutoRound(model_name, quant_lm_head=True, iters=0, seqlen=8, nsamples=1, disable_opt_rtn=True) ar.quantize_and_save(output_dir=self.save_folder, format="auto_round") model = AutoModelForCausalLM.from_pretrained(self.save_folder, device_map="cpu") @@ -734,8 +700,8 @@ def test_quant_lm_head(self): assert "lm_head" in model.config.quantization_config.extra_config assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4 - def test_quant_lm_head_layer_config(self): - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-8B" + def test_quant_lm_head_layer_config(self, tiny_untied_qwen_model_path): + model_name = tiny_untied_qwen_model_path layer_config = {"lm_head": {"bits": 4}} ar = AutoRound( model_name, @@ -751,22 +717,22 @@ def test_quant_lm_head_layer_config(self): assert "lm_head" in model.config.quantization_config.extra_config assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4 - def test_compressor(self): - model_name = "Qwen/Qwen2-VL-2B-Instruct" + def test_compressor(self, tiny_qwen_vl_model_path): + model_name = tiny_qwen_vl_model_path ar = AutoRound(model_name, enable_adam=True) - self.assertEqual(ar.optimizer, torch.optim.AdamW) - self.assertTrue(ar.mllm) + assert ar.optimizer == torch.optim.AdamW + assert ar.mllm # test old api from auto_round import AutoRoundMLLM ar = AutoRoundMLLM(model_name) - self.assertTrue(ar.mllm) + assert ar.mllm def test_attention_mask_in_dataset(self): from transformers import AutoTokenizer - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B" + model_name = qwen_name_or_path # model_name = "/models/Qwen3-0.6B" tokenizer = AutoTokenizer.from_pretrained(model_name) text = ["haha", "hello world"] @@ -784,7 +750,7 @@ def test_attention_mask_in_dataset(self): def test_attention_mask_via_tokenize_in_dataset(self): from transformers import AutoTokenizer - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B" + model_name = qwen_name_or_path # model_name = "/models/Qwen3-0.6B" tokenizer = AutoTokenizer.from_pretrained(model_name) text = ["haha", "hello world"] @@ -801,9 +767,9 @@ def test_attention_mask_via_tokenize_in_dataset(self): ar = AutoRound(model_name, iters=1, dataset=data, seqlen=8) ar.quantize() - def test_low_cpu_mem_usage(self): + def test_low_cpu_mem_usage(self, tiny_opt_model_path, dataloader): bits, group_size = 4, 32 - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + model_name = tiny_opt_model_path model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) quantized_model_path = self.save_folder @@ -814,7 +780,7 @@ def test_low_cpu_mem_usage(self): group_size=group_size, iters=2, seqlen=10, - dataset=self.llm_dataloader, + dataset=dataloader, low_cpu_mem_usage=True, device_map="cpu", ) @@ -822,11 +788,7 @@ def test_low_cpu_mem_usage(self): shutil.rmtree(quantized_model_path, ignore_errors=True) def test_create_adam(self): - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B" + model_name = qwen_name_or_path from auto_round import AutoRound ar = AutoRound(model=model_name, enable_adam=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_autoround_acc.py b/test/test_cpu/test_autoround_acc.py index 41b28e663..876d4a452 100644 --- a/test/test_cpu/test_autoround_acc.py +++ b/test/test_cpu/test_autoround_acc.py @@ -1,42 +1,29 @@ import copy import shutil -import sys -import unittest - -from auto_round.eval.evaluation import simple_evaluate - -sys.path.insert(0, "../..") from math import isclose +import pytest import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound # pylint: disable=E0401 - -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) +from ..helpers import gptj_name_or_path -class TestAutoRound(unittest.TestCase): +class TestAutoRound: @classmethod - def setUpClass(self): - self.llm_dataloader = LLMDataLoader() + def setup_class(self): self.save_dir = "./saved" @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree(self.save_dir, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_default_acc(self): - model_name = "/tf_dataset/auto_round/models/hf-internal-testing/tiny-random-GPTJForCausalLM" + def test_default_acc(self, dataloader): + model_name = gptj_name_or_path model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True @@ -50,7 +37,7 @@ def test_default_acc(self): sym=sym, iters=2, seqlen=10, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() out0 = model(inp) @@ -66,28 +53,19 @@ def test_default_acc(self): device="cpu", iters=2, seqlen=10, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround_1.quantize() out1 = model_tmp(inp) assert out0[0].equal(out1[0]) - self.assertTrue(isclose(float(out0[0][0][0][0]), -0.021002087742090225, rel_tol=5e-04)) + assert isclose(float(out0[0][0][0][0]), -0.021002087742090225, rel_tol=5e-04) - def test_3bits_asym_autoround(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_3bits_asym_autoround(self, tiny_opt_model_path): + model_name = tiny_opt_model_path bits, sym = 3, False autoround = AutoRound(model_name, bits=bits, sym=sym, iters=0) autoround.quantize_and_save(self.save_dir, format="auto_round", inplace=False) model_args = f"pretrained={self.save_dir}" - # res = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto", limit=10) - - # accuracy = res["results"]["lambada_openai"]["acc,none"] - # print(f"accuracy = {accuracy}") - # assert accuracy > 0.15 shutil.rmtree(self.save_dir, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_autoround_export_to_itrex.py b/test/test_cpu/test_autoround_export_to_itrex.py index d9b4f42c6..19f196270 100644 --- a/test/test_cpu/test_autoround_export_to_itrex.py +++ b/test/test_cpu/test_autoround_export_to_itrex.py @@ -1,15 +1,15 @@ import copy import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound +from ..helpers import get_model_path, gptj_name_or_path + class SimpleDataLoader: def __init__(self): @@ -20,35 +20,23 @@ def __iter__(self): yield torch.randn([1, 30]) -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestAutoroundExport(unittest.TestCase): +class TestAutoroundExport: approach = "weight_only" @classmethod - def setUpClass(self): + def setup_class(self): self.gptj = transformers.AutoModelForCausalLM.from_pretrained( - "/tf_dataset/auto_round/models/hf-internal-testing/tiny-random-GPTJForCausalLM", + gptj_name_or_path, torchscript=True, ) - self.tokenizer = transformers.AutoTokenizer.from_pretrained( - "/tf_dataset/auto_round/models/hf-internal-testing/tiny-random-GPTJForCausalLM", trust_remote_code=True - ) + self.tokenizer = transformers.AutoTokenizer.from_pretrained(gptj_name_or_path, trust_remote_code=True) self.gptj_no_jit = transformers.AutoModelForCausalLM.from_pretrained( - "/tf_dataset/auto_round/models/hf-internal-testing/tiny-random-GPTJForCausalLM", + gptj_name_or_path, ) - self.llm_dataloader = LLMDataLoader() self.lm_input = torch.ones([1, 10], dtype=torch.long) @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @@ -64,11 +52,11 @@ def test_autoround_int_quant(self): out2 = model(self.lm_input) out3 = q_model(self.lm_input) out4 = compressed_model(self.lm_input) - self.assertTrue(torch.all(torch.isclose(out1[0], out2[0], atol=1e-1))) - self.assertFalse(torch.all(out1[0] == out2[0])) - self.assertTrue(torch.all(out2[0] == out3[0])) - self.assertTrue(torch.all(torch.isclose(out3[0], out4[0], atol=1e-3))) - self.assertTrue("transformer.h.0.attn.k_proj.qzeros" in compressed_model.state_dict().keys()) + assert torch.all(torch.isclose(out1[0], out2[0], atol=1e-1)) + assert not torch.all(out1[0] == out2[0]) + assert torch.all(out2[0] == out3[0]) + assert torch.all(torch.isclose(out3[0], out4[0], atol=1e-3)) + assert "transformer.h.0.attn.k_proj.qzeros" in compressed_model.state_dict().keys() model = copy.deepcopy(self.gptj) out6 = model(self.lm_input) @@ -78,19 +66,19 @@ def test_autoround_int_quant(self): compressed_model = compressed_model.to(torch.float32) out4 = q_model(self.lm_input) out5 = compressed_model(self.lm_input) - self.assertTrue(torch.all(out1[0] == out6[0])) - self.assertTrue(torch.all(torch.isclose(out4[0], out5[0], atol=5e-3))) + assert torch.all(out1[0] == out6[0]) + assert torch.all(torch.isclose(out4[0], out5[0], atol=5e-3)) def test_config(self): from auto_round.export.export_to_itrex import QuantConfig - config = QuantConfig.from_pretrained("/tf_dataset/auto_round/models/TheBloke/Llama-2-7B-Chat-GPTQ") + config = QuantConfig.from_pretrained(get_model_path("TheBloke/Llama-2-7B-Chat-GPTQ")) config.save_pretrained("quantization_config_dir") loaded_config = QuantConfig.from_pretrained("quantization_config_dir") - self.assertEqual(config.group_size, loaded_config.group_size) - self.assertEqual(config.desc_act, loaded_config.desc_act) - self.assertEqual(config.bits, loaded_config.bits) - self.assertEqual(config.sym, loaded_config.sym) + assert config.group_size == loaded_config.group_size + assert config.desc_act == loaded_config.desc_act + assert config.bits == loaded_config.bits + assert config.sym == loaded_config.sym def test_xpu_export(self): model = copy.deepcopy(self.gptj) @@ -106,12 +94,8 @@ def test_xpu_export(self): out3 = q_model(self.lm_input) out4 = compressed_model_xpu(self.lm_input) out5 = compressed_model_cpu(self.lm_input) - self.assertTrue(torch.all(torch.isclose(out1[0], out2[0], atol=1e-1))) - self.assertFalse(torch.all(out1[0] == out2[0])) - self.assertTrue(torch.all(out2[0] == out3[0])) - self.assertTrue(torch.all(torch.isclose(out3[0], out4[0], atol=1e-3))) - self.assertTrue(torch.all(torch.isclose(out4[0], out5[0], atol=1e-5))) - - -if __name__ == "__main__": - unittest.main() + assert torch.all(torch.isclose(out1[0], out2[0], atol=1e-1)) + assert not torch.all(out1[0] == out2[0]) + assert torch.all(out2[0] == out3[0]) + assert torch.all(torch.isclose(out3[0], out4[0], atol=1e-3)) + assert torch.all(torch.isclose(out4[0], out5[0], atol=1e-5)) diff --git a/test/test_cpu/test_block_names.py b/test/test_cpu/test_block_names.py index 501caee25..47c554317 100644 --- a/test/test_cpu/test_block_names.py +++ b/test/test_cpu/test_block_names.py @@ -1,25 +1,14 @@ import os import shutil -import sys -import unittest -sys.path.insert(0, ".") -sys.path.insert(0, "../..") +import pytest import torch import torch.nn as nn from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound - -class LLMDataLoader: - def __init__(self, input_size=10): - self.batch_size = 1 - self.input_size = input_size - - def __iter__(self): - for i in range(2): - yield torch.ones([1, self.input_size], dtype=torch.long) +from ..helpers import get_model_path, lamini_name_or_path # ================= simple multimodal model ================= @@ -116,15 +105,14 @@ def forward(self, x): return output -class TestQuantizationBlocks(unittest.TestCase): +class TestQuantizationBlocks: @classmethod - def setUpClass(self): - self.model_name = "/tf_dataset/auto_round/models/MBZUAI/LaMini-GPT-124M" + def setup_class(self): + self.model_name = lamini_name_or_path self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) - self.llm_dataloader = LLMDataLoader() @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @@ -133,7 +121,6 @@ def test_moe_quant(self): hidden_size = 10 num_groups = 2 experts_per_group = 2 - self.llm_dataloader = LLMDataLoader(input_size) self.model = NestedMoEModel(input_size, hidden_size, num_groups, experts_per_group) from auto_round.utils import get_block_names @@ -159,7 +146,7 @@ def test_multimodal_quant(self): assert block_names_wo_vision == llm_block_names assert len(block_names_wo_vision) != (block_names_with_vision) - def test_block_name_quant(self): + def test_block_name_quant(self, dataloader): self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) from auto_round.utils import get_block_names @@ -174,7 +161,7 @@ def test_block_name_quant(self): iters=2, seqlen=2, batch_size=batch_size, - dataset=self.llm_dataloader, + dataset=dataloader, to_quant_block_names=llm_block_names, ) autoround.quantize() @@ -191,33 +178,29 @@ def test_block_name_quant(self): assert quant_config.block_name_to_quantize is not None shutil.rmtree("./saved", ignore_errors=True) - def test_mm_block_name(self): + def test_mm_block_name(self, tiny_qwen_vl_model_path): from transformers import Qwen2VLForConditionalGeneration from auto_round.utils import get_block_names - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct" + model_name = tiny_qwen_vl_model_path model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, trust_remote_code=True, device_map="auto") block_name = get_block_names(model, quant_vision=True) - self.assertTrue(len(block_name) == 2) - self.assertTrue(all(["visual.merger.mlp" not in n for n in block_name])) + assert len(block_name) == 2 + assert all(["visual.merger.mlp" not in n for n in block_name]) block_name = get_block_names(model, quant_vision=False) - self.assertTrue(len(block_name) == 1) - self.assertTrue(block_name == get_block_names(model)) + assert len(block_name) == 1 + assert block_name == get_block_names(model) def test_moe(self): from auto_round.utils import get_block_names - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen1.5-MoE-A2.7B" + model_name = get_model_path("Qwen/Qwen1.5-MoE-A2.7B") # config = AutoConfig.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) block_name = get_block_names(model) block_name_2 = get_block_names(model, quant_vision=True) - self.assertTrue(block_name == block_name_2) - self.assertTrue(len(block_name_2) == 1) - self.assertTrue("model.layers.23" == block_name_2[0][-1]) - - -if __name__ == "__main__": - unittest.main() + assert block_name == block_name_2 + assert len(block_name_2) == 1 + assert "model.layers.23" == block_name_2[0][-1] diff --git a/test/test_cpu/test_calib_dataset.py b/test/test_cpu/test_calib_dataset.py index 689cc705c..cb276147e 100644 --- a/test/test_cpu/test_calib_dataset.py +++ b/test/test_cpu/test_calib_dataset.py @@ -1,29 +1,19 @@ +import json import os import shutil -import sys -import unittest - -sys.path.insert(0, "../..") -import json +import pytest import torch from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound +from ..helpers import get_model_path, opt_name_or_path -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - -class TestLocalCalibDataset(unittest.TestCase): +class TestLocalCalibDataset: @classmethod - def setUpClass(self): + def setup_class(self): json_data = [{"text": "awefdsfsddfd"}, {"text": "fdfdfsdfdfdfd"}, {"text": "dfdsfsdfdfdfdf"}] os.makedirs("./saved", exist_ok=True) self.json_file = "./saved/tmp.json" @@ -38,7 +28,7 @@ def setUpClass(self): json.dump(item, jsonl_file, ensure_ascii=False) jsonl_file.write("\n") - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + model_name = opt_name_or_path self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) @@ -71,7 +61,7 @@ def test_jsonl(self): autoround.quantize() def test_apply_chat_template(self): - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct" + model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) dataset = "NeelNanda/pile-10k:apply_chat_template:system_prompt=''" @@ -130,10 +120,6 @@ def test_combine_dataset2(self): # autoround.quantize() @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_cli_usage.py b/test/test_cpu/test_cli_usage.py index 2b93f5131..b3aecf2f1 100644 --- a/test/test_cpu/test_cli_usage.py +++ b/test/test_cpu/test_cli_usage.py @@ -1,45 +1,44 @@ import os import shutil import sys -import unittest -sys.path.insert(0, "../..") +from ..helpers import get_model_path -class TestAutoRoundCmd(unittest.TestCase): +class TestAutoRoundCmd: @classmethod - def setUpClass(self): + def setup_class(self): pass @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) shutil.rmtree("../../saved", ignore_errors=True) shutil.rmtree("../../tmp_autoround", ignore_errors=True) - def test_auto_round_cmd(self): + def test_auto_round_cmd(self, tiny_opt_model_path, tiny_qwen_vl_model_path): python_path = sys.executable # Test llm script - res = os.system(f"cd ../.. && {python_path} -m auto_round -h") + res = os.system(f"cd .. && {python_path} -m auto_round -h") if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"cd ../.. && {python_path} -m auto_round --model '/tf_dataset/auto_round/models/facebook/opt-125m' --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved --tasks piqa" + f"cd .. && {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved --tasks piqa" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"cd ../.. && {python_path} -m auto_round --model '/tf_dataset/auto_round/models/facebook/opt-125m' --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32" + f"cd .. && {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"cd ../.. && {python_path} -c 'from auto_round.__main__ import run_light; run_light()' --seqlen 8 --iter 2 --nsamples 8 --output_dir ./saved --tasks lambada_openai" + f"cd .. && {python_path} -c 'from auto_round.__main__ import run_light; run_light()' --seqlen 8 --iter 2 --nsamples 8 --output_dir ./saved --tasks lambada_openai" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" @@ -47,28 +46,24 @@ def test_auto_round_cmd(self): # test mllm script # test auto_round_mllm --eval help - res = os.system(f"cd ../.. && {python_path} -m auto_round --eval -h") + res = os.system(f"cd .. && {python_path} -m auto_round --eval -h") if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" # test auto_round_mllm --lmms help - res = os.system(f"cd ../.. && {python_path} -m auto_round --eval --lmms -h") + res = os.system(f"cd .. && {python_path} -m auto_round --eval --lmms -h") if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"cd ../.. && {python_path} -m auto_round --mllm --model /tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct --iter 2 --nsamples 2 --seqlen 32 --format auto_round --output_dir ./saved" + f"cd .. && {python_path} -m auto_round --mllm --model {tiny_qwen_vl_model_path} --iter 2 --nsamples 2 --seqlen 32 --format auto_round --output_dir ./saved" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"cd ../.. && {python_path} -m auto_round --mllm --iter 2 --nsamples 2 --model /tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct --seqlen 32 --format auto_round" + f"cd .. && {python_path} -m auto_round --mllm --iter 2 --nsamples 2 --model {tiny_qwen_vl_model_path} --seqlen 32 --format auto_round" " --quant_nontext_module --output_dir ./saved " ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_conv1d.py b/test/test_cpu/test_conv1d.py index edd28110f..1997026b3 100644 --- a/test/test_cpu/test_conv1d.py +++ b/test/test_cpu/test_conv1d.py @@ -1,38 +1,27 @@ import copy import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch -from _test_helpers import model_infer from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound +from ..helpers import lamini_name_or_path, model_infer -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestQuantizationConv1d(unittest.TestCase): +class TestQuantizationConv1d: @classmethod - def setUpClass(self): - self.model_name = "/tf_dataset/auto_round/models/MBZUAI/LaMini-GPT-124M" + def setup_class(self): + self.model_name = lamini_name_or_path self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) - self.llm_dataloader = LLMDataLoader() @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_quant(self): + def test_quant(self, dataloader): self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) bits, group_size, sym = 4, 128, True autoround = AutoRound( @@ -43,7 +32,7 @@ def test_quant(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() @@ -51,7 +40,3 @@ def test_quant(self): model = AutoModelForCausalLM.from_pretrained("./saved", device_map="cpu", trust_remote_code=True) model_infer(model, self.tokenizer) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py index 47b18f314..a2c9e1fa6 100644 --- a/test/test_cpu/test_export.py +++ b/test/test_cpu/test_export.py @@ -1,16 +1,14 @@ import os import shutil -import sys -import unittest -from parameterized import parameterized - -sys.path.insert(0, "../..") +import pytest import torch from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from auto_round import AutoRound +from ..helpers import get_model_path, opt_name_or_path + def _get_folder_size(path: str) -> float: """Return folder size in GB.""" @@ -23,30 +21,20 @@ def _get_folder_size(path: str) -> float: return total_size / (1024**3) # convert to GB -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestAutoRound(unittest.TestCase): +class TestAutoRound: @classmethod - def setUpClass(self): - self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def setup_class(self): + self.model_name = opt_name_or_path self.save_dir = "./saved" self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) - self.llm_dataloader = LLMDataLoader() @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_autogptq_format(self): + def test_autogptq_format(self, dataloader): for group_size in [-1, 32, 128]: bits, sym = 4, False model_name = self.model_name @@ -57,7 +45,7 @@ def test_autogptq_format(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = "./saved" @@ -76,7 +64,7 @@ def test_autogptq_format(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree("./saved", ignore_errors=True) - def test_autoround_format(self): + def test_autoround_format(self, dataloader): for group_size in [-1, 32, 128]: bits, sym = 4, True model_name = self.model_name @@ -87,7 +75,7 @@ def test_autoround_format(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") @@ -102,7 +90,7 @@ def test_autoround_format(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree("./saved", ignore_errors=True) - def test_autoround_awq_format(self): + def test_autoround_awq_format(self, dataloader): for group_size in [-1, 32, 128]: bits, sym = 4, False model_name = self.model_name @@ -113,7 +101,7 @@ def test_autoround_awq_format(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = "./saved" @@ -132,7 +120,7 @@ def test_autoround_awq_format(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree("./saved", ignore_errors=True) - def test_autoawq_format(self): + def test_autoawq_format(self, dataloader): for group_size in [-1, 32, 128]: bits, sym = 4, False autoround = AutoRound( @@ -143,7 +131,7 @@ def test_autoawq_format(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() quantized_model_path = "./saved" @@ -163,7 +151,7 @@ def test_autoawq_format(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree("./saved", ignore_errors=True) - def test_autoround_3bit_asym_format(self): + def test_autoround_3bit_asym_format(self, dataloader): bits, group_size, sym = 3, 128, False autoround = AutoRound( self.model, @@ -173,7 +161,7 @@ def test_autoround_3bit_asym_format(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() quantized_model_path = self.save_dir @@ -187,7 +175,7 @@ def test_autoround_3bit_asym_format(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_autoround_3bit_sym_format(self): + def test_autoround_3bit_sym_format(self, dataloader): bits, group_size, sym = 3, 128, True autoround = AutoRound( self.model, @@ -197,7 +185,7 @@ def test_autoround_3bit_sym_format(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() quantized_model_path = self.save_dir @@ -211,7 +199,7 @@ def test_autoround_3bit_sym_format(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree(quantized_model_path, ignore_errors=True) - @parameterized.expand([(None,), ("fp8",), ("float16")]) + @pytest.mark.parametrize("static_kv_dtype", ["fp8", "float16"]) def test_static_afp8_export(self, static_kv_dtype): import os @@ -237,10 +225,10 @@ def test_static_afp8_export(self, static_kv_dtype): quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") - self.assertIn("model.decoder.layers.8.self_attn.k_proj.input_scale", f.keys()) - self.assertIn("model.decoder.layers.8.self_attn.k_proj.weight_scale", f.keys()) - self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape, torch.Size([1])) - self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype, torch.float8_e4m3fn) + assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys() + assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys() + assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape == torch.Size([1]) + assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype == torch.float8_e4m3fn if static_kv_dtype is None: with torch.no_grad(): import transformers @@ -270,11 +258,11 @@ def test_static_afp8_export(self, static_kv_dtype): assert output is not None, "Output should not be None" if static_kv_dtype == "fp8": - self.assertIn("model.decoder.layers.8.self_attn.k_scale", f.keys()) - self.assertIn("model.decoder.layers.8.self_attn.v_scale", f.keys()) - self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_scale").shape, torch.Size([1])) - self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.k_scale").shape, torch.Size([1])) - self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.k_scale").dtype, torch.float32) + assert "model.decoder.layers.8.self_attn.k_scale" in f.keys() + assert "model.decoder.layers.8.self_attn.v_scale" in f.keys() + assert f.get_tensor("model.decoder.layers.5.self_attn.v_scale").shape == torch.Size([1]) + assert f.get_tensor("model.decoder.layers.5.self_attn.k_scale").shape == torch.Size([1]) + assert f.get_tensor("model.decoder.layers.5.self_attn.k_scale").dtype == torch.float32 shutil.rmtree(quantized_model_path, ignore_errors=True) model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) @@ -296,10 +284,10 @@ def test_static_afp8_export(self, static_kv_dtype): autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") - self.assertIn("model.decoder.layers.8.self_attn.k_proj.input_scale", f.keys()) - self.assertIn("model.decoder.layers.8.self_attn.k_proj.weight_scale", f.keys()) - self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape, torch.Size([1])) - self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype, torch.float8_e4m3fn) + assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys() + assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys() + assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape == torch.Size([1]) + assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype == torch.float8_e4m3fn shutil.rmtree(quantized_model_path, ignore_errors=True) def test_static_fp8_attn(self): @@ -321,22 +309,22 @@ def test_static_fp8_attn(self): quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") - self.assertIn("model.decoder.layers.8.self_attn.k_proj.input_scale", f.keys()) - self.assertIn("model.decoder.layers.8.self_attn.k_proj.weight_scale", f.keys()) - self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape, torch.Size([1])) - self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype, torch.float8_e4m3fn) + assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys() + assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys() + assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape == torch.Size([1]) + assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype == torch.float8_e4m3fn check_attrs = ["k_scale", "v_scale", "q_scale"] for attr in check_attrs: weight_name = f"model.decoder.layers.8.self_attn.{attr}" - self.assertIn(weight_name, f.keys()) - self.assertEqual(f.get_tensor(weight_name).shape, torch.Size([1])) - self.assertEqual(f.get_tensor(weight_name).dtype, torch.float32) + assert weight_name in f.keys() + assert f.get_tensor(weight_name).shape == torch.Size([1]) + assert f.get_tensor(weight_name).dtype == torch.float32 shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_awq_lmhead_export(self): + def test_awq_lmhead_export(self, dataloader): bits, sym, group_size = 4, False, 128 - model_name = "/tf_dataset/auto_round/models/microsoft/phi-2" + model_name = get_model_path("microsoft/phi-2") layer_config = { "lm_head": {"bits": 4}, # set lm_head quant "layer": {"bits": 16}, @@ -350,7 +338,7 @@ def test_awq_lmhead_export(self): nsamples=2, seqlen=2, layer_config=layer_config, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = "./saved" compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") @@ -360,10 +348,10 @@ def test_awq_lmhead_export(self): assert isinstance(lm_head, WQLinear_GEMM), "Illegal AWQ quantization for lm_head layer" shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_gptq_lmhead_export(self): + def test_gptq_lmhead_export(self, dataloader): bits, sym, group_size = 4, True, 128 # Note that, to save UT tuning time, the local model is intentionally kept lightweight, using only 2 hidden layers. - model_name = "/tf_dataset/auto_round/models/microsoft/phi-2" + model_name = get_model_path("microsoft/phi-2") layer_config = { "lm_head": {"bits": 4}, # set lm_head quant "layer": {"bits": 16}, @@ -377,7 +365,7 @@ def test_gptq_lmhead_export(self): iters=2, seqlen=2, layer_config=layer_config, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = "./saved" compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") @@ -393,7 +381,3 @@ def test_gptq_lmhead_export(self): res = tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0]) print(res) shutil.rmtree(quantized_model_path, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_generation.py b/test/test_cpu/test_generation.py index 5018d1610..e1e9dc3f1 100644 --- a/test/test_cpu/test_generation.py +++ b/test/test_cpu/test_generation.py @@ -1,39 +1,29 @@ import copy import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from auto_round import AutoRound +from ..helpers import opt_name_or_path -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestAutoRoundFormatGeneration(unittest.TestCase): +class TestAutoRoundFormatGeneration: @classmethod - def setUpClass(self): - self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def setup_class(self): + self.model_name = opt_name_or_path self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) - self.llm_dataloader = LLMDataLoader() self.save_folder = "./saved" @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree(self.save_folder, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_4bits_sym(self): + def test_4bits_sym(self, dataloader): bits = 4 group_size = 128 sym = True @@ -45,7 +35,7 @@ def test_4bits_sym(self): sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_folder @@ -72,7 +62,7 @@ def test_4bits_sym(self): print(res) assert "!!!" not in res - def test_autoround_sym(self): + def test_autoround_sym(self, dataloader): for bits in [4]: model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -85,7 +75,7 @@ def test_autoround_sym(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = "./saved" diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py index 53b199c41..92e9d620e 100644 --- a/test/test_cpu/test_gguf_format.py +++ b/test/test_cpu/test_gguf_format.py @@ -1,43 +1,32 @@ import os import shutil import sys -import unittest - -sys.path.insert(0, "../..") +import pytest import torch from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound +from ..helpers import get_model_path, get_tiny_model -class LLMDataLoader: - - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - -class TestGGUF(unittest.TestCase): +class TestGGUF: @classmethod - def setUpClass(self): - self.model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct" + def setup_class(self): + self.model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct") self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) - self.llm_dataloader = LLMDataLoader() @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_basic_usage(self): + def test_basic_usage(self, tiny_gemma_model_path, tiny_qwen_model_path): python_path = sys.executable res = os.system( - f"cd ../.. && {python_path} -m auto_round --model /tf_dataset/auto_round/models/benzart/gemma-2b-it-fine-tuning-for-code-test " + f"cd .. && {python_path} -m auto_round --model {tiny_gemma_model_path} " f" --bs 16 --iters 0 --nsamples 1 --format gguf:q4_k_m" ) if res > 0 or res == -1: @@ -45,7 +34,7 @@ def test_basic_usage(self): shutil.rmtree("./saved", ignore_errors=True) res = os.system( - f"cd ../.. && {python_path} -m auto_round --model {self.model_name}" + f"cd .. && {python_path} -m auto_round --model {tiny_qwen_model_path}" f" --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0" ) if res > 0 or res == -1: @@ -73,39 +62,12 @@ def test_q4_0(self): inputs = self.tokenizer(text, return_tensors="pt").to(model.device) print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) - # from auto_round.eval.evaluation import simple_evaluate_user_model - # result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="openbookqa", eval_model_dtype="bf16") - # # 0.246 - # self.assertGreater(result['results']['openbookqa']['acc,none'], 0.23) shutil.rmtree("./saved", ignore_errors=True) - # def test_q4_1(self): - # bits, group_size, sym = 4, 32, False - # autoround = AutoRound( - # self.model, self.tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, data_type="int", nsamples=1 - # ) - # quantized_model_path = "./saved" - # - # autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q4_1") - # gguf_file = os.listdir(quantized_model_path)[0] - # model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto") - # text = "There is a girl who likes adventure," - # inputs = self.tokenizer(text, return_tensors="pt").to(model.device) - # print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) - # - # # from auto_round.eval.evaluation import simple_evaluate_user_model - # # result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="openbookqa", eval_model_dtype="bf16") - # # # 0.23 - # # self.assertGreater(result['results']['openbookqa']['acc,none'], 0.22) - # shutil.rmtree("./saved", ignore_errors=True) - def test_func(self): bits, group_size, sym = 4, 128, True autoround = AutoRound( self.model_name, - # bits=bits, - # group_size=group_size, - # sym=sym, iters=1, nsamples=1, seqlen=10, @@ -113,8 +75,8 @@ def test_func(self): ) quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q*_1") - self.assertTrue(autoround.group_size == 32) - self.assertFalse(autoround.sym) + assert autoround.group_size == 32 + assert not autoround.sym gguf_file = os.listdir("saved")[0] model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto") text = "There is a girl who likes adventure," @@ -122,80 +84,8 @@ def test_func(self): print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) shutil.rmtree("./saved", ignore_errors=True) - # model_name = "Qwen/Qwen2.5-1.5B-Instruct" - # model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - # autoround = AutoRound( - # model, - # self.tokenizer, - # bits=3, - # group_size=16, - # sym=True, - # iters=1, - # nsamples=1, - # data_type="int_sym_dq", - # super_group_size=16, - # super_bits=6, - # ) - quantized_model_path = "./saved" - # autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q*_k_s") - # from auto_round.eval.evaluation import simple_evaluate_user_model - # gguf_file = os.listdir("saved")[0] - # model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto") - # result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="lambada_openai", eval_model_dtype="bf16") - # self.assertGreater(result['results']['lambada_openai']['acc,none'], 0.5) - shutil.rmtree("./saved", ignore_errors=True) - - # - # def test_q5_k(self): - # model_name = "Qwen/Qwen2.5-1.5B-Instruct" - # model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - # autoround = AutoRound( - # model, - # self.tokenizer, - # bits=5, - # group_size=32, - # sym=False, - # iters=1, - # nsamples=1, - # data_type="int_asym_dq", - # super_group_size=8, - # super_bits=6, - # ) - # quantized_model_path = "./saved" - # autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q*_k_s") - # gguf_file = os.listdir("saved")[0] - # model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto") - # text = "There is a girl who likes adventure," - # inputs = self.tokenizer(text, return_tensors="pt").to(model.device) - # print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) - # shutil.rmtree("./saved", ignore_errors=True) - - # def test_q6_k(self): - # model_name = "Qwen/Qwen2.5-1.5B-Instruct" - # model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - # autoround = AutoRound( - # model, - # self.tokenizer, - # bits=6, - # group_size=16, - # sym=True, - # iters=1, - # nsamples=1, - # data_type="int_sym_dq", - # super_group_size=16, - # super_bits=8, - # ) - # quantized_model_path = "./saved" - # autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q*_k") - # gguf_file = os.listdir("saved")[0] - # model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto") - # text = "There is a girl who likes adventure," - # inputs = self.tokenizer(text, return_tensors="pt").to(model.device) - # print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) - # shutil.rmtree("./saved", ignore_errors=True) - def test_gguf_baseline(self): - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-1.5B-Instruct" + model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) autoround = AutoRound( model, @@ -218,31 +108,9 @@ def test_gguf_baseline(self): inputs = self.tokenizer(text, return_tensors="pt").to(model.device) print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) shutil.rmtree("./saved", ignore_errors=True) - # - # model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - # autoround = AutoRound( - # model, - # self.tokenizer, - # bits=5, - # group_size=32, - # sym=True, - # iters=0, - # nsamples=8, - # data_type="int_asym_dq", - # super_group_size=8, - # super_bits=6, - # disable_opt_rtn=True, - # ) - # quantized_model_path = "./saved" - # autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q5_k_s,fake") - # model = AutoModelForCausalLM.from_pretrained(quantized_model_path + "/fake", device_map="auto") - # text = "There is a girl who likes adventure," - # inputs = self.tokenizer(text, return_tensors="pt").to(model.device) - # print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) - # shutil.rmtree("./saved", ignore_errors=True) - def test_q4_k_m(self): - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-1.5B-Instruct" + def test_q4_k_m(self, dataloader): + model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) layer_config = { @@ -265,21 +133,21 @@ def test_q4_k_m(self): iters=0, seqlen=1, nsamples=8, - dataset=self.llm_dataloader, + dataset=dataloader, disable_opt_rtn=True, ) quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake") - self.assertEqual(autoround.layer_config["model.layers.11.self_attn.v_proj"]["super_group_size"], 16) - self.assertEqual(autoround.layer_config["model.layers.11.self_attn.v_proj"]["data_type"], "int_sym_dq") - self.assertEqual(autoround.layer_config["model.layers.7.self_attn.v_proj"]["data_type"], "int_asym_dq") - self.assertEqual(autoround.model.model.layers[0].self_attn.v_proj.bits, 6) - self.assertEqual(autoround.model.model.layers[12].self_attn.v_proj.bits, 4) - self.assertEqual(autoround.model.model.embed_tokens.bits, 6) - self.assertEqual(autoround.model.model.embed_tokens.group_size, 16) - self.assertEqual(autoround.model.model.layers[12].mlp.gate_proj.bits, 3) - self.assertEqual(autoround.model.model.layers[10].mlp.gate_proj.bits, 8) - self.assertEqual(autoround.layer_config["model.layers.10.mlp.gate_proj"]["mostly"], "gguf:q8_0") + assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["super_group_size"] == 16 + assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["data_type"] == "int_sym_dq" + assert autoround.layer_config["model.layers.7.self_attn.v_proj"]["data_type"] == "int_asym_dq" + assert autoround.model.model.layers[0].self_attn.v_proj.bits == 6 + assert autoround.model.model.layers[12].self_attn.v_proj.bits == 4 + assert autoround.model.model.embed_tokens.bits == 6 + assert autoround.model.model.embed_tokens.group_size == 16 + assert autoround.model.model.layers[12].mlp.gate_proj.bits == 3 + assert autoround.model.model.layers[10].mlp.gate_proj.bits == 8 + assert autoround.layer_config["model.layers.10.mlp.gate_proj"]["mostly"] == "gguf:q8_0" shutil.rmtree("./saved", ignore_errors=True) model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) @@ -288,13 +156,13 @@ def test_q4_k_m(self): autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake") shutil.rmtree("./saved", ignore_errors=True) - def test_all_format(self): - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-1.5B-Instruct" + def test_all_format(self, tiny_qwen_model_path): + model_name = tiny_qwen_model_path python_path = sys.executable # for gguf_format in ["gguf:q4_0", "gguf:q4_1", "gguf:q4_k_m", "gguf:q6_k"]: for gguf_format in ["gguf:q4_k_m"]: res = os.system( - f"cd ../.. && {python_path} -m auto_round --model {model_name} " + f"cd .. && {python_path} -m auto_round --model {model_name} " f" --bs 16 --iters 1 --nsamples 1 --seqlen 16 --format {gguf_format}" ) if res > 0 or res == -1: @@ -302,7 +170,7 @@ def test_all_format(self): shutil.rmtree("../../tmp_autoround", ignore_errors=True) res = os.system( - f"cd ../.. && {python_path} -m auto_round --model {model_name}" + f"cd .. && {python_path} -m auto_round --model {model_name}" f" --bs 16 --iters 0 --nsamples 1 --seqlen 16 --format fake,{gguf_format}" ) if res > 0 or res == -1: @@ -311,7 +179,7 @@ def test_all_format(self): # test mixed q2_k_s res = os.system( - f"cd ../.. && {python_path} -m auto_round --model {model_name}" + f"cd .. && {python_path} -m auto_round --model {model_name}" f" --bs 16 --iters 0 --nsamples 1 --seqlen 16 --scheme GGUF:Q2_K_MIXED" ) if res > 0 or res == -1: @@ -319,7 +187,7 @@ def test_all_format(self): shutil.rmtree("../../tmp_autoround", ignore_errors=True) def test_vlm_gguf(self): - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct" + model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct") from auto_round import AutoRoundMLLM from auto_round.utils import mllm_load_model @@ -334,13 +202,13 @@ def test_vlm_gguf(self): ) quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") - self.assertTrue("mmproj-model.gguf" in os.listdir("./saved")) + assert "mmproj-model.gguf" in os.listdir("./saved") for file_name in os.listdir(quantized_model_path): file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2 if file_name == "mmproj-model.gguf": - self.assertAlmostEqual(file_size, 2537, delta=5.0) + assert abs(file_size - 2537) < 5.0 else: - self.assertAlmostEqual(file_size, 892, delta=5.0) + assert abs(file_size - 892) < 5.0 shutil.rmtree("./saved", ignore_errors=True) def test_qtype_setting(self): @@ -351,7 +219,7 @@ def test_qtype_setting(self): from auto_round.compressors.utils import set_layer_config from auto_round.export.export_to_gguf.config import ModelType - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct" + model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct") ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0) ar.formats = ["gguf:q4_0"] ar.layer_config, _, _ = set_layer_config( @@ -367,8 +235,8 @@ def test_qtype_setting(self): enable_gguf_official_mixed=True, is_mllm=ar.mllm, ) - self.assertTrue(ar.layer_config["model.embed_tokens"]["bits"] == 8) - self.assertTrue("lm_head" not in ar.layer_config) + assert ar.layer_config["model.embed_tokens"]["bits"] == 8 + assert "lm_head" not in ar.layer_config model_name = "Qwen/Qwen3-0.6B" ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0) @@ -386,8 +254,8 @@ def test_qtype_setting(self): enable_gguf_official_mixed=True, is_mllm=ar.mllm, ) - self.assertTrue(ar.layer_config["model.embed_tokens"]["bits"] == 4) - self.assertTrue(ar.layer_config["lm_head"]["bits"] == 6 and ar.layer_config["lm_head"]["super_bits"] == 8) + assert ar.layer_config["model.embed_tokens"]["bits"] == 4 + assert ar.layer_config["lm_head"]["bits"] == 6 and ar.layer_config["lm_head"]["super_bits"] == 8 layer_config = { "model.embed_tokens": {"bits": 6, "super_bits": 8}, @@ -408,12 +276,8 @@ def test_qtype_setting(self): enable_gguf_official_mixed=True, is_mllm=ar.mllm, ) - self.assertTrue(ar.layer_config["lm_head"]["bits"] == 4) - self.assertTrue( - ar.layer_config["model.embed_tokens"]["bits"] == 6 + assert ( + ar.layer_config["lm_head"]["bits"] == 4 + and ar.layer_config["model.embed_tokens"]["bits"] == 6 and ar.layer_config["model.embed_tokens"]["super_bits"] == 8 ) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_gpt_oss.py b/test/test_cpu/test_gpt_oss.py deleted file mode 100644 index ccc997eba..000000000 --- a/test/test_cpu/test_gpt_oss.py +++ /dev/null @@ -1,72 +0,0 @@ -import pytest -from transformers import AutoConfig, AutoTokenizer -from transformers.models.gpt_oss.modeling_gpt_oss import GptOssForCausalLM - -from auto_round import AutoRound - - -@pytest.fixture -def setup_gpt_oss(): - """Fixture to set up the GPT-OSS model and tokenizer.""" - model_name = "/tf_dataset/auto_round/models/unsloth/gpt-oss-20b-BF16" - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) - config.num_hidden_layers = 1 # Reduce layers for testing - model = GptOssForCausalLM(config) - output_dir = "/tmp/test_quantized_gpt_oss" - return model, tokenizer, output_dir, config - - -def quantize_model(model, tokenizer, output_dir, scheme, iters=0): - """Helper function to quantize the model with the given scheme.""" - autoround = AutoRound( - model, - tokenizer, - scheme=scheme, - nsamples=2, - iters=iters, - fp_layers="self_attn,router,lm_head,mlp.gate", - ) - quantized_model, save_folder = autoround.quantize_and_save(format="auto_round", output_dir=output_dir) - return quantized_model - - -def count_modules_by_type(model, target_module_name_or_class): - """Helper function to count modules of a specific type in the model.""" - cnt = 0 - for name, module in model.named_modules(): - if isinstance(target_module_name_or_class, str): - if target_module_name_or_class == module.__class__.__name__: - cnt += 1 - else: - if isinstance(module, target_module_name_or_class): - cnt += 1 - return cnt - - -@pytest.mark.parametrize("scheme", ["MXFP4", "MXFP8"]) -def test_quantization(setup_gpt_oss, scheme): - """Test quantization with the scheme.""" - model, tokenizer, output_dir, config = setup_gpt_oss - quantized_model = quantize_model(model, tokenizer, output_dir, scheme) - - # Ensure the quantized model is not None - assert quantized_model is not None, "Quantized model should not be None." - from auto_round.export.export_to_autoround.qlinear_fp import QuantLinear - from auto_round.modelling.gpt_oss import GPTOssSingleExpert - - single_expert_cnt = count_modules_by_type(quantized_model, GPTOssSingleExpert) - quant_linear_cnt = count_modules_by_type(quantized_model, QuantLinear) - assert ( - single_expert_cnt == config.num_local_experts - ), f"Expected {config.num_local_experts} GPTOssSingleExpert modules, found {single_expert_cnt}." - assert ( - quant_linear_cnt == config.num_hidden_layers * 3 * config.num_local_experts - ), f"Expected {config.num_hidden_layers * 3 * config.num_local_experts} QuantLinear modules, found {quant_linear_cnt}." - - print(f"[{scheme}] Total {GPTOssSingleExpert.__name__} modules: {single_expert_cnt}") - print(f"[{scheme}] Total {QuantLinear.__name__} modules: {quant_linear_cnt}") - # clean the output directory after test - import shutil - - shutil.rmtree(output_dir, ignore_errors=True) diff --git a/test/test_cpu/test_init.py b/test/test_cpu/test_init.py index 6ebee954d..01785d679 100644 --- a/test/test_cpu/test_init.py +++ b/test/test_cpu/test_init.py @@ -1,8 +1,8 @@ from auto_round import AutoRound -def test_torch_compile(): - ar = AutoRound(model="facebook/opt-125m", scheme="NVFP4", enable_torch_compile=True) +def test_torch_compile(tiny_opt_model_path): + ar = AutoRound(model=tiny_opt_model_path, scheme="NVFP4", enable_torch_compile=True) assert not ar.enable_torch_compile, "NVFP4 cannot work with torch.compile." - ar = AutoRound(model="facebook/opt-125m", scheme="FP8_STATIC", enable_torch_compile=True) + ar = AutoRound(model=tiny_opt_model_path, scheme="FP8_STATIC", enable_torch_compile=True) assert not ar.enable_torch_compile, "FP8_STATIC cannot work with torch.compile." diff --git a/test/test_cpu/test_llmc_integration.py b/test/test_cpu/test_llmc_integration.py index 6dba09cfa..cea412327 100644 --- a/test/test_cpu/test_llmc_integration.py +++ b/test/test_cpu/test_llmc_integration.py @@ -85,7 +85,7 @@ def test_oneshot_application(recipe, tmp_path): assert weight_args.num_bits == 4 # Check a specific layer is quantized - targeted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj + targeted_linear_layer = model_loaded.model.layers[1].self_attn.q_proj assert hasattr(targeted_linear_layer, "quantization_scheme") # Check lm-head is not quantized diff --git a/test/test_cpu/test_llmcompressor.py b/test/test_cpu/test_llmcompressor.py index 051dfb075..614701943 100644 --- a/test/test_cpu/test_llmcompressor.py +++ b/test/test_cpu/test_llmcompressor.py @@ -1,25 +1,24 @@ import os import shutil -import sys -import unittest - -sys.path.insert(0, "../..") +import pytest import torch from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound +from ..helpers import get_model_path, opt_name_or_path + -class TestLLMC(unittest.TestCase): +class TestLLMC: @classmethod - def setUpClass(self): - self.model_name = "/tf_dataset/auto_round/models/stas/tiny-random-llama-2" + def setup_class(self): + self.model_name = get_model_path("stas/tiny-random-llama-2") self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @@ -42,7 +41,7 @@ def test_llmcompressor_w8a8(self): def test_llmcompressor_fp8(self): ## quantize the model - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + model_name = opt_name_or_path autoround = AutoRound( model_name, scheme="FP8_STATIC", @@ -59,14 +58,14 @@ def test_llmcompressor_fp8(self): import json config = json.load(open("./saved/config.json")) - self.assertIn("group_0", config["quantization_config"]["config_groups"]) - self.assertEqual(config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"], 8) - self.assertEqual(config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"], "channel") - self.assertEqual(config["quantization_config"]["quant_method"], "compressed-tensors") + assert "group_0" in config["quantization_config"]["config_groups"] + assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8 + assert config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"] == "channel" + assert config["quantization_config"]["quant_method"] == "compressed-tensors" def test_autoround_llmcompressor_fp8(self): ## quantize the model - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + model_name = opt_name_or_path autoround = AutoRound( model_name, scheme="FP8_STATIC", @@ -80,14 +79,8 @@ def test_autoround_llmcompressor_fp8(self): import json config = json.load(open("./saved/config.json")) - self.assertIn("group_0", config["quantization_config"]["config_groups"]) - self.assertEqual(config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"], 8) - self.assertEqual(config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"], "tensor") - self.assertEqual( - config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["strategy"], "tensor" - ) - self.assertEqual(config["quantization_config"]["quant_method"], "compressed-tensors") - - -if __name__ == "__main__": - unittest.main() + assert "group_0" in config["quantization_config"]["config_groups"] + assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8 + assert config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"] == "tensor" + assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["strategy"] == "tensor" + assert config["quantization_config"]["quant_method"] == "compressed-tensors" diff --git a/test/test_cpu/test_load_awq_gptq.py b/test/test_cpu/test_load_awq_gptq.py index 4fb6bb977..6dc295b4e 100644 --- a/test/test_cpu/test_load_awq_gptq.py +++ b/test/test_cpu/test_load_awq_gptq.py @@ -1,46 +1,21 @@ import shutil -import sys -import unittest - -sys.path.insert(0, "../..") +import pytest from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer +from ..helpers import get_model_path, model_infer -class TestAutoRound(unittest.TestCase): - def model_infer(self, model, tokenizer): - prompts = [ - "Hello,my name is", - # "The president of the United States is", - # "The capital of France is", - # "The future of AI is", - ] - - inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) - - outputs = model.generate( - input_ids=inputs["input_ids"].to(model.device), - attention_mask=inputs["attention_mask"].to(model.device), - do_sample=False, ## change this to follow official usage - max_new_tokens=5, - ) - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] - - decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - for i, prompt in enumerate(prompts): - print(f"Prompt: {prompt}") - print(f"Generated: {decoded_outputs[i]}") - print("-" * 50) +class TestAutoRound: @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("runs", ignore_errors=True) def test_load_gptq_no_dummy_gidx_model(self): - model_name = "/tf_dataset/auto_round/models/ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1" + model_name = get_model_path("ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1") quantization_config = AutoRoundConfig() - with self.assertRaises(NotImplementedError) as cm: + with pytest.raises(NotImplementedError): model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", @@ -50,7 +25,7 @@ def test_load_gptq_no_dummy_gidx_model(self): ) def test_load_awq(self): - model_name = "/tf_dataset/auto_round/models/casperhansen/opt-125m-awq" + model_name = get_model_path("casperhansen/opt-125m-awq") quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( model_name, @@ -60,4 +35,4 @@ def test_load_awq(self): quantization_config=quantization_config, ) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) diff --git a/test/test_cpu/test_mix_bits.py b/test/test_cpu/test_mix_bits.py index 2c73d42cd..6cc390637 100644 --- a/test/test_cpu/test_mix_bits.py +++ b/test/test_cpu/test_mix_bits.py @@ -1,19 +1,17 @@ import json import os import shutil -import sys -import unittest from pathlib import Path -from parameterized import parameterized - -sys.path.insert(0, "../..") +import pytest import torch from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from auto_round import AutoRound from auto_round.testing_utils import require_gptqmodel +from ..helpers import opt_name_or_path + def _get_folder_size(path: str) -> float: """Return folder size in GB.""" @@ -26,31 +24,21 @@ def _get_folder_size(path: str) -> float: return total_size / (1024**3) # convert to GB -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestAutoRound(unittest.TestCase): +class TestAutoRound: @classmethod - def setUpClass(self): - self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def setup_class(self): + self.model_name = opt_name_or_path self.save_dir = ".saved/" self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) - self.llm_dataloader = LLMDataLoader() @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @require_gptqmodel - def test_mixed_gptqmodel(self): + def test_mixed_gptqmodel(self, dataloader): layer_config = { "k_proj": {"bits": 8}, # part name "lm_head": {"bits": 4}, # set lm_head quant @@ -64,7 +52,7 @@ def test_mixed_gptqmodel(self): iters=2, seqlen=2, layer_config=layer_config, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") @@ -79,7 +67,7 @@ def test_mixed_gptqmodel(self): assert "!!!" not in model.tokenizer.decode(result) # string output shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mixed_gptqmodel_convert_to_ar(self): + def test_mixed_gptqmodel_convert_to_ar(self, dataloader): layer_config = { "k_proj": {"bits": 8}, # part name "lm_head": {"bits": 4}, # set lm_head quant @@ -93,7 +81,7 @@ def test_mixed_gptqmodel_convert_to_ar(self): iters=2, seqlen=2, layer_config=layer_config, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") @@ -108,7 +96,7 @@ def test_mixed_gptqmodel_convert_to_ar(self): print(res) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mixed_autoround_format(self): + def test_mixed_autoround_format(self, dataloader): layer_config = { "k_proj": {"bits": 8}, "q_proj": {"bits": 3}, @@ -120,7 +108,7 @@ def test_mixed_autoround_format(self): scheme="W4A16", iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = "./saved" @@ -134,7 +122,7 @@ def test_mixed_autoround_format(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_fallback_regex_for_awq_format(self): + def test_fallback_regex_for_awq_format(self, dataloader): layer_config = { "lm_head": {"bits": 16}, "fc1": {"bits": 16}, @@ -144,7 +132,7 @@ def test_fallback_regex_for_awq_format(self): scheme="W4A16", iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = "./saved" @@ -159,7 +147,7 @@ def test_fallback_regex_for_awq_format(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mixed_ar_format_part_name_hf_loading(self): + def test_mixed_ar_format_part_name_hf_loading(self, dataloader): layer_config = { "k_proj": {"bits": 8}, # part name "lm_head": {"bits": 16}, # full name @@ -170,7 +158,7 @@ def test_mixed_ar_format_part_name_hf_loading(self): scheme="W4A16", iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = "./saved" @@ -220,7 +208,7 @@ def test_mixed_ar_format_part_name_hf_loading(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mixed_MXFP_autoround_format_loading(self): + def test_mixed_MXFP_autoround_format_loading(self, dataloader): layer_config = { "k_proj": {"bits": 8, "act_bits": 8}, "lm_head": {"bits": 16, "act_bits": 16}, @@ -231,7 +219,7 @@ def test_mixed_MXFP_autoround_format_loading(self): scheme="MXFP4", iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = self.save_dir @@ -246,9 +234,5 @@ def test_mixed_MXFP_autoround_format_loading(self): result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10) print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.14) + assert result["results"]["lambada_openai"]["acc,none"] > 0.14 shutil.rmtree(quantized_model_path, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_mllm.py b/test/test_cpu/test_mllm.py index 8510adca5..2eb1d3e2f 100644 --- a/test/test_cpu/test_mllm.py +++ b/test/test_cpu/test_mllm.py @@ -1,14 +1,12 @@ -import sys -import unittest - -sys.path.insert(0, "../..") - import shutil +import pytest from transformers import AutoProcessor, AutoTokenizer, Qwen2VLForConditionalGeneration from auto_round import AutoRoundMLLM +from ..helpers import get_model_path, opt_name_or_path + class FakeDataLoader: def __init__(self): @@ -27,23 +25,21 @@ def __iter__(self): yield self.data -class TestAutoRoundMLLM(unittest.TestCase): +class TestAutoRoundMLLM: @classmethod - def setUpClass(self): - self.model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct" + def setup_class(self): + self.model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct") self.dataset = FakeDataLoader() @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - return super().tearDownClass() - - def test_tune(self): + def test_tune(self, tiny_qwen_vl_model_path): bits, group_size = 4, 128 autoround = AutoRoundMLLM( - model=self.model_name, + model=tiny_qwen_vl_model_path, bits=bits, group_size=group_size, nsamples=1, @@ -56,11 +52,11 @@ def test_tune(self): autoround.save_quantized("./saved/", format="auto_gptq", inplace=False) autoround.save_quantized("./saved/", format="auto_round", inplace=False) - def test_quant_vision(self): ## bug need to fix - tokenizer = AutoTokenizer.from_pretrained(self.model_name) - processor = AutoProcessor.from_pretrained(self.model_name, trust_remote_code=True) + def test_quant_vision(self, tiny_qwen_vl_model_path): ## bug need to fix + tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_vl_model_path) + processor = AutoProcessor.from_pretrained(tiny_qwen_vl_model_path, trust_remote_code=True) model = Qwen2VLForConditionalGeneration.from_pretrained( - self.model_name, trust_remote_code=True, device_map="auto" + tiny_qwen_vl_model_path, trust_remote_code=True, device_map="auto" ) bits, group_size = 4, 128 autoround = AutoRoundMLLM( @@ -105,17 +101,17 @@ class Myclass: dataset = MLLM_DATASET["liuhaotian/llava"]( template=Myclass(), model=None, tokenzier=None, dataset_path="liuhaotian/llava", seqlen=32, nsamples=32 ) - self.assertEqual(len(dataset.questions), 32) + assert len(dataset.questions) == 32 dataset = MLLM_DATASET["liuhaotian/llava"]( template=Myclass(), model=None, tokenzier=None, dataset_path="liuhaotian/llava", seqlen=2048, nsamples=512 ) - self.assertEqual(len(dataset.questions), 512) + assert len(dataset.questions) == 512 - def test_diff_dataset(self): - tokenizer = AutoTokenizer.from_pretrained(self.model_name) - processor = AutoProcessor.from_pretrained(self.model_name, trust_remote_code=True) + def test_diff_dataset(self, tiny_qwen_vl_model_path): + tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_vl_model_path) + processor = AutoProcessor.from_pretrained(tiny_qwen_vl_model_path, trust_remote_code=True) model = Qwen2VLForConditionalGeneration.from_pretrained( - self.model_name, trust_remote_code=True, device_map="auto" + tiny_qwen_vl_model_path, trust_remote_code=True, device_map="auto" ) bits, group_size = 4, 128 dataset = ["dataset test", "list test"] @@ -133,19 +129,17 @@ def test_diff_dataset(self): ) autoround.quantize() - def test_pure_text_model_check(self): + def test_pure_text_model_check(self, tiny_qwen_vl_model_path): from transformers import AutoModelForCausalLM from auto_round.utils import is_pure_text_model model = Qwen2VLForConditionalGeneration.from_pretrained( - self.model_name, trust_remote_code=True, device_map="auto" + tiny_qwen_vl_model_path, trust_remote_code=True, device_map="auto" ) - self.assertFalse(is_pure_text_model(model)) - model = AutoModelForCausalLM.from_pretrained( - "/tf_dataset/auto_round/models/facebook/opt-125m", trust_remote_code=True - ) - self.assertTrue(is_pure_text_model(model)) + assert not is_pure_text_model(model) + model = AutoModelForCausalLM.from_pretrained(opt_name_or_path, trust_remote_code=True) + assert is_pure_text_model(model) def test_str_input(self): tokenizer = AutoTokenizer.from_pretrained(self.model_name) @@ -211,10 +205,10 @@ def test_str_input(self): ) print(output_text[0]) - def test_qwen2_5(self): + def test_qwen2_5(self, tiny_qwen_2_5_vl_model_path): from auto_round.utils import mllm_load_model - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-VL-3B-Instruct" + model_name = tiny_qwen_2_5_vl_model_path model, processor, tokenizer, image_processor = mllm_load_model(model_name) autoround = AutoRoundMLLM( model, @@ -264,8 +258,3 @@ def test_qwen2_5(self): output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) - print(output_text) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_model_scope.py b/test/test_cpu/test_model_scope.py index 6da33cdc3..7edcab156 100644 --- a/test/test_cpu/test_model_scope.py +++ b/test/test_cpu/test_model_scope.py @@ -1,30 +1,19 @@ import copy import os import shutil -import sys -import unittest - -sys.path.insert(0, "../..") +import pytest import torch from auto_round import AutoRound +from ..helpers import get_model_path -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(3): - yield torch.ones([1, 10], dtype=torch.long) - -class TestModelScope(unittest.TestCase): +class TestModelScope: @classmethod - def setUpClass(self): + def setup_class(self): self.saved_path = "./saved" - self.dataset = LLMDataLoader() self.source_path, self.cache_path = "/tf_dataset/auto_round/modelscope", "/home/hostuser/.cache/modelscope" if os.path.exists(self.source_path): @@ -33,28 +22,20 @@ def setUpClass(self): shutil.copytree(self.source_path, self.cache_path, dirs_exist_ok=True) @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) if os.path.exists(self.cache_path): shutil.rmtree(self.cache_path, ignore_errors=True) - return super().tearDownClass() - - def test_llm(self): - model_name = "Qwen/Qwen2.5-0.5B-Instruct" - autoround = AutoRound( - model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=self.dataset - ) + def test_llm(self, dataloader): + model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct") + autoround = AutoRound(model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=dataloader) autoround.quantize_and_save() - def test_mllm(self): - model_name = "Qwen/Qwen2-VL-2B-Instruct" + def test_mllm(self, dataloader): + model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct") autoround = AutoRound( - model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=self.dataset, batch_size=2 + model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=dataloader, batch_size=2 ) autoround.quantize_and_save(self.saved_path) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_moe_model.py b/test/test_cpu/test_moe_model.py index c88571346..c30ab0e39 100644 --- a/test/test_cpu/test_moe_model.py +++ b/test/test_cpu/test_moe_model.py @@ -6,29 +6,37 @@ from auto_round import AutoRound +from ..helpers import get_model_path + +gpt_oss_name_or_path = get_model_path("unsloth/gpt-oss-20b-BF16") +llama4_name_or_path = get_model_path("meta-llama/Llama-4-Scout-17B-16E-Instruct") + +# local path for debug +# llama4_name_or_path = get_model_path("/dataset/Llama-4-Scout-17B-16E-Instruct") + @pytest.fixture def setup_gpt_oss(): """Fixture to set up the GPT-OSS model and tokenizer.""" - model_name = "/tf_dataset/auto_round/models/unsloth/gpt-oss-20b-BF16" + model_name = gpt_oss_name_or_path tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) config.num_hidden_layers = 1 # Reduce layers for testing model = GptOssForCausalLM(config) - output_dir = "/tmp/test_quantized_gpt_oss" + output_dir = "./tmp/test_quantized_gpt_oss" return model, tokenizer, output_dir, config @pytest.fixture def setup_llama4(): """Fixture to set up the llama4 model and tokenizer.""" - model_name = "/tf_dataset/auto_round/models/meta-llama/Llama-4-Scout-17B-16E-Instruct" + model_name = llama4_name_or_path tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) config.vision_config.num_hidden_layers = 2 # Reduce layers for testing config.text_config.num_hidden_layers = 2 model = Llama4ForConditionalGeneration(config) - output_dir = "/tmp/test_quantized_llama4" + output_dir = "./tmp/test_quantized_llama4" return model, tokenizer, output_dir, config @@ -46,23 +54,52 @@ def quantize_model(model, tokenizer, output_dir, scheme, iters=0): return quantized_model -def test_gptoss(setup_gpt_oss): +def count_modules_by_type(model, target_module_name_or_class): + """Helper function to count modules of a specific type in the model.""" + cnt = 0 + for name, module in model.named_modules(): + if isinstance(target_module_name_or_class, str): + if target_module_name_or_class == module.__class__.__name__: + cnt += 1 + else: + if isinstance(module, target_module_name_or_class): + cnt += 1 + return cnt + + +@pytest.mark.parametrize("scheme", ["MXFP4", "MXFP8"]) +def test_gptoss(setup_gpt_oss, scheme): model, tokenizer, output_dir, config = setup_gpt_oss # Below parameter is set to be same as the full model # Remove it to avoid mismatch during quantized model loading delattr(model.config, "layer_types") - quantized_model = quantize_model(model, tokenizer, output_dir, "MXFP4") + quantized_model = quantize_model(model, tokenizer, output_dir, scheme) # Ensure the quantized model is not None assert quantized_model is not None, "Quantized model should not be None." - - loaded_model = GptOssForCausalLM.from_pretrained(output_dir) - for n, m in quantized_model.named_modules(): - if m.__class__.__name__ == "QuantLinear": - loaded_m = loaded_model.get_submodule(n) - assert (loaded_m.weight_packed.to("cpu") == m.weight_packed.to("cpu")).all() + from auto_round.export.export_to_autoround.qlinear_fp import QuantLinear + from auto_round.modelling.gpt_oss import GPTOssSingleExpert + + single_expert_cnt = count_modules_by_type(quantized_model, GPTOssSingleExpert) + quant_linear_cnt = count_modules_by_type(quantized_model, QuantLinear) + assert ( + single_expert_cnt == config.num_local_experts + ), f"Expected {config.num_local_experts} GPTOssSingleExpert modules, found {single_expert_cnt}." + assert ( + quant_linear_cnt == config.num_hidden_layers * 3 * config.num_local_experts + ), f"Expected {config.num_hidden_layers * 3 * config.num_local_experts} QuantLinear modules, found {quant_linear_cnt}." + + print(f"[{scheme}] Total {GPTOssSingleExpert.__name__} modules: {single_expert_cnt}") + print(f"[{scheme}] Total {QuantLinear.__name__} modules: {quant_linear_cnt}") + + if scheme == "MXFP4": + loaded_model = GptOssForCausalLM.from_pretrained(output_dir) + for n, m in quantized_model.named_modules(): + if m.__class__.__name__ == "QuantLinear": + loaded_m = loaded_model.get_submodule(n) + assert (loaded_m.weight_packed.to("cpu") == m.weight_packed.to("cpu")).all() # clean the output directory after test shutil.rmtree(output_dir, ignore_errors=True) diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py index f38fe3eb6..7e0600f05 100644 --- a/test/test_cpu/test_mxfp_nvfp.py +++ b/test/test_cpu/test_mxfp_nvfp.py @@ -1,16 +1,14 @@ import os import shutil -import sys -import unittest -from parameterized import parameterized - -sys.path.insert(0, "../..") +import pytest import torch from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from auto_round import AutoRound +from ..helpers import is_model_outputs_similar + def _get_folder_size(path: str) -> float: """Return folder size in GB.""" @@ -23,31 +21,18 @@ def _get_folder_size(path: str) -> float: return total_size / (1024**3) # convert to GB -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestAutoRoundFP(unittest.TestCase): +class TestAutoRoundFP: @classmethod - def setUpClass(self): - self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def setup_class(self): self.save_dir = "./saved" - self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto") - self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) - self.llm_dataloader = LLMDataLoader() @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_nvfp4_moe_actmax_rtn(self): - model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite" + def test_nvfp4_moe_actmax_rtn(self, tiny_deepseek_v2_model_path, dataloader): + model_name = tiny_deepseek_v2_model_path layer_config = { "self_attn": {"bits": 16, "act_bits": 16}, "mlp.shared_experts": {"bits": 16, "act_bits": 16}, @@ -62,7 +47,7 @@ def test_nvfp4_moe_actmax_rtn(self): iters=0, seqlen=2, nsamples=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) compressed_model, _ = autoround.quantize() @@ -73,8 +58,8 @@ def test_nvfp4_moe_actmax_rtn(self): ), "Illegal NVFP4 quantization for lm_head layer" shutil.rmtree(self.save_dir, ignore_errors=True) - def test_nvfp4_moe_actmax_ar(self): - model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite" + def test_nvfp4_moe_actmax_ar(self, tiny_deepseek_v2_model_path, dataloader): + model_name = tiny_deepseek_v2_model_path layer_config = { "q_proj": {"bits": 16, "act_bits": 16}, "mlp.shared_experts": {"bits": 16, "act_bits": 16}, @@ -89,7 +74,7 @@ def test_nvfp4_moe_actmax_ar(self): iters=1, seqlen=3, nsamples=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) compressed_model, _ = autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round") @@ -102,17 +87,11 @@ def test_nvfp4_moe_actmax_ar(self): and lm_head.weight_scale.dtype is torch.float8_e4m3fn ), "Illegal NVFP4 packing for lm_head layer" quantized_model_path = self.save_dir - model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="auto") - tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) - from auto_round.eval.evaluation import simple_evaluate_user_model - - result = simple_evaluate_user_model(model, tokenizer, batch_size=4, tasks="piqa", limit=4) - print(result["results"]["piqa"]["acc,none"]) - self.assertGreater(result["results"]["piqa"]["acc,none"], 0.49) + assert is_model_outputs_similar(model_name, quantized_model_path) shutil.rmtree(self.save_dir, ignore_errors=True) - def test_mxfp4_moe_ar(self): - model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite" + def test_mxfp4_moe_ar(self, tiny_deepseek_v2_model_path, dataloader): + model_name = tiny_deepseek_v2_model_path layer_config = { "q_proj": {"bits": 16, "act_bits": 16, "data_type": "float"}, "mlp.shared_experts": {"bits": 16, "act_bits": 16, "data_type": "float"}, @@ -127,7 +106,7 @@ def test_mxfp4_moe_ar(self): iters=1, seqlen=2, nsamples=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) compressed_model, _ = autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round") @@ -139,8 +118,8 @@ def test_mxfp4_moe_ar(self): ), "Illegal MXFP4 packing for lm_head layer" shutil.rmtree(self.save_dir, ignore_errors=True) - def test_mxfp4_llmcompressor_format(self): - model_name = self.model_name + def test_mxfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path from transformers import AutoConfig scheme = "MXFP4" @@ -151,15 +130,15 @@ def test_mxfp4_llmcompressor_format(self): iters=2, seqlen=2, layer_config=layer_config, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_dir autoround.quantize() compressed_model = autoround.save_quantized( output_dir=quantized_model_path, inplace=True, format="llm_compressor" ) - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj - skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj + tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj + skip_layer = compressed_model.model.decoder.layers[1].self_attn.k_proj assert ( hasattr(tmp_layer, "weight_scale") and hasattr(tmp_layer, "weight_packed") @@ -179,8 +158,8 @@ def test_mxfp4_llmcompressor_format(self): ), f"Invalid MXFP4 quantization configuration: {quantization_config}" shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_rtn_mxfp4_llmcompressor_format(self): - model_name = self.model_name + def test_rtn_mxfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path from transformers import AutoConfig scheme = "MXFP4" @@ -191,15 +170,15 @@ def test_rtn_mxfp4_llmcompressor_format(self): iters=0, seqlen=2, layer_config=layer_config, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_dir autoround.quantize() compressed_model = autoround.save_quantized( output_dir=quantized_model_path, inplace=True, format="llm_compressor" ) - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj - skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj + tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj + skip_layer = compressed_model.model.decoder.layers[1].self_attn.k_proj assert ( hasattr(tmp_layer, "weight_scale") and hasattr(tmp_layer, "weight_packed") @@ -219,8 +198,8 @@ def test_rtn_mxfp4_llmcompressor_format(self): ), f"Invalid MXFP4 quantization configuration: {quantization_config}" shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mxfp8_llmcompressor_format(self): - model_name = self.model_name + def test_mxfp8_llmcompressor_format(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path from transformers import AutoConfig scheme = "MXFP8" @@ -229,11 +208,11 @@ def test_mxfp8_llmcompressor_format(self): scheme=scheme, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_dir compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj + tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") and hasattr(tmp_layer, "weight") @@ -250,14 +229,14 @@ def test_mxfp8_llmcompressor_format(self): and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 8 ), f"Invalid MXFP8 quantization configuration: {quantization_config}" folder_size_gb = _get_folder_size(quantized_model_path) - # Original opt-125m is < 0.5GB -> quantized mxfp8 model should be smaller but not empty + # Original tiny_opt_model_path-125m is < 0.1GB -> quantized mxfp8 model should be smaller but not empty assert ( - 0.15 < folder_size_gb < 0.2 - ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.2 GB)" + 0.05 < folder_size_gb < 0.1 + ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.05~0.1 GB)" shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_nvfp4_llmcompressor_format(self): - model_name = self.model_name + def test_nvfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path from transformers import AutoConfig scheme = "NVFP4" @@ -266,11 +245,11 @@ def test_nvfp4_llmcompressor_format(self): scheme=scheme, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_dir compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj + tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") and hasattr(tmp_layer, "weight_global_scale") @@ -287,14 +266,14 @@ def test_nvfp4_llmcompressor_format(self): and quantization_config["config_groups"]["group_0"]["input_activations"]["num_bits"] == 4 ), f"Invalid NVFP4 quantization configuration: {quantization_config}" folder_size_gb = _get_folder_size(quantized_model_path) - # Original opt-125m is < 0.5GB -> quantized nvfp4 model should be smaller but not empty + # Original opt-125m is < 0.1GB -> quantized nvfp4 model should be smaller but not empty assert ( - 0.1 < folder_size_gb < 0.15 - ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.15 GB)" + 0.05 < folder_size_gb < 0.1 + ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.05~0.1 GB)" shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_nvfp4_autoround_format(self): - model_name = self.model_name + def test_nvfp4_autoround_format(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path from transformers import AutoConfig scheme = "NVFP4" @@ -303,11 +282,11 @@ def test_nvfp4_autoround_format(self): scheme=scheme, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_dir compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj + tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") and hasattr(tmp_layer, "weight_global_scale") @@ -318,8 +297,8 @@ def test_nvfp4_autoround_format(self): ), "Illegal NVFP4 packing name or data_type or shape" shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_nvfp4_autoround_save_quantized(self): - model_name = self.model_name + def test_nvfp4_autoround_save_quantized(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path from transformers import AutoConfig scheme = "NVFP4" @@ -328,12 +307,12 @@ def test_nvfp4_autoround_save_quantized(self): scheme=scheme, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_dir autoround.quantize() compressed_model = autoround.save_quantized(output_dir=quantized_model_path, format="auto_round") - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj + tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") and hasattr(tmp_layer, "weight_global_scale") @@ -344,10 +323,10 @@ def test_nvfp4_autoround_save_quantized(self): ), "Illegal NVFP4 packing name or data_type or shape" shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_qwen_moe_quant_infer(self): - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen1.5-MoE-A2.7B" + def test_qwen_moe_quant_infer(self, tiny_qwen_moe_model_path, dataloader): + model_name = tiny_qwen_moe_model_path layer_config = { - "layers\.(?:[3-9]|1[0-9]|2[0-3])": {"bits": 16, "act_bits": 16}, + "layers.0": {"bits": 16, "act_bits": 16}, } scheme = "nvfp4" autoround = AutoRound( @@ -356,21 +335,16 @@ def test_qwen_moe_quant_infer(self): iters=1, seqlen=2, nsamples=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round") - model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="cpu") - tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) - from auto_round.eval.evaluation import simple_evaluate_user_model - - result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="piqa", limit=10) - print(result["results"]["piqa"]["acc,none"]) - self.assertGreater(result["results"]["piqa"]["acc,none"], 0.60) - shutil.rmtree(quantized_model_path, ignore_errors=True) + assert is_model_outputs_similar(model_name, quantized_model_path) + shutil.rmtree(self.save_dir, ignore_errors=True) - @parameterized.expand( + @pytest.mark.parametrize( + "scheme, static_kv_dtype, static_attention_dtype", [ # scheme, static_kv_dtype, static_attention_dtype ("MXFP4", None, "fp8"), @@ -379,11 +353,11 @@ def test_qwen_moe_quant_infer(self): ("MXFP8", "fp8", None), ("NVFP4", None, "fp8"), ("NVFP4", "fp8", None), - ] + ], ) - def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype): - model_name = self.model_name - from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer + def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path + from transformers import AutoConfig from transformers.models.opt.modeling_opt import OPTForCausalLM config = AutoConfig.from_pretrained(model_name) @@ -397,7 +371,7 @@ def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype): scheme=scheme, iters=0, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, static_kv_dtype=static_kv_dtype, static_attention_dtype=static_attention_dtype, ) @@ -433,7 +407,3 @@ def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype): getattr(attn, "q_scale", None) is not None ), f"Missing q_scale in attention for scheme={scheme}, static_attention_dtype={static_attention_dtype}" shutil.rmtree(quantized_model_path, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_mxfp_save_load.py b/test/test_cpu/test_mxfp_save_load.py index aca5c7592..bf3e9853b 100644 --- a/test/test_cpu/test_mxfp_save_load.py +++ b/test/test_cpu/test_mxfp_save_load.py @@ -14,6 +14,8 @@ from auto_round.inference.backend import MX_TENSOR_DATA_TYPES from auto_round.testing_utils import has_module +from ..helpers import get_model_path + testing_scheme_name_lst = [ AutoRoundFormat.MXFP8.value, AutoRoundFormat.MXFP4.value, @@ -35,7 +37,7 @@ def test_e2e_quant_and_load(scheme_name, weight_data_type, act_data_type): # Use a temporary directory for saving the quantized model with tempfile.TemporaryDirectory() as temp_dir: - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct" + model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct") config = AutoConfig.from_pretrained(model_name) config.num_hidden_layers = 2 # Use a smaller model for testing # Fix configuration validation issues diff --git a/test/test_cpu/test_scheme.py b/test/test_cpu/test_scheme.py index c2d165639..7a60a9ccd 100644 --- a/test/test_cpu/test_scheme.py +++ b/test/test_cpu/test_scheme.py @@ -1,134 +1,119 @@ import shutil -import sys -import unittest -import torch +import transformers -sys.path.insert(0, "../..") from auto_round import AutoRound from auto_round.schemes import QuantizationScheme +from ..helpers import get_model_path, get_tiny_model, opt_name_or_path, qwen_name_or_path -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestAutoRound(unittest.TestCase): +class TestAutoRound: @classmethod - def setUpClass(self): - self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def setup_class(self): self.save_folder = "./saved" - self.llm_dataloader = LLMDataLoader() @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree(self.save_folder, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_gguf(self): + def test_gguf(self, tiny_qwen_model_path, dataloader): ar = AutoRound( - "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B", + tiny_qwen_model_path, scheme="W2A16", nsamples=1, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) ar.quantize_and_save(self.save_folder, format="gguf:q4_k_m") - self.assertEqual(ar.bits, 4) + assert ar.bits == 4 shutil.rmtree(self.save_folder, ignore_errors=True) - def test_w4a16(self): - ar = AutoRound(self.model_name, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=self.llm_dataloader) - self.assertEqual(ar.bits, 4) + def test_w4a16(self, tiny_opt_model_path, dataloader): + ar = AutoRound(tiny_opt_model_path, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=dataloader) + assert ar.bits == 4 ar.quantize() - def test_w2a16_rtn(self): - ar = AutoRound(self.model_name, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=self.llm_dataloader) - self.assertEqual(ar.bits, 2) + def test_w2a16_rtn(self, tiny_opt_model_path, dataloader): + ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=dataloader) + assert ar.bits == 2 ar.quantize() - def test_mxfp4(self): - ar = AutoRound(self.model_name, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=self.llm_dataloader) - self.assertEqual(ar.bits, 4) - self.assertEqual(ar.act_bits, 4) - self.assertEqual(ar.data_type, "mx_fp") - self.assertEqual(ar.act_data_type, "mx_fp_rceil") + def test_mxfp4(self, tiny_opt_model_path, dataloader): + ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader) + assert ar.bits == 4 + assert ar.act_bits == 4 + assert ar.data_type == "mx_fp" + assert ar.act_data_type == "mx_fp_rceil" ar.quantize() - def test_vllm(self): + def test_vllm(self, tiny_qwen_vl_model_path): from auto_round import AutoRoundMLLM - ar = AutoRoundMLLM( - "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct", scheme="W2A16", nsamples=1, iters=1, seqlen=2 - ) - self.assertEqual(ar.bits, 2) - self.assertEqual(ar.act_bits, 16) - - def test_nvfp4(self): - ar = AutoRound(self.model_name, scheme="NVFP4", nsamples=1, iters=1, seqlen=2, dataset=self.llm_dataloader) - self.assertEqual(ar.bits, 4) - self.assertEqual(ar.act_bits, 4) - self.assertEqual(ar.data_type, "nv_fp") - self.assertEqual(ar.act_data_type, "nv_fp4_with_static_gs") + ar = AutoRoundMLLM(tiny_qwen_vl_model_path, scheme="W2A16", nsamples=1, iters=1, seqlen=2) + assert ar.bits == 2 + assert ar.act_bits == 16 + + def test_nvfp4(self, tiny_opt_model_path, dataloader): + ar = AutoRound(tiny_opt_model_path, scheme="NVFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader) + assert ar.bits == 4 + assert ar.act_bits == 4 + assert ar.data_type == "nv_fp" + assert ar.act_data_type == "nv_fp4_with_static_gs" ar.quantize() - def test_all_scheme(self): + def test_all_scheme(self, tiny_opt_model_path, tiny_qwen_model_path, dataloader): import copy preset_schemes = ["W8A16", "MXFP8", "FPW8A16", "FP8_STATIC", "GGUF:Q2_K_S", "GGUF:Q4_K_M"] for scheme in preset_schemes: - model_name = self.model_name + model_name = tiny_opt_model_path if "gguf" in scheme.lower(): - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-1.5B-Instruct" + model_name = tiny_qwen_model_path print(f"scheme={scheme}") - ar = AutoRound(model_name, scheme=scheme, nsamples=1, iters=1, seqlen=2, dataset=self.llm_dataloader) + ar = AutoRound(model_name, scheme=scheme, nsamples=1, iters=1, seqlen=2, dataset=dataloader) ar.quantize_and_save(self.save_folder) shutil.rmtree(self.save_folder, ignore_errors=True) - def test_scheme_in_layer_config(self): + def test_scheme_in_layer_config(self, dataloader): + model = get_tiny_model(opt_name_or_path, num_layers=5) + tokenizer = transformers.AutoTokenizer.from_pretrained(opt_name_or_path, trust_remote_code=True) layer_config = { "model.decoder.layers.2.self_attn": {"bits": 2}, "model.decoder.layers.3.self_attn.v_proj": "W8A16", "model.decoder.layers.4.self_attn.k_proj": QuantizationScheme.from_dict({"group_size": 64}), } ar = AutoRound( - "/tf_dataset/auto_round/models/facebook/opt-125m", + model, + tokenizer, scheme="W3A16", nsamples=1, iters=1, layer_config=layer_config, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) ar.quantize() for n, m in ar.model.named_modules(): if n == "model.decoder.layers.2.self_attn.q_proj": - self.assertEqual(m.bits, 2) + assert m.bits == 2 if n == "model.decoder.layers.2.self_attn.k_proj": - self.assertEqual(m.bits, 2) + assert m.bits == 2 if n == "model.decoder.layers.3.self_attn.v_proj": - self.assertEqual(m.bits, 8) + assert m.bits == 8 if n == "model.decoder.layers.4.self_attn.k_proj": - self.assertEqual(m.group_size, 64) + assert m.group_size == 64 def test_parse_available_devices(self): from auto_round.utils.device import parse_available_devices device_list = parse_available_devices("auto") - self.assertTrue(len(device_list) == 1 and "cpu" in device_list) + assert len(device_list) == 1 and "cpu" in device_list device_list = parse_available_devices("a:cuda:0,b:cuda:1,c:cpu") - self.assertTrue(len(device_list) == 3) - self.assertEqual(device_list, ["cuda:0", "cuda:1", "cpu"]) + assert len(device_list) == 3 + assert device_list == ["cuda:0", "cuda:1", "cpu"] device_list = parse_available_devices("0,1") - self.assertTrue(len(device_list) == 1 and "cpu" in device_list) - - -if __name__ == "__main__": - unittest.main() + assert len(device_list) == 1 and "cpu" in device_list diff --git a/test/test_cpu/test_script.py b/test/test_cpu/test_script.py deleted file mode 100644 index 01bbba644..000000000 --- a/test/test_cpu/test_script.py +++ /dev/null @@ -1,21 +0,0 @@ -import os -import sys -import unittest - -sys.path.insert(0, "../..") - - -class TestScript(unittest.TestCase): - def test_default(self): - os.system( - """ - cd ../.. && - python -m auto_round - --iters 2 - --deployment_device fake - --output_dir ./tmp_script_test""" - ) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_torch_backend.py b/test/test_cpu/test_torch_backend.py index d1e9bd293..0be8f76e6 100644 --- a/test/test_cpu/test_torch_backend.py +++ b/test/test_cpu/test_torch_backend.py @@ -1,11 +1,6 @@ import shutil -import sys -import unittest import pytest - -sys.path.insert(0, "../..") - import torch from transformers import AutoModelForCausalLM, AutoTokenizer @@ -13,56 +8,22 @@ from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.testing_utils import require_autogptq, require_gptqmodel +from ..helpers import get_model_path, model_infer -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - -class TestAutoRoundTorchBackend(unittest.TestCase): +class TestAutoRoundTorchBackend: @classmethod - def setUpClass(self): - self.model_name = "facebook/opt-125m" + def setup_class(self): + self.model_name = get_model_path("facebook/opt-125m") self.save_folder = "./saved" - self.llm_dataloader = LLMDataLoader() - - def model_infer(self, model, tokenizer): - prompts = [ - "Hello,my name is", - # "The president of the United States is", - # "The capital of France is", - # "The future of AI is", - ] - - inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) - - outputs = model.generate( - input_ids=inputs["input_ids"].to(model.device), - attention_mask=inputs["attention_mask"].to(model.device), - do_sample=False, ## change this to follow official usage - max_new_tokens=5, - ) - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] - - decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - - for i, prompt in enumerate(prompts): - print(f"Prompt: {prompt}") - print(f"Generated: {decoded_outputs[i]}") - print("-" * 50) - return decoded_outputs[0] @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree(self.save_folder, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_torch_4bits_asym(self): + def test_torch_4bits_asym(self, dataloader): model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, False @@ -74,7 +35,7 @@ def test_torch_4bits_asym(self): sym=sym, iters=0, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_folder autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel") @@ -85,10 +46,10 @@ def test_torch_4bits_asym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10) print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35) + assert result["results"]["lambada_openai"]["acc,none"] > 0.35 torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( @@ -96,14 +57,14 @@ def test_torch_4bits_asym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10) print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35) + assert result["results"]["lambada_openai"]["acc,none"] > 0.35 torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) - def test_torch_4bits_sym(self): + def test_torch_4bits_sym(self, dataloader): model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 32, True @@ -115,7 +76,7 @@ def test_torch_4bits_sym(self): sym=sym, iters=0, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_folder autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") ##will convert to gptq model @@ -126,13 +87,9 @@ def test_torch_4bits_sym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=1000) print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.28) + assert result["results"]["lambada_openai"]["acc,none"] > 0.28 torch.cuda.empty_cache() shutil.rmtree(self.save_folder, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_utils.py b/test/test_cpu/test_utils.py index e70a4b7b4..3dec97010 100644 --- a/test/test_cpu/test_utils.py +++ b/test/test_cpu/test_utils.py @@ -1,7 +1,5 @@ -import sys from unittest.mock import patch -sys.path.insert(0, "../..") import auto_round.utils.device as auto_round_utils diff --git a/test/test_cpu/test_woq_linear.py b/test/test_cpu/test_woq_linear.py index e077c7a21..8f5bedc2c 100644 --- a/test/test_cpu/test_woq_linear.py +++ b/test/test_cpu/test_woq_linear.py @@ -1,9 +1,6 @@ -import sys - import pytest import torch -sys.path.insert(0, "../..") from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear diff --git a/test/test_cuda/__init__.py b/test/test_cuda/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/test_cuda/_test_helpers.py b/test/test_cuda/_test_helpers.py deleted file mode 100644 index b4b8a5955..000000000 --- a/test/test_cuda/_test_helpers.py +++ /dev/null @@ -1,32 +0,0 @@ -def model_infer(model, tokenizer, apply_chat_template=False): - prompts = [ - "Hello,my name is", - # "The president of the United States is", - # "The capital of France is", - # "The future of AI is", - ] - if apply_chat_template: - texts = [] - for prompt in prompts: - messages = [{"role": "user", "content": prompt}] - text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - texts.append(text) - prompts = texts - - inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) - - outputs = model.generate( - input_ids=inputs["input_ids"].to(model.device), - attention_mask=inputs["attention_mask"].to(model.device), - do_sample=False, ## change this to follow official usage - max_new_tokens=5, - ) - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] - - decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - - for i, prompt in enumerate(prompts): - print(f"Prompt: {prompt}") - print(f"Generated: {decoded_outputs[i]}") - print("-" * 50) - return decoded_outputs[0] diff --git a/test/test_cuda/requirements.txt b/test/test_cuda/requirements.txt index e7dd4e0d8..071eb233e 100644 --- a/test/test_cuda/requirements.txt +++ b/test/test_cuda/requirements.txt @@ -6,7 +6,6 @@ intel-extension-for-pytorch lm-eval>=0.4.9.1 optimum pandas -parameterized pillow torchvision numba diff --git a/test/test_cuda/test_2_3bits.py b/test/test_cuda/test_2_3bits.py index 2ea407f20..1b305f494 100644 --- a/test/test_cuda/test_2_3bits.py +++ b/test/test_cuda/test_2_3bits.py @@ -1,10 +1,8 @@ import copy import re import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch import transformers from lm_eval.utils import make_table # pylint: disable=E0401 @@ -14,6 +12,8 @@ from auto_round.eval.evaluation import simple_evaluate, simple_evaluate_user_model from auto_round.testing_utils import require_autogptq, require_greater_than_050, require_greater_than_051 +from ..helpers import get_model_path, model_infer + def get_accuracy(data): match = re.search(r"\|acc\s+\|[↑↓]\s+\|\s+([\d.]+)\|", data) @@ -25,49 +25,27 @@ def get_accuracy(data): return 0.0 -class TestAutoRound(unittest.TestCase): - @classmethod - def setUpClass(self): - self.save_dir = "./saved" - self.tasks = "lambada_openai" +class TestAutoRound: + save_dir = "./saved" + tasks = "lambada_openai" - @classmethod - def tearDownClass(self): - shutil.rmtree("./saved", ignore_errors=True) - shutil.rmtree("runs", ignore_errors=True) - - def model_infer(self, model, tokenizer): - prompts = [ - "Hello,my name is", - # "The president of the United States is", - # "The capital of France is", - # "The future of AI is", - ] - - inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) - - outputs = model.generate( - input_ids=inputs["input_ids"].to(model.device), - attention_mask=inputs["attention_mask"].to(model.device), - do_sample=False, ## change this to follow official usage - max_new_tokens=5, - ) - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") - decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + # Yield to hand control to the test methods + yield - for i, prompt in enumerate(prompts): - print(f"Prompt: {prompt}") - print(f"Generated: {decoded_outputs[i]}") - print("-" * 50) - return decoded_outputs[0] + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") + shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree("runs", ignore_errors=True) @require_greater_than_051 def test_3bits_autoround(self): - model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") - tokenizer = AutoTokenizer.from_pretrained(model_name) - autoround = AutoRound(model, tokenizer, bits=3) + model_name = get_model_path("facebook/opt-125m") + autoround = AutoRound(model_name, bits=3) quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") ##will convert to gptq model @@ -77,18 +55,16 @@ def test_3bits_autoround(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_dir) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.3) ## 0.3130 + assert result["results"]["lambada_openai"]["acc,none"] > 0.3 @require_greater_than_051 def test_3bits_asym_autoround(self): - model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") - tokenizer = AutoTokenizer.from_pretrained(model_name) + model_name = get_model_path("facebook/opt-125m") bits, sym = 3, False - autoround = AutoRound(model, tokenizer, bits=bits, sym=sym) + autoround = AutoRound(model_name, bits=bits, sym=sym) autoround.quantize_and_save(self.save_dir, format="auto_round", inplace=False) model_args = f"pretrained={self.save_dir}" res = simple_evaluate( @@ -106,10 +82,8 @@ def test_3bits_asym_autoround(self): @require_greater_than_050 def test_norm_bias_tuning(self): - model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") - tokenizer = AutoTokenizer.from_pretrained(model_name) - autoround = AutoRound(model, tokenizer, bits=2, group_size=64, enable_norm_bias_tuning=True) + model_name = get_model_path("facebook/opt-125m") + autoround = AutoRound(model_name, bits=2, group_size=64, enable_norm_bias_tuning=True) autoround.quantize() ##test auto_round format @@ -123,10 +97,8 @@ def test_norm_bias_tuning(self): @require_greater_than_050 def test_2bits_autoround(self): - model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") - tokenizer = AutoTokenizer.from_pretrained(model_name) - autoround = AutoRound(model, tokenizer, bits=2, group_size=64) + model_name = get_model_path("facebook/opt-125m") + autoround = AutoRound(model_name, bits=2, group_size=64) autoround.quantize() ##test auto_round format @@ -145,7 +117,3 @@ def test_2bits_autoround(self): accuracy = get_accuracy(res) assert accuracy > 0.17 shutil.rmtree("./saved", ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_alg_ext.py b/test/test_cuda/test_alg_ext.py index c83d6f3b4..6cdbc82ab 100644 --- a/test/test_cuda/test_alg_ext.py +++ b/test/test_cuda/test_alg_ext.py @@ -1,30 +1,34 @@ import shutil import sys -import unittest - -sys.path.insert(0, "../..") +import pytest import torch from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound, AutoRoundConfig from auto_round.eval.evaluation import simple_evaluate_user_model +from ..helpers import get_model_path -class TestAlgExt(unittest.TestCase): - @classmethod - def setUpClass(self): - self.model_name = "/models/opt-125m" - self.save_folder = "./saved" +class TestAlgExt: + save_folder = "./saved" - @classmethod - def tearDownClass(self): - shutil.rmtree(self.save_folder, ignore_errors=True) + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + + # Yield to hand control to the test methods + yield + + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") + shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) def test_2bits(self): - model_name = "/models/opt-125m" + model_name = get_model_path("facebook/opt-125m") ar = AutoRound(model=model_name, bits=2, group_size=64, enable_alg_ext=True) ar.quantize_and_save(self.save_folder) model = AutoModelForCausalLM.from_pretrained( @@ -36,39 +40,39 @@ def test_2bits(self): result = simple_evaluate_user_model(model, tokenizer, batch_size=64, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) # wo alg ext 0.2078, with 0.2371 - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.22) + assert result["results"]["lambada_openai"]["acc,none"] > 0.22 shutil.rmtree(self.save_folder, ignore_errors=True) - def test_cli(self): + def test_cli(self, tiny_opt_model_path): import os - model_name = "/models/opt-125m" python_path = sys.executable res = os.system( - f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {model_name} --iters 1 --device auto --enable_alg_ext --avg_bits 2 --options=W2A16,W4A16 --ignore_scale_zp_bits" + f"cd .. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 2 --options=W2A16,W4A16 --ignore_scale_zp_bits --nsamples 1 --seqlen 32" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {model_name} --iters 1 --device auto --enable_alg_ext --avg_bits 5.5 --options=mxfp4,mxfp8 --ignore_scale_zp_bits --enable_torch_compile" + f"cd .. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 5.5 --options=mxfp4,mxfp8 --ignore_scale_zp_bits --enable_torch_compile --nsamples 1 --seqlen 32" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" - def test_all_support_dtype(self): + def test_all_support_dtype(self, tiny_qwen_model_path): from auto_round.auto_scheme import AutoScheme - model_name = "/models/Qwen3-0.6B" for scheme in ["MXFP4", "NVFP4", "W2A16G64", "gguf:q2_k_s,gguf:q4_k_s"]: avg_bits = 2 if scheme == "W2A16G64" else 4 scheme = AutoScheme(options=scheme, avg_bits=avg_bits, ignore_scale_zp_bits=True) ar = AutoRound( - model_name, scheme=scheme, iters=1, nsamples=1, enable_alg_ext=True, enable_torch_compile=True + tiny_qwen_model_path, + scheme=scheme, + iters=1, + nsamples=1, + seqlen=32, + enable_alg_ext=True, + enable_torch_compile=True, ) ar.quantize() - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_asym.py b/test/test_cuda/test_asym.py index c41c0d5d8..1eda6f146 100644 --- a/test/test_cuda/test_asym.py +++ b/test/test_cuda/test_asym.py @@ -3,16 +3,16 @@ import sys import unittest -sys.path.insert(0, "../..") - +import pytest import torch -from _test_helpers import model_infer from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from auto_round import AutoRound from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.utils import get_module +from ..helpers import model_infer + class LLMDataLoader: def __init__(self): @@ -23,140 +23,138 @@ def __iter__(self): yield torch.ones([1, 10], dtype=torch.long) -class TestAutoRoundAsym(unittest.TestCase): - @classmethod - def setUpClass(self): - self.model_name = "/models/opt-125m" - # self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - self.save_folder = "./saved" +class TestAutoRoundAsym: + save_dir = "./saved" + + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + + # Yield to hand control to the test methods + yield - @classmethod - def tearDownClass(self): - shutil.rmtree(self.save_folder, ignore_errors=True) + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") + shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_asym_group_size(self): - model_name = self.model_name - model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto") - tokenizer = AutoTokenizer.from_pretrained(model_name) + def test_asym_group_size(self, tiny_opt_model_path): for group_size in [32, 64, 128]: bits, sym = 4, False - ar = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1) - ar.quantize_and_save(format="auto_round", output_dir=self.save_folder) + ar = AutoRound( + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1 + ) + ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) # TODO when ark is ready, uncomment the following lines to do inference test # model = AutoModelForCausalLM.from_pretrained( - # self.save_folder, + # self.save_dir, # torch_dtype="auto", # device_map="auto", # ) - # tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) # model_infer(model, tokenizer) - shutil.rmtree(self.save_folder) + shutil.rmtree(self.save_dir) - def test_asym_bits(self): - model_name = self.model_name - model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto") - tokenizer = AutoTokenizer.from_pretrained(model_name) + def test_asym_bits(self, tiny_opt_model_path): for bits in [2, 3, 8]: group_size, sym = 128, False - ar = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1) - ar.quantize_and_save(format="auto_round", output_dir=self.save_folder) + ar = AutoRound( + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1 + ) + ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) # TODO when ark is ready, uncomment the following lines to do inference test # model = AutoModelForCausalLM.from_pretrained( - # self.save_folder, + # self.save_dir, # torch_dtype="auto", # device_map="auto", # ) - # tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) # model_infer(model, tokenizer) - shutil.rmtree(self.save_folder) + shutil.rmtree(self.save_dir) # use parameters later - def test_asym_format(self): - model_name = self.model_name - model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto") - tokenizer = AutoTokenizer.from_pretrained(model_name) + def test_asym_format(self, tiny_opt_model_path): for format in ["auto_round", "auto_round:auto_gptq", "auto_round:gptqmodel"]: bits, group_size, sym = 4, 128, False - ar = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1) + ar = AutoRound( + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1 + ) # TODO when ark is ready, uncomment the following lines to do inference test - ar.quantize_and_save(format=format, output_dir=self.save_folder) + ar.quantize_and_save(format=format, output_dir=self.save_dir) # model = AutoModelForCausalLM.from_pretrained( - # self.save_folder, + # self.save_dir, # torch_dtype="auto", # device_map="auto", # ) - # tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) # model_infer(model, tokenizer) - shutil.rmtree(self.save_folder) + shutil.rmtree(self.save_dir) - def test_asym_group_size_with_tuning(self): - model_name = self.model_name - model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto") - tokenizer = AutoTokenizer.from_pretrained(model_name) + def test_asym_group_size_with_tuning(self, tiny_opt_model_path): for group_size in [32, 64, 128]: bits, sym = 4, False - ar = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1) - ar.quantize_and_save(format="auto_round", output_dir=self.save_folder) + ar = AutoRound( + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1 + ) + ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) # TODO when ark is ready, uncomment the following lines to do inference test # model = AutoModelForCausalLM.from_pretrained( - # self.save_folder, + # self.save_dir, # torch_dtype="auto", # device_map="auto", # ) - # tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) # model_infer(model, tokenizer) - shutil.rmtree(self.save_folder) + shutil.rmtree(self.save_dir) - def test_asym_bits_with_tuning(self): - model_name = self.model_name - model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto") - tokenizer = AutoTokenizer.from_pretrained(model_name) + def test_asym_bits_with_tuning(self, tiny_opt_model_path): for bits in [2, 3, 8]: group_size, sym = 128, False - ar = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1) - ar.quantize_and_save(format="auto_round", output_dir=self.save_folder) + ar = AutoRound( + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1 + ) + ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) # TODO when ark is ready, uncomment the following lines to do inference test # model = AutoModelForCausalLM.from_pretrained( - # self.save_folder, + # self.save_dir, # torch_dtype="auto", # device_map="auto", # ) - # tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) # model_infer(model, tokenizer) - shutil.rmtree(self.save_folder) + shutil.rmtree(self.save_dir) # use parameters later - def test_asym_format_with_tuning(self): - model_name = self.model_name - model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto") - tokenizer = AutoTokenizer.from_pretrained(model_name) + def test_asym_format_with_tuning(self, tiny_opt_model_path): for format in ["auto_round", "auto_round:auto_gptq", "auto_round:gptqmodel"]: bits, group_size, sym = 4, 128, False - ar = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1) + ar = AutoRound( + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1 + ) # TODO when ark is ready, uncomment the following lines to do inference test - ar.quantize_and_save(format=format, output_dir=self.save_folder) + ar.quantize_and_save(format=format, output_dir=self.save_dir) # model = AutoModelForCausalLM.from_pretrained( - # self.save_folder, + # self.save_dir, # torch_dtype="auto", # device_map="auto", # ) - # tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) # model_infer(model, tokenizer) - shutil.rmtree(self.save_folder) + shutil.rmtree(self.save_dir) diff --git a/test/test_cuda/test_auto_round_format.py b/test/test_cuda/test_auto_round_format.py index 55fc1690f..9604ffff1 100644 --- a/test/test_cuda/test_auto_round_format.py +++ b/test/test_cuda/test_auto_round_format.py @@ -1,9 +1,7 @@ import copy import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer @@ -18,84 +16,41 @@ require_package_version_ut, ) +from ..helpers import get_model_path, get_tiny_model, model_infer -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestAutoRound(unittest.TestCase): - @classmethod - def setUpClass(self): - self.model_name = "facebook/opt-125m" - - self.llm_dataloader = LLMDataLoader() - self.save_folder = "./saved" - - def model_infer(self, model, tokenizer): - prompts = [ - "Hello,my name is", - # "The president of the United States is", - # "The capital of France is", - # "The future of AI is", - ] - - ##texts = [] - # for prompt in prompts: - # messages = [ - # {"role": "user", "content": prompt} - # ] - # text = tokenizer.apply_chat_template( - # messages, - # tokenize=False, - # add_generation_prompt=True - # ) - # texts.append(text) - - inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) - - outputs = model.generate( - input_ids=inputs["input_ids"].to(model.device), - attention_mask=inputs["attention_mask"].to(model.device), - do_sample=False, ## change this to follow official usage - max_new_tokens=5, - ) - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] - decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) +class TestAutoRound: + save_dir = "./saved" + + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") - for i, prompt in enumerate(prompts): - print(f"Prompt: {prompt}") - print(f"Generated: {decoded_outputs[i]}") - print("-" * 50) + # Yield to hand control to the test methods + yield - @classmethod - def tearDownClass(self): - shutil.rmtree(self.save_folder, ignore_errors=True) + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") + shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @require_greater_than_050 @require_package_version_ut("transformers", "<4.57.0") - def test_autoround_asym(self): + def test_autoround_asym(self, tiny_opt_model_path, dataloader): for bits in [2, 3, 4, 8]: - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + # model_name = get_model_path("facebook/opt-125m") bits, group_size, sym = bits, 128, False autoround = AutoRound( - model, - tokenizer, + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) - quantized_model_path = self.save_folder + quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") @@ -108,12 +63,11 @@ def test_autoround_asym(self): res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]) print(res) assert "!!!" not in res - shutil.rmtree(self.save_folder, ignore_errors=True) + shutil.rmtree(self.save_dir, ignore_errors=True) @require_autogptq def test_mixed_precision(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + model_name = get_model_path("facebook/opt-125m") layer_config = {} layer_config["model.decoder.layers.0.self_attn.k_proj"] = {"bits": 8} @@ -123,85 +77,80 @@ def test_mixed_precision(self): } ## 3bits when using asym will have some issue layer_config["model.decoder.layers.6.self_attn.out_proj"] = {"bits": 2, "group_size": 32} bits, group_size, sym = 4, 128, True - autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, layer_config=layer_config) - quantized_model_path = self.save_folder + autoround = AutoRound(model_name, bits=bits, group_size=group_size, sym=sym, layer_config=layer_config) + quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") quantization_config = AutoRoundConfig(backend="auto") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.32) + assert result["results"]["lambada_openai"]["acc,none"] > 0.32 @require_awq @require_package_version_ut("transformers", "<4.57.0") def test_awq_backend(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + model_name = get_model_path("facebook/opt-125m") bits, group_size, sym = 4, 128, True autoround = AutoRound( - model, - tokenizer, + model_name, bits=bits, group_size=group_size, iters=1, nsamples=1, sym=sym, ) - quantized_model_path = self.save_folder + quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq") quantization_config = AutoRoundConfig(backend="auto") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.18) + assert result["results"]["lambada_openai"]["acc,none"] > 0.18 torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + self.save_dir, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) - shutil.rmtree(self.save_folder, ignore_errors=True) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + model_infer(model, tokenizer) + shutil.rmtree(self.save_dir, ignore_errors=True) @require_greater_than_050 def test_tritonv2_bf16(self): - model_name = "OPEA/Meta-Llama-3.1-8B-Instruct-int4-sym-inc" + model_name = get_model_path("OPEA/Meta-Llama-3.1-8B-Instruct-int4-sym-inc") quantization_config = AutoRoundConfig(backend="tritonv2") - model = AutoModelForCausalLM.from_pretrained( + model = get_tiny_model( model_name, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(model_name) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) torch.cuda.empty_cache() @require_ipex - def test_autoround_gptq_sym_format(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + def test_autoround_gptq_sym_format(self, tiny_opt_model_path, dataloader): bits, group_size, sym = 4, 128, True autoround = AutoRound( - model, - tokenizer, + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = "./saved" @@ -244,19 +193,16 @@ def test_autoround_gptq_sym_format(self): @require_awq @require_ipex @require_package_version_ut("transformers", "<4.57.0") - def test_autoround_awq_sym_format(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + def test_autoround_awq_sym_format(self, tiny_opt_model_path, dataloader): bits, group_size, sym = 4, 128, True autoround = AutoRound( - model, - tokenizer, + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = "./saved" @@ -283,20 +229,17 @@ def test_autoround_awq_sym_format(self): shutil.rmtree("./saved", ignore_errors=True) @require_greater_than_050 - def test_autoround_sym(self): + def test_autoround_sym(self, tiny_opt_model_path, dataloader): for bits in [2, 3, 4, 8]: - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = bits, 128, True autoround = AutoRound( - model, - tokenizer, + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = "./saved" @@ -311,11 +254,11 @@ def test_autoround_sym(self): res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]) print(res) assert "!!!" not in res - shutil.rmtree(self.save_folder, ignore_errors=True) + shutil.rmtree(self.save_dir, ignore_errors=True) @require_greater_than_050 def test_load_gptq_model_3bits(self): - model_name = "LucasSantiago257/gemma-2b-2bits-gptq" + model_name = get_model_path("LucasSantiago257/gemma-2b-2bits-gptq") quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( model_name, @@ -325,8 +268,4 @@ def test_load_gptq_model_3bits(self): quantization_config=quantization_config, ) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - self.model_infer(model, tokenizer) - - -if __name__ == "__main__": - unittest.main() + model_infer(model, tokenizer) diff --git a/test/test_cuda/test_auto_scheme.py b/test/test_cuda/test_auto_scheme.py index 681e3b29b..259bc4450 100644 --- a/test/test_cuda/test_auto_scheme.py +++ b/test/test_cuda/test_auto_scheme.py @@ -1,73 +1,78 @@ import copy import re import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest +import transformers + from auto_round import AutoRound, AutoRoundConfig, AutoScheme from auto_round.auto_scheme.utils import compute_avg_bits_for_model from auto_round.eval.evaluation import simple_evaluate from auto_round.testing_utils import multi_card from auto_round.utils import get_module +from ..helpers import get_model_path, get_tiny_model + + +class TestAutoScheme: + save_dir = "./saved" -class TestAutoScheme(unittest.TestCase): - @classmethod - def setUpClass(self): - self.save_dir = "./saved" - self.tasks = "lambada_openai" + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") - @classmethod - def tearDownClass(self): + # Yield to hand control to the test methods + yield + + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_gguf_k_0(self): - model_name = "/models/Qwen3-0.6B" + def test_gguf_k_0(self, tiny_qwen_model_path): target_bits = 5.5 scheme = AutoScheme(avg_bits=target_bits, options=("GGUF:Q4_K_M", "GGUF:Q8_0")) - ar = AutoRound(model=model_name, scheme=scheme, iters=1, enable_alg_ext=True) + ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=1, enable_alg_ext=True) ar.quantize_and_save(self.save_dir, format="gguf:q2_k_s") shutil.rmtree(self.save_dir, ignore_errors=True) - def test_gguf_k_1(self): - model_name = "/models/Qwen3-0.6B" + def test_gguf_k_1(self, tiny_qwen_model_path): target_bits = 3.5 scheme = AutoScheme(avg_bits=target_bits, options=("GGUF:Q2_K_S", "GGUF:Q4_1")) - ar = AutoRound(model=model_name, scheme=scheme, iters=1, enable_alg_ext=True) + ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=1, enable_alg_ext=True) ar.quantize_and_save(self.save_dir, format="gguf:q2_k_s") shutil.rmtree(self.save_dir, ignore_errors=True) # - def test_embedding_fallback(self): - model_name = "/models/Qwen3-0.6B" + def test_embedding_fallback(self, tiny_qwen_model_path): target_bits = 5.0 scheme = AutoScheme(avg_bits=target_bits, options=("GGUF:Q4_K_M", "GGUF:Q8_0")) - ar = AutoRound(model=model_name, scheme=scheme, iters=1, enable_alg_ext=True) + ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=1, enable_alg_ext=True) ar.quantize_and_save(self.save_dir, format="gguf:q2_k_s") shutil.rmtree(self.save_dir, ignore_errors=True) - def test_gguf_export(self): - model_name = "/models/Qwen3-0.6B" + def test_gguf_export(self, tiny_qwen_model_path): target_bits = 3 scheme = AutoScheme(avg_bits=target_bits, options=("GGUF:Q2_K_S", "GGUF:Q4_K_M"), ignore_scale_zp_bits=True) - ar = AutoRound(model=model_name, scheme=scheme, iters=0) + ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0) ar.quantize_and_save(self.save_dir, format="gguf:q2_k_s") shutil.rmtree(self.save_dir, ignore_errors=True) def test_gguf(self): - model_name = "/models/Qwen3-8B" + model_name = get_model_path("qwen/Qwen3-8B") + model = get_tiny_model(model_name) + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) target_bits = 3 scheme = AutoScheme(avg_bits=target_bits, options=("GGUF:Q2_K_S", "GGUF:Q4_K_M"), ignore_scale_zp_bits=True) - ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, disable_opt_rtn=True) + ar = AutoRound(model=model, tokenizer=tokenizer, scheme=scheme, iters=0, nsamples=1, disable_opt_rtn=True) model, layer_config = ar.quantize() avg_bits, _ = compute_avg_bits_for_model(model, ignore_scale_zp_bits=True) print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 def test_shared_layers(self): - model_name = "/models/opt-125m" + model_name = get_model_path("facebook/opt-125m") from transformers import AutoModelForCausalLM, AutoTokenizer model = AutoModelForCausalLM.from_pretrained(model_name) @@ -79,7 +84,7 @@ def test_shared_layers(self): from auto_round.auto_scheme.utils import parse_shared_layers res = parse_shared_layers(model, shared_layers) - self.assertEqual(len(res), 24) + assert len(res) == 24 assert [ "model.decoder.layers.2.self_attn.out_proj", "model.decoder.layers.2.self_attn.q_proj", @@ -101,68 +106,61 @@ def test_shared_layers(self): else: bits.append(module.bits) bits = set(bits) - self.assertEqual(len(bits), 1) + assert len(bits) == 1 print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 # @multi_card - def test_multi_card(self): - model_name = "/models/Qwen3-0.6B" + def test_multi_card(self, tiny_qwen_model_path): target_bits = 4.5 for device_map in ["auto", "0,1", "0", None]: scheme = AutoScheme(avg_bits=target_bits, options=("NVFP4")) - ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, device_map=device_map) + ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0, nsamples=1, device_map=device_map) model, layer_config = ar.quantize() avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 @multi_card - def test_multi_card_1(self): - model_name = "/models/Qwen3-0.6B" + def test_multi_card_1(self, tiny_qwen_model_path): target_bits = 4.5 from transformers import AutoModelForCausalLM, AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(model_name) - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto") scheme = AutoScheme(avg_bits=target_bits, options=("NVFP4")) - ar = AutoRound(model=model, tokenizer=tokenizer, scheme=scheme, iters=0, nsamples=1) + ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0, nsamples=1) model, layer_config = ar.quantize() avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 - def test_non_low_gpu_mem_usage(self): - model_name = "/models/Qwen3-0.6B" + def test_non_low_gpu_mem_usage(self, tiny_qwen_model_path): target_bits = 4.5 # for device_map in ["auto", "0,1", "0", None]: scheme = AutoScheme(avg_bits=target_bits, options=("NVFP4"), low_gpu_mem_usage=False, device_map="auto") - ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1) + ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0, nsamples=1) model, layer_config = ar.quantize() avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 @multi_card - def test_dict_device_map(self): - model_name = "/models/Qwen3-8B" + def test_dict_device_map(self, tiny_qwen_model_path): target_bits = 8.25 device_map = {"up_proj": 0, "down_proj": 1} scheme = AutoScheme(avg_bits=target_bits, options=("MXFP8")) - ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, device_map=device_map) + ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0, nsamples=1, device_map=device_map) model, layer_config = ar.quantize() avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 - def test_min_target_bits(self): - model_name = "/models/opt-125m" + def test_min_target_bits(self, tiny_opt_model_path): target_bits = 4.644 scheme = AutoScheme(avg_bits=target_bits, options=("MXFP4", "W8A16")) - ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1) + ar = AutoRound(model=tiny_opt_model_path, scheme=scheme, iters=0, nsamples=1) model, layer_config = ar.quantize() avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) @@ -170,102 +168,97 @@ def test_min_target_bits(self): # def test_max_target_bits(self): - model_name = "/models/opt-125m" target_bits = 8.025 + model_path = get_model_path("facebook/opt-125m") scheme = AutoScheme(avg_bits=target_bits, options=("MXFP4", "W8A16")) - ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1) + ar = AutoRound(model=model_path, scheme=scheme, iters=0, nsamples=1) model, layer_config = ar.quantize() avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 - def test_patch_scheme(self): - model_name = "/models/opt-125m" + def test_patch_scheme(self, tiny_opt_model_path): target_bits = 5 scheme = AutoScheme(avg_bits=target_bits, options=("MXFP4", "W8A16")) - ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, group_size=32) + ar = AutoRound(model=tiny_opt_model_path, scheme=scheme, iters=0, nsamples=1, group_size=32) model, layer_config = ar.quantize() for n, m in model.named_modules(): if hasattr(m, "group_size"): - self.assertEqual(m.group_size, 32) + assert m.group_size == 32 avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 def test_layer_config(self): target_bits = 3.0 - model_name = "/models/opt-125m" + model_name = get_model_path("facebook/opt-125m") scheme = AutoScheme(avg_bits=3, options=("W2A16", "W4A16", "BF16")) user_layer_config = {"model.decoder.layers.10.fc1": {"bits": 8, "group_size": 32, "sym": False}} ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config) model, layer_config = ar.quantize() - self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["bits"], 8) - self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["sym"], False) - self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["group_size"], 32) + assert layer_config["model.decoder.layers.10.fc1"]["bits"] == 8 + assert layer_config["model.decoder.layers.10.fc1"]["sym"] is False + assert layer_config["model.decoder.layers.10.fc1"]["group_size"] == 32 layer = get_module(model, "model.decoder.layers.10.fc1") - self.assertEqual(layer.bits, 8) - self.assertEqual(layer.sym, False) - self.assertEqual(layer.group_size, 32) + assert layer.bits == 8 + assert layer.sym is False + assert layer.group_size == 32 avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 target_bits = 5.5 - model_name = "/models/opt-125m" scheme = AutoScheme(avg_bits=target_bits, options=("mxfp4", "mxfp8")) user_layer_config = {"model.decoder.layers.10.fc1": {"bits": 8, "group_size": 32, "sym": False}} ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config) model, layer_config = ar.quantize() - self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["bits"], 8) - self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["sym"], False) - self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["group_size"], 32) + assert layer_config["model.decoder.layers.10.fc1"]["bits"] == 8 + assert layer_config["model.decoder.layers.10.fc1"]["sym"] is False + assert layer_config["model.decoder.layers.10.fc1"]["group_size"] == 32 layer = get_module(model, "model.decoder.layers.10.fc1") - self.assertEqual(layer.orig_layer.bits, 8) - self.assertEqual(layer.orig_layer.sym, False) - self.assertEqual(layer.orig_layer.group_size, 32) + assert layer.orig_layer.bits == 8 + assert layer.orig_layer.sym is False + assert layer.orig_layer.group_size == 32 avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 - def test_lm_head_and_mix_dtype(self): - model_name = "/models/Qwen3-8B" + def test_lm_head_and_mix_dtype(self, tiny_untied_qwen_model_path): + model_name = tiny_untied_qwen_model_path + model = get_tiny_model(model_name) + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) target_bits = 6 scheme = AutoScheme(avg_bits=target_bits, options=("MXFP4", "MXFP8")) - ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, quant_lm_head=True) + ar = AutoRound(model=model, tokenizer=tokenizer, scheme=scheme, iters=0, nsamples=1, quant_lm_head=True) model, layer_config = ar.quantize() - self.assertLessEqual(layer_config["lm_head"]["bits"], 8) + assert layer_config["lm_head"]["bits"] <= 8 avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 - def test_auto_scheme_export(self): - model_name = "/models/opt-125m" + def test_auto_scheme_export(self, tiny_qwen_model_path): + model_name = get_model_path("facebook/opt-125m") scheme = AutoScheme(avg_bits=3, options=("W2A16", "W4A16", "W8A16", "BF16")) ar = AutoRound(model=model_name, scheme=scheme) ar.quantize_and_save(self.save_dir) model_args = f"pretrained={self.save_dir}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.25) + assert result["results"]["lambada_openai"]["acc,none"] > 0.25 shutil.rmtree(self.save_dir, ignore_errors=True) - model_name = "/models/Qwen3-0.6B" scheme = AutoScheme(avg_bits=3, options=("gguf:q2_k_s,gguf:q4_k_s"), nsamples=1, ignore_scale_zp_bits=True) - ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1) + ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0, nsamples=1) ar.quantize_and_save(self.save_dir) shutil.rmtree(self.save_dir, ignore_errors=True) def test_enable_torch_compile(self): - model_name = "/models/opt-125m" + model_name = get_model_path("facebook/opt-125m") scheme = AutoScheme(avg_bits=2, options=("W2A16"), ignore_scale_zp_bits=True) ar = AutoRound(model=model_name, scheme=scheme, enable_torch_compile=True) ar.quantize_and_save(self.save_dir) model_args = f"pretrained={self.save_dir}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.10) + assert result["results"]["lambada_openai"]["acc,none"] > 0.10 shutil.rmtree(self.save_dir, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_calib_dataset.py b/test/test_cuda/test_calib_dataset.py index b66f60127..bdee2ebeb 100644 --- a/test/test_cuda/test_calib_dataset.py +++ b/test/test_cuda/test_calib_dataset.py @@ -1,46 +1,19 @@ +import json import os import shutil -import sys -import unittest - -sys.path.insert(0, "../..") -import json +import pytest import torch from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound -class TestLocalCalibDataset(unittest.TestCase): - @classmethod - def setUpClass(self): - json_data = [{"text": "awefdsfsddfd"}, {"text": "fdfdfsdfdfdfd"}, {"text": "dfdsfsdfdfdfdf"}] - os.makedirs("./saved", exist_ok=True) - self.json_file = "./saved/tmp.json" - with open(self.json_file, "w") as json_file: - json.dump(json_data, json_file, indent=4) - - jsonl_data = [{"text": "哈哈,開心點"}, {"text": "hello world"}] - os.makedirs("./saved", exist_ok=True) - self.jsonl_file = "./saved/tmp.jsonl" - with open(self.jsonl_file, "w") as jsonl_file: - for item in jsonl_data: - json.dump(item, jsonl_file, ensure_ascii=False) - jsonl_file.write("\n") - - model_name = "facebook/opt-125m" - self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - - def test_combine_dataset(self): +class TestLocalCalibDataset: + def test_combine_dataset(self, tiny_opt_model_path): dataset = "NeelNanda/pile-10k" + ",BAAI/CCI3-HQ" + ",madao33/new-title-chinese" bits, group_size, sym = 4, 128, True autoround = AutoRound( - self.model, self.tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=128, dataset=dataset + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=128, dataset=dataset ) autoround.quantize() - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_conv1d.py b/test/test_cuda/test_conv1d.py index e617bf55e..11f80a1b2 100644 --- a/test/test_cuda/test_conv1d.py +++ b/test/test_cuda/test_conv1d.py @@ -1,53 +1,52 @@ import copy import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch -from _test_helpers import model_infer from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound from auto_round.testing_utils import require_gptqmodel +from ..helpers import get_model_path, get_tiny_model, model_infer -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) +class TestQuantizationConv1d: + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + # Yield to hand control to the test methods + yield -class TestQuantizationConv1d(unittest.TestCase): - @classmethod - def setUpClass(self): - self.model_name = "MBZUAI/LaMini-GPT-124M" - self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) - self.llm_dataloader = LLMDataLoader() + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") + shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree("runs", ignore_errors=True) @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @require_gptqmodel - def test_quant(self): - self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + def test_quant(self, dataloader): + model_name = get_model_path("MBZUAI/LaMini-GPT-124M") + model = get_tiny_model(model_name) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True from auto_round import AutoRoundConfig autoround = AutoRound( - self.model, - self.tokenizer, + model, + tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() @@ -55,7 +54,3 @@ def test_quant(self): model = AutoModelForCausalLM.from_pretrained("./saved", device_map="cuda", trust_remote_code=True) model_infer(model, self.tokenizer) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_diffusion.py b/test/test_cuda/test_diffusion.py index 9a5a8bfd3..a3a90d14e 100644 --- a/test/test_cuda/test_diffusion.py +++ b/test/test_cuda/test_diffusion.py @@ -2,13 +2,9 @@ import os import re import shutil -import sys -import unittest +import pytest import requests - -sys.path.insert(0, "../..") - from diffusers import AutoPipelineForText2Image from PIL import Image @@ -16,13 +12,20 @@ from auto_round.testing_utils import require_gptqmodel, require_optimum, require_vlm_env -class TestAutoRound(unittest.TestCase): - @classmethod - def setUpClass(self): - self.model_name = "/dataset/FLUX.1-dev" +class TestAutoRound: + model_name = "/dataset/FLUX.1-dev" + + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") - @classmethod - def tearDownClass(self): + # Yield to hand control to the test methods + yield + + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") + shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @require_optimum @@ -73,11 +76,7 @@ def test_diffusion_rtn(self): def test_diffusion_model_checker(self): from auto_round.utils import is_diffusion_model - self.assertTrue(is_diffusion_model("/dataset/FLUX.1-dev")) - self.assertTrue(is_diffusion_model("/models/stable-diffusion-2-1")) - self.assertTrue(is_diffusion_model("/models/stable-diffusion-xl-base-1.0")) - self.assertFalse(is_diffusion_model("/models/Qwen3-8B")) - - -if __name__ == "__main__": - unittest.main() + assert is_diffusion_model("/dataset/FLUX.1-dev") + assert is_diffusion_model("/models/stable-diffusion-2-1") + assert is_diffusion_model("/models/stable-diffusion-xl-base-1.0") + assert is_diffusion_model("/models/Qwen3-8B") is False diff --git a/test/test_cuda/test_exllamav2_backend.py b/test/test_cuda/test_exllamav2_backend.py index c489b37b2..d0f5bed53 100644 --- a/test/test_cuda/test_exllamav2_backend.py +++ b/test/test_cuda/test_exllamav2_backend.py @@ -1,12 +1,6 @@ import shutil -import sys -import unittest import pytest - -sys.path.insert(0, "../..") - - import torch from transformers import AutoModelForCausalLM, AutoTokenizer @@ -14,157 +8,118 @@ from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.testing_utils import require_autogptq, require_gptqmodel, require_package_version_ut +from ..helpers import get_model_path, model_infer -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) +class TestAutoRoundexllamaBackend: + save_dir = "./saved" -class TestAutoRoundexllamaBackend(unittest.TestCase): + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") - @classmethod - def setUpClass(self): - self.model_name = "/models/opt-125m" - self.save_folder = "./saved" - self.llm_dataloader = LLMDataLoader() + # Yield to hand control to the test methods + yield - def model_infer(self, model, tokenizer): - prompts = [ - "Hello,my name is", - # "The president of the United States is", - # "The capital of France is", - # "The future of AI is", - ] - - inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) - - outputs = model.generate( - input_ids=inputs["input_ids"].to(model.device), - attention_mask=inputs["attention_mask"].to(model.device), - do_sample=False, ## change this to follow official usage - max_new_tokens=5, - ) - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] - - decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - - for i, prompt in enumerate(prompts): - print(f"Prompt: {prompt}") - print(f"Generated: {decoded_outputs[i]}") - print("-" * 50) - return decoded_outputs[0] - - @classmethod - def tearDownClass(self): - shutil.rmtree(self.save_folder, ignore_errors=True) + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") + shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @require_gptqmodel - def test_gptqmodel_exllmav2_4bits_asym(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + def test_gptqmodel_exllmav2_4bits_asym(self, dataloader): + model_path = get_model_path("facebook/opt-125m") bits, group_size, sym = 4, 128, False autoround = AutoRound( - model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, dataset=self.llm_dataloader + model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, dataset=dataloader ) - quantized_model_path = self.save_folder + quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel") quantization_config = AutoRoundConfig(backend="gptqmodel:exllamav2") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35) + assert result["results"]["lambada_openai"]["acc,none"] > 0.35 torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + self.save_dir, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35) + assert result["results"]["lambada_openai"]["acc,none"] > 0.35 torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) @require_autogptq @require_package_version_ut("torch", "<2.6.0") - def test_gptq_exllamav2_4bits_sym(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + def test_gptq_exllamav2_4bits_sym(self, dataloader): + model_path = get_model_path("facebook/opt-125m") bits, group_size, sym = 4, 128, True autoround = AutoRound( - model, - tokenizer, + model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) - quantized_model_path = self.save_folder + quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") ##will convert to gptq model quantization_config = AutoRoundConfig(backend="gptq:exllamav2") ## or exllamav2 model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27) + assert result["results"]["lambada_openai"]["acc,none"] > 0.27 torch.cuda.empty_cache() - shutil.rmtree(self.save_folder, ignore_errors=True) + shutil.rmtree(self.save_dir, ignore_errors=True) @require_autogptq @require_package_version_ut("torch", "<2.6.0") def test_gptq_exllamav2_4bits_sym_group_size(self): + model_path = get_model_path("facebook/opt-125m") for group_size in [-1, 32, 64, 128, 256, 1024]: ## 384, 768 has accuracy issue print(f"!!!!!!!!!!!!!!!!!{group_size}!!!!!!!!!!!!!!!!!") - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, group_size, True autoround = AutoRound( - model, - tokenizer, + model_path, bits=bits, iters=1, nsamples=1, group_size=group_size, sym=sym, ) - quantized_model_path = self.save_folder + quantized_model_path = self.save_dir autoround.quantize_and_save( output_dir=quantized_model_path, format="auto_round" ) ##will convert to gptq model quantization_config = AutoRoundConfig(backend="gptq:exllamav2") ## or exllamav2 model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=64, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.15) + assert result["results"]["lambada_openai"]["acc,none"] > 0.15 torch.cuda.empty_cache() - shutil.rmtree(self.save_folder, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() + shutil.rmtree(self.save_dir, ignore_errors=True) diff --git a/test/test_cuda/test_export.py b/test/test_cuda/test_export.py index 297b20193..c8f87b4bf 100644 --- a/test/test_cuda/test_export.py +++ b/test/test_cuda/test_export.py @@ -1,9 +1,7 @@ import copy import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch import transformers from transformers import AutoConfig, AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer @@ -11,42 +9,37 @@ from auto_round import AutoRound from auto_round.testing_utils import require_awq, require_optimum, require_package_version_ut +from ..helpers import get_model_path, get_tiny_model -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) +class TestAutoRound: + save_dir = "./saved" + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") -class TestAutoRound(unittest.TestCase): - @classmethod - def setUpClass(self): - self.model_name = "facebook/opt-125m" - self.save_dir = "./saved" - self.llm_dataloader = LLMDataLoader() + # Yield to hand control to the test methods + yield - @classmethod - def tearDownClass(self): + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @require_optimum - def test_autogptq_format(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + def test_autogptq_format(self, dataloader): + model_path = get_model_path("facebook/opt-125m") bits, group_size, sym = 4, 128, False autoround = AutoRound( - model, - tokenizer, + model_path, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() quantized_model_path = "./saved" @@ -65,10 +58,10 @@ def test_autogptq_format(self): shutil.rmtree("./saved", ignore_errors=True) @require_optimum - def test_autogptq_format_fp_layers(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + def test_autogptq_format_fp_layers(self, tiny_opt_model_path, dataloader): layer_config = {} + model = AutoModelForCausalLM.from_pretrained(tiny_opt_model_path) + tokenizer = AutoTokenizer.from_pretrained(tiny_opt_model_path) for n, m in model.named_modules(): if "q_proj" in n: layer_config[n] = {"bits": 16} @@ -82,7 +75,7 @@ def test_autogptq_format_fp_layers(self): sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) autoround.quantize() @@ -102,9 +95,10 @@ def test_autogptq_format_fp_layers(self): # "there there there there there there") shutil.rmtree("./saved", ignore_errors=True) - def test_autogptq_format_qsave_fp_layers(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + def test_autogptq_format_qsave_fp_layers(self, dataloader): + model_path = get_model_path("facebook/opt-125m") + model = AutoModelForCausalLM.from_pretrained(model_path) + layer_config = {} for n, m in model.named_modules(): if "q_proj" in n: @@ -112,14 +106,13 @@ def test_autogptq_format_qsave_fp_layers(self): bits, group_size, sym = 4, 128, False autoround = AutoRound( - model, - tokenizer, + model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = "./saved" @@ -153,19 +146,16 @@ def test_autogptq_format_qsave_fp_layers(self): ##print(res) shutil.rmtree("./saved", ignore_errors=True) - def test_autoround_format(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + def test_autoround_format(self, tiny_opt_model_path, dataloader): bits, group_size, sym = 4, 128, True autoround = AutoRound( - model, - tokenizer, + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() quantized_model_path = "./saved" @@ -186,19 +176,17 @@ def test_autoround_format(self): @require_awq @require_package_version_ut("transformers", "<4.57.0") - def test_autoawq_format(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + def test_autoawq_format(self, dataloader): + model_path = get_model_path("facebook/opt-125m") bits, group_size, sym = 4, 128, False autoround = AutoRound( - model, - tokenizer, + model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() quantized_model_path = "./saved" @@ -220,23 +208,21 @@ def test_autoawq_format(self): @require_optimum @require_awq @require_package_version_ut("transformers", "<4.57.0") - def test_autoawq_format_fp_qsave_layers(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + def test_autoawq_format_fp_qsave_layers(self, dataloader): + model_path = get_model_path("facebook/opt-125m") layer_config = { "model.decoder.layers.0.self_attn.k_proj": {"bits": 16}, "model.decoder.layers.9.self_attn.v_proj": {"bits": 16}, } - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, False autoround = AutoRound( - model, - tokenizer, + model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = "./saved/test_export" @@ -261,19 +247,16 @@ def test_autoawq_format_fp_qsave_layers(self): shutil.rmtree("./saved", ignore_errors=True) - def test_autoround_3bit_asym_torch_format(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + def test_autoround_3bit_asym_torch_format(self, tiny_opt_model_path, dataloader): bits, group_size, sym = 3, 128, False autoround = AutoRound( - model, - tokenizer, + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() quantized_model_path = "./saved" @@ -290,19 +273,16 @@ def test_autoround_3bit_asym_torch_format(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree("./saved", ignore_errors=True) - def test_autoround_3bit_sym_torch_format(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + def test_autoround_3bit_sym_torch_format(self, tiny_opt_model_path, dataloader): bits, group_size, sym = 3, 128, True autoround = AutoRound( - model, - tokenizer, + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() quantized_model_path = "./saved" @@ -322,21 +302,24 @@ def test_autoround_3bit_sym_torch_format(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree("./saved", ignore_errors=True) - def test_awq_lmhead_export(self): + def test_awq_lmhead_export(self, dataloader): bits, sym, group_size = 4, False, 128 - model_name = "/models/phi-2" + model_name = get_model_path("microsoft/phi-2") + tiny_model = get_tiny_model(model_name) + tokenizer = AutoTokenizer.from_pretrained(model_name) layer_config = { "lm_head": {"bits": 4}, # set lm_head quant } autoround = AutoRound( - model=model_name, + model=tiny_model, + tokenizer=tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=2, layer_config=layer_config, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = "./saved" compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") @@ -346,21 +329,24 @@ def test_awq_lmhead_export(self): assert isinstance(lm_head, WQLinear_GEMM), "Illegal AWQ quantization for lm_head layer" shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_gptq_lmhead_export(self): + def test_gptq_lmhead_export(self, tiny_qwen_model_path, dataloader): bits, sym, group_size = 4, True, 128 - model_name = "/models/phi-2" + model_name = get_model_path("microsoft/phi-2") + tiny_model = get_tiny_model(model_name) + tokenizer = AutoTokenizer.from_pretrained(model_name) layer_config = { "lm_head": {"bits": 4}, # set lm_head quant } autoround = AutoRound( - model=model_name, + model=tiny_model, + tokenizer=tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=2, layer_config=layer_config, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = "./saved" compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") @@ -376,7 +362,3 @@ def test_gptq_lmhead_export(self): res = tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0]) print(res) shutil.rmtree(quantized_model_path, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_fp8_input.py b/test/test_cuda/test_fp8_input.py index 5258fe183..9e1c1cc3a 100644 --- a/test/test_cuda/test_fp8_input.py +++ b/test/test_cuda/test_fp8_input.py @@ -1,30 +1,43 @@ import os import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound from auto_round.eval.evaluation import simple_evaluate +from auto_round.utils import llm_load_model +from ..helpers import get_model_path, get_tiny_model -class TestAutoRound(unittest.TestCase): - @classmethod - def setUpClass(self): - self.save_dir = "./saved" - @classmethod - def tearDownClass(self): - shutil.rmtree(self.save_dir, ignore_errors=True) +class TestAutoRound: + save_dir = "./saved" + + def tiny_fp8_model(self): + model_name = get_model_path("qwen/Qwen3-0.6B-FP8") + model, tokenizer = llm_load_model(model_name) + model.model.layers = model.model.layers[:3] + return model, tokenizer + + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + + # Yield to hand control to the test methods + yield + + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") + shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) def test_small_model_rtn_generation(self): - model_name = "/models/Qwen3-0.6B-FP8" - ar = AutoRound(model=model_name, iters=0) + model, tokenizer = self.tiny_fp8_model() + ar = AutoRound(model=model, tokenizer=tokenizer, iters=0) ar.quantize_and_save(output_dir=self.save_dir) model = AutoModelForCausalLM.from_pretrained(self.save_dir, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.save_dir) @@ -34,8 +47,8 @@ def test_small_model_rtn_generation(self): shutil.rmtree(self.save_dir, ignore_errors=True) def test_gguf_imatrix(self): - model_name = "/models/Qwen3-0.6B-FP8" - ar = AutoRound(model=model_name, iters=0) + model, tokenizer = self.tiny_fp8_model() + ar = AutoRound(model=model, tokenizer=tokenizer, iters=0) ar.quantize_and_save(format="gguf:q2_k_s", output_dir=self.save_dir) # from llama_cpp import Llama # @@ -51,56 +64,55 @@ def test_gguf_imatrix(self): # print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) def test_small_model_rtn(self): - model_name = "/models/Qwen3-0.6B-FP8" - ar = AutoRound(model=model_name, iters=0) + model, tokenizer = self.tiny_fp8_model() + ar = AutoRound(model=model, tokenizer=tokenizer, iters=0) _, folder = ar.quantize_and_save(output_dir=self.save_dir) model_args = f"pretrained={self.save_dir}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.25) + assert result["results"]["lambada_openai"]["acc,none"] > 0.25 shutil.rmtree(self.save_dir, ignore_errors=True) def test_small_model_iters1(self): - model_name = "/models/Qwen3-0.6B-FP8" - ar = AutoRound(model=model_name, iters=1) + model, tokenizer = self.tiny_fp8_model() + ar = AutoRound(model=model, tokenizer=tokenizer, iters=1) _, folder = ar.quantize_and_save(output_dir=self.save_dir) model_args = f"pretrained={self.save_dir}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.25) + assert result["results"]["lambada_openai"]["acc,none"] > 0.25 shutil.rmtree(self.save_dir, ignore_errors=True) def test_medium_model_rtn(self): - model_name = "/models/Qwen3-8B-FP8" - ar = AutoRound(model=model_name, iters=0) + model, tokenizer = self.tiny_fp8_model() + ar = AutoRound(model=model, tokenizer=tokenizer, iters=0) _, folder = ar.quantize_and_save(output_dir=self.save_dir) model_args = f"pretrained={self.save_dir}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.55) + assert result["results"]["lambada_openai"]["acc,none"] > 0.55 shutil.rmtree(self.save_dir, ignore_errors=True) def test_medium_model_rtn_with_lm_head(self): - model_name = "/models/Qwen3-8B-FP8" + model, tokenizer = self.tiny_fp8_model() layer_config = {"lm_head": {"bits": 4}} - ar = AutoRound(model=model_name, iters=0, layer_config=layer_config) + ar = AutoRound(model=model, tokenizer=tokenizer, iters=0, layer_config=layer_config) _, folder = ar.quantize_and_save(output_dir=self.save_dir) model_args = f"pretrained={self.save_dir}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.55) + assert result["results"]["lambada_openai"]["acc,none"] > 0.55 shutil.rmtree(self.save_dir, ignore_errors=True) def test_fp8_model_gguf(self): from llama_cpp import Llama - model_name = "Qwen/Qwen3-0.6B-FP8" - - ar = AutoRound(model=model_name, iters=0) + model, tokenizer = self.tiny_fp8_model() + ar = AutoRound(model=model, tokenizer=tokenizer, iters=0) ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q4_0") for file in os.listdir(self.save_dir): if file.endswith(".gguf"): @@ -110,7 +122,8 @@ def test_fp8_model_gguf(self): print(output) shutil.rmtree(self.save_dir, ignore_errors=True) - ar = AutoRound(model=model_name, iters=1) + model, tokenizer = self.tiny_fp8_model() + ar = AutoRound(model=model, tokenizer=tokenizer, iters=1) ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q3_k_s") for file in os.listdir(self.save_dir): if file.endswith(".gguf"): @@ -121,14 +134,10 @@ def test_fp8_model_gguf(self): shutil.rmtree(self.save_dir, ignore_errors=True) def test_diff_datatype(self): - model_name = "/models/Qwen3-0.6B-FP8" for scheme in ["NVFP4", "MXFP4"]: + model, tokenizer = self.tiny_fp8_model() for iters in [0, 1]: print(f"Testing scheme: {scheme}, iters: {iters}") - ar = AutoRound(model=model_name, iters=iters, scheme=scheme) + ar = AutoRound(model=model, tokenizer=tokenizer, iters=iters, scheme=scheme) ar.quantize_and_save(output_dir=self.save_dir) shutil.rmtree(self.save_dir, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_get_block_name.py b/test/test_cuda/test_get_block_name.py index cc9297653..829ac1e46 100644 --- a/test/test_cuda/test_get_block_name.py +++ b/test/test_cuda/test_get_block_name.py @@ -1,9 +1,7 @@ import copy import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch import transformers from diffusers import AutoPipelineForText2Image @@ -20,13 +18,13 @@ from auto_round.utils import get_block_names, is_pure_text_model -class TestAutoRound(unittest.TestCase): +class TestAutoRound: @classmethod - def setUpClass(self): + def setup_class(self): pass @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("runs", ignore_errors=True) def check_block_names(self, block_names, prefixs=[], n_layers=[]): @@ -195,11 +193,7 @@ def test_flux(self): block_names = get_block_names(model) self.check_block_names(block_names, ["transformer_blocks", "single_transformer_blocks"], [19, 38]) - self.assertTrue(any(["context_embedder" not in n for n in block_names])) + assert any(["context_embedder" not in n for n in block_names]) block_names = get_block_names(model, quant_vision=True) self.check_block_names(block_names, ["transformer_blocks", "single_transformer_blocks"], [19, 38]) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_gguf.py b/test/test_cuda/test_gguf.py index 312e561cf..174deab2f 100644 --- a/test/test_cuda/test_gguf.py +++ b/test/test_cuda/test_gguf.py @@ -1,9 +1,8 @@ import os import shutil import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer @@ -11,35 +10,37 @@ from auto_round import AutoRound from auto_round.testing_utils import require_gguf +from ..helpers import get_model_path, get_tiny_model, save_tiny_model -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) +class TestAutoRound: + save_dir = "./saved" + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") -class TestAutoRound(unittest.TestCase): - @classmethod - def tearDownClass(self): + # Yield to hand control to the test methods + yield + + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @require_gguf - def test_gguf_format(self): - model_name = "Qwen/Qwen2.5-0.5B-Instruct" + def test_gguf_format(self, tiny_qwen_model_path, dataloader): bits, group_size, sym = 4, 32, False autoround = AutoRound( - model_name, + tiny_qwen_model_path, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=2, nsamples=2, - dataset=LLMDataLoader(), + dataset=dataloader, ) autoround.quantize() quantized_model_path = "./saved" @@ -54,34 +55,36 @@ def test_gguf_format(self): shutil.rmtree("./saved", ignore_errors=True) save_dir = os.path.join(os.path.dirname(__file__), "saved") - model_path = "Qwen/Qwen2.5-0.5B-Instruct" res = os.system( - f"cd ../.. && {sys.executable} -m auto_round --model {model_path} --iter 2 " + f"cd .. && {sys.executable} -m auto_round --model {tiny_qwen_model_path} --iter 2 " f"--output_dir {save_dir} --nsample 2 --format gguf:q4_0 --device 0" ) print(save_dir) - self.assertFalse(res > 0 or res == -1, msg="qwen2 tuning fail") + assert not (res > 0 or res == -1), "qwen2 tuning fail" from llama_cpp import Llama - gguf_file = os.listdir("saved/Qwen2.5-0.5B-Instruct-gguf")[0] - llm = Llama(f"saved/Qwen2.5-0.5B-Instruct-gguf/{gguf_file}", n_gpu_layers=-1) + gguf_file = os.listdir("saved/tmp_tiny_qwen_model_path-gguf")[0] + llm = Llama(f"saved/tmp_tiny_qwen_model_path-gguf/{gguf_file}", n_gpu_layers=-1) output = llm("There is a girl who likes adventure,", max_tokens=32) print(output) shutil.rmtree("./saved", ignore_errors=True) @require_gguf - def test_q2_k_export(self): + def test_q2_k_export(self, dataloader): bits, group_size, sym = 2, 16, False - model_name = "Qwen/Qwen2.5-1.5B-Instruct" + model_path = get_model_path("Qwen/Qwen2.5-1.5B-Instruct") + model = get_tiny_model(model_path) + tokenizer = AutoTokenizer.from_pretrained(model_path) autoround = AutoRound( - model_name, + model, + tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=1, - dataset=LLMDataLoader(), + dataset=dataloader, data_type="int_asym_dq", ) autoround.quantize() @@ -94,20 +97,13 @@ def test_q2_k_export(self): inputs = autoround.tokenizer(text, return_tensors="pt").to(model.device) result = autoround.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]) print(result) - - from auto_round.eval.evaluation import simple_evaluate_user_model - - result = simple_evaluate_user_model(model, autoround.tokenizer, batch_size=16, tasks="piqa") - self.assertGreater(result["results"]["piqa"]["acc,none"], 0.45) - shutil.rmtree(quantized_model_path, ignore_errors=True) @require_gguf - def test_basic_usage(self): - model_name = "Qwen/Qwen2.5-0.5B-Instruct" + def test_basic_usage(self, tiny_qwen_model_path): python_path = sys.executable res = os.system( - f"cd ../.. && {python_path} -m auto_round --model {model_name} --eval_task_by_task" + f"cd .. && {python_path} -m auto_round --model {tiny_qwen_model_path} --eval_task_by_task" f" --tasks piqa,openbookqa --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0 --eval_model_dtype bf16" ) if res > 0 or res == -1: @@ -116,7 +112,7 @@ def test_basic_usage(self): @require_gguf def test_q4_0(self): - model_name = "Qwen/Qwen2.5-0.5B-Instruct" + model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct") bits, group_size, sym = 4, 32, True autoround = AutoRound(model_name, bits=bits, group_size=group_size, sym=sym, iters=1, data_type="int") autoround.quantize() @@ -132,12 +128,12 @@ def test_q4_0(self): from auto_round.eval.evaluation import simple_evaluate_user_model result = simple_evaluate_user_model(model, autoround.tokenizer, batch_size=16, tasks="piqa") - self.assertGreater(result["results"]["piqa"]["acc,none"], 0.54) + assert result["results"]["piqa"]["acc,none"] > 0.54 shutil.rmtree(quantized_model_path, ignore_errors=True) @require_gguf def test_q4_1(self): - model_name = "Qwen/Qwen2.5-0.5B-Instruct" + model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct") bits, group_size, sym = 4, 32, False autoround = AutoRound(model=model_name, bits=bits, group_size=group_size, sym=sym, iters=1, data_type="int") autoround.quantize() @@ -153,36 +149,28 @@ def test_q4_1(self): from auto_round.eval.evaluation import simple_evaluate_user_model result = simple_evaluate_user_model(model, autoround.tokenizer, batch_size=16, tasks="piqa") - self.assertGreater(result["results"]["piqa"]["acc,none"], 0.54) + assert result["results"]["piqa"]["acc,none"] > 0.54 shutil.rmtree("./saved", ignore_errors=True) @require_gguf def test_all_format(self): - from auto_round.export.export_to_gguf.config import GGUF_CONFIG + for model_name in ["qwen/Qwen3-8B", "meta-llama/Llama-3.1-8B-Instruct", "meta-llama/Llama-3.2-3B"]: + for gguf_format in ["gguf:q5_0", "gguf:q5_1", "gguf:q3_k_m", "q5_k_m", "q6_k", "q8_0"]: + model_path = get_model_path(model_name) + tiny_model_path = "tmp_tiny_model" + tiny_model_path = save_tiny_model(model_path, tiny_model_path, num_layers=2) + ar = AutoRound(tiny_model_path, scheme=gguf_format, iters=0, nsampels=1, seqlen=16) + ar.quantize_and_save(output_dir=self.save_dir, format=gguf_format) - python_path = sys.executable - for model_name in ["/models/Qwen3-8B/", "/models/Llama-3.2-3B/", "/models/Meta-Llama-3.1-8B-Instruct"]: - for gguf_format in GGUF_CONFIG.keys(): - print(model_name, gguf_format) - res = os.system( - f"cd ../.. && {python_path} -m auto_round --model {model_name} " - f" --bs 16 --iters 1 --nsamples 1 --format fake,{gguf_format}" - ) - if res > 0 or res == -1: - assert False, "cmd line test fail, please have a check" - shutil.rmtree("../../tmp_autoround", ignore_errors=True) - - res = os.system( - f"cd ../.. && {python_path} -m auto_round --model {model_name} " - f" --bs 16 --iters 0 --nsamples 1 --format {gguf_format}" - ) - if res > 0 or res == -1: - assert False, "cmd line test fail, please have a check" - shutil.rmtree("../../tmp_autoround", ignore_errors=True) + ar = AutoRound(tiny_model_path, scheme=gguf_format, iters=1, nsampels=1, seqlen=16) + ar.quantize_and_save(output_dir=self.save_dir, format=gguf_format) + + shutil.rmtree(tiny_model_path, ignore_errors=True) + shutil.rmtree(self.save_dir, ignore_errors=True) @require_gguf def test_vlm_gguf(self): - model_name = "/models/Qwen2.5-VL-7B-Instruct" + model_name = "/models/Qwen2-VL-2B-Instruct" from auto_round import AutoRoundMLLM from auto_round.utils import mllm_load_model @@ -197,11 +185,11 @@ def test_vlm_gguf(self): ) quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") - self.assertTrue("mmproj-model.gguf" in os.listdir("./saved")) - file_size = os.path.getsize("./saved/Qwen2.5-VL-7B-Instruct-Q4_0.gguf") / 1024**2 - self.assertAlmostEqual(file_size, 4242, delta=5.0) + assert "mmproj-model.gguf" in os.listdir("./saved") + file_size = os.path.getsize("./saved/Qwen2-VL-2B-Instruct-Q4_0.gguf") / 1024**2 + assert abs(file_size - 4242) < 5.0 file_size = os.path.getsize("./saved/mmproj-model.gguf") / 1024**2 - self.assertAlmostEqual(file_size, 2580, delta=5.0) + assert abs(file_size - 2580) < 5.0 shutil.rmtree("./saved", ignore_errors=True) model_name = "/models/gemma-3-12b-it" @@ -218,41 +206,9 @@ def test_vlm_gguf(self): ) quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m") - self.assertTrue("mmproj-model.gguf" in os.listdir("./saved")) + assert "mmproj-model.gguf" in os.listdir("./saved") file_size = os.path.getsize("./saved/gemma-3-12B-it-Q4_K_M.gguf") / 1024**2 - self.assertAlmostEqual(file_size, 6568, delta=5.0) + assert abs(file_size - 6568) < 5.0 file_size = os.path.getsize("./saved/mmproj-model.gguf") / 1024**2 - self.assertAlmostEqual(file_size, 1599, delta=5.0) + assert abs(file_size - 1599) < 5.0 shutil.rmtree(quantized_model_path, ignore_errors=True) - - # @require_gguf - # def test_llama_4(self): - # model_name = "/dataset/Llama-4-Scout-17B-16E-Instruct/" - # from auto_round import AutoRoundMLLM - # from auto_round.utils import mllm_load_model - - # model, processor, tokenizer, image_processor = mllm_load_model(model_name, use_auto_mapping=False) - # autoround = AutoRoundMLLM( - # model, - # tokenizer=tokenizer, - # processor=processor, - # image_processor=image_processor, - # device="auto", - # iters=0, - # ) - # quantized_model_path = "/dataset/Llam-4-test" - # shutil.rmtree(quantized_model_path, ignore_errors=True) - # autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") - # self.assertTrue("mmproj-model.gguf" in os.listdir(quantized_model_path)) - # file_size = ( - # os.path.getsize(os.path.join(quantized_model_path, "Llama-4-Scout-17B-16E-Instruct-16x17B-Q4_0.gguf")) - # / 1024**2 - # ) - # self.assertAlmostEqual(file_size, 58093.62, delta=1.0) - # file_size = os.path.getsize(os.path.join(quantized_model_path, "mmproj-model.gguf")) / 1024**2 - # self.assertAlmostEqual(file_size, 3326.18, delta=5.0) - # shutil.rmtree(quantized_model_path, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_main_func.py b/test/test_cuda/test_main_func.py index 571fc10f5..ac8b8b91e 100644 --- a/test/test_cuda/test_main_func.py +++ b/test/test_cuda/test_main_func.py @@ -1,10 +1,8 @@ import copy import re import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch import transformers from lm_eval.utils import make_table # pylint: disable=E0401 @@ -15,6 +13,8 @@ from auto_round.eval.evaluation import simple_evaluate from auto_round.testing_utils import require_awq, require_gptqmodel, require_optimum, require_package_version_ut +from ..helpers import get_model_path + def get_accuracy(data): match = re.search(r"\|acc\s+\|[↑↓]\s+\|\s+([\d.]+)\|", data) @@ -26,21 +26,27 @@ def get_accuracy(data): return 0.0 -class TestMainFunc(unittest.TestCase): - @classmethod - def setUpClass(self): - self.save_dir = "./saved" - self.tasks = "lambada_openai" +class TestMainFunc: + save_dir = "./saved" + tasks = "lambada_openai" + + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") - @classmethod - def tearDownClass(self): + # Yield to hand control to the test methods + yield + + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @require_gptqmodel @require_optimum def test_backend(self): - model_name = "/models/opt-125m" + model_name = get_model_path("facebook/opt-125m") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) autoround = AutoRound(model, tokenizer, bits=4, group_size=128) @@ -68,7 +74,7 @@ def test_backend(self): @require_awq @require_package_version_ut("transformers", "<4.57.0") def test_backend_awq(self): - model_name = "/models/opt-125m" + model_name = get_model_path("facebook/opt-125m") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) autoround = AutoRound(model, tokenizer, bits=4, group_size=128) @@ -83,10 +89,10 @@ def test_backend_awq(self): assert accuracy > 0.35 shutil.rmtree("./saved", ignore_errors=True) - @unittest.skipIf(torch.cuda.is_available() is False, "Skipping because no cuda") + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @require_gptqmodel def test_fp_layers(self): - model_name = "/models/opt-125m" + model_name = get_model_path("facebook/opt-125m") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) from auto_round.compressors.utils import get_fp_layer_names @@ -107,11 +113,11 @@ def test_fp_layers(self): assert accuracy > 0.35 shutil.rmtree("./saved", ignore_errors=True) - @unittest.skipIf(torch.cuda.is_available() is False, "Skipping because no cuda") + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @require_awq @require_package_version_ut("transformers", "<4.57.0") def test_fp_layers_awq(self): - model_name = "/models/opt-125m" + model_name = get_model_path("facebook/opt-125m") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) from auto_round.compressors.utils import get_fp_layer_names @@ -132,18 +138,17 @@ def test_fp_layers_awq(self): assert accuracy > 0.35 shutil.rmtree("./saved", ignore_errors=True) - @unittest.skipIf(torch.cuda.is_available() is False, "Skipping because no cuda") - def test_undivided_group_size_tuning(self): - model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") - tokenizer = AutoTokenizer.from_pretrained(model_name) + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") + def test_undivided_group_size_tuning(self, tiny_opt_model_path): + model = AutoModelForCausalLM.from_pretrained(tiny_opt_model_path, torch_dtype=torch.float16, device_map="auto") + tokenizer = AutoTokenizer.from_pretrained(tiny_opt_model_path) autoround = AutoRound(model, tokenizer, bits=4, group_size=127, nsamples=2, iters=2) autoround.quantize() @require_gptqmodel def test_adam(self): - model_name = "/models/opt-125m" + model_name = get_model_path("facebook/opt-125m") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) autoround = AutoRoundAdam(model, tokenizer, bits=4, group_size=128) @@ -164,7 +169,7 @@ def test_autoround_asym(self): ##need to install false except ImportError as e: print("skip autoround asym test, as autoround is not installed from source") return - model_name = "/models/opt-125m" + model_name = get_model_path("facebook/opt-125m") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) autoround = AutoRound(model, tokenizer, bits=4, group_size=128, sym=False) @@ -179,12 +184,12 @@ def test_autoround_asym(self): ##need to install false assert accuracy > 0.35 shutil.rmtree("./saved", ignore_errors=True) - def test_attention_mask_lm_head(self): + def test_attention_mask_lm_head(self, tiny_qwen_moe_model_path): from transformers import AutoTokenizer - model_name = "/models/Qwen3-8B" + # model_name = "/models/Qwen3-8B" # model_name = "/models/Qwen3-0.6B" - tokenizer = AutoTokenizer.from_pretrained(model_name) + tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_moe_model_path) text = ["haha", "hello world"] res = tokenizer(text, return_tensors="pt", max_length=8, padding="max_length", truncation=True) res.data.pop("attention_mask") @@ -196,14 +201,13 @@ def test_attention_mask_lm_head(self): data.append(res.data) from auto_round import AutoRound - ar = AutoRound(model_name, iters=1, dataset=data, seqlen=8, quant_lm_head=True) + ar = AutoRound(tiny_qwen_moe_model_path, iters=1, dataset=data, seqlen=8, quant_lm_head=True) ar.quantize() - def test_low_cpu_mem_usage(self): + def test_low_cpu_mem_usage(self, tiny_opt_model_path): bits, group_size = 4, 32 - model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(tiny_opt_model_path, torch_dtype="auto", trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(tiny_opt_model_path, trust_remote_code=True) quantized_model_path = "./saved" autoround = AutoRound( model, @@ -216,7 +220,3 @@ def test_low_cpu_mem_usage(self): ) autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") shutil.rmtree(quantized_model_path, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_marlin_backend.py b/test/test_cuda/test_marlin_backend.py index 26d3ddca2..8d7594086 100644 --- a/test/test_cuda/test_marlin_backend.py +++ b/test/test_cuda/test_marlin_backend.py @@ -1,29 +1,32 @@ import shutil -import sys -import unittest import pytest - -sys.path.insert(0, "../..") import torch from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound, AutoRoundConfig from auto_round.eval.evaluation import simple_evaluate_user_model +from ..helpers import model_infer + -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 +class TestAutoRoundMarlinBackend: + save_dir = "./saved" - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + # Yield to hand control to the test methods + yield -class TestAutoRoundMarlinBackend(unittest.TestCase): + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") + shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree("runs", ignore_errors=True) - def test_marlin_group_size(self): + def test_marlin_group_size(self, dataloader): for group_size in [-1, 64]: print(f"{group_size}!!!!!!!!!!!!!!!!!") model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) @@ -37,21 +40,21 @@ def test_marlin_group_size(self): sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) - quantized_model_path = self.save_folder + quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") quantization_config = AutoRoundConfig(backend="marlin") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.14) + assert result["results"]["lambada_openai"]["acc,none"] > 0.14 for group_size in [32, 128]: print(f"{group_size}!!!!!!!!!!!!!!!!!") @@ -66,60 +69,23 @@ def test_marlin_group_size(self): sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) - quantized_model_path = self.save_folder + quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") quantization_config = AutoRoundConfig(backend="marlin") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.14) - - @classmethod - def setUpClass(self): - self.model_name = "/models/opt-125m" - self.save_folder = "./saved" - self.llm_dataloader = LLMDataLoader() - - def model_infer(self, model, tokenizer): - prompts = [ - "Hello,my name is", - # "The president of the United States is", - # "The capital of France is", - # "The future of AI is", - ] - - inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) - - outputs = model.generate( - input_ids=inputs["input_ids"].to(model.device), - attention_mask=inputs["attention_mask"].to(model.device), - do_sample=False, ## change this to follow official usage - max_new_tokens=5, - ) - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] - - decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + assert result["results"]["lambada_openai"]["acc,none"] > 0.14 - for i, prompt in enumerate(prompts): - print(f"Prompt: {prompt}") - print(f"Generated: {decoded_outputs[i]}") - print("-" * 50) - return decoded_outputs[0] - - @classmethod - def tearDownClass(self): - shutil.rmtree("./saved", ignore_errors=True) - shutil.rmtree("runs", ignore_errors=True) - - def test_marlin_4bits_sym_with_zp_m_1(self): + def test_marlin_4bits_sym_with_zp_m_1(self, dataloader): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True @@ -131,32 +97,32 @@ def test_marlin_4bits_sym_with_zp_m_1(self): sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) - quantized_model_path = self.save_folder + quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") quantization_config = AutoRoundConfig(backend="marlin") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27) + assert result["results"]["lambada_openai"]["acc,none"] > 0.27 torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + self.save_dir, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27) + assert result["results"]["lambada_openai"]["acc,none"] > 0.27 torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) @@ -172,41 +138,37 @@ def test_marlin_4bits_sym_with_zp_m_1(self): # sym=sym, # iters=1, # seqlen=2, - # dataset=self.llm_dataloader, + # dataset=dataloader, # ) - # quantized_model_path = self.save_folder + # quantized_model_path = self.save_dir # autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") # # quantization_config = AutoRoundConfig(backend="marlin") # model = AutoModelForCausalLM.from_pretrained( - # self.save_folder, + # self.save_dir, # torch_dtype=torch.float16, # device_map="auto", # quantization_config=quantization_config # ) # - # tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - # self.model_infer(model, tokenizer) + # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + # model_infer(model, tokenizer) # result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) - # self.assertGreater(result['results']['lambada_openai']['acc,none'], 0.27) + # assert result['results']['lambada_openai']['acc,none'] > 0.27 # torch.cuda.empty_cache() # # model = AutoModelForCausalLM.from_pretrained( - # self.save_folder, + # self.save_dir, # torch_dtype=torch.bfloat16, # device_map="auto", # quantization_config=quantization_config # ) # - # tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - # self.model_infer(model, tokenizer) + # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + # model_infer(model, tokenizer) # result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) - # self.assertGreater(result['results']['lambada_openai']['acc,none'], 0.27) + # assert result['results']['lambada_openai']['acc,none'] > 0.27 # torch.cuda.empty_cache() # shutil.rmtree("./saved", ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_mix_bits.py b/test/test_cuda/test_mix_bits.py index 4f7d39d8c..6988709d5 100644 --- a/test/test_cuda/test_mix_bits.py +++ b/test/test_cuda/test_mix_bits.py @@ -1,14 +1,9 @@ import json import os import shutil -import sys -import unittest - -from parameterized import parameterized - -sys.path.insert(0, "../..") from pathlib import Path +import pytest import torch from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer @@ -19,32 +14,27 @@ require_package_version_ut, ) +from ..helpers import get_model_path -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) +class TestAutoRound: + save_dir = "./saved" + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") -class TestAutoRound(unittest.TestCase): - @classmethod - def setUpClass(self): - self.model_name = "/models/opt-125m" - self.save_dir = "./saved" - self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) - self.llm_dataloader = LLMDataLoader() + # Yield to hand control to the test methods + yield - @classmethod - def tearDownClass(self): + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @require_gptqmodel - def test_mixed_gptqmodel(self): + def test_mixed_gptqmodel(self, tiny_opt_model_path, dataloader): scheme = "W4A16" layer_config = { "k_proj": {"bits": 8}, # part name @@ -54,12 +44,12 @@ def test_mixed_gptqmodel(self): "model.decoder.layers.0.self_attn.q_proj": {"bits": 8}, # full name } autoround = AutoRound( - model=self.model_name, + model=tiny_opt_model_path, scheme=scheme, iters=2, seqlen=2, layer_config=layer_config, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") @@ -73,7 +63,7 @@ def test_mixed_gptqmodel(self): print(res) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mixed_gptqmodel_convert_to_ar(self): + def test_mixed_gptqmodel_convert_to_ar(self, tiny_opt_model_path, dataloader): layer_config = { "k_proj": {"bits": 8}, # part name "lm_head": {"bits": 4}, # set lm_head quant @@ -81,12 +71,12 @@ def test_mixed_gptqmodel_convert_to_ar(self): "model.decoder.layers.0.self_attn.q_proj": {"bits": 8}, # full name } autoround = AutoRound( - model=self.model_name, + model=tiny_opt_model_path, scheme="W4A16", iters=2, seqlen=2, layer_config=layer_config, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") @@ -101,7 +91,7 @@ def test_mixed_gptqmodel_convert_to_ar(self): print(res) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mixed_autoround_format(self): + def test_mixed_autoround_format(self, tiny_opt_model_path, dataloader): layer_config = { "k_proj": {"bits": 8}, "q_proj": {"bits": 3}, @@ -109,11 +99,11 @@ def test_mixed_autoround_format(self): "fc1": {"bits": 16}, } autoround = AutoRound( - model=self.model_name, + model=tiny_opt_model_path, scheme="W4A16", iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = "self.save_dir" @@ -129,18 +119,17 @@ def test_mixed_autoround_format(self): @require_awq @require_package_version_ut("transformers", "<4.57.0") - def test_fallback_regex_for_awq_format(self): - model_name = "facebook/opt-125m" + def test_fallback_regex_for_awq_format(self, tiny_opt_model_path, dataloader): layer_config = { "lm_head": {"bits": 16}, "fc1": {"bits": 16}, } autoround = AutoRound( - model=model_name, + model=tiny_opt_model_path, scheme="W4A16", iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = "self.save_dir" @@ -155,18 +144,18 @@ def test_fallback_regex_for_awq_format(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mixed_ar_format_part_name_hf_loading(self): + def test_mixed_ar_format_part_name_hf_loading(self, tiny_opt_model_path, dataloader): layer_config = { "k_proj": {"bits": 8}, # part name "lm_head": {"bits": 16}, # full name ".*fc1.*": {"bits": 16}, # standard regex } autoround = AutoRound( - model=self.model_name, + model=tiny_opt_model_path, scheme="W4A16", iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = "self.save_dir" @@ -216,18 +205,19 @@ def test_mixed_ar_format_part_name_hf_loading(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mixed_MXFP_autoround_format_loading(self): + def test_mixed_MXFP_autoround_format_loading(self, dataloader): layer_config = { "k_proj": {"bits": 8, "act_bits": 8}, "lm_head": {"bits": 16, "act_bits": 16}, "fc1": {"bits": 8, "act_bits": 8}, } + model_path = get_model_path("facebook/opt-125m") autoround = AutoRound( - self.model_name, + model_path, scheme="MXFP4", iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = self.save_dir @@ -242,21 +232,21 @@ def test_mixed_MXFP_autoround_format_loading(self): result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.32) + assert result["results"]["lambada_openai"]["acc,none"] > 0.32 shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mixed_autoround_format_vllm(self): + def test_mixed_autoround_format_vllm(self, tiny_opt_model_path, dataloader): layer_config = { "self_attn": {"bits": 8}, "lm_head": {"bits": 16}, } autoround = AutoRound( - self.model, + tiny_opt_model_path, self.tokenizer, scheme="W4A16", iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) autoround.quantize() @@ -285,18 +275,18 @@ def test_mixed_autoround_format_vllm(self): print(f"{prompt}: {generated_text}") shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mixed_llmcompressor_format_vllm(self): + def test_mixed_llmcompressor_format_vllm(self, tiny_opt_model_path, dataloader): layer_config = { "self_attn": {"bits": 16, "act_bits": 16}, "lm_head": {"bits": 16, "act_bits": 16}, "fc1": {"bits": 16, "act_bits": 16}, } autoround = AutoRound( - self.model_name, + tiny_opt_model_path, scheme="NVFP4", iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = self.save_dir @@ -323,7 +313,3 @@ def test_mixed_llmcompressor_format_vllm(self): print(f"{prompt}: {generated_text}") assert "!!!" not in generated_text shutil.rmtree(quantized_model_path, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_multiple_card.py b/test/test_cuda/test_multiple_card.py index 5dac584fe..e09975a19 100644 --- a/test/test_cuda/test_multiple_card.py +++ b/test/test_cuda/test_multiple_card.py @@ -1,11 +1,7 @@ import re import shutil -import sys -import unittest - -sys.path.insert(0, "../..") - +import pytest import torch from lm_eval.utils import make_table # pylint: disable=E0401 from transformers import AutoModelForCausalLM, AutoTokenizer @@ -14,6 +10,8 @@ from auto_round.eval.evaluation import simple_evaluate from auto_round.testing_utils import multi_card, require_gptqmodel, require_greater_than_050 +from ..helpers import get_model_path, get_tiny_model + def get_accuracy(data): match = re.search(r"\|acc\s+\|[↑↓]\s+\|\s+([\d.]+)\|", data) @@ -27,15 +25,21 @@ def get_accuracy(data): # import os # os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" -class TestAutoRound(unittest.TestCase): - @classmethod - def setUpClass(self): - self.save_dir = "./saved" - self.tasks = "lambada_openai" - - @classmethod - def tearDownClass(self): - shutil.rmtree(self.save_dir, ignore_errors=True) +class TestAutoRound: + save_dir = "./saved" + tasks = "lambada_openai" + + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + + # Yield to hand control to the test methods + yield + + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") + shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @multi_card @@ -57,10 +61,9 @@ def test_device_map_str(self): shutil.rmtree("./saved", ignore_errors=True) @multi_card - def test_layer_norm(self): - model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) - tokenizer = AutoTokenizer.from_pretrained(model_name) + def test_layer_norm(self, tiny_opt_model_path): + model = AutoModelForCausalLM.from_pretrained(tiny_opt_model_path, torch_dtype=torch.float16) + tokenizer = AutoTokenizer.from_pretrained(tiny_opt_model_path) device_map = {"norm": "cuda:1"} autoround = AutoRound( model, tokenizer, iters=2, device_map=device_map, nsamples=7, seqlen=32, enable_norm_bias_tuning=True @@ -68,10 +71,9 @@ def test_layer_norm(self): autoround.quantize() @multi_card - def test_rms_norm(self): - model_name = "/models/Qwen2-0.5B-Instruct" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) - tokenizer = AutoTokenizer.from_pretrained(model_name) + def test_rms_norm(self, tiny_qwen_model_path): + model = AutoModelForCausalLM.from_pretrained(tiny_qwen_model_path, torch_dtype=torch.float16) + tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_model_path) device_map = {"norm": "cuda:1"} autoround = AutoRound( model, tokenizer, iters=2, device_map=device_map, nsamples=7, seqlen=32, enable_norm_bias_tuning=True @@ -79,10 +81,9 @@ def test_rms_norm(self): autoround.quantize() @multi_card - def test_act_quantization(self): - model_name = "/models/Qwen2-0.5B-Instruct" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) - tokenizer = AutoTokenizer.from_pretrained(model_name) + def test_act_quantization(self, tiny_qwen_model_path): + model = AutoModelForCausalLM.from_pretrained(tiny_qwen_model_path, torch_dtype=torch.float16) + tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_model_path) device_map = {".*q_proj": "0", ".*k_proj": "cuda:1", "v_proj": 1, ".*up_proj": "1"} autoround = AutoRound( model, tokenizer, iters=2, device_map=device_map, nsamples=7, seqlen=32, act_bits=4, act_dynamic=False @@ -91,9 +92,9 @@ def test_act_quantization(self): @multi_card def test_lm_head(self): - model_name = "/models/Qwen2.5-7B-Instruct" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) - tokenizer = AutoTokenizer.from_pretrained(model_name) + model_path = get_model_path("qwen/Qwen2.5-7B-Instruct") + model = get_tiny_model(model_path) + tokenizer = AutoTokenizer.from_pretrained(model_path) device_map = {".*q_proj": "0", ".*k_proj": "cuda:1", "v_proj": 1, ".*up_proj": "1", "lm_head": 1} layer_config = {"lm_head": {"bits": 4}} autoround = AutoRound( @@ -109,10 +110,9 @@ def test_lm_head(self): autoround.quantize() @multi_card - def test_device_map(self): - model_name = "/models/Qwen2-0.5B-Instruct" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) - tokenizer = AutoTokenizer.from_pretrained(model_name) + def test_device_map(self, tiny_qwen_model_path): + model = AutoModelForCausalLM.from_pretrained(tiny_qwen_model_path, torch_dtype=torch.float16) + tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_model_path) device_map = {".*q_proj": "0", ".*k_proj": "cuda:1", "v_proj": 1, ".*up_proj": "cpu"} autoround = AutoRound(model, tokenizer, iters=2, device_map=device_map, nsamples=7, seqlen=32) autoround.quantize() @@ -210,12 +210,11 @@ def test_device_map(self): torch.cuda.empty_cache() @multi_card - def test_device_map_dict(self): + def test_device_map_dict(self, tiny_opt_model_path): device_map = {".*q_proj": "0", ".*k_proj": "cuda:1", "v_proj": 1, ".*up_proj": "1"} bits, group_size, sym = 4, 128, False - model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") - tokenizer = AutoTokenizer.from_pretrained(model_name) + model = AutoModelForCausalLM.from_pretrained(tiny_opt_model_path, torch_dtype=torch.float16, device_map="auto") + tokenizer = AutoTokenizer.from_pretrained(tiny_opt_model_path) autoround = AutoRound( model, tokenizer, @@ -229,9 +228,8 @@ def test_device_map_dict(self): autoround.quantize() # test model_name - model_name = "/models/opt-125m" autoround = AutoRound( - model_name, + tiny_opt_model_path, tokenizer, bits=bits, group_size=group_size, @@ -244,7 +242,7 @@ def test_device_map_dict(self): # test rtn autoround = AutoRound( - model_name, + tiny_opt_model_path, tokenizer, bits=bits, group_size=group_size, @@ -356,29 +354,25 @@ def test_device_map_for_triton(self): @multi_card def test_mllm_device_map(self): - model_name = "/models/Qwen2-VL-2B-Instruct/" + model_name = get_model_path("qwen/Qwen2-VL-2B-Instruct/") from auto_round import AutoRoundMLLM device_map = "0,1" ar = AutoRoundMLLM(model_name, device_map=device_map) - self.assertEqual(ar.device, "cuda:0") - self.assertEqual(ar.device_map, device_map) + assert ar.device == "cuda:0" + assert ar.device_map == device_map device_map = 1 ar = AutoRoundMLLM(ar.model, ar.tokenizer, processor=ar.processor, device_map=device_map) - self.assertEqual(ar.device, "cuda:1") - self.assertEqual(ar.device_map, device_map) + assert ar.device == "cuda:1" + assert ar.device_map == device_map device_map = "auto" ar = AutoRoundMLLM(ar.model, ar.tokenizer, processor=ar.processor, device_map=device_map) - self.assertEqual(ar.device, "cuda") - self.assertEqual(ar.device_map, device_map) + assert ar.device == "cuda" + assert ar.device_map == device_map device_map = {"model.language_model.layers": 0, "model.visual.blocks": 1} ar = AutoRoundMLLM(ar.model, ar.tokenizer, processor=ar.processor, device_map=device_map) - self.assertEqual(ar.model.model.language_model.layers[0].self_attn.q_proj.tuning_device, "cuda:0") - self.assertEqual(ar.model.model.visual.blocks[0].mlp.fc1.tuning_device, "cuda:1") - - -if __name__ == "__main__": - unittest.main() + assert ar.model.model.language_model.layers[0].self_attn.q_proj.tuning_device == "cuda:0" + assert ar.model.model.visual.blocks[0].mlp.fc1.tuning_device == "cuda:1" diff --git a/test/test_cuda/test_multiple_card_calib.py b/test/test_cuda/test_multiple_card_calib.py index 8d97046fb..fedb3f328 100644 --- a/test/test_cuda/test_multiple_card_calib.py +++ b/test/test_cuda/test_multiple_card_calib.py @@ -2,9 +2,8 @@ import re import shutil import sys -import unittest -sys.path.insert(0, "../..") +import pytest from auto_round.testing_utils import multi_card @@ -19,14 +18,20 @@ def get_accuracy(data): return 0.0 -class TestAutoRound(unittest.TestCase): - @classmethod - def setUpClass(self): - self.save_dir = "./saved" - self.tasks = "lambada_openai" +class TestAutoRound: + save_dir = "./saved" + tasks = "lambada_openai" - @classmethod - def tearDownClass(self): + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + + # Yield to hand control to the test methods + yield + + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @@ -36,7 +41,7 @@ def test_multiple_card_calib(self): ##test llm script res = os.system( - f"cd ../.. && {python_path} -m auto_round --model /models/Meta-Llama-3.1-8B-Instruct --devices '0,1' --quant_lm_head --iters 1 --nsamples 1 --output_dir None" + f"cd .. && {python_path} -m auto_round --model /models/Meta-Llama-3.1-8B-Instruct --devices '0,1' --quant_lm_head --iters 1 --nsamples 1 --output_dir None" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" @@ -47,11 +52,7 @@ def test_multiple_card_nvfp4(self): ##test llm script res = os.system( - f"cd ../.. && {python_path} -m auto_round --model facebook/opt-125m --scheme NVFP4 --devices '0,1' --iters 1 --nsamples 1 --enable_torch_compile --low_gpu_mem_usage" + f"cd .. && {python_path} -m auto_round --model facebook/opt-125m --scheme NVFP4 --devices '0,1' --iters 1 --nsamples 1 --enable_torch_compile --low_gpu_mem_usage" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_mxfp_and_nvfp_quant.py b/test/test_cuda/test_mxfp_and_nvfp_quant.py index 0dc43b093..808fa4a28 100644 --- a/test/test_cuda/test_mxfp_and_nvfp_quant.py +++ b/test/test_cuda/test_mxfp_and_nvfp_quant.py @@ -12,6 +12,8 @@ from auto_round.export.export_to_autoround import qlinear_fp as ar_qlinear_fp from auto_round.testing_utils import has_module +from ..helpers import get_model_path + testing_schemes = [AutoRoundFormat.MXFP8.value, AutoRoundFormat.MXFP4.value, AutoRoundFormat.NVFP4.value] QMODULE_MAPPING = { AutoRoundFormat.MXFP8.value: ar_qmodules.MXFP8QuantLinear, @@ -22,15 +24,14 @@ @pytest.mark.parametrize("scheme", testing_schemes) @torch.inference_mode() -def test_e2e_quant_and_infer(scheme): +def test_e2e_quant_and_infer(scheme, tiny_qwen_model_path): # Use a temporary directory for saving the quantized model with tempfile.TemporaryDirectory() as temp_dir: - model_name = "Qwen/Qwen2.5-0.5B-Instruct" # Load the tokenizer and model - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_model_path, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( - model_name, + tiny_qwen_model_path, device_map="cpu", torch_dtype="auto", trust_remote_code=True, diff --git a/test/test_cuda/test_mxfp_nvfp.py b/test/test_cuda/test_mxfp_nvfp.py index 552016f17..41c996b95 100644 --- a/test/test_cuda/test_mxfp_nvfp.py +++ b/test/test_cuda/test_mxfp_nvfp.py @@ -1,9 +1,7 @@ import copy import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch import transformers from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer @@ -11,37 +9,34 @@ from auto_round import AutoRound from auto_round.testing_utils import require_awq, require_optimum +from ..helpers import get_model_path, get_tiny_model -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) +class TestAutoRound: + save_dir = "./saved" + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") -class TestAutoRound(unittest.TestCase): - @classmethod - def setUpClass(self): - self.model_name = "facebook/opt-125m" - self.save_dir = "./saved" - self.llm_dataloader = LLMDataLoader() + # Yield to hand control to the test methods + yield - @classmethod - def tearDownClass(self): + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_fp8input_mxfp4_llmcompressor_format(self): - model_name = "/models/Qwen3-0.6B-FP8" + def test_fp8input_mxfp4_llmcompressor_format(self, dataloader): + model_name = get_model_path("qwen/Qwen3-0.6B-FP8") scheme = "mxfp4" ar = AutoRound( model=model_name, iters=2, seqlen=2, scheme=scheme, - dataset=self.llm_dataloader, + dataset=dataloader, ) compressed_model, _ = ar.quantize_and_save(output_dir=self.save_dir, format="llm_compressor") tmp_layer = compressed_model.model.layers[3].self_attn.q_proj @@ -59,18 +54,18 @@ def test_fp8input_mxfp4_llmcompressor_format(self): ), f"Invalid MXFP4 quantization configuration: {quantization_config}" shutil.rmtree(self.save_dir, ignore_errors=True) - def test_nvfp4_llmcompressor_format(self): + def test_nvfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader): scheme = "nvfp4" autoround = AutoRound( - self.model_name, + tiny_opt_model_path, scheme=scheme, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_dir compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj + tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") and hasattr(tmp_layer, "weight_global_scale") @@ -110,38 +105,37 @@ def test_nvfp4_llmcompressor_format(self): # if "France" in prompt: # assert "Paris" in generated_text - def test_nvfp4_moe_actmax_rtn(self): - model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite" + def test_nvfp4_moe_actmax_rtn(self, tiny_deepseek_v2_model_path, dataloader): + # model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite" scheme = "nvfp4" autoround = AutoRound( - model_name, + tiny_deepseek_v2_model_path, scheme=scheme, iters=0, seqlen=2, nsamples=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() quantized_model_path = self.save_dir autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round") - def test_nvfp4_moe_actmax_ar(self): - model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite" + def test_nvfp4_moe_actmax_ar(self, tiny_deepseek_v2_model_path, dataloader): scheme = "nvfp4" autoround = AutoRound( - model_name, + tiny_deepseek_v2_model_path, scheme=scheme, iters=1, seqlen=2, nsamples=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() quantized_model_path = self.save_dir autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round") - def test_qwen_moe_quant_infer(self): - model_name = "/models/Qwen1.5-MoE-A2.7B" + def test_qwen_moe_quant_infer(self, dataloader): + model_name = get_model_path("qwen/Qwen1.5-MoE-A2.7B") layer_config = { "layers\.(?:[3-9]|1[0-9]|2[0-3])": {"bits": 16, "act_bits": 16}, } @@ -152,7 +146,7 @@ def test_qwen_moe_quant_infer(self): iters=1, seqlen=3, nsamples=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = self.save_dir @@ -163,9 +157,5 @@ def test_qwen_moe_quant_infer(self): result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="piqa") print(result["results"]["piqa"]["acc,none"]) - self.assertGreater(result["results"]["piqa"]["acc,none"], 0.49) + assert result["results"]["piqa"]["acc,none"] > 0.49 shutil.rmtree(quantized_model_path, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_qbits.py b/test/test_cuda/test_qbits.py index d73d474d6..37e119b2c 100644 --- a/test/test_cuda/test_qbits.py +++ b/test/test_cuda/test_qbits.py @@ -1,48 +1,28 @@ import shutil -import sys -import unittest - -sys.path.insert(0, "../..") +import pytest from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound, AutoRoundConfig from auto_round.testing_utils import require_gptqmodel, require_itrex +from ..helpers import get_model_path, model_infer -class TestAutoRound(unittest.TestCase): - @classmethod - def setUpClass(self): - self.model_name = "/models/opt-125m" - self.save_folder = "./saved" - - def model_infer(self, model, tokenizer): - prompts = [ - "Hello,my name is", - # "The president of the United States is", - # "The capital of France is", - # "The future of AI is", - ] - - inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) - outputs = model.generate( - input_ids=inputs["input_ids"].to(model.device), - attention_mask=inputs["attention_mask"].to(model.device), - do_sample=False, ## change this to follow official usage - max_new_tokens=5, - ) - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] +class TestAutoRound: + save_dir = "./saved" - decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") - for i, prompt in enumerate(prompts): - print(f"Prompt: {prompt}") - print(f"Generated: {decoded_outputs[i]}") - print("-" * 50) + # Yield to hand control to the test methods + yield - @classmethod - def tearDownClass(self): + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") + shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) ## require torch 2.6 @@ -58,7 +38,7 @@ def test_load_gptq_model_8bits(self): quantization_config=quantization_config, ) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) @require_itrex def test_load_gptq_model_2bits(self): @@ -72,12 +52,13 @@ def test_load_gptq_model_2bits(self): quantization_config=quantization_config, ) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) @require_itrex def test_mixed_precision(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + model_path = get_model_path("facebook/opt-125m") + model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto", trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) layer_config = {} layer_config["model.decoder.layers.0.self_attn.k_proj"] = {"bits": 8} @@ -90,27 +71,29 @@ def test_mixed_precision(self): autoround = AutoRound( model, tokenizer, bits=bits, group_size=group_size, iters=1, nsamples=1, sym=sym, layer_config=layer_config ) - quantized_model_path = self.save_folder + quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, + self.save_dir, torch_dtype=torch.float16, device_map="cpu", ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) text = "There is a girl who likes adventure," inputs = tokenizer(text, return_tensors="pt").to(model.device) res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]) print(res) assert "!!!" not in res - shutil.rmtree(self.save_folder, ignore_errors=True) + shutil.rmtree(self.save_dir, ignore_errors=True) @require_gptqmodel - def test_autoround_sym(self): + def test_autoround_sym(self, tiny_opt_model_path): for bits in [4]: - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained( + tiny_opt_model_path, torch_dtype="auto", trust_remote_code=True + ) + tokenizer = AutoTokenizer.from_pretrained(tiny_opt_model_path, trust_remote_code=True) bits, group_size, sym = bits, 128, True autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=2) quantized_model_path = "./saved" @@ -126,4 +109,4 @@ def test_autoround_sym(self): res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]) print(res) assert "!!!" not in res - shutil.rmtree(self.save_folder, ignore_errors=True) + shutil.rmtree(self.save_dir, ignore_errors=True) diff --git a/test/test_cuda/test_scheme.py b/test/test_cuda/test_scheme.py index 1c603c7ed..2ed5527bd 100644 --- a/test/test_cuda/test_scheme.py +++ b/test/test_cuda/test_scheme.py @@ -1,103 +1,104 @@ import shutil -import sys -import unittest +import pytest + +from auto_round import AutoRound from auto_round.schemes import QuantizationScheme -sys.path.insert(0, "../..") +from ..helpers import get_model_path -from auto_round import AutoRound +class TestAutoRound: + save_dir = "./saved" -class TestAutoRound(unittest.TestCase): - @classmethod - def setUpClass(self): - self.model_name = "/models/opt-125m" - self.save_folder = "./saved" + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") - @classmethod - def tearDownClass(self): - shutil.rmtree(self.save_folder, ignore_errors=True) + # Yield to hand control to the test methods + yield + + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") + shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) # Tuning tests - def test_gguf(self): - ar = AutoRound("/models/Qwen3-0.6B", scheme="W2A16", nsamples=1, iters=1) - ar.quantize_and_save(self.save_folder, format="gguf:q4_k_m") - self.assertEqual(ar.bits, 4) - shutil.rmtree(self.save_folder, ignore_errors=True) - - def test_w4a16(self): - ar = AutoRound(self.model_name, scheme="W4A16", nsamples=1, iters=1) - self.assertEqual(ar.bits, 4) + def test_gguf(self, tiny_qwen_model_path): + ar = AutoRound(tiny_qwen_model_path, scheme="W2A16", nsamples=1, iters=1) + ar.quantize_and_save(self.save_dir, format="gguf:q4_k_m") + assert ar.bits == 4 + shutil.rmtree(self.save_dir, ignore_errors=True) + + def test_w4a16(self, tiny_opt_model_path): + ar = AutoRound(tiny_opt_model_path, scheme="W4A16", nsamples=1, iters=1) + assert ar.bits == 4 ar.quantize() - def test_w2a16(self): - ar = AutoRound(self.model_name, scheme="W2A16", nsamples=1, iters=1) - self.assertEqual(ar.bits, 2) + def test_w2a16(self, tiny_opt_model_path): + ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=1) + assert ar.bits == 2 ar.quantize() - def test_mxfp4(self): - ar = AutoRound(self.model_name, scheme="MXFP4", nsamples=1, iters=1) - self.assertEqual(ar.bits, 4) - self.assertEqual(ar.act_bits, 4) - self.assertEqual(ar.data_type, "mx_fp") - self.assertEqual(ar.act_data_type, "mx_fp_rceil") + def test_mxfp4(self, tiny_opt_model_path): + ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=1) + assert ar.bits == 4 + assert ar.act_bits == 4 + assert ar.data_type == "mx_fp" + assert ar.act_data_type == "mx_fp_rceil" ar.quantize() - def test_fp8_static(self): - ar = AutoRound(self.model_name, scheme="FP8_STATIC", nsamples=1, iters=1) - self.assertEqual(ar.bits, 8) - self.assertEqual(ar.act_bits, 8) - self.assertEqual(ar.data_type, "fp") - self.assertEqual(ar.act_data_type, "fp") - self.assertEqual(ar.group_size, -1) - self.assertEqual(ar.act_dynamic, False) + def test_fp8_static(self, tiny_opt_model_path): + ar = AutoRound(tiny_opt_model_path, scheme="FP8_STATIC", nsamples=1, iters=1) + assert ar.bits == 8 + assert ar.act_bits == 8 + assert ar.data_type == "fp" + assert ar.act_data_type == "fp" + assert ar.group_size == -1 + assert ar.act_dynamic is False ar.quantize() ## RTN tests - def test_w2a16_rtn(self): - ar = AutoRound(self.model_name, scheme="W2A16", nsamples=1, iters=0) - self.assertEqual(ar.bits, 2) + def test_w2a16_rtn(self, tiny_opt_model_path): + ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=0) + assert ar.bits == 2 ar.quantize() - def test_mxfp4_rtn(self): - ar = AutoRound(self.model_name, scheme="MXFP4", nsamples=1, iters=0) - self.assertEqual(ar.bits, 4) - self.assertEqual(ar.act_bits, 4) - self.assertEqual(ar.data_type, "mx_fp") - self.assertEqual(ar.act_data_type, "mx_fp_rceil") + def test_mxfp4_rtn(self, tiny_opt_model_path): + ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=0) + assert ar.bits == 4 + assert ar.act_bits == 4 + assert ar.data_type == "mx_fp" + assert ar.act_data_type == "mx_fp_rceil" ar.quantize() - def test_fp8_static_rtn(self): - ar = AutoRound(self.model_name, scheme="FP8_STATIC", nsamples=1, iters=0) - self.assertEqual(ar.bits, 8) - self.assertEqual(ar.act_bits, 8) - self.assertEqual(ar.data_type, "fp") - self.assertEqual(ar.act_data_type, "fp") - self.assertEqual(ar.group_size, -1) - self.assertEqual(ar.act_dynamic, False) + def test_fp8_static_rtn(self, tiny_opt_model_path): + ar = AutoRound(tiny_opt_model_path, scheme="FP8_STATIC", nsamples=1, iters=0) + assert ar.bits == 8 + assert ar.act_bits == 8 + assert ar.data_type == "fp" + assert ar.act_data_type == "fp" + assert ar.group_size == -1 + assert ar.act_dynamic is False ar.quantize() def test_scheme_in_layer_config(self): + model_path = get_model_path("facebook/opt-125m") layer_config = { "model.decoder.layers.2.self_attn": {"bits": 2}, "model.decoder.layers.3.self_attn.v_proj": "W8A16", "model.decoder.layers.4.self_attn.k_proj": QuantizationScheme.from_dict({"group_size": 64}), } - ar = AutoRound(self.model_name, scheme="W3A16", nsamples=1, iters=1, layer_config=layer_config) + ar = AutoRound(model_path, scheme="W3A16", nsamples=1, iters=1, layer_config=layer_config) ar.quantize() for n, m in ar.model.named_modules(): if n == "model.decoder.layers.2.self_attn.q_proj": - self.assertEqual(m.bits, 2) + assert m.bits == 2 if n == "model.decoder.layers.2.self_attn.k_proj": - self.assertEqual(m.bits, 2) + assert m.bits == 2 if n == "model.decoder.layers.3.self_attn.v_proj": - self.assertEqual(m.bits, 8) + assert m.bits == 8 if n == "model.decoder.layers.4.self_attn.k_proj": - self.assertEqual(m.group_size, 64) - - -if __name__ == "__main__": - unittest.main() + assert m.group_size == 64 diff --git a/test/test_cuda/test_support_vlms.py b/test/test_cuda/test_support_vlms.py index 5a2759021..3358c8226 100644 --- a/test/test_cuda/test_support_vlms.py +++ b/test/test_cuda/test_support_vlms.py @@ -1,10 +1,8 @@ import os import shutil import sys -import unittest - -sys.path.insert(0, "../..") +import pytest import requests from PIL import Image @@ -12,15 +10,15 @@ from auto_round.testing_utils import require_gptqmodel, require_package_version_ut, require_vlm_env -class TestSupportVLMS(unittest.TestCase): +class TestSupportVLMS: @classmethod - def setUpClass(self): + def setup_class(self): self.save_dir = os.path.join(os.path.dirname(__file__), "ut_saved") self.python_path = sys.executable self.device = 0 @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree(self.save_dir, ignore_errors=True) @require_gptqmodel @@ -28,10 +26,10 @@ def test_qwen2(self): model_path = "/models/Qwen2-VL-2B-Instruct/" # test tune res = os.system( - f"cd ../.. && {self.python_path} -m auto_round --mllm " + f"cd .. && {self.python_path} -m auto_round --mllm " f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}" ) - self.assertFalse(res > 0 or res == -1, msg="qwen2 tuning fail") + assert not (res > 0 or res == -1), "qwen2 tuning fail" # test infer quantized_model_path = os.path.join(self.save_dir, "Qwen2-VL-2B-Instruct-w4g128") @@ -83,10 +81,10 @@ def test_phi3(self): model_path = "/models/Phi-3.5-vision-instruct/" ## test tune res = os.system( - f"cd ../.. && {self.python_path} -m auto_round --mllm " + f"cd .. && {self.python_path} -m auto_round --mllm " f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}" ) - self.assertFalse(res > 0 or res == -1, msg="Phi-3.5 tuning fail") + assert not (res > 0 or res == -1), "Phi-3.5 tuning fail" ## test infer from transformers import AutoModelForCausalLM, AutoProcessor @@ -131,12 +129,12 @@ def test_phi3_vision_awq(self): model_path = "/models/Phi-3.5-vision-instruct/" ## test tune res = os.system( - f"cd ../.. && {self.python_path} -m auto_round --mllm " + f"cd .. && {self.python_path} -m auto_round --mllm " f"--model {model_path} --iter 2 --quant_nontext_module " f"--nsample 64 --seqlen 32 " f"--format auto_awq --output_dir {self.save_dir} --device {self.device}" ) - self.assertFalse(res > 0 or res == -1, msg="Phi-3.5 tuning fail") + assert not (res > 0 or res == -1), "Phi-3.5 tuning fail" ## test infer from transformers import AutoModelForCausalLM, AutoProcessor @@ -179,20 +177,16 @@ def test_glm(self): model_path = "/models/glm-4v-9b/" ## test tune res = os.system( - f"cd ../.. && {self.python_path} -m auto_round " + f"cd .. && {self.python_path} -m auto_round " f"--model {model_path} --iter 1 --output_dir {self.save_dir} --device {self.device}" ) - self.assertFalse(res > 0 or res == -1, msg="glm-4v-9b tuning fail") + assert not (res > 0 or res == -1), "glm-4v-9b tuning fail" def test_granite_vision(self): model_path = "/models/granite-vision-3.2-2b" ## test tune res = os.system( - f"cd ../.. && {self.python_path} -m auto_round " + f"cd .. && {self.python_path} -m auto_round " f"--model {model_path} --iter 1 --output_dir {self.save_dir} --device {self.device}" ) - self.assertFalse(res > 0 or res == -1, msg="granite-vision-3.2-2b tuning fail") - - -if __name__ == "__main__": - unittest.main() + assert not (res > 0 or res == -1), "granite-vision-3.2-2b tuning fail" diff --git a/test/test_cuda/test_torch_backend.py b/test/test_cuda/test_torch_backend.py index 3f7cb4141..a7eb30552 100644 --- a/test/test_cuda/test_torch_backend.py +++ b/test/test_cuda/test_torch_backend.py @@ -1,12 +1,6 @@ import shutil -import sys -import unittest import pytest - -sys.path.insert(0, "../..") - - import torch from transformers import AutoModelForCausalLM, AutoTokenizer @@ -14,58 +8,30 @@ from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.testing_utils import require_autogptq, require_gptqmodel +from ..helpers import get_model_path, model_infer -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestAutoRoundTorchBackend(unittest.TestCase): - @classmethod - def setUpClass(self): - self.model_name = "/models/opt-125m" - self.save_folder = "./saved" - self.llm_dataloader = LLMDataLoader() +class TestAutoRoundTorchBackend: - def model_infer(self, model, tokenizer): - prompts = [ - "Hello,my name is", - # "The president of the United States is", - # "The capital of France is", - # "The future of AI is", - ] + save_dir = "./saved" - inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") - outputs = model.generate( - input_ids=inputs["input_ids"].to(model.device), - attention_mask=inputs["attention_mask"].to(model.device), - do_sample=False, ## change this to follow official usage - max_new_tokens=5, - ) - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] - - decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - - for i, prompt in enumerate(prompts): - print(f"Prompt: {prompt}") - print(f"Generated: {decoded_outputs[i]}") - print("-" * 50) - return decoded_outputs[0] + # Yield to hand control to the test methods + yield - @classmethod - def tearDownClass(self): - shutil.rmtree(self.save_folder, ignore_errors=True) + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") + shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_torch_4bits_asym(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + def test_torch_4bits_asym(self, dataloader): + model_path = get_model_path("facebook/opt-125m") + model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto", trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) bits, group_size, sym = 4, 128, False autoround = AutoRound( model, @@ -75,9 +41,9 @@ def test_torch_4bits_asym(self): sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) - quantized_model_path = self.save_folder + quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel") quantization_config = AutoRoundConfig(backend="torch") @@ -85,28 +51,29 @@ def test_torch_4bits_asym(self): quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35) + assert result["results"]["lambada_openai"]["acc,none"] > 0.35 torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + self.save_dir, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35) + assert result["results"]["lambada_openai"]["acc,none"] > 0.35 torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) - def test_torch_4bits_sym(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + def test_torch_4bits_sym(self, dataloader): + model_path = get_model_path("facebook/opt-125m") + model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto", trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) bits, group_size, sym = 4, 128, True autoround = AutoRound( model, @@ -116,9 +83,9 @@ def test_torch_4bits_sym(self): sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) - quantized_model_path = self.save_folder + quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") ##will convert to gptq model quantization_config = AutoRoundConfig(backend="torch") @@ -126,14 +93,10 @@ def test_torch_4bits_sym(self): quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.28) + assert result["results"]["lambada_openai"]["acc,none"] > 0.28 torch.cuda.empty_cache() - shutil.rmtree(self.save_folder, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() + shutil.rmtree(self.save_dir, ignore_errors=True) diff --git a/test/test_cuda/test_transformers.py b/test/test_cuda/test_transformers.py index 6f953339d..f37fe94ff 100644 --- a/test/test_cuda/test_transformers.py +++ b/test/test_cuda/test_transformers.py @@ -14,8 +14,8 @@ import gc import os import tempfile -import unittest +import pytest from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from transformers.testing_utils import ( require_accelerate, @@ -27,6 +27,8 @@ ) from transformers.utils import is_torch_available +from ..helpers import get_model_path + if is_torch_available(): import torch @@ -34,7 +36,7 @@ # @slow @require_torch_gpu @require_accelerate -class AutoRoundTest(unittest.TestCase): +class AutoRoundTest: model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc" input_text = "There is a girl who likes adventure," EXPECTED_OUTPUTS = set() @@ -53,7 +55,7 @@ class AutoRoundTest(unittest.TestCase): # called only once for all test in this class @classmethod - def setUpClass(cls): + def setup_class(cls): """ Setup quantized model """ @@ -74,12 +76,12 @@ def test_quantized_model(self): """ input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) output = self.quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False) - self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS) + assert self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUTS - def test_raise_if_non_quantized(self): - model_id = "facebook/opt-125m" + def test_raise_if_non_quantized(self, tiny_opt_model_path): + model_id = tiny_opt_model_path quantization_config = AutoRoundConfig(bits=4) - with self.assertRaises(ValueError): + with pytest.raises(ValueError): _ = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config) def test_quantized_model_bf16(self): @@ -96,7 +98,7 @@ def test_quantized_model_bf16(self): ) output = quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False) - self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS) + assert self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUTS @require_intel_extension_for_pytorch def test_quantized_model_on_cpu(self): @@ -108,7 +110,7 @@ def test_quantized_model_on_cpu(self): quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto") output = quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False) - self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS) + assert self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUTS def test_save_pretrained(self): """ @@ -131,7 +133,7 @@ def test_save_pretrained(self): input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) output = model.generate(**input_ids, max_new_tokens=40, do_sample=False) - self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS) + assert self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUTS @require_torch_multi_gpu def test_quantized_model_multi_gpu(self): @@ -144,7 +146,7 @@ def test_quantized_model_multi_gpu(self): ) input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(quantized_model.device) output = quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False) - self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS) + assert self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUTS def test_convert_from_gptq(self): """ @@ -185,7 +187,7 @@ def test_mixed_bits(self): """ Simple test that checks if auto-round work properly with mixed bits """ - model_name = "facebook/opt-125m" + model_name = get_model_path("facebook/opt-125m") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) layer_config = { @@ -203,7 +205,3 @@ def test_mixed_bits(self): text = "There is a girl who likes adventure," inputs = tokenizer(text, return_tensors="pt").to(model.device) tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0]) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_triton_backend.py b/test/test_cuda/test_triton_backend.py index 7cbc8719d..ac5436f47 100644 --- a/test/test_cuda/test_triton_backend.py +++ b/test/test_cuda/test_triton_backend.py @@ -1,8 +1,6 @@ import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch from transformers import AutoModelForCausalLM, AutoTokenizer @@ -10,56 +8,22 @@ from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.testing_utils import require_greater_than_050 +from ..helpers import model_infer -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestAutoRoundTritonBackend(unittest.TestCase): +class TestAutoRoundTritonBackend: @classmethod - def setUpClass(self): + def setup_class(self): self.model_name = "/models/opt-125m" self.save_folder = "./saved" - self.llm_dataloader = LLMDataLoader() - - def model_infer(self, model, tokenizer): - prompts = [ - "Hello,my name is", - # "The president of the United States is", - # "The capital of France is", - # "The future of AI is", - ] - - inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) - - outputs = model.generate( - input_ids=inputs["input_ids"].to(model.device), - attention_mask=inputs["attention_mask"].to(model.device), - do_sample=False, ## change this to follow official usage - max_new_tokens=5, - ) - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] - - decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - - for i, prompt in enumerate(prompts): - print(f"Prompt: {prompt}") - print(f"Generated: {decoded_outputs[i]}") - print("-" * 50) - return decoded_outputs[0] @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @require_greater_than_050 - def test_tritonv2_4bits_asym(self): + def test_tritonv2_4bits_asym(self, dataloader): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, False @@ -71,7 +35,7 @@ def test_tritonv2_4bits_asym(self): sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_folder autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel") @@ -82,10 +46,10 @@ def test_tritonv2_4bits_asym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.34) + assert result["results"]["lambada_openai"]["acc,none"] > 0.34 torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( @@ -93,10 +57,10 @@ def test_tritonv2_4bits_asym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.34) + assert result["results"]["lambada_openai"]["acc,none"] > 0.34 torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) @@ -115,10 +79,10 @@ def test_tritonv2_2bits_asym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.19) + assert result["results"]["lambada_openai"]["acc,none"] > 0.19 torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( @@ -126,15 +90,15 @@ def test_tritonv2_2bits_asym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.19) + assert result["results"]["lambada_openai"]["acc,none"] > 0.19 torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) @require_greater_than_050 - def test_tritonv2_4bits_sym(self): + def test_tritonv2_4bits_sym(self, dataloader): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True @@ -146,7 +110,7 @@ def test_tritonv2_4bits_sym(self): sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_folder autoround.quantize_and_save(output_dir=quantized_model_path) @@ -157,10 +121,10 @@ def test_tritonv2_4bits_sym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.26) + assert result["results"]["lambada_openai"]["acc,none"] > 0.26 torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( @@ -168,10 +132,10 @@ def test_tritonv2_4bits_sym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.26) + assert result["results"]["lambada_openai"]["acc,none"] > 0.26 torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) @@ -191,10 +155,10 @@ def test_tritonv2_8bits_sym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27) + assert result["results"]["lambada_openai"]["acc,none"] > 0.27 torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( @@ -202,10 +166,10 @@ def test_tritonv2_8bits_sym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27) + assert result["results"]["lambada_openai"]["acc,none"] > 0.27 torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) @@ -230,10 +194,10 @@ def test_tritonv2_2bits_sym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.18) + assert result["results"]["lambada_openai"]["acc,none"] > 0.18 torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( @@ -241,13 +205,9 @@ def test_tritonv2_2bits_sym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.18) + assert result["results"]["lambada_openai"]["acc,none"] > 0.18 torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_vlms.py b/test/test_cuda/test_vlms.py index d06c48ff5..c8a4adb53 100644 --- a/test/test_cuda/test_vlms.py +++ b/test/test_cuda/test_vlms.py @@ -2,26 +2,22 @@ import os import re import shutil -import sys -import unittest +import pytest import requests - -sys.path.insert(0, "../..") - from PIL import Image from auto_round import AutoRoundConfig from auto_round.testing_utils import require_gptqmodel, require_optimum, require_vlm_env -class TestAutoRound(unittest.TestCase): +class TestAutoRound: @classmethod - def setUpClass(self): + def setup_class(self): self.save_dir = "./saved" @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree(self.save_dir, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @@ -123,12 +119,12 @@ def test_mm_block_name(self): model = MllamaForConditionalGeneration.from_pretrained(model_name, trust_remote_code=True, device_map="auto") block_name = get_block_names(model, quant_vision=True) - self.assertTrue(len(block_name) == 3) - self.assertTrue(any(["vision_model.global_transformer.layers.0" not in n for n in block_name])) - self.assertTrue(any(["vision_model.transformer.layers.0" not in n for n in block_name])) + assert len(block_name) == 3 + assert any(["vision_model.global_transformer.layers.0" not in n for n in block_name]) + assert any(["vision_model.transformer.layers.0" not in n for n in block_name]) block_name = get_block_names(model, quant_vision=False) - self.assertTrue(len(block_name) == 1) - self.assertTrue(get_block_names(model) == block_name) + assert len(block_name) == 1 + assert get_block_names(model) == block_name def test_mllm_detect(self): from auto_round.utils import is_mllm_model, llm_load_model, mllm_load_model @@ -144,18 +140,14 @@ def test_mllm_detect(self): "/models/InternVL3-1B", "/models/pixtral-12b", ]: - self.assertTrue(is_mllm_model(model_name)) + assert is_mllm_model(model_name) try: model, _, _, _ = mllm_load_model(model_name) except: continue - self.assertTrue(is_mllm_model(model)) + assert is_mllm_model(model) for model_name in ["/models/glm-4-9b-chat", "/models/Qwen2.5-1.5B-Instruct/"]: - self.assertFalse(is_mllm_model(model_name)) + assert not is_mllm_model(model_name) model, _ = llm_load_model(model_name) - self.assertFalse(is_mllm_model(model)) - - -if __name__ == "__main__": - unittest.main() + assert not is_mllm_model(model) diff --git a/test/test_hpu/__init__.py b/test/test_hpu/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/test_hpu/_test_helpers.py b/test/test_hpu/_test_helpers.py deleted file mode 100644 index 48e8398d7..000000000 --- a/test/test_hpu/_test_helpers.py +++ /dev/null @@ -1,43 +0,0 @@ -import pytest - - -def is_pytest_mode_compile(): - return pytest.mode == "compile" - - -def is_pytest_mode_lazy(): - return pytest.mode == "lazy" - - -def model_infer(model, tokenizer, apply_chat_template=False): - prompts = [ - "Hello,my name is", - # "The president of the United States is", - # "The capital of France is", - # "The future of AI is", - ] - if apply_chat_template: - texts = [] - for prompt in prompts: - messages = [{"role": "user", "content": prompt}] - text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - texts.append(text) - prompts = texts - - inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) - - outputs = model.generate( - input_ids=inputs["input_ids"].to(model.device), - attention_mask=inputs["attention_mask"].to(model.device), - do_sample=False, ## change this to follow official usage - max_new_tokens=5, - ) - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] - - decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - - for i, prompt in enumerate(prompts): - print(f"Prompt: {prompt}") - print(f"Generated: {decoded_outputs[i]}") - print("-" * 50) - return decoded_outputs[0] diff --git a/test/test_hpu/test_auto_round.py b/test/test_hpu/test_auto_round.py index 2bb7983e5..d2e33dd03 100644 --- a/test/test_hpu/test_auto_round.py +++ b/test/test_hpu/test_auto_round.py @@ -1,16 +1,17 @@ import pytest import torch -from _test_helpers import is_pytest_mode_compile, is_pytest_mode_lazy from auto_round.utils import is_hpex_available +from ..helpers import get_model_path, is_pytest_mode_compile, is_pytest_mode_lazy + def run_opt_125m_on_hpu(): from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound - model_name = "facebook/opt-125m" + model_name = get_model_path("facebook/opt-125m") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) @@ -55,7 +56,7 @@ def test_w4a8(data_type): from auto_round import AutoRound - model_name = "facebook/opt-125m" + model_name = get_model_path("facebook/opt-125m") model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", diff --git a/test/test_hpu/test_inference.py b/test/test_hpu/test_inference.py index e0a0ef321..95c680c2d 100644 --- a/test/test_hpu/test_inference.py +++ b/test/test_hpu/test_inference.py @@ -1,23 +1,12 @@ import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - def is_hpex_available(): try: import habana_frameworks.torch.core as htcore # pylint: disable=E0401 @@ -28,16 +17,15 @@ def is_hpex_available(): # TODO: This test case is temporarily commented out since it not tested for a long time. We need to add it back and change it into pytest format. -# class TestAutoRound(unittest.TestCase): +# class TestAutoRound: # @classmethod -# def setUpClass(self): +# def setup_class(self): # model_name = "facebook/opt-125m" # self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) # self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) -# self.llm_dataloader = LLMDataLoader() # @classmethod -# def tearDownClass(self): +# def teardown_class(self): # shutil.rmtree("./saved", ignore_errors=True) # shutil.rmtree("runs", ignore_errors=True) @@ -57,7 +45,7 @@ def is_hpex_available(): # sym=sym, # iters=2, # seqlen=2, -# dataset=self.llm_dataloader, +# dataset=dataloader, # ) # autoround.quantize() # quantized_model_path = "./saved" @@ -86,7 +74,7 @@ def is_hpex_available(): # sym=sym, # iters=2, # seqlen=2, -# dataset=self.llm_dataloader, +# dataset=dataloader, # ) # autoround.quantize() # quantized_model_path = "./saved" diff --git a/test/test_xpu/__init__.py b/test/test_xpu/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/test_xpu/test_autoround.py b/test/test_xpu/test_autoround.py index 8052a8af0..d857e3bdc 100644 --- a/test/test_xpu/test_autoround.py +++ b/test/test_xpu/test_autoround.py @@ -1,39 +1,29 @@ import copy import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound, AutoRoundConfig +from ..helpers import get_model_path -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - def __iter__(self): - for i in range(3): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestAutoRoundXPU(unittest.TestCase): +class TestAutoRoundXPU: @classmethod - def setUpClass(self): - - self.llm_dataloader = LLMDataLoader() + def setup_class(self): + pass @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) pass - def test_gptq_format(self): - model_name = "facebook/opt-125m" + def test_gptq_format(self, dataloader): + model_name = get_model_path("facebook/opt-125m") model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", trust_remote_code=True, device_map="auto" ) @@ -48,7 +38,7 @@ def test_gptq_format(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path) @@ -65,8 +55,8 @@ def test_gptq_format(self): print(res) assert "!!!" not in res - def test_awq_format(self): - model_name = "facebook/opt-125m" + def test_awq_format(self, dataloader): + model_name = get_model_path("facebook/opt-125m") model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", trust_remote_code=True, device_map="xpu" ) @@ -80,7 +70,7 @@ def test_awq_format(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq") @@ -97,7 +87,3 @@ def test_awq_format(self): res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]) print(res) assert "!!!" not in res - - -if __name__ == "__main__": - unittest.main()