From ed413217e4abe559f54ef883f03f1c5eb3251792 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Thu, 18 Dec 2025 03:40:02 -0500 Subject: [PATCH 01/24] initial implementation Signed-off-by: He, Xin3 --- test/README.md | 0 test/{test_hpu => }/conftest.py | 14 ++ test/fixtures.py | 63 ++++++++ .../{test_hpu/_test_helpers.py => helpers.py} | 22 +++ test/test_ark/test_model.py | 78 +++------- test/test_cpu/__init__.py | 0 test/test_cpu/_test_helpers.py | 32 ---- test/test_cpu/test_act_quantization.py | 135 ++++++----------- test/test_cpu/test_alg_ext.py | 21 +-- test/test_cpu/test_auto_scheme.py | 52 +++---- test/test_cpu/test_autoopt.py | 43 ++---- test/test_cpu/test_autoround.py | 141 ++++++++---------- test/test_cpu/test_autoround_acc.py | 44 ++---- .../test_autoround_export_to_itrex.py | 42 ++---- test/test_cpu/test_block_names.py | 35 ++--- test/test_cpu/test_calib_dataset.py | 26 +--- test/test_cpu/test_cli_usage.py | 13 +- test/test_cpu/test_conv1d.py | 31 +--- test/test_cpu/test_export.py | 81 ++++------ test/test_cpu/test_generation.py | 28 +--- test/test_cpu/test_gguf_format.py | 49 ++---- test/test_cpu/test_llmcompressor.py | 27 ++-- test/test_cpu/test_load_awq_gptq.py | 35 +---- test/test_cpu/test_mix_bits.py | 51 +++---- test/test_cpu/test_mllm.py | 22 +-- test/test_cpu/test_model_scope.py | 27 +--- test/test_cpu/test_mxfp_nvfp.py | 71 ++++----- test/test_cpu/test_scheme.py | 90 +++++------ test/test_cpu/test_script.py | 10 +- test/test_cpu/test_torch_backend.py | 65 ++------ test/test_cpu/test_utils.py | 2 - test/test_cpu/test_woq_linear.py | 3 - test/test_cuda/__init__.py | 0 test/test_cuda/_test_helpers.py | 32 ---- test/test_cuda/test_2_3bits.py | 44 +----- test/test_cuda/test_alg_ext.py | 14 +- test/test_cuda/test_auto_round_format.py | 87 +++-------- test/test_cuda/test_auto_scheme.py | 45 +++--- test/test_cuda/test_calib_dataset.py | 15 +- test/test_cuda/test_conv1d.py | 29 +--- test/test_cuda/test_diffusion.py | 16 +- test/test_cuda/test_exllamav2_backend.py | 68 ++------- test/test_cuda/test_export.py | 64 +++----- test/test_cuda/test_fp8_input.py | 14 +- test/test_cuda/test_get_block_name.py | 14 +- test/test_cuda/test_gguf.py | 28 +--- test/test_cuda/test_main_func.py | 14 +- test/test_cuda/test_marlin_backend.py | 74 ++------- test/test_cuda/test_mix_bits.py | 60 +++----- test/test_cuda/test_multiple_card.py | 32 ++-- test/test_cuda/test_multiple_card_calib.py | 13 +- test/test_cuda/test_mxfp_nvfp.py | 44 ++---- test/test_cuda/test_qbits.py | 42 +----- test/test_cuda/test_scheme.py | 73 ++++----- test/test_cuda/test_support_vlms.py | 14 +- test/test_cuda/test_torch_backend.py | 66 ++------ test/test_cuda/test_transformers.py | 10 +- test/test_cuda/test_triton_backend.py | 78 +++------- test/test_cuda/test_vlms.py | 16 +- test/test_hpu/__init__.py | 0 test/test_hpu/test_auto_round.py | 3 +- test/test_hpu/test_inference.py | 24 +-- test/test_xpu/__init__.py | 0 test/test_xpu/test_autoround.py | 28 +--- 64 files changed, 782 insertions(+), 1632 deletions(-) create mode 100644 test/README.md rename test/{test_hpu => }/conftest.py (72%) create mode 100644 test/fixtures.py rename test/{test_hpu/_test_helpers.py => helpers.py} (63%) create mode 100644 test/test_cpu/__init__.py delete mode 100644 test/test_cpu/_test_helpers.py create mode 100644 test/test_cuda/__init__.py delete mode 100644 test/test_cuda/_test_helpers.py create mode 100644 test/test_hpu/__init__.py create mode 100644 test/test_xpu/__init__.py diff --git a/test/README.md b/test/README.md new file mode 100644 index 000000000..e69de29bb diff --git a/test/test_hpu/conftest.py b/test/conftest.py similarity index 72% rename from test/test_hpu/conftest.py rename to test/conftest.py index f4e9675bf..ebe377e48 100644 --- a/test/test_hpu/conftest.py +++ b/test/conftest.py @@ -1,9 +1,23 @@ import os +import sys from typing import Mapping import pytest +from .fixtures import ( + dataloader, + model, + tiny_opt_model, + tiny_opt_model_path, + tokenizer, +) +from .helpers import model_infer +# Easy debugging without installing auto-round. +sys.path.insert(0, "..") + + +### HPU related configuration, usage: `pytest --mode=compile/lazy`` def pytest_addoption(parser): parser.addoption( "--mode", diff --git a/test/fixtures.py b/test/fixtures.py new file mode 100644 index 000000000..615e579a8 --- /dev/null +++ b/test/fixtures.py @@ -0,0 +1,63 @@ +import shutil + +import pytest +import torch +import transformers + +from .helpers import opt_name_or_path + + +class DataLoader: + def __init__(self): + self.batch_size = 1 + + def __iter__(self): + for i in range(2): + yield torch.ones([1, 10], dtype=torch.long) + + +@pytest.fixture(scope="session") +def tiny_opt_model_path(): + tiny_opt_model_path = "./tmp_tiny_opt_model_path" + model = transformers.AutoModelForCausalLM.from_pretrained(opt_name_or_path, dtype="auto", trust_remote_code=True) + model.config.num_hidden_layers = 3 + setattr(model.model.decoder, "layers", model.model.decoder.layers[:3]) + tokenizer = transformers.AutoTokenizer.from_pretrained(opt_name_or_path, trust_remote_code=True) + model.save_pretrained(tiny_opt_model_path) + tokenizer.save_pretrained(tiny_opt_model_path) + print("[Fixture]: built tiny model path for testing in session") + yield tiny_opt_model_path + shutil.rmtree(tiny_opt_model_path) + + +@pytest.fixture(scope="function") +def tiny_opt_model(): + model = transformers.AutoModelForCausalLM.from_pretrained(opt_name_or_path, dtype="auto", trust_remote_code=True) + model.config.num_hidden_layers = 3 + setattr(model.model.decoder, "layers", model.model.decoder.layers[:3]) + return model + + +@pytest.fixture(scope="function") +def tiny_opt_model(): + model = transformers.AutoModelForCausalLM.from_pretrained(opt_name_or_path, dtype="auto", trust_remote_code=True) + model.config.num_hidden_layers = 3 + setattr(model.model.decoder, "layers", model.model.decoder.layers[:3]) + return model + + +@pytest.fixture(scope="function") +def opt_model(): + model = transformers.AutoModelForCausalLM.from_pretrained(opt_name_or_path, dtype="auto", trust_remote_code=True) + return model + + +@pytest.fixture(scope="session") +def opt_tokenizer(): + tokenizer = transformers.AutoTokenizer.from_pretrained(opt_name_or_path, trust_remote_code=True) + return tokenizer + + +@pytest.fixture(scope="session") +def dataloader(): + return DataLoader() diff --git a/test/test_hpu/_test_helpers.py b/test/helpers.py similarity index 63% rename from test/test_hpu/_test_helpers.py rename to test/helpers.py index 48e8398d7..97870eba6 100644 --- a/test/test_hpu/_test_helpers.py +++ b/test/helpers.py @@ -1,6 +1,27 @@ +import os + import pytest +# Automatic choose local path or model name. +opt_name_or_path = "/tf_dataset/auto_round/models/facebook/opt-125m" +if not os.path.exists(opt_name_or_path): + opt_name_or_path = "facebook/opt-125m" + +qwen_name_or_path = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B" +if not os.path.exists(qwen_name_or_path): + qwen_name_or_path = "Qwen/Qwen3-0.6B" + +lamini_name_or_path = "/tf_dataset/auto_round/models/MBZUAI/LaMini-GPT-124M" +if not os.path.exists(lamini_name_or_path): + lamini_name_or_path = "MBZUAI/LaMini-GPT-124M" + +gptj_name_or_path = "/tf_dataset/auto_round/models/hf-internal-testing/tiny-random-GPTJForCausalLM" +if not os.path.exists(gptj_name_or_path): + gptj_name_or_path = "hf-internal-testing/tiny-random-GPTJForCausalLM" + + +# HPU mode checking def is_pytest_mode_compile(): return pytest.mode == "compile" @@ -9,6 +30,7 @@ def is_pytest_mode_lazy(): return pytest.mode == "lazy" +# General model inference code def model_infer(model, tokenizer, apply_chat_template=False): prompts = [ "Hello,my name is", diff --git a/test/test_ark/test_model.py b/test/test_ark/test_model.py index 911a186c0..622f4a6dd 100644 --- a/test/test_ark/test_model.py +++ b/test/test_ark/test_model.py @@ -1,11 +1,6 @@ import shutil -import sys -import unittest import pytest - -sys.path.insert(0, "../..") - import torch from transformers import AutoModelForCausalLM, AutoTokenizer @@ -13,58 +8,25 @@ from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.testing_utils import require_autogptq, require_gptqmodel - -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) +from ..helpers import model_infer -class TestAutoRoundTorchBackend(unittest.TestCase): +class TestAutoRoundTorchBackend: - @classmethod - def setUpClass(self): - self.model_name = "facebook/opt-125m" - self.save_folder = "./saved" - self.llm_dataloader = LLMDataLoader() + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") - def model_infer(self, model, tokenizer): - prompts = [ - "Hello,my name is", - # "The president of the United States is", - # "The capital of France is", - # "The future of AI is", - ] - - inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) - - outputs = model.generate( - input_ids=inputs["input_ids"].to(model.device), - attention_mask=inputs["attention_mask"].to(model.device), - do_sample=False, ## change this to follow official usage - max_new_tokens=5, - ) - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] + # Yield to hand control to the test methods + yield - decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - - for i, prompt in enumerate(prompts): - print(f"Prompt: {prompt}") - print(f"Generated: {decoded_outputs[i]}") - print("-" * 50) - return decoded_outputs[0] - - @classmethod - def tearDownClass(self): - shutil.rmtree(self.save_folder, ignore_errors=True) + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") + shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_torch_4bits_sym_cpu(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + def test_torch_4bits_sym_cpu(self, model, tokenizer, dataloader): bits, group_size, sym = 4, 32, True autoround = AutoRound( model, @@ -74,7 +36,7 @@ def test_torch_4bits_sym_cpu(self): sym=sym, iters=0, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_folder autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel") @@ -85,16 +47,14 @@ def test_torch_4bits_sym_cpu(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=1000) print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.28) shutil.rmtree("./saved", ignore_errors=True) - def test_torch_4bits_sym_xpu(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + def test_torch_4bits_sym_xpu(self, model, tokenizer, dataloader): bits, group_size, sym = 4, 32, True autoround = AutoRound( model, @@ -104,7 +64,7 @@ def test_torch_4bits_sym_xpu(self): sym=sym, iters=0, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_folder autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") ##will convert to gptq model @@ -115,13 +75,9 @@ def test_torch_4bits_sym_xpu(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=1000) print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.28) torch.xpu.empty_cache() shutil.rmtree(self.save_folder, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/__init__.py b/test/test_cpu/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/test_cpu/_test_helpers.py b/test/test_cpu/_test_helpers.py deleted file mode 100644 index b4b8a5955..000000000 --- a/test/test_cpu/_test_helpers.py +++ /dev/null @@ -1,32 +0,0 @@ -def model_infer(model, tokenizer, apply_chat_template=False): - prompts = [ - "Hello,my name is", - # "The president of the United States is", - # "The capital of France is", - # "The future of AI is", - ] - if apply_chat_template: - texts = [] - for prompt in prompts: - messages = [{"role": "user", "content": prompt}] - text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - texts.append(text) - prompts = texts - - inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) - - outputs = model.generate( - input_ids=inputs["input_ids"].to(model.device), - attention_mask=inputs["attention_mask"].to(model.device), - do_sample=False, ## change this to follow official usage - max_new_tokens=5, - ) - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] - - decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - - for i, prompt in enumerate(prompts): - print(f"Prompt: {prompt}") - print(f"Generated: {decoded_outputs[i]}") - print("-" * 50) - return decoded_outputs[0] diff --git a/test/test_cpu/test_act_quantization.py b/test/test_cpu/test_act_quantization.py index 31ba51f1b..0483c027d 100644 --- a/test/test_cpu/test_act_quantization.py +++ b/test/test_cpu/test_act_quantization.py @@ -1,87 +1,72 @@ import copy import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 +class TestAutoRoundAct: + save_dir = "./saved" - def __iter__(self): - for i in range(3): - yield torch.ones([1, 10], dtype=torch.long) + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + # Yield to hand control to the test methods + yield -class TestAutoRoundAct(unittest.TestCase): - @classmethod - def setUpClass(self): - self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - self.save_dir = "./saved" - self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) - self.llm_dataloader = LLMDataLoader() - - @classmethod - def tearDownClass(self): + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_mx_fp4(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + def test_mx_fp4(self, tiny_opt_model, tokenizer, dataloader): bits, group_size, sym = 4, 128, True autoround = AutoRound( - model, + tiny_opt_model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, act_bits=4, data_type="mx_fp", ) autoround.quantize() - def test_wint4fp8_dynamic(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + def test_wint4fp8_dynamic(self, tiny_opt_model, tokenizer, dataloader): bits, group_size = 4, 128 autoround = AutoRound( - model, + tiny_opt_model, tokenizer, bits=bits, group_size=group_size, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, act_bits=8, data_type="fp8", act_data_type="fp8", ) autoround.quantize() - def test_wint4fp8_static(self): + def test_wint4fp8_static(self, tiny_opt_model, tokenizer, dataloader): bits, group_size, sym = 4, 128, True autoround = AutoRound( - self.model, - self.tokenizer, + tiny_opt_model, + tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, act_bits=8, data_type="fp8_to_int_sym", act_dynamic=False, @@ -89,66 +74,42 @@ def test_wint4fp8_static(self): ) autoround.quantize() - def test_wfp8afp8_static(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + @pytest.mark.parametrize("act_group_size", [-1, 128]) + def test_wfp8afp8_static(self, act_group_size, tiny_opt_model, tokenizer, dataloader): from auto_round.wrapper import WrapperWALayer - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) autoround = AutoRound( - model, + tiny_opt_model, tokenizer, group_size=128, - act_group_size=-1, + act_group_size=act_group_size, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, data_type="fp8", act_dynamic=False, act_data_type="fp8", ) autoround.quantize() - self.assertTrue(isinstance(autoround.model.model.decoder.layers[2].self_attn.k_proj, WrapperWALayer)) - self.assertEqual(autoround.model.model.decoder.layers[2].self_attn.k_proj.orig_layer.act_scale.shape[0], 30) - self.assertEqual(autoround.model.model.decoder.layers[2].self_attn.k_proj.orig_layer.act_max.shape[0], 30) + k_proj = autoround.model.model.decoder.layers[2].self_attn.k_proj + assert isinstance(k_proj, WrapperWALayer), "k_proj should be WrapperWALayer" + if act_group_size == -1: + assert k_proj.orig_layer.act_scale.shape[0] == 20, "act_scale shape[0] should be 20" + assert k_proj.orig_layer.act_max.shape[0] == 20, "act_max shape[0] should be 20" + else: + assert k_proj.orig_layer.act_scale.shape[0] == int(2 * 10 * 768 / 128), "act_scale shape[0] is incorrect" + assert k_proj.orig_layer.act_max.shape[0] == int(2 * 10 * 768 / 128), "act_max shape[0] is incorrect" - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - autoround = AutoRound( - model, - tokenizer, - group_size=128, - act_group_size=128, - iters=0, - seqlen=2, - dataset=self.llm_dataloader, - data_type="fp8", - act_dynamic=False, - act_data_type="fp8", - ) - autoround.quantize() - self.assertTrue(isinstance(autoround.model.model.decoder.layers[2].self_attn.k_proj, WrapperWALayer)) - - self.assertEqual( - autoround.model.model.decoder.layers[2].self_attn.k_proj.orig_layer.act_scale.shape[0], - int(3 * 10 * 768 / 128), - ) - self.assertEqual( - autoround.model.model.decoder.layers[2].self_attn.k_proj.orig_layer.act_max.shape[0], - int(3 * 10 * 768 / 128), - ) - - def test_act_config_MXFP4_saving(self): + def test_act_config_MXFP4_saving(self, tiny_opt_model_path, dataloader): scheme = "MXFP4" layer_config = {"lm_head": {"act_bits": 8, "bits": 8}, "k_proj": {"act_bits": 8, "bits": 8}} autoround = AutoRound( - self.model_name, + tiny_opt_model_path, scheme=scheme, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = self.save_dir @@ -168,15 +129,15 @@ def test_act_config_MXFP4_saving(self): assert "sym" in kproj_config.keys() and kproj_config["sym"] shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_act_config_NVFP4_saving(self): + def test_act_config_NVFP4_saving(self, tiny_opt_model_path, dataloader): scheme = "NVFP4" layer_config = {"k_proj": {"act_bits": 16, "bits": 16}} autoround = AutoRound( - self.model_name, + tiny_opt_model_path, scheme=scheme, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = self.save_dir @@ -193,16 +154,16 @@ def test_act_config_NVFP4_saving(self): assert "sym" in kproj_config.keys() and kproj_config["sym"] shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_WOQ_config_INT_saving(self): + def test_WOQ_config_INT_saving(self, tiny_opt_model_path, dataloader): scheme = "W4A16" layer_config = {"k_proj": {"bits": 8}} autoround = AutoRound( - self.model_name, + tiny_opt_model_path, scheme=scheme, iters=2, seqlen=2, sym=False, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = self.save_dir @@ -223,7 +184,7 @@ def test_WOQ_config_INT_saving(self): assert "act_dynamic" in kproj_config.keys() and kproj_config["act_dynamic"] shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_act_config_FP8_saving(self): + def test_act_config_FP8_saving(self, tiny_opt_model_path, dataloader): scheme = "FP8_STATIC" layer_config = { "lm_head": {"act_bits": 8, "bits": 8}, @@ -237,11 +198,11 @@ def test_act_config_FP8_saving(self): }, } autoround = AutoRound( - self.model_name, + tiny_opt_model_path, scheme=scheme, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = self.save_dir @@ -262,7 +223,3 @@ def test_act_config_FP8_saving(self): assert "group_size" in kproj_config.keys() and kproj_config["group_size"] == 0 assert "sym" in kproj_config.keys() and kproj_config["sym"] shutil.rmtree(quantized_model_path, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_alg_ext.py b/test/test_cpu/test_alg_ext.py index b0c909bd3..504b7d0f8 100644 --- a/test/test_cpu/test_alg_ext.py +++ b/test/test_cpu/test_alg_ext.py @@ -1,29 +1,22 @@ -import copy -import shutil -import sys -import unittest - -from parameterized import parameterized - -sys.path.insert(0, "../..") - from auto_round import AutoRound +from ..helpers import opt_name_or_path, qwen_name_or_path + -class TestAlgExt(unittest.TestCase): +class TestAlgExt: def test_alg_ext(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + model_name = opt_name_or_path ar = AutoRound(model_name, scheme="W2A16", iters=1, nsamples=1, enable_alg_ext=True) ar.quantize() - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B" + model_name = qwen_name_or_path ar = AutoRound(model_name, scheme="gguf:q4_k_s", iters=1, nsamples=1, enable_alg_ext=True) ar.quantize() from auto_round.auto_scheme import AutoScheme scheme = AutoScheme(options=["mxfp4", "mxfp8"], avg_bits=5.5, ignore_scale_zp_bits=True) - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B" + model_name = qwen_name_or_path ar = AutoRound(model_name, scheme=scheme, iters=1, nsamples=1, enable_alg_ext=True, enable_torch_compile=True) ar.quantize() @@ -31,7 +24,7 @@ def test_alg_ext_import(self): from auto_round.alg_ext import wrapper_autoround def test_all_support_dtype(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + model_name = opt_name_or_path for scheme in ["MXFP4", "NVFP4", "W2A16G64"]: ar = AutoRound( model_name, scheme=scheme, iters=1, nsamples=1, enable_alg_ext=True, enable_torch_compile=True diff --git a/test/test_cpu/test_auto_scheme.py b/test/test_cpu/test_auto_scheme.py index cd38b220d..b6c20826e 100644 --- a/test/test_cpu/test_auto_scheme.py +++ b/test/test_cpu/test_auto_scheme.py @@ -1,24 +1,28 @@ import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest + from auto_round import AutoRound, AutoRoundConfig, AutoScheme -class TestAutoScheme(unittest.TestCase): - @classmethod - def setUpClass(self): - self.save_dir = "./saved" - self.tasks = "lambada_openai" +class TestAutoScheme: + save_dir = "./saved" + + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + + # Yield to hand control to the test methods + yield - @classmethod - def tearDownClass(self): + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_auto_scheme_export(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_auto_scheme_export(self, tiny_opt_model_path): + model_name = tiny_opt_model_path scheme = AutoScheme(avg_bits=2, options=("W2A16"), nsamples=1, ignore_scale_zp_bits=True) ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1) ar.quantize_and_save(self.save_dir) @@ -29,27 +33,23 @@ def test_auto_scheme_export(self): ar.quantize_and_save(self.save_dir) shutil.rmtree(self.save_dir, ignore_errors=True) - def test_layer_config(self): + def test_layer_config(self, tiny_opt_model_path): from auto_round.auto_scheme.utils import compute_avg_bits_for_model from auto_round.utils import get_module target_bits = 3.0 - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + model_name = tiny_opt_model_path scheme = AutoScheme(avg_bits=3, options=("W2A16", "W4A16", "BF16")) - user_layer_config = {"model.decoder.layers.10.fc1": {"bits": 8, "group_size": 32, "sym": False}} + user_layer_config = {"model.decoder.layers.1.fc1": {"bits": 8, "group_size": 32, "sym": False}} ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config) model, layer_config = ar.quantize() - self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["bits"], 8) - self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["sym"], False) - self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["group_size"], 32) - layer = get_module(model, "model.decoder.layers.10.fc1") - self.assertEqual(layer.bits, 8) - self.assertEqual(layer.sym, False) - self.assertEqual(layer.group_size, 32) + assert layer_config["model.decoder.layers.1.fc1"]["bits"] == 8 + assert layer_config["model.decoder.layers.1.fc1"]["sym"] == False + assert layer_config["model.decoder.layers.1.fc1"]["group_size"] == 32 + layer = get_module(model, "model.decoder.layers.1.fc1") + assert layer.bits == 8 + assert layer.sym == False + assert layer.group_size == 32 avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_autoopt.py b/test/test_cpu/test_autoopt.py index f9801217e..472711155 100644 --- a/test/test_cpu/test_autoopt.py +++ b/test/test_cpu/test_autoopt.py @@ -1,9 +1,7 @@ import copy import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer @@ -11,48 +9,37 @@ from auto_round import AutoRoundAdam -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 +class TestAutoRound: - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + # Yield to hand control to the test methods + yield -class TestAutoRound(unittest.TestCase): - @classmethod - def setUpClass(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - self.llm_dataloader = LLMDataLoader() - - @classmethod - def tearDownClass(self): + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_Adam(self): + def test_Adam(self, tiny_opt_model, tokenizer, dataloader): bits, group_size, sym = 4, 128, False from auto_round.utils import get_block_names - llm_block_names = get_block_names(self.model, quant_vision=True) + llm_block_names = get_block_names(tiny_opt_model, quant_vision=True) bits, group_size, sym, batch_size = 4, 128, False, 20 adamround = AutoRoundAdam( - self.model, - self.tokenizer, + tiny_opt_model, + tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=2, batch_size=batch_size, - dataset=self.llm_dataloader, + dataset=dataloader, to_quant_block_names=llm_block_names, ) adamround.quantize() - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_autoround.py b/test/test_cpu/test_autoround.py index 2790f8817..1f1f85f55 100644 --- a/test/test_cpu/test_autoround.py +++ b/test/test_cpu/test_autoround.py @@ -1,41 +1,28 @@ import copy import shutil -import sys -import unittest - -from parameterized import parameterized - -sys.path.insert(0, "../..") +import pytest import torch -from _test_helpers import model_infer +from parameterized import parameterized from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from auto_round import AutoRound from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.utils import get_module +from ..helpers import model_infer -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - def __iter__(self): - for i in range(3): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestAutoRound(unittest.TestCase): +class TestAutoRound: @classmethod - def setUpClass(self): + def setup_class(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - self.llm_dataloader = LLMDataLoader() self.save_folder = "./saved" @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree(self.save_folder, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @@ -49,7 +36,7 @@ def test_bits_setting(self): if module.bits != 8: raise ValueError(f"Expected bits to be 8, but got {module.bits}") - def test_layer_config(self): + def test_layer_config(self, dataloader): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" layer_config = {"self_attn": {"bits": 4, "data_type": "nv_fp", "act_bits": 16, "group_size": 16}} autoround = AutoRound( @@ -58,14 +45,14 @@ def test_layer_config(self): scheme="NVFP4", iters=0, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, amp=False, ) autoround.quantize_and_save(self.save_folder, inplace=False, format="fake") shutil.rmtree(self.save_folder) - def test_remove_whole_block(self): + def test_remove_whole_block(self, dataloader): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" layer_config = { "model.decoder.layers.0.self_attn.k_proj": {"bits": 32}, @@ -83,12 +70,12 @@ def test_remove_whole_block(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) autoround.quantize() - def test_consecutive_quant(self): + def test_consecutive_quant(self, dataloader): bits, group_size, sym = 4, -1, False autoround = AutoRound( self.model, @@ -98,7 +85,7 @@ def test_consecutive_quant(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() @@ -116,11 +103,11 @@ def test_consecutive_quant(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() - def test_mx_fp4(self): + def test_mx_fp4(self, dataloader): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" bits, group_size, sym = 4, 32, False autoround = AutoRound( @@ -142,7 +129,7 @@ def test_mx_fp4(self): print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.3) # 0.375 - def test_nv_fp4(self): + def test_nv_fp4(self, dataloader): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" bits, group_size, sym = 4, 16, False autoround = AutoRound( @@ -152,7 +139,7 @@ def test_nv_fp4(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, data_type="nv_fp4", ) model, _ = autoround.quantize() @@ -162,7 +149,7 @@ def test_nv_fp4(self): print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35) - def test_w4g1(self): + def test_w4g1(self, dataloader): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" bits, group_size, sym = 4, -1, True autoround = AutoRound( @@ -172,12 +159,12 @@ def test_w4g1(self): sym=sym, iters=2, seqlen=10, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() @parameterized.expand([(2,), (3,), (4,)]) - def test_g128(self, bits): + def test_g128(self, bits, dataloader): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" group_size, sym = 128, True autoround = AutoRound( @@ -187,7 +174,7 @@ def test_g128(self, bits): sym=sym, iters=2, seqlen=10, - dataset=self.llm_dataloader, + dataset=dataloader, ) model, _ = autoround.quantize() if bits > 2: @@ -197,7 +184,7 @@ def test_g128(self, bits): print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.3) - def test_disable_quanted_input(self): + def test_disable_quanted_input(self, dataloader): bits, group_size, sym = 4, -1, True autoround = AutoRound( self.model, @@ -208,11 +195,11 @@ def test_disable_quanted_input(self): iters=2, seqlen=10, enable_quanted_input=False, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() - def test_enable_norm_bias_tuning_qwen3(self): + def test_enable_norm_bias_tuning_qwen3(self, dataloader): bits, group_size, sym = 4, 128, True model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) @@ -226,11 +213,11 @@ def test_enable_norm_bias_tuning_qwen3(self): iters=2, seqlen=10, enable_norm_bias_tuning=True, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() - def test_enable_norm_bias_tuning(self): + def test_enable_norm_bias_tuning(self, dataloader): bits, group_size, sym = 4, -1, True autoround = AutoRound( self.model, @@ -242,11 +229,11 @@ def test_enable_norm_bias_tuning(self): seqlen=10, enable_quanted_input=False, enable_norm_bias_tuning=True, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() - def test_disable_minmax_tuning(self): + def test_disable_minmax_tuning(self, dataloader): bits, group_size, sym = 4, -1, True autoround = AutoRound( self.model, @@ -257,12 +244,12 @@ def test_disable_minmax_tuning(self): iters=2, seqlen=10, enable_minmax_tuning=False, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() # - def test_signround(self): + def test_signround(self, dataloader): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" bits, group_size, sym = 4, -1, False autoround = AutoRound( @@ -274,11 +261,11 @@ def test_signround(self): seqlen=10, enable_minmax_tuning=False, enable_quanted_input=False, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() - def test_lm_head_layer_config_way(self): + def test_lm_head_layer_config_way(self, dataloader): bits, group_size, sym = 4, -1, False layer_config = {"lm_head": {"data_type": "int"}} autoround = AutoRound( @@ -291,12 +278,12 @@ def test_lm_head_layer_config_way(self): seqlen=10, enable_minmax_tuning=False, enable_quanted_input=False, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) autoround.quantize() - def test_wa_quant(self): + def test_wa_quant(self, dataloader): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" bits, group_size, sym, act_bits = 4, 128, False, 4 autoround = AutoRound( @@ -306,12 +293,12 @@ def test_wa_quant(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, act_bits=act_bits, ) autoround.quantize() - def test_auto_device_map(self): + def test_auto_device_map(self, dataloader): bits, group_size, sym = 4, 128, False model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" model = AutoModelForCausalLM.from_pretrained( @@ -325,11 +312,11 @@ def test_auto_device_map(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() - def test_device_map_dict(self): + def test_device_map_dict(self, dataloader): bits, group_size, sym = 4, 128, False device_map = {".*": "cpu"} autoround = AutoRound( @@ -340,7 +327,7 @@ def test_device_map_dict(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, device_map=device_map, ) autoround.quantize() @@ -355,12 +342,12 @@ def test_device_map_dict(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, device_map=device_map, ) autoround.quantize() - def test_fp32(self): + def test_fp32(self, dataloader): bits, group_size, sym = 4, 128, False model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" model = AutoModelForCausalLM.from_pretrained( @@ -374,12 +361,12 @@ def test_fp32(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, amp=False, ) autoround.quantize() - def test_tensor_reshape(self): + def test_tensor_reshape(self, dataloader): bits, group_size, sym = 4, 100, False autoround = AutoRound( self.model, @@ -389,7 +376,7 @@ def test_tensor_reshape(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() @@ -412,7 +399,7 @@ def test_rtn(self): model_infer(model, tokenizer) shutil.rmtree(self.save_folder) - def test_embed_quant(self): + def test_embed_quant(self, dataloader): bits, group_size, sym = 4, 128, True model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" layer_config = { @@ -426,12 +413,12 @@ def test_embed_quant(self): iters=2, seqlen=2, nsamples=3, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) autoround.quantize() - def test_fallback_layers(self): + def test_fallback_layers(self, dataloader): bits, group_size, sym = 4, 128, True model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" model = AutoModelForCausalLM.from_pretrained( @@ -450,7 +437,7 @@ def test_fallback_layers(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) autoround.quantize() @@ -520,7 +507,7 @@ def test_not_convert_modules(self): ) print(output_text) - def test_fallback_layers_regex_awq(self): + def test_fallback_layers_regex_awq(self, dataloader): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" bits, group_size, sym = 4, 128, True model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) @@ -537,7 +524,7 @@ def test_fallback_layers_regex_awq(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) autoround.quantize() @@ -556,7 +543,7 @@ def test_fallback_layers_regex_awq(self): print(res) shutil.rmtree(self.save_folder, ignore_errors=True) - def test_fallback_layers_regex_gptq(self): + def test_fallback_layers_regex_gptq(self, dataloader): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" bits, group_size, sym = 4, 128, True model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) @@ -573,7 +560,7 @@ def test_fallback_layers_regex_gptq(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) autoround.quantize() @@ -592,7 +579,7 @@ def test_fallback_layers_regex_gptq(self): print(res) shutil.rmtree(self.save_folder, ignore_errors=True) - def test_fallback_layers_regex_round(self): + def test_fallback_layers_regex_round(self, dataloader): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" bits, group_size, sym = 4, 128, True model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) @@ -609,7 +596,7 @@ def test_fallback_layers_regex_round(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) autoround.quantize() @@ -628,7 +615,7 @@ def test_fallback_layers_regex_round(self): print(res) shutil.rmtree(self.save_folder, ignore_errors=True) - def test_fallback_layers_regex_exception(self): + def test_fallback_layers_regex_exception(self, dataloader): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" bits, group_size, sym = 4, 128, True model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) @@ -643,7 +630,7 @@ def test_fallback_layers_regex_exception(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) autoround.quantize() @@ -666,16 +653,16 @@ def test_dequant_fp8_weight(self): weight_scale = torch.randn(5, 56) block_size = [128, 128] dequant_weight = dequant_block_fp8_weight(weight, weight_scale, block_size) - self.assertEqual(dequant_weight.shape.numel(), 4207616) + assert dequant_weight.shape.numel() == 4207616 # test experts are stacked. weight = torch.randn([32, 5760, 1440]) weight_scale = torch.randn([32, 5760, 90]) block_size = [1, 16] dequant_weight = dequant_block_fp8_weight(weight, weight_scale, block_size) - self.assertEqual(len(dequant_weight.shape), 3) - self.assertEqual(dequant_weight.shape[0], 32) - self.assertEqual(dequant_weight.shape.numel(), 32 * 5760 * 1440) + assert len(dequant_weight.shape) == 3 + assert dequant_weight.shape[0] == 32 + assert dequant_weight.shape.numel() == 32 * 5760 * 1440 def test_mixed_bit_setting(self): model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" @@ -754,7 +741,7 @@ def test_quant_lm_head_layer_config(self): def test_compressor(self): model_name = "Qwen/Qwen2-VL-2B-Instruct" ar = AutoRound(model_name, enable_adam=True) - self.assertEqual(ar.optimizer, torch.optim.AdamW) + assert ar.optimizer == torch.optim.AdamW self.assertTrue(ar.mllm) # test old api @@ -801,7 +788,7 @@ def test_attention_mask_via_tokenize_in_dataset(self): ar = AutoRound(model_name, iters=1, dataset=data, seqlen=8) ar.quantize() - def test_low_cpu_mem_usage(self): + def test_low_cpu_mem_usage(self, dataloader): bits, group_size = 4, 32 model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) @@ -814,7 +801,7 @@ def test_low_cpu_mem_usage(self): group_size=group_size, iters=2, seqlen=10, - dataset=self.llm_dataloader, + dataset=dataloader, low_cpu_mem_usage=True, device_map="cpu", ) @@ -826,7 +813,3 @@ def test_create_adam(self): from auto_round import AutoRound ar = AutoRound(model=model_name, enable_adam=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_autoround_acc.py b/test/test_cpu/test_autoround_acc.py index 41b28e663..721a5c8ed 100644 --- a/test/test_cpu/test_autoround_acc.py +++ b/test/test_cpu/test_autoround_acc.py @@ -1,42 +1,29 @@ import copy import shutil -import sys -import unittest - -from auto_round.eval.evaluation import simple_evaluate - -sys.path.insert(0, "../..") from math import isclose +import pytest import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound # pylint: disable=E0401 - -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) +from ..helpers import gptj_name_or_path -class TestAutoRound(unittest.TestCase): +class TestAutoRound: @classmethod - def setUpClass(self): - self.llm_dataloader = LLMDataLoader() + def setup_class(self): self.save_dir = "./saved" @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree(self.save_dir, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_default_acc(self): - model_name = "/tf_dataset/auto_round/models/hf-internal-testing/tiny-random-GPTJForCausalLM" + def test_default_acc(self, dataloader): + model_name = gptj_name_or_path model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True @@ -50,7 +37,7 @@ def test_default_acc(self): sym=sym, iters=2, seqlen=10, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() out0 = model(inp) @@ -66,7 +53,7 @@ def test_default_acc(self): device="cpu", iters=2, seqlen=10, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround_1.quantize() out1 = model_tmp(inp) @@ -74,20 +61,11 @@ def test_default_acc(self): assert out0[0].equal(out1[0]) self.assertTrue(isclose(float(out0[0][0][0][0]), -0.021002087742090225, rel_tol=5e-04)) - def test_3bits_asym_autoround(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_3bits_asym_autoround(self, tiny_opt_model_path): + model_name = tiny_opt_model_path bits, sym = 3, False autoround = AutoRound(model_name, bits=bits, sym=sym, iters=0) autoround.quantize_and_save(self.save_dir, format="auto_round", inplace=False) model_args = f"pretrained={self.save_dir}" - # res = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto", limit=10) - - # accuracy = res["results"]["lambada_openai"]["acc,none"] - # print(f"accuracy = {accuracy}") - # assert accuracy > 0.15 shutil.rmtree(self.save_dir, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_autoround_export_to_itrex.py b/test/test_cpu/test_autoround_export_to_itrex.py index d9b4f42c6..d4cc2a73c 100644 --- a/test/test_cpu/test_autoround_export_to_itrex.py +++ b/test/test_cpu/test_autoround_export_to_itrex.py @@ -1,15 +1,15 @@ import copy import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound +from ..helper import gptj_name_or_path + class SimpleDataLoader: def __init__(self): @@ -20,35 +20,23 @@ def __iter__(self): yield torch.randn([1, 30]) -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestAutoroundExport(unittest.TestCase): +class TestAutoroundExport: approach = "weight_only" @classmethod - def setUpClass(self): + def setup_class(self): self.gptj = transformers.AutoModelForCausalLM.from_pretrained( - "/tf_dataset/auto_round/models/hf-internal-testing/tiny-random-GPTJForCausalLM", + gptj_name_or_path, torchscript=True, ) - self.tokenizer = transformers.AutoTokenizer.from_pretrained( - "/tf_dataset/auto_round/models/hf-internal-testing/tiny-random-GPTJForCausalLM", trust_remote_code=True - ) + self.tokenizer = transformers.AutoTokenizer.from_pretrained(gptj_name_or_path, trust_remote_code=True) self.gptj_no_jit = transformers.AutoModelForCausalLM.from_pretrained( - "/tf_dataset/auto_round/models/hf-internal-testing/tiny-random-GPTJForCausalLM", + gptj_name_or_path, ) - self.llm_dataloader = LLMDataLoader() self.lm_input = torch.ones([1, 10], dtype=torch.long) @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @@ -87,10 +75,10 @@ def test_config(self): config = QuantConfig.from_pretrained("/tf_dataset/auto_round/models/TheBloke/Llama-2-7B-Chat-GPTQ") config.save_pretrained("quantization_config_dir") loaded_config = QuantConfig.from_pretrained("quantization_config_dir") - self.assertEqual(config.group_size, loaded_config.group_size) - self.assertEqual(config.desc_act, loaded_config.desc_act) - self.assertEqual(config.bits, loaded_config.bits) - self.assertEqual(config.sym, loaded_config.sym) + assert config.group_size == loaded_config.group_size + assert config.desc_act == loaded_config.desc_act + assert config.bits == loaded_config.bits + assert config.sym == loaded_config.sym def test_xpu_export(self): model = copy.deepcopy(self.gptj) @@ -111,7 +99,3 @@ def test_xpu_export(self): self.assertTrue(torch.all(out2[0] == out3[0])) self.assertTrue(torch.all(torch.isclose(out3[0], out4[0], atol=1e-3))) self.assertTrue(torch.all(torch.isclose(out4[0], out5[0], atol=1e-5))) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_block_names.py b/test/test_cpu/test_block_names.py index 501caee25..8d5f935d9 100644 --- a/test/test_cpu/test_block_names.py +++ b/test/test_cpu/test_block_names.py @@ -1,25 +1,16 @@ import os import shutil -import sys -import unittest + +import pytest sys.path.insert(0, ".") -sys.path.insert(0, "../..") import torch import torch.nn as nn from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound - -class LLMDataLoader: - def __init__(self, input_size=10): - self.batch_size = 1 - self.input_size = input_size - - def __iter__(self): - for i in range(2): - yield torch.ones([1, self.input_size], dtype=torch.long) +from ..helper import lamini_name_or_path # ================= simple multimodal model ================= @@ -116,24 +107,22 @@ def forward(self, x): return output -class TestQuantizationBlocks(unittest.TestCase): +class TestQuantizationBlocks: @classmethod - def setUpClass(self): - self.model_name = "/tf_dataset/auto_round/models/MBZUAI/LaMini-GPT-124M" + def setup_class(self): + self.model_name = lamini_name_or_path self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) - self.llm_dataloader = LLMDataLoader() @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_moe_quant(self): + def test_moe_quant(self, dataloader): input_size = 10 hidden_size = 10 num_groups = 2 experts_per_group = 2 - self.llm_dataloader = LLMDataLoader(input_size) self.model = NestedMoEModel(input_size, hidden_size, num_groups, experts_per_group) from auto_round.utils import get_block_names @@ -159,7 +148,7 @@ def test_multimodal_quant(self): assert block_names_wo_vision == llm_block_names assert len(block_names_wo_vision) != (block_names_with_vision) - def test_block_name_quant(self): + def test_block_name_quant(self, dataloader): self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) from auto_round.utils import get_block_names @@ -174,7 +163,7 @@ def test_block_name_quant(self): iters=2, seqlen=2, batch_size=batch_size, - dataset=self.llm_dataloader, + dataset=dataloader, to_quant_block_names=llm_block_names, ) autoround.quantize() @@ -217,7 +206,3 @@ def test_moe(self): self.assertTrue(block_name == block_name_2) self.assertTrue(len(block_name_2) == 1) self.assertTrue("model.layers.23" == block_name_2[0][-1]) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_calib_dataset.py b/test/test_cpu/test_calib_dataset.py index 689cc705c..fc95966b6 100644 --- a/test/test_cpu/test_calib_dataset.py +++ b/test/test_cpu/test_calib_dataset.py @@ -1,29 +1,17 @@ +import json import os import shutil -import sys -import unittest - -sys.path.insert(0, "../..") -import json +import pytest import torch from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestLocalCalibDataset(unittest.TestCase): +class TestLocalCalibDataset: @classmethod - def setUpClass(self): + def setup_class(self): json_data = [{"text": "awefdsfsddfd"}, {"text": "fdfdfsdfdfdfd"}, {"text": "dfdsfsdfdfdfdf"}] os.makedirs("./saved", exist_ok=True) self.json_file = "./saved/tmp.json" @@ -130,10 +118,6 @@ def test_combine_dataset2(self): # autoround.quantize() @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_cli_usage.py b/test/test_cpu/test_cli_usage.py index 2b93f5131..ffc04d8f1 100644 --- a/test/test_cpu/test_cli_usage.py +++ b/test/test_cpu/test_cli_usage.py @@ -1,18 +1,17 @@ import os import shutil import sys -import unittest -sys.path.insert(0, "../..") +import pytest -class TestAutoRoundCmd(unittest.TestCase): +class TestAutoRoundCmd: @classmethod - def setUpClass(self): + def setup_class(self): pass @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) shutil.rmtree("../../saved", ignore_errors=True) @@ -68,7 +67,3 @@ def test_auto_round_cmd(self): ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_conv1d.py b/test/test_cpu/test_conv1d.py index edd28110f..1997026b3 100644 --- a/test/test_cpu/test_conv1d.py +++ b/test/test_cpu/test_conv1d.py @@ -1,38 +1,27 @@ import copy import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch -from _test_helpers import model_infer from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound +from ..helpers import lamini_name_or_path, model_infer -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestQuantizationConv1d(unittest.TestCase): +class TestQuantizationConv1d: @classmethod - def setUpClass(self): - self.model_name = "/tf_dataset/auto_round/models/MBZUAI/LaMini-GPT-124M" + def setup_class(self): + self.model_name = lamini_name_or_path self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) - self.llm_dataloader = LLMDataLoader() @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_quant(self): + def test_quant(self, dataloader): self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) bits, group_size, sym = 4, 128, True autoround = AutoRound( @@ -43,7 +32,7 @@ def test_quant(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() @@ -51,7 +40,3 @@ def test_quant(self): model = AutoModelForCausalLM.from_pretrained("./saved", device_map="cpu", trust_remote_code=True) model_infer(model, self.tokenizer) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py index 57b30354b..866a7d396 100644 --- a/test/test_cpu/test_export.py +++ b/test/test_cpu/test_export.py @@ -1,12 +1,9 @@ import os import shutil -import sys -import unittest -from parameterized import parameterized - -sys.path.insert(0, "../..") +import pytest import torch +from parameterized import parameterized from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from auto_round import AutoRound @@ -23,30 +20,20 @@ def _get_folder_size(path: str) -> float: return total_size / (1024**3) # convert to GB -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestAutoRound(unittest.TestCase): +class TestAutoRound: @classmethod - def setUpClass(self): + def setup_class(self): self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" self.save_dir = "./saved" self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) - self.llm_dataloader = LLMDataLoader() @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_autogptq_format(self): + def test_autogptq_format(self, dataloader): for group_size in [-1, 32, 128]: bits, sym = 4, False model_name = self.model_name @@ -57,7 +44,7 @@ def test_autogptq_format(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = "./saved" @@ -76,7 +63,7 @@ def test_autogptq_format(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree("./saved", ignore_errors=True) - def test_autoround_format(self): + def test_autoround_format(self, dataloader): for group_size in [-1, 32, 128]: bits, sym = 4, True model_name = self.model_name @@ -87,7 +74,7 @@ def test_autoround_format(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") @@ -102,7 +89,7 @@ def test_autoround_format(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree("./saved", ignore_errors=True) - def test_autoround_awq_format(self): + def test_autoround_awq_format(self, dataloader): for group_size in [-1, 32, 128]: bits, sym = 4, False model_name = self.model_name @@ -113,7 +100,7 @@ def test_autoround_awq_format(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = "./saved" @@ -132,7 +119,7 @@ def test_autoround_awq_format(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree("./saved", ignore_errors=True) - def test_autoawq_format(self): + def test_autoawq_format(self, dataloader): for group_size in [-1, 32, 128]: bits, sym = 4, False autoround = AutoRound( @@ -143,7 +130,7 @@ def test_autoawq_format(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() quantized_model_path = "./saved" @@ -163,7 +150,7 @@ def test_autoawq_format(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree("./saved", ignore_errors=True) - def test_autoround_3bit_asym_format(self): + def test_autoround_3bit_asym_format(self, dataloader): bits, group_size, sym = 3, 128, False autoround = AutoRound( self.model, @@ -173,7 +160,7 @@ def test_autoround_3bit_asym_format(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() quantized_model_path = self.save_dir @@ -187,7 +174,7 @@ def test_autoround_3bit_asym_format(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_autoround_3bit_sym_format(self): + def test_autoround_3bit_sym_format(self, dataloader): bits, group_size, sym = 3, 128, True autoround = AutoRound( self.model, @@ -197,7 +184,7 @@ def test_autoround_3bit_sym_format(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() quantized_model_path = self.save_dir @@ -239,8 +226,8 @@ def test_static_afp8_export(self, static_kv_dtype): f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") self.assertIn("model.decoder.layers.8.self_attn.k_proj.input_scale", f.keys()) self.assertIn("model.decoder.layers.8.self_attn.k_proj.weight_scale", f.keys()) - self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape, torch.Size([1])) - self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype, torch.float8_e4m3fn) + assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape == torch.Size([1]) + assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype == torch.float8_e4m3fn if static_kv_dtype is None: with torch.no_grad(): import transformers @@ -272,9 +259,9 @@ def test_static_afp8_export(self, static_kv_dtype): if static_kv_dtype == "fp8": self.assertIn("model.decoder.layers.8.self_attn.k_scale", f.keys()) self.assertIn("model.decoder.layers.8.self_attn.v_scale", f.keys()) - self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_scale").shape, torch.Size([1])) - self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.k_scale").shape, torch.Size([1])) - self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.k_scale").dtype, torch.float32) + assert f.get_tensor("model.decoder.layers.5.self_attn.v_scale").shape == torch.Size([1]) + assert f.get_tensor("model.decoder.layers.5.self_attn.k_scale").shape == torch.Size([1]) + assert f.get_tensor("model.decoder.layers.5.self_attn.k_scale").dtype == torch.float32 shutil.rmtree(quantized_model_path, ignore_errors=True) model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) @@ -298,8 +285,8 @@ def test_static_afp8_export(self, static_kv_dtype): f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") self.assertIn("model.decoder.layers.8.self_attn.k_proj.input_scale", f.keys()) self.assertIn("model.decoder.layers.8.self_attn.k_proj.weight_scale", f.keys()) - self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape, torch.Size([1])) - self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype, torch.float8_e4m3fn) + assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape == torch.Size([1]) + assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype == torch.float8_e4m3fn shutil.rmtree(quantized_model_path, ignore_errors=True) def test_static_fp8_attn(self): @@ -323,18 +310,18 @@ def test_static_fp8_attn(self): f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") self.assertIn("model.decoder.layers.8.self_attn.k_proj.input_scale", f.keys()) self.assertIn("model.decoder.layers.8.self_attn.k_proj.weight_scale", f.keys()) - self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape, torch.Size([1])) - self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype, torch.float8_e4m3fn) + assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape == torch.Size([1]) + assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype == torch.float8_e4m3fn check_attrs = ["k_scale", "v_scale", "q_scale"] for attr in check_attrs: weight_name = f"model.decoder.layers.8.self_attn.{attr}" self.assertIn(weight_name, f.keys()) - self.assertEqual(f.get_tensor(weight_name).shape, torch.Size([1])) - self.assertEqual(f.get_tensor(weight_name).dtype, torch.float32) + assert f.get_tensor(weight_name).shape == torch.Size([1]) + assert f.get_tensor(weight_name).dtype == torch.float32 shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_awq_lmhead_export(self): + def test_awq_lmhead_export(self, dataloader): bits, sym, group_size = 4, False, 128 model_name = "/tf_dataset/auto_round/models/microsoft/phi-2" layer_config = { @@ -350,7 +337,7 @@ def test_awq_lmhead_export(self): nsamples=2, seqlen=2, layer_config=layer_config, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = "./saved" compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") @@ -368,7 +355,7 @@ def test_awq_lmhead_export(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_gptq_lmhead_export(self): + def test_gptq_lmhead_export(self, dataloader): bits, sym, group_size = 4, True, 128 # Note that, to save UT tuning time, the local model is intentionally kept lightweight, using only 2 hidden layers. model_name = "/tf_dataset/auto_round/models/microsoft/phi-2" @@ -385,7 +372,7 @@ def test_gptq_lmhead_export(self): iters=2, seqlen=2, layer_config=layer_config, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = "./saved" compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") @@ -401,7 +388,3 @@ def test_gptq_lmhead_export(self): res = tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0]) print(res) shutil.rmtree(quantized_model_path, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_generation.py b/test/test_cpu/test_generation.py index 5018d1610..4c72db93c 100644 --- a/test/test_cpu/test_generation.py +++ b/test/test_cpu/test_generation.py @@ -1,39 +1,27 @@ import copy import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from auto_round import AutoRound -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestAutoRoundFormatGeneration(unittest.TestCase): +class TestAutoRoundFormatGeneration: @classmethod - def setUpClass(self): + def setup_class(self): self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) - self.llm_dataloader = LLMDataLoader() self.save_folder = "./saved" @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree(self.save_folder, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_4bits_sym(self): + def test_4bits_sym(self, dataloader): bits = 4 group_size = 128 sym = True @@ -45,7 +33,7 @@ def test_4bits_sym(self): sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_folder @@ -72,7 +60,7 @@ def test_4bits_sym(self): print(res) assert "!!!" not in res - def test_autoround_sym(self): + def test_autoround_sym(self, dataloader): for bits in [4]: model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -85,7 +73,7 @@ def test_autoround_sym(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = "./saved" diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py index 53b199c41..393e11dba 100644 --- a/test/test_cpu/test_gguf_format.py +++ b/test/test_cpu/test_gguf_format.py @@ -1,36 +1,23 @@ import os import shutil import sys -import unittest - -sys.path.insert(0, "../..") +import pytest import torch from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound -class LLMDataLoader: - - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestGGUF(unittest.TestCase): +class TestGGUF: @classmethod - def setUpClass(self): + def setup_class(self): self.model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct" self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) - self.llm_dataloader = LLMDataLoader() @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @@ -241,7 +228,7 @@ def test_gguf_baseline(self): # print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) # shutil.rmtree("./saved", ignore_errors=True) - def test_q4_k_m(self): + def test_q4_k_m(self, dataloader): model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-1.5B-Instruct" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) @@ -265,21 +252,21 @@ def test_q4_k_m(self): iters=0, seqlen=1, nsamples=8, - dataset=self.llm_dataloader, + dataset=dataloader, disable_opt_rtn=True, ) quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake") - self.assertEqual(autoround.layer_config["model.layers.11.self_attn.v_proj"]["super_group_size"], 16) - self.assertEqual(autoround.layer_config["model.layers.11.self_attn.v_proj"]["data_type"], "int_sym_dq") - self.assertEqual(autoround.layer_config["model.layers.7.self_attn.v_proj"]["data_type"], "int_asym_dq") - self.assertEqual(autoround.model.model.layers[0].self_attn.v_proj.bits, 6) - self.assertEqual(autoround.model.model.layers[12].self_attn.v_proj.bits, 4) - self.assertEqual(autoround.model.model.embed_tokens.bits, 6) - self.assertEqual(autoround.model.model.embed_tokens.group_size, 16) - self.assertEqual(autoround.model.model.layers[12].mlp.gate_proj.bits, 3) - self.assertEqual(autoround.model.model.layers[10].mlp.gate_proj.bits, 8) - self.assertEqual(autoround.layer_config["model.layers.10.mlp.gate_proj"]["mostly"], "gguf:q8_0") + assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["super_group_size"] == 16 + assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["data_type"] == "int_sym_dq" + assert autoround.layer_config["model.layers.7.self_attn.v_proj"]["data_type"] == "int_asym_dq" + assert autoround.model.model.layers[0].self_attn.v_proj.bits == 6 + assert autoround.model.model.layers[12].self_attn.v_proj.bits == 4 + assert autoround.model.model.embed_tokens.bits == 6 + assert autoround.model.model.embed_tokens.group_size == 16 + assert autoround.model.model.layers[12].mlp.gate_proj.bits == 3 + assert autoround.model.model.layers[10].mlp.gate_proj.bits == 8 + assert autoround.layer_config["model.layers.10.mlp.gate_proj"]["mostly"] == "gguf:q8_0" shutil.rmtree("./saved", ignore_errors=True) model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) @@ -413,7 +400,3 @@ def test_qtype_setting(self): ar.layer_config["model.embed_tokens"]["bits"] == 6 and ar.layer_config["model.embed_tokens"]["super_bits"] == 8 ) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_llmcompressor.py b/test/test_cpu/test_llmcompressor.py index 051dfb075..ebe531f75 100644 --- a/test/test_cpu/test_llmcompressor.py +++ b/test/test_cpu/test_llmcompressor.py @@ -1,25 +1,22 @@ import os import shutil -import sys -import unittest - -sys.path.insert(0, "../..") +import pytest import torch from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound -class TestLLMC(unittest.TestCase): +class TestLLMC: @classmethod - def setUpClass(self): + def setup_class(self): self.model_name = "/tf_dataset/auto_round/models/stas/tiny-random-llama-2" self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @@ -60,9 +57,9 @@ def test_llmcompressor_fp8(self): config = json.load(open("./saved/config.json")) self.assertIn("group_0", config["quantization_config"]["config_groups"]) - self.assertEqual(config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"], 8) - self.assertEqual(config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"], "channel") - self.assertEqual(config["quantization_config"]["quant_method"], "compressed-tensors") + assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8 + assert config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"] == "channel" + assert config["quantization_config"]["quant_method"] == "compressed-tensors" def test_autoround_llmcompressor_fp8(self): ## quantize the model @@ -81,13 +78,9 @@ def test_autoround_llmcompressor_fp8(self): config = json.load(open("./saved/config.json")) self.assertIn("group_0", config["quantization_config"]["config_groups"]) - self.assertEqual(config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"], 8) - self.assertEqual(config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"], "tensor") + assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8 + assert config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"] == "tensor" self.assertEqual( config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["strategy"], "tensor" ) - self.assertEqual(config["quantization_config"]["quant_method"], "compressed-tensors") - - -if __name__ == "__main__": - unittest.main() + assert config["quantization_config"]["quant_method"] == "compressed-tensors" diff --git a/test/test_cpu/test_load_awq_gptq.py b/test/test_cpu/test_load_awq_gptq.py index 4fb6bb977..e78266182 100644 --- a/test/test_cpu/test_load_awq_gptq.py +++ b/test/test_cpu/test_load_awq_gptq.py @@ -1,40 +1,15 @@ import shutil -import sys -import unittest - -sys.path.insert(0, "../..") +import pytest from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer +from ..helpers import model_infer -class TestAutoRound(unittest.TestCase): - def model_infer(self, model, tokenizer): - prompts = [ - "Hello,my name is", - # "The president of the United States is", - # "The capital of France is", - # "The future of AI is", - ] - - inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) - - outputs = model.generate( - input_ids=inputs["input_ids"].to(model.device), - attention_mask=inputs["attention_mask"].to(model.device), - do_sample=False, ## change this to follow official usage - max_new_tokens=5, - ) - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] - - decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - for i, prompt in enumerate(prompts): - print(f"Prompt: {prompt}") - print(f"Generated: {decoded_outputs[i]}") - print("-" * 50) +class TestAutoRound: @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("runs", ignore_errors=True) def test_load_gptq_no_dummy_gidx_model(self): @@ -60,4 +35,4 @@ def test_load_awq(self): quantization_config=quantization_config, ) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) diff --git a/test/test_cpu/test_mix_bits.py b/test/test_cpu/test_mix_bits.py index 2c73d42cd..71354feb9 100644 --- a/test/test_cpu/test_mix_bits.py +++ b/test/test_cpu/test_mix_bits.py @@ -1,14 +1,11 @@ import json import os import shutil -import sys -import unittest from pathlib import Path -from parameterized import parameterized - -sys.path.insert(0, "../..") +import pytest import torch +from parameterized import parameterized from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from auto_round import AutoRound @@ -26,31 +23,21 @@ def _get_folder_size(path: str) -> float: return total_size / (1024**3) # convert to GB -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestAutoRound(unittest.TestCase): +class TestAutoRound: @classmethod - def setUpClass(self): + def setup_class(self): self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" self.save_dir = ".saved/" self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) - self.llm_dataloader = LLMDataLoader() @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @require_gptqmodel - def test_mixed_gptqmodel(self): + def test_mixed_gptqmodel(self, dataloader): layer_config = { "k_proj": {"bits": 8}, # part name "lm_head": {"bits": 4}, # set lm_head quant @@ -64,7 +51,7 @@ def test_mixed_gptqmodel(self): iters=2, seqlen=2, layer_config=layer_config, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") @@ -79,7 +66,7 @@ def test_mixed_gptqmodel(self): assert "!!!" not in model.tokenizer.decode(result) # string output shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mixed_gptqmodel_convert_to_ar(self): + def test_mixed_gptqmodel_convert_to_ar(self, dataloader): layer_config = { "k_proj": {"bits": 8}, # part name "lm_head": {"bits": 4}, # set lm_head quant @@ -93,7 +80,7 @@ def test_mixed_gptqmodel_convert_to_ar(self): iters=2, seqlen=2, layer_config=layer_config, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") @@ -108,7 +95,7 @@ def test_mixed_gptqmodel_convert_to_ar(self): print(res) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mixed_autoround_format(self): + def test_mixed_autoround_format(self, dataloader): layer_config = { "k_proj": {"bits": 8}, "q_proj": {"bits": 3}, @@ -120,7 +107,7 @@ def test_mixed_autoround_format(self): scheme="W4A16", iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = "./saved" @@ -134,7 +121,7 @@ def test_mixed_autoround_format(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_fallback_regex_for_awq_format(self): + def test_fallback_regex_for_awq_format(self, dataloader): layer_config = { "lm_head": {"bits": 16}, "fc1": {"bits": 16}, @@ -144,7 +131,7 @@ def test_fallback_regex_for_awq_format(self): scheme="W4A16", iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = "./saved" @@ -159,7 +146,7 @@ def test_fallback_regex_for_awq_format(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mixed_ar_format_part_name_hf_loading(self): + def test_mixed_ar_format_part_name_hf_loading(self, dataloader): layer_config = { "k_proj": {"bits": 8}, # part name "lm_head": {"bits": 16}, # full name @@ -170,7 +157,7 @@ def test_mixed_ar_format_part_name_hf_loading(self): scheme="W4A16", iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = "./saved" @@ -220,7 +207,7 @@ def test_mixed_ar_format_part_name_hf_loading(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mixed_MXFP_autoround_format_loading(self): + def test_mixed_MXFP_autoround_format_loading(self, dataloader): layer_config = { "k_proj": {"bits": 8, "act_bits": 8}, "lm_head": {"bits": 16, "act_bits": 16}, @@ -231,7 +218,7 @@ def test_mixed_MXFP_autoround_format_loading(self): scheme="MXFP4", iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = self.save_dir @@ -248,7 +235,3 @@ def test_mixed_MXFP_autoround_format_loading(self): print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.14) shutil.rmtree(quantized_model_path, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_mllm.py b/test/test_cpu/test_mllm.py index 8510adca5..25f2a209a 100644 --- a/test/test_cpu/test_mllm.py +++ b/test/test_cpu/test_mllm.py @@ -1,10 +1,6 @@ -import sys -import unittest - -sys.path.insert(0, "../..") - import shutil +import pytest from transformers import AutoProcessor, AutoTokenizer, Qwen2VLForConditionalGeneration from auto_round import AutoRoundMLLM @@ -27,18 +23,18 @@ def __iter__(self): yield self.data -class TestAutoRoundMLLM(unittest.TestCase): +class TestAutoRoundMLLM: @classmethod - def setUpClass(self): + def setup_class(self): self.model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct" self.dataset = FakeDataLoader() @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - return super().tearDownClass() + return super().teardown_class() def test_tune(self): bits, group_size = 4, 128 @@ -105,11 +101,11 @@ class Myclass: dataset = MLLM_DATASET["liuhaotian/llava"]( template=Myclass(), model=None, tokenzier=None, dataset_path="liuhaotian/llava", seqlen=32, nsamples=32 ) - self.assertEqual(len(dataset.questions), 32) + assert len(dataset.questions) == 32 dataset = MLLM_DATASET["liuhaotian/llava"]( template=Myclass(), model=None, tokenzier=None, dataset_path="liuhaotian/llava", seqlen=2048, nsamples=512 ) - self.assertEqual(len(dataset.questions), 512) + assert len(dataset.questions) == 512 def test_diff_dataset(self): tokenizer = AutoTokenizer.from_pretrained(self.model_name) @@ -265,7 +261,3 @@ def test_qwen2_5(self): generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) print(output_text) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_model_scope.py b/test/test_cpu/test_model_scope.py index 6da33cdc3..0097b3584 100644 --- a/test/test_cpu/test_model_scope.py +++ b/test/test_cpu/test_model_scope.py @@ -1,30 +1,17 @@ import copy import os import shutil -import sys -import unittest - -sys.path.insert(0, "../..") +import pytest import torch from auto_round import AutoRound -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(3): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestModelScope(unittest.TestCase): +class TestModelScope: @classmethod - def setUpClass(self): + def setup_class(self): self.saved_path = "./saved" - self.dataset = LLMDataLoader() self.source_path, self.cache_path = "/tf_dataset/auto_round/modelscope", "/home/hostuser/.cache/modelscope" if os.path.exists(self.source_path): @@ -33,13 +20,13 @@ def setUpClass(self): shutil.copytree(self.source_path, self.cache_path, dirs_exist_ok=True) @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) if os.path.exists(self.cache_path): shutil.rmtree(self.cache_path, ignore_errors=True) - return super().tearDownClass() + return super().teardown_class() def test_llm(self): model_name = "Qwen/Qwen2.5-0.5B-Instruct" @@ -54,7 +41,3 @@ def test_mllm(self): model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=self.dataset, batch_size=2 ) autoround.quantize_and_save(self.saved_path) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py index ba9d3a1a8..1144d00d6 100644 --- a/test/test_cpu/test_mxfp_nvfp.py +++ b/test/test_cpu/test_mxfp_nvfp.py @@ -1,12 +1,9 @@ import os import shutil -import sys -import unittest -from parameterized import parameterized - -sys.path.insert(0, "../..") +import pytest import torch +from parameterized import parameterized from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from auto_round import AutoRound @@ -23,30 +20,20 @@ def _get_folder_size(path: str) -> float: return total_size / (1024**3) # convert to GB -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestAutoRoundFP(unittest.TestCase): +class TestAutoRoundFP: @classmethod - def setUpClass(self): + def setup_class(self): self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" self.save_dir = "./saved" self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto") self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) - self.llm_dataloader = LLMDataLoader() @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_nvfp4_moe_actmax_rtn(self): + def test_nvfp4_moe_actmax_rtn(self, dataloader): model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite" layer_config = { "self_attn": {"bits": 16, "act_bits": 16}, @@ -62,7 +49,7 @@ def test_nvfp4_moe_actmax_rtn(self): iters=0, seqlen=2, nsamples=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) compressed_model, _ = autoround.quantize() @@ -73,7 +60,7 @@ def test_nvfp4_moe_actmax_rtn(self): ), "Illegal NVFP4 quantization for lm_head layer" shutil.rmtree(self.save_dir, ignore_errors=True) - def test_nvfp4_moe_actmax_ar(self): + def test_nvfp4_moe_actmax_ar(self, dataloader): model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite" layer_config = { "q_proj": {"bits": 16, "act_bits": 16}, @@ -89,7 +76,7 @@ def test_nvfp4_moe_actmax_ar(self): iters=1, seqlen=2, nsamples=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) compressed_model, _ = autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round") @@ -111,7 +98,7 @@ def test_nvfp4_moe_actmax_ar(self): self.assertGreater(result["results"]["piqa"]["acc,none"], 0.7) shutil.rmtree(self.save_dir, ignore_errors=True) - def test_mxfp4_moe_ar(self): + def test_mxfp4_moe_ar(self, dataloader): model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite" layer_config = { "q_proj": {"bits": 16, "act_bits": 16, "data_type": "float"}, @@ -127,7 +114,7 @@ def test_mxfp4_moe_ar(self): iters=1, seqlen=2, nsamples=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) compressed_model, _ = autoround.quantize_and_save(output_dir=self.save_dir, inplace=True, format="auto_round") @@ -139,7 +126,7 @@ def test_mxfp4_moe_ar(self): ), "Illegal MXFP4 packing for lm_head layer" shutil.rmtree(self.save_dir, ignore_errors=True) - def test_mxfp4_llmcompressor_format(self): + def test_mxfp4_llmcompressor_format(self, dataloader): model_name = self.model_name from transformers import AutoConfig @@ -151,7 +138,7 @@ def test_mxfp4_llmcompressor_format(self): iters=2, seqlen=2, layer_config=layer_config, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_dir autoround.quantize() @@ -179,7 +166,7 @@ def test_mxfp4_llmcompressor_format(self): ), f"Invalid MXFP4 quantization configuration: {quantization_config}" shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_rtn_mxfp4_llmcompressor_format(self): + def test_rtn_mxfp4_llmcompressor_format(self, dataloader): model_name = self.model_name from transformers import AutoConfig @@ -191,7 +178,7 @@ def test_rtn_mxfp4_llmcompressor_format(self): iters=0, seqlen=2, layer_config=layer_config, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_dir autoround.quantize() @@ -219,7 +206,7 @@ def test_rtn_mxfp4_llmcompressor_format(self): ), f"Invalid MXFP4 quantization configuration: {quantization_config}" shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mxfp8_llmcompressor_format(self): + def test_mxfp8_llmcompressor_format(self, dataloader): model_name = self.model_name from transformers import AutoConfig @@ -229,7 +216,7 @@ def test_mxfp8_llmcompressor_format(self): scheme=scheme, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_dir compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") @@ -256,7 +243,7 @@ def test_mxfp8_llmcompressor_format(self): ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.2 GB)" shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_nvfp4_llmcompressor_format(self): + def test_nvfp4_llmcompressor_format(self, dataloader): model_name = self.model_name from transformers import AutoConfig @@ -266,7 +253,7 @@ def test_nvfp4_llmcompressor_format(self): scheme=scheme, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_dir compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") @@ -293,7 +280,7 @@ def test_nvfp4_llmcompressor_format(self): ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.15 GB)" shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_nvfp4_autoround_format(self): + def test_nvfp4_autoround_format(self, dataloader): model_name = self.model_name from transformers import AutoConfig @@ -303,7 +290,7 @@ def test_nvfp4_autoround_format(self): scheme=scheme, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_dir compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") @@ -318,7 +305,7 @@ def test_nvfp4_autoround_format(self): ), "Illegal NVFP4 packing name or data_type or shape" shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_nvfp4_autoround_save_quantized(self): + def test_nvfp4_autoround_save_quantized(self, dataloader): model_name = self.model_name from transformers import AutoConfig @@ -328,7 +315,7 @@ def test_nvfp4_autoround_save_quantized(self): scheme=scheme, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_dir autoround.quantize() @@ -344,7 +331,7 @@ def test_nvfp4_autoround_save_quantized(self): ), "Illegal NVFP4 packing name or data_type or shape" shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_qwen_moe_quant_infer(self): + def test_qwen_moe_quant_infer(self, dataloader): model_name = "/tf_dataset/auto_round/models/Qwen/Qwen1.5-MoE-A2.7B" layer_config = { "layers\.(?:[3-9]|1[0-9]|2[0-3])": {"bits": 16, "act_bits": 16}, @@ -356,7 +343,7 @@ def test_qwen_moe_quant_infer(self): iters=1, seqlen=2, nsamples=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = self.save_dir @@ -381,7 +368,7 @@ def test_qwen_moe_quant_infer(self): ("NVFP4", "fp8", None), ] ) - def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype): + def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype, dataloader): model_name = self.model_name from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from transformers.models.opt.modeling_opt import OPTForCausalLM @@ -397,7 +384,7 @@ def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype): scheme=scheme, iters=0, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, static_kv_dtype=static_kv_dtype, static_attention_dtype=static_attention_dtype, ) @@ -433,7 +420,3 @@ def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype): getattr(attn, "q_scale", None) is not None ), f"Missing q_scale in attention for scheme={scheme}, static_attention_dtype={static_attention_dtype}" shutil.rmtree(quantized_model_path, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_scheme.py b/test/test_cpu/test_scheme.py index c2d165639..71f02dc96 100644 --- a/test/test_cpu/test_scheme.py +++ b/test/test_cpu/test_scheme.py @@ -1,64 +1,52 @@ import shutil -import sys -import unittest +import pytest import torch -sys.path.insert(0, "../..") from auto_round import AutoRound from auto_round.schemes import QuantizationScheme -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestAutoRound(unittest.TestCase): +class TestAutoRound: @classmethod - def setUpClass(self): + def setup_class(self): self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" self.save_folder = "./saved" - self.llm_dataloader = LLMDataLoader() @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree(self.save_folder, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_gguf(self): + def test_gguf(self, dataloader): ar = AutoRound( "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B", scheme="W2A16", nsamples=1, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) ar.quantize_and_save(self.save_folder, format="gguf:q4_k_m") - self.assertEqual(ar.bits, 4) + assert ar.bits == 4 shutil.rmtree(self.save_folder, ignore_errors=True) - def test_w4a16(self): - ar = AutoRound(self.model_name, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=self.llm_dataloader) - self.assertEqual(ar.bits, 4) + def test_w4a16(self, dataloader): + ar = AutoRound(self.model_name, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=dataloader) + assert ar.bits == 4 ar.quantize() - def test_w2a16_rtn(self): - ar = AutoRound(self.model_name, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=self.llm_dataloader) - self.assertEqual(ar.bits, 2) + def test_w2a16_rtn(self, dataloader): + ar = AutoRound(self.model_name, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=dataloader) + assert ar.bits == 2 ar.quantize() - def test_mxfp4(self): - ar = AutoRound(self.model_name, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=self.llm_dataloader) - self.assertEqual(ar.bits, 4) - self.assertEqual(ar.act_bits, 4) - self.assertEqual(ar.data_type, "mx_fp") - self.assertEqual(ar.act_data_type, "mx_fp_rceil") + def test_mxfp4(self, dataloader): + ar = AutoRound(self.model_name, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader) + assert ar.bits == 4 + assert ar.act_bits == 4 + assert ar.data_type == "mx_fp" + assert ar.act_data_type == "mx_fp_rceil" ar.quantize() def test_vllm(self): @@ -67,18 +55,18 @@ def test_vllm(self): ar = AutoRoundMLLM( "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct", scheme="W2A16", nsamples=1, iters=1, seqlen=2 ) - self.assertEqual(ar.bits, 2) - self.assertEqual(ar.act_bits, 16) - - def test_nvfp4(self): - ar = AutoRound(self.model_name, scheme="NVFP4", nsamples=1, iters=1, seqlen=2, dataset=self.llm_dataloader) - self.assertEqual(ar.bits, 4) - self.assertEqual(ar.act_bits, 4) - self.assertEqual(ar.data_type, "nv_fp") - self.assertEqual(ar.act_data_type, "nv_fp4_with_static_gs") + assert ar.bits == 2 + assert ar.act_bits == 16 + + def test_nvfp4(self, dataloader): + ar = AutoRound(self.model_name, scheme="NVFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader) + assert ar.bits == 4 + assert ar.act_bits == 4 + assert ar.data_type == "nv_fp" + assert ar.act_data_type == "nv_fp4_with_static_gs" ar.quantize() - def test_all_scheme(self): + def test_all_scheme(self, dataloader): import copy preset_schemes = ["W8A16", "MXFP8", "FPW8A16", "FP8_STATIC", "GGUF:Q2_K_S", "GGUF:Q4_K_M"] @@ -87,11 +75,11 @@ def test_all_scheme(self): if "gguf" in scheme.lower(): model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-1.5B-Instruct" print(f"scheme={scheme}") - ar = AutoRound(model_name, scheme=scheme, nsamples=1, iters=1, seqlen=2, dataset=self.llm_dataloader) + ar = AutoRound(model_name, scheme=scheme, nsamples=1, iters=1, seqlen=2, dataset=dataloader) ar.quantize_and_save(self.save_folder) shutil.rmtree(self.save_folder, ignore_errors=True) - def test_scheme_in_layer_config(self): + def test_scheme_in_layer_config(self, dataloader): layer_config = { "model.decoder.layers.2.self_attn": {"bits": 2}, "model.decoder.layers.3.self_attn.v_proj": "W8A16", @@ -104,19 +92,19 @@ def test_scheme_in_layer_config(self): iters=1, layer_config=layer_config, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) ar.quantize() for n, m in ar.model.named_modules(): if n == "model.decoder.layers.2.self_attn.q_proj": - self.assertEqual(m.bits, 2) + assert m.bits == 2 if n == "model.decoder.layers.2.self_attn.k_proj": - self.assertEqual(m.bits, 2) + assert m.bits == 2 if n == "model.decoder.layers.3.self_attn.v_proj": - self.assertEqual(m.bits, 8) + assert m.bits == 8 if n == "model.decoder.layers.4.self_attn.k_proj": - self.assertEqual(m.group_size, 64) + assert m.group_size == 64 def test_parse_available_devices(self): from auto_round.utils.device import parse_available_devices @@ -125,10 +113,6 @@ def test_parse_available_devices(self): self.assertTrue(len(device_list) == 1 and "cpu" in device_list) device_list = parse_available_devices("a:cuda:0,b:cuda:1,c:cpu") self.assertTrue(len(device_list) == 3) - self.assertEqual(device_list, ["cuda:0", "cuda:1", "cpu"]) + assert device_list == ["cuda:0", "cuda:1", "cpu"] device_list = parse_available_devices("0,1") self.assertTrue(len(device_list) == 1 and "cpu" in device_list) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_script.py b/test/test_cpu/test_script.py index 01bbba644..aa25d7f61 100644 --- a/test/test_cpu/test_script.py +++ b/test/test_cpu/test_script.py @@ -1,11 +1,9 @@ import os -import sys -import unittest -sys.path.insert(0, "../..") +import pytest -class TestScript(unittest.TestCase): +class TestScript: def test_default(self): os.system( """ @@ -15,7 +13,3 @@ def test_default(self): --deployment_device fake --output_dir ./tmp_script_test""" ) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_torch_backend.py b/test/test_cpu/test_torch_backend.py index d1e9bd293..e27914d9b 100644 --- a/test/test_cpu/test_torch_backend.py +++ b/test/test_cpu/test_torch_backend.py @@ -1,11 +1,6 @@ import shutil -import sys -import unittest import pytest - -sys.path.insert(0, "../..") - import torch from transformers import AutoModelForCausalLM, AutoTokenizer @@ -13,56 +8,22 @@ from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.testing_utils import require_autogptq, require_gptqmodel +from ..helpers import model_infer -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - -class TestAutoRoundTorchBackend(unittest.TestCase): +class TestAutoRoundTorchBackend: @classmethod - def setUpClass(self): + def setup_class(self): self.model_name = "facebook/opt-125m" self.save_folder = "./saved" - self.llm_dataloader = LLMDataLoader() - - def model_infer(self, model, tokenizer): - prompts = [ - "Hello,my name is", - # "The president of the United States is", - # "The capital of France is", - # "The future of AI is", - ] - - inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) - - outputs = model.generate( - input_ids=inputs["input_ids"].to(model.device), - attention_mask=inputs["attention_mask"].to(model.device), - do_sample=False, ## change this to follow official usage - max_new_tokens=5, - ) - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] - - decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - - for i, prompt in enumerate(prompts): - print(f"Prompt: {prompt}") - print(f"Generated: {decoded_outputs[i]}") - print("-" * 50) - return decoded_outputs[0] @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree(self.save_folder, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_torch_4bits_asym(self): + def test_torch_4bits_asym(self, dataloader): model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, False @@ -74,7 +35,7 @@ def test_torch_4bits_asym(self): sym=sym, iters=0, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_folder autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel") @@ -85,7 +46,7 @@ def test_torch_4bits_asym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10) print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35) @@ -96,14 +57,14 @@ def test_torch_4bits_asym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10) print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35) torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) - def test_torch_4bits_sym(self): + def test_torch_4bits_sym(self, dataloader): model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 32, True @@ -115,7 +76,7 @@ def test_torch_4bits_sym(self): sym=sym, iters=0, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_folder autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") ##will convert to gptq model @@ -126,13 +87,9 @@ def test_torch_4bits_sym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=1000) print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.28) torch.cuda.empty_cache() shutil.rmtree(self.save_folder, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cpu/test_utils.py b/test/test_cpu/test_utils.py index e70a4b7b4..3dec97010 100644 --- a/test/test_cpu/test_utils.py +++ b/test/test_cpu/test_utils.py @@ -1,7 +1,5 @@ -import sys from unittest.mock import patch -sys.path.insert(0, "../..") import auto_round.utils.device as auto_round_utils diff --git a/test/test_cpu/test_woq_linear.py b/test/test_cpu/test_woq_linear.py index e077c7a21..8f5bedc2c 100644 --- a/test/test_cpu/test_woq_linear.py +++ b/test/test_cpu/test_woq_linear.py @@ -1,9 +1,6 @@ -import sys - import pytest import torch -sys.path.insert(0, "../..") from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear diff --git a/test/test_cuda/__init__.py b/test/test_cuda/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/test_cuda/_test_helpers.py b/test/test_cuda/_test_helpers.py deleted file mode 100644 index b4b8a5955..000000000 --- a/test/test_cuda/_test_helpers.py +++ /dev/null @@ -1,32 +0,0 @@ -def model_infer(model, tokenizer, apply_chat_template=False): - prompts = [ - "Hello,my name is", - # "The president of the United States is", - # "The capital of France is", - # "The future of AI is", - ] - if apply_chat_template: - texts = [] - for prompt in prompts: - messages = [{"role": "user", "content": prompt}] - text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - texts.append(text) - prompts = texts - - inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) - - outputs = model.generate( - input_ids=inputs["input_ids"].to(model.device), - attention_mask=inputs["attention_mask"].to(model.device), - do_sample=False, ## change this to follow official usage - max_new_tokens=5, - ) - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] - - decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - - for i, prompt in enumerate(prompts): - print(f"Prompt: {prompt}") - print(f"Generated: {decoded_outputs[i]}") - print("-" * 50) - return decoded_outputs[0] diff --git a/test/test_cuda/test_2_3bits.py b/test/test_cuda/test_2_3bits.py index 2ea407f20..f12bf240c 100644 --- a/test/test_cuda/test_2_3bits.py +++ b/test/test_cuda/test_2_3bits.py @@ -1,10 +1,8 @@ import copy import re import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch import transformers from lm_eval.utils import make_table # pylint: disable=E0401 @@ -14,6 +12,8 @@ from auto_round.eval.evaluation import simple_evaluate, simple_evaluate_user_model from auto_round.testing_utils import require_autogptq, require_greater_than_050, require_greater_than_051 +from ..helpers import model_infer + def get_accuracy(data): match = re.search(r"\|acc\s+\|[↑↓]\s+\|\s+([\d.]+)\|", data) @@ -25,43 +25,17 @@ def get_accuracy(data): return 0.0 -class TestAutoRound(unittest.TestCase): +class TestAutoRound: @classmethod - def setUpClass(self): + def setup_class(self): self.save_dir = "./saved" self.tasks = "lambada_openai" @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def model_infer(self, model, tokenizer): - prompts = [ - "Hello,my name is", - # "The president of the United States is", - # "The capital of France is", - # "The future of AI is", - ] - - inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) - - outputs = model.generate( - input_ids=inputs["input_ids"].to(model.device), - attention_mask=inputs["attention_mask"].to(model.device), - do_sample=False, ## change this to follow official usage - max_new_tokens=5, - ) - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] - - decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - - for i, prompt in enumerate(prompts): - print(f"Prompt: {prompt}") - print(f"Generated: {decoded_outputs[i]}") - print("-" * 50) - return decoded_outputs[0] - @require_greater_than_051 def test_3bits_autoround(self): model_name = "/models/opt-125m" @@ -77,7 +51,7 @@ def test_3bits_autoround(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_dir) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.3) ## 0.3130 @@ -145,7 +119,3 @@ def test_2bits_autoround(self): accuracy = get_accuracy(res) assert accuracy > 0.17 shutil.rmtree("./saved", ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_alg_ext.py b/test/test_cuda/test_alg_ext.py index c83d6f3b4..499213c74 100644 --- a/test/test_cuda/test_alg_ext.py +++ b/test/test_cuda/test_alg_ext.py @@ -1,9 +1,7 @@ import shutil import sys -import unittest - -sys.path.insert(0, "../..") +import pytest import torch from transformers import AutoModelForCausalLM, AutoTokenizer @@ -11,15 +9,15 @@ from auto_round.eval.evaluation import simple_evaluate_user_model -class TestAlgExt(unittest.TestCase): +class TestAlgExt: @classmethod - def setUpClass(self): + def setup_class(self): self.model_name = "/models/opt-125m" self.save_folder = "./saved" @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree(self.save_folder, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @@ -68,7 +66,3 @@ def test_all_support_dtype(self): model_name, scheme=scheme, iters=1, nsamples=1, enable_alg_ext=True, enable_torch_compile=True ) ar.quantize() - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_auto_round_format.py b/test/test_cuda/test_auto_round_format.py index 55fc1690f..cbc6868f1 100644 --- a/test/test_cuda/test_auto_round_format.py +++ b/test/test_cuda/test_auto_round_format.py @@ -1,9 +1,7 @@ import copy import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer @@ -18,69 +16,24 @@ require_package_version_ut, ) +from ..helpers import model_infer -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestAutoRound(unittest.TestCase): +class TestAutoRound: @classmethod - def setUpClass(self): + def setup_class(self): self.model_name = "facebook/opt-125m" - self.llm_dataloader = LLMDataLoader() self.save_folder = "./saved" - def model_infer(self, model, tokenizer): - prompts = [ - "Hello,my name is", - # "The president of the United States is", - # "The capital of France is", - # "The future of AI is", - ] - - ##texts = [] - # for prompt in prompts: - # messages = [ - # {"role": "user", "content": prompt} - # ] - # text = tokenizer.apply_chat_template( - # messages, - # tokenize=False, - # add_generation_prompt=True - # ) - # texts.append(text) - - inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) - - outputs = model.generate( - input_ids=inputs["input_ids"].to(model.device), - attention_mask=inputs["attention_mask"].to(model.device), - do_sample=False, ## change this to follow official usage - max_new_tokens=5, - ) - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] - - decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - - for i, prompt in enumerate(prompts): - print(f"Prompt: {prompt}") - print(f"Generated: {decoded_outputs[i]}") - print("-" * 50) - @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree(self.save_folder, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @require_greater_than_050 @require_package_version_ut("transformers", "<4.57.0") - def test_autoround_asym(self): + def test_autoround_asym(self, dataloader): for bits in [2, 3, 4, 8]: model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -93,7 +46,7 @@ def test_autoround_asym(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_folder @@ -132,7 +85,7 @@ def test_mixed_precision(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.32) @@ -161,7 +114,7 @@ def test_awq_backend(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.18) @@ -172,7 +125,7 @@ def test_awq_backend(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) shutil.rmtree(self.save_folder, ignore_errors=True) @require_greater_than_050 @@ -184,12 +137,12 @@ def test_tritonv2_bf16(self): ) tokenizer = AutoTokenizer.from_pretrained(model_name) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) torch.cuda.empty_cache() @require_ipex - def test_autoround_gptq_sym_format(self): + def test_autoround_gptq_sym_format(self, dataloader): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True @@ -201,7 +154,7 @@ def test_autoround_gptq_sym_format(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = "./saved" @@ -244,7 +197,7 @@ def test_autoround_gptq_sym_format(self): @require_awq @require_ipex @require_package_version_ut("transformers", "<4.57.0") - def test_autoround_awq_sym_format(self): + def test_autoround_awq_sym_format(self, dataloader): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True @@ -256,7 +209,7 @@ def test_autoround_awq_sym_format(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = "./saved" @@ -283,7 +236,7 @@ def test_autoround_awq_sym_format(self): shutil.rmtree("./saved", ignore_errors=True) @require_greater_than_050 - def test_autoround_sym(self): + def test_autoround_sym(self, dataloader): for bits in [2, 3, 4, 8]: model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -296,7 +249,7 @@ def test_autoround_sym(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = "./saved" @@ -325,8 +278,4 @@ def test_load_gptq_model_3bits(self): quantization_config=quantization_config, ) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - self.model_infer(model, tokenizer) - - -if __name__ == "__main__": - unittest.main() + model_infer(model, tokenizer) diff --git a/test/test_cuda/test_auto_scheme.py b/test/test_cuda/test_auto_scheme.py index 681e3b29b..b6f5d8066 100644 --- a/test/test_cuda/test_auto_scheme.py +++ b/test/test_cuda/test_auto_scheme.py @@ -1,10 +1,9 @@ import copy import re import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest + from auto_round import AutoRound, AutoRoundConfig, AutoScheme from auto_round.auto_scheme.utils import compute_avg_bits_for_model from auto_round.eval.evaluation import simple_evaluate @@ -12,14 +11,14 @@ from auto_round.utils import get_module -class TestAutoScheme(unittest.TestCase): +class TestAutoScheme: @classmethod - def setUpClass(self): + def setup_class(self): self.save_dir = "./saved" self.tasks = "lambada_openai" @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @@ -79,7 +78,7 @@ def test_shared_layers(self): from auto_round.auto_scheme.utils import parse_shared_layers res = parse_shared_layers(model, shared_layers) - self.assertEqual(len(res), 24) + assert len(res) == 24 assert [ "model.decoder.layers.2.self_attn.out_proj", "model.decoder.layers.2.self_attn.q_proj", @@ -101,7 +100,7 @@ def test_shared_layers(self): else: bits.append(module.bits) bits = set(bits) - self.assertEqual(len(bits), 1) + assert len(bits) == 1 print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 @@ -187,7 +186,7 @@ def test_patch_scheme(self): model, layer_config = ar.quantize() for n, m in model.named_modules(): if hasattr(m, "group_size"): - self.assertEqual(m.group_size, 32) + assert m.group_size == 32 avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 @@ -199,13 +198,13 @@ def test_layer_config(self): user_layer_config = {"model.decoder.layers.10.fc1": {"bits": 8, "group_size": 32, "sym": False}} ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config) model, layer_config = ar.quantize() - self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["bits"], 8) - self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["sym"], False) - self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["group_size"], 32) + assert layer_config["model.decoder.layers.10.fc1"]["bits"] == 8 + assert layer_config["model.decoder.layers.10.fc1"]["sym"] == False + assert layer_config["model.decoder.layers.10.fc1"]["group_size"] == 32 layer = get_module(model, "model.decoder.layers.10.fc1") - self.assertEqual(layer.bits, 8) - self.assertEqual(layer.sym, False) - self.assertEqual(layer.group_size, 32) + assert layer.bits == 8 + assert layer.sym == False + assert layer.group_size == 32 avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 @@ -216,13 +215,13 @@ def test_layer_config(self): user_layer_config = {"model.decoder.layers.10.fc1": {"bits": 8, "group_size": 32, "sym": False}} ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config) model, layer_config = ar.quantize() - self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["bits"], 8) - self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["sym"], False) - self.assertEqual(layer_config["model.decoder.layers.10.fc1"]["group_size"], 32) + assert layer_config["model.decoder.layers.10.fc1"]["bits"] == 8 + assert layer_config["model.decoder.layers.10.fc1"]["sym"] == False + assert layer_config["model.decoder.layers.10.fc1"]["group_size"] == 32 layer = get_module(model, "model.decoder.layers.10.fc1") - self.assertEqual(layer.orig_layer.bits, 8) - self.assertEqual(layer.orig_layer.sym, False) - self.assertEqual(layer.orig_layer.group_size, 32) + assert layer.orig_layer.bits == 8 + assert layer.orig_layer.sym == False + assert layer.orig_layer.group_size == 32 avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 @@ -265,7 +264,3 @@ def test_enable_torch_compile(self): print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.10) shutil.rmtree(self.save_dir, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_calib_dataset.py b/test/test_cuda/test_calib_dataset.py index b66f60127..6a36c21b1 100644 --- a/test/test_cuda/test_calib_dataset.py +++ b/test/test_cuda/test_calib_dataset.py @@ -1,20 +1,17 @@ +import json import os import shutil -import sys -import unittest - -sys.path.insert(0, "../..") -import json +import pytest import torch from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound -class TestLocalCalibDataset(unittest.TestCase): +class TestLocalCalibDataset: @classmethod - def setUpClass(self): + def setup_class(self): json_data = [{"text": "awefdsfsddfd"}, {"text": "fdfdfsdfdfdfd"}, {"text": "dfdsfsdfdfdfdf"}] os.makedirs("./saved", exist_ok=True) self.json_file = "./saved/tmp.json" @@ -40,7 +37,3 @@ def test_combine_dataset(self): self.model, self.tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=128, dataset=dataset ) autoround.quantize() - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_conv1d.py b/test/test_cuda/test_conv1d.py index e617bf55e..c5384a384 100644 --- a/test/test_cuda/test_conv1d.py +++ b/test/test_cuda/test_conv1d.py @@ -1,40 +1,29 @@ import copy import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch -from _test_helpers import model_infer from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound from auto_round.testing_utils import require_gptqmodel +from ..helpers import model_infer -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestQuantizationConv1d(unittest.TestCase): +class TestQuantizationConv1d: @classmethod - def setUpClass(self): + def setup_class(self): self.model_name = "MBZUAI/LaMini-GPT-124M" self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) - self.llm_dataloader = LLMDataLoader() @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @require_gptqmodel - def test_quant(self): + def test_quant(self, dataloader): self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) bits, group_size, sym = 4, 128, True from auto_round import AutoRoundConfig @@ -47,7 +36,7 @@ def test_quant(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() @@ -55,7 +44,3 @@ def test_quant(self): model = AutoModelForCausalLM.from_pretrained("./saved", device_map="cuda", trust_remote_code=True) model_infer(model, self.tokenizer) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_diffusion.py b/test/test_cuda/test_diffusion.py index 9a5a8bfd3..147a34d47 100644 --- a/test/test_cuda/test_diffusion.py +++ b/test/test_cuda/test_diffusion.py @@ -2,13 +2,9 @@ import os import re import shutil -import sys -import unittest +import pytest import requests - -sys.path.insert(0, "../..") - from diffusers import AutoPipelineForText2Image from PIL import Image @@ -16,13 +12,13 @@ from auto_round.testing_utils import require_gptqmodel, require_optimum, require_vlm_env -class TestAutoRound(unittest.TestCase): +class TestAutoRound: @classmethod - def setUpClass(self): + def setup_class(self): self.model_name = "/dataset/FLUX.1-dev" @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("runs", ignore_errors=True) @require_optimum @@ -77,7 +73,3 @@ def test_diffusion_model_checker(self): self.assertTrue(is_diffusion_model("/models/stable-diffusion-2-1")) self.assertTrue(is_diffusion_model("/models/stable-diffusion-xl-base-1.0")) self.assertFalse(is_diffusion_model("/models/Qwen3-8B")) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_exllamav2_backend.py b/test/test_cuda/test_exllamav2_backend.py index c489b37b2..e6f78ba90 100644 --- a/test/test_cuda/test_exllamav2_backend.py +++ b/test/test_cuda/test_exllamav2_backend.py @@ -1,12 +1,6 @@ import shutil -import sys -import unittest import pytest - -sys.path.insert(0, "../..") - - import torch from transformers import AutoModelForCausalLM, AutoTokenizer @@ -14,62 +8,28 @@ from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.testing_utils import require_autogptq, require_gptqmodel, require_package_version_ut - -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) +from ..helpers import model_infer -class TestAutoRoundexllamaBackend(unittest.TestCase): +class TestAutoRoundexllamaBackend: @classmethod - def setUpClass(self): + def setup_class(self): self.model_name = "/models/opt-125m" self.save_folder = "./saved" - self.llm_dataloader = LLMDataLoader() - - def model_infer(self, model, tokenizer): - prompts = [ - "Hello,my name is", - # "The president of the United States is", - # "The capital of France is", - # "The future of AI is", - ] - - inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) - - outputs = model.generate( - input_ids=inputs["input_ids"].to(model.device), - attention_mask=inputs["attention_mask"].to(model.device), - do_sample=False, ## change this to follow official usage - max_new_tokens=5, - ) - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] - - decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - - for i, prompt in enumerate(prompts): - print(f"Prompt: {prompt}") - print(f"Generated: {decoded_outputs[i]}") - print("-" * 50) - return decoded_outputs[0] @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree(self.save_folder, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @require_gptqmodel - def test_gptqmodel_exllmav2_4bits_asym(self): + def test_gptqmodel_exllmav2_4bits_asym(self, dataloader): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, False autoround = AutoRound( - model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, dataset=self.llm_dataloader + model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, dataset=dataloader ) quantized_model_path = self.save_folder autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel") @@ -80,7 +40,7 @@ def test_gptqmodel_exllmav2_4bits_asym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35) @@ -91,7 +51,7 @@ def test_gptqmodel_exllmav2_4bits_asym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35) @@ -100,7 +60,7 @@ def test_gptqmodel_exllmav2_4bits_asym(self): @require_autogptq @require_package_version_ut("torch", "<2.6.0") - def test_gptq_exllamav2_4bits_sym(self): + def test_gptq_exllamav2_4bits_sym(self, dataloader): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True @@ -112,7 +72,7 @@ def test_gptq_exllamav2_4bits_sym(self): sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_folder autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") ##will convert to gptq model @@ -123,7 +83,7 @@ def test_gptq_exllamav2_4bits_sym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27) @@ -158,13 +118,9 @@ def test_gptq_exllamav2_4bits_sym_group_size(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=64, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.15) torch.cuda.empty_cache() shutil.rmtree(self.save_folder, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_export.py b/test/test_cuda/test_export.py index 6bb2612e8..3e1171162 100644 --- a/test/test_cuda/test_export.py +++ b/test/test_cuda/test_export.py @@ -1,9 +1,7 @@ import copy import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch import transformers from transformers import AutoConfig, AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer @@ -12,29 +10,19 @@ from auto_round.testing_utils import require_awq, require_optimum, require_package_version_ut -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestAutoRound(unittest.TestCase): +class TestAutoRound: @classmethod - def setUpClass(self): + def setup_class(self): self.model_name = "facebook/opt-125m" self.save_dir = "./saved" - self.llm_dataloader = LLMDataLoader() @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @require_optimum - def test_autogptq_format(self): + def test_autogptq_format(self, dataloader): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, False @@ -46,7 +34,7 @@ def test_autogptq_format(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() quantized_model_path = "./saved" @@ -65,7 +53,7 @@ def test_autogptq_format(self): shutil.rmtree("./saved", ignore_errors=True) @require_optimum - def test_autogptq_format_fp_layers(self): + def test_autogptq_format_fp_layers(self, dataloader): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) layer_config = {} @@ -82,7 +70,7 @@ def test_autogptq_format_fp_layers(self): sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) autoround.quantize() @@ -102,7 +90,7 @@ def test_autogptq_format_fp_layers(self): # "there there there there there there") shutil.rmtree("./saved", ignore_errors=True) - def test_autogptq_format_qsave_fp_layers(self): + def test_autogptq_format_qsave_fp_layers(self, dataloader): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) layer_config = {} @@ -119,7 +107,7 @@ def test_autogptq_format_qsave_fp_layers(self): sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = "./saved" @@ -153,7 +141,7 @@ def test_autogptq_format_qsave_fp_layers(self): ##print(res) shutil.rmtree("./saved", ignore_errors=True) - def test_autoround_format(self): + def test_autoround_format(self, dataloader): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True @@ -165,7 +153,7 @@ def test_autoround_format(self): sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() quantized_model_path = "./saved" @@ -186,7 +174,7 @@ def test_autoround_format(self): @require_awq @require_package_version_ut("transformers", "<4.57.0") - def test_autoawq_format(self): + def test_autoawq_format(self, dataloader): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, False @@ -198,7 +186,7 @@ def test_autoawq_format(self): sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() quantized_model_path = "./saved" @@ -220,7 +208,7 @@ def test_autoawq_format(self): @require_optimum @require_awq @require_package_version_ut("transformers", "<4.57.0") - def test_autoawq_format_fp_qsave_layers(self): + def test_autoawq_format_fp_qsave_layers(self, dataloader): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) layer_config = { "model.decoder.layers.0.self_attn.k_proj": {"bits": 16}, @@ -236,7 +224,7 @@ def test_autoawq_format_fp_qsave_layers(self): sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = "./saved/test_export" @@ -261,7 +249,7 @@ def test_autoawq_format_fp_qsave_layers(self): shutil.rmtree("./saved", ignore_errors=True) - def test_autoround_3bit_asym_torch_format(self): + def test_autoround_3bit_asym_torch_format(self, dataloader): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 3, 128, False @@ -273,7 +261,7 @@ def test_autoround_3bit_asym_torch_format(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() quantized_model_path = "./saved" @@ -290,7 +278,7 @@ def test_autoround_3bit_asym_torch_format(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree("./saved", ignore_errors=True) - def test_autoround_3bit_sym_torch_format(self): + def test_autoround_3bit_sym_torch_format(self, dataloader): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 3, 128, True @@ -302,7 +290,7 @@ def test_autoround_3bit_sym_torch_format(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() quantized_model_path = "./saved" @@ -322,7 +310,7 @@ def test_autoround_3bit_sym_torch_format(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree("./saved", ignore_errors=True) - def test_awq_lmhead_export(self): + def test_awq_lmhead_export(self, dataloader): bits, sym, group_size = 4, False, 128 model_name = "/models/phi-2" layer_config = { @@ -336,7 +324,7 @@ def test_awq_lmhead_export(self): iters=2, seqlen=2, layer_config=layer_config, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = "./saved" compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_awq") @@ -354,7 +342,7 @@ def test_awq_lmhead_export(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_gptq_lmhead_export(self): + def test_gptq_lmhead_export(self, dataloader): bits, sym, group_size = 4, True, 128 model_name = "/models/phi-2" layer_config = { @@ -368,7 +356,7 @@ def test_gptq_lmhead_export(self): iters=2, seqlen=2, layer_config=layer_config, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = "./saved" compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") @@ -384,7 +372,3 @@ def test_gptq_lmhead_export(self): res = tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0]) print(res) shutil.rmtree(quantized_model_path, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_fp8_input.py b/test/test_cuda/test_fp8_input.py index 5258fe183..90a177ef3 100644 --- a/test/test_cuda/test_fp8_input.py +++ b/test/test_cuda/test_fp8_input.py @@ -1,9 +1,7 @@ import os import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer @@ -12,13 +10,13 @@ from auto_round.eval.evaluation import simple_evaluate -class TestAutoRound(unittest.TestCase): +class TestAutoRound: @classmethod - def setUpClass(self): + def setup_class(self): self.save_dir = "./saved" @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree(self.save_dir, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @@ -128,7 +126,3 @@ def test_diff_datatype(self): ar = AutoRound(model=model_name, iters=iters, scheme=scheme) ar.quantize_and_save(output_dir=self.save_dir) shutil.rmtree(self.save_dir, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_get_block_name.py b/test/test_cuda/test_get_block_name.py index cc9297653..52f251cb7 100644 --- a/test/test_cuda/test_get_block_name.py +++ b/test/test_cuda/test_get_block_name.py @@ -1,9 +1,7 @@ import copy import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch import transformers from diffusers import AutoPipelineForText2Image @@ -20,13 +18,13 @@ from auto_round.utils import get_block_names, is_pure_text_model -class TestAutoRound(unittest.TestCase): +class TestAutoRound: @classmethod - def setUpClass(self): + def setup_class(self): pass @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("runs", ignore_errors=True) def check_block_names(self, block_names, prefixs=[], n_layers=[]): @@ -199,7 +197,3 @@ def test_flux(self): block_names = get_block_names(model, quant_vision=True) self.check_block_names(block_names, ["transformer_blocks", "single_transformer_blocks"], [19, 38]) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_gguf.py b/test/test_cuda/test_gguf.py index 312e561cf..b8ee88d0b 100644 --- a/test/test_cuda/test_gguf.py +++ b/test/test_cuda/test_gguf.py @@ -1,9 +1,8 @@ import os import shutil import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer @@ -12,23 +11,14 @@ from auto_round.testing_utils import require_gguf -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestAutoRound(unittest.TestCase): +class TestAutoRound: @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @require_gguf - def test_gguf_format(self): + def test_gguf_format(self, dataloader): model_name = "Qwen/Qwen2.5-0.5B-Instruct" bits, group_size, sym = 4, 32, False autoround = AutoRound( @@ -39,7 +29,7 @@ def test_gguf_format(self): iters=2, seqlen=2, nsamples=2, - dataset=LLMDataLoader(), + dataset=dataloader, ) autoround.quantize() quantized_model_path = "./saved" @@ -71,7 +61,7 @@ def test_gguf_format(self): shutil.rmtree("./saved", ignore_errors=True) @require_gguf - def test_q2_k_export(self): + def test_q2_k_export(self, dataloader): bits, group_size, sym = 2, 16, False model_name = "Qwen/Qwen2.5-1.5B-Instruct" autoround = AutoRound( @@ -81,7 +71,7 @@ def test_q2_k_export(self): sym=sym, iters=1, seqlen=1, - dataset=LLMDataLoader(), + dataset=dataloader, data_type="int_asym_dq", ) autoround.quantize() @@ -252,7 +242,3 @@ def test_vlm_gguf(self): # file_size = os.path.getsize(os.path.join(quantized_model_path, "mmproj-model.gguf")) / 1024**2 # self.assertAlmostEqual(file_size, 3326.18, delta=5.0) # shutil.rmtree(quantized_model_path, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_main_func.py b/test/test_cuda/test_main_func.py index 571fc10f5..20dc7bdc8 100644 --- a/test/test_cuda/test_main_func.py +++ b/test/test_cuda/test_main_func.py @@ -1,10 +1,8 @@ import copy import re import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch import transformers from lm_eval.utils import make_table # pylint: disable=E0401 @@ -26,14 +24,14 @@ def get_accuracy(data): return 0.0 -class TestMainFunc(unittest.TestCase): +class TestMainFunc: @classmethod - def setUpClass(self): + def setup_class(self): self.save_dir = "./saved" self.tasks = "lambada_openai" @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @@ -216,7 +214,3 @@ def test_low_cpu_mem_usage(self): ) autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") shutil.rmtree(quantized_model_path, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_marlin_backend.py b/test/test_cuda/test_marlin_backend.py index 26d3ddca2..b920d9478 100644 --- a/test/test_cuda/test_marlin_backend.py +++ b/test/test_cuda/test_marlin_backend.py @@ -1,29 +1,18 @@ import shutil -import sys -import unittest import pytest - -sys.path.insert(0, "../..") import torch from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound, AutoRoundConfig from auto_round.eval.evaluation import simple_evaluate_user_model +from ..helpers import model_infer -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) +class TestAutoRoundMarlinBackend: -class TestAutoRoundMarlinBackend(unittest.TestCase): - - def test_marlin_group_size(self): + def test_marlin_group_size(self, dataloader): for group_size in [-1, 64]: print(f"{group_size}!!!!!!!!!!!!!!!!!") model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) @@ -37,7 +26,7 @@ def test_marlin_group_size(self): sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_folder autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") @@ -48,7 +37,7 @@ def test_marlin_group_size(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.14) @@ -66,7 +55,7 @@ def test_marlin_group_size(self): sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_folder autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") @@ -77,49 +66,22 @@ def test_marlin_group_size(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.14) @classmethod - def setUpClass(self): + def setup_class(self): self.model_name = "/models/opt-125m" self.save_folder = "./saved" - self.llm_dataloader = LLMDataLoader() - - def model_infer(self, model, tokenizer): - prompts = [ - "Hello,my name is", - # "The president of the United States is", - # "The capital of France is", - # "The future of AI is", - ] - - inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) - - outputs = model.generate( - input_ids=inputs["input_ids"].to(model.device), - attention_mask=inputs["attention_mask"].to(model.device), - do_sample=False, ## change this to follow official usage - max_new_tokens=5, - ) - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] - - decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - - for i, prompt in enumerate(prompts): - print(f"Prompt: {prompt}") - print(f"Generated: {decoded_outputs[i]}") - print("-" * 50) - return decoded_outputs[0] @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_marlin_4bits_sym_with_zp_m_1(self): + def test_marlin_4bits_sym_with_zp_m_1(self, dataloader): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True @@ -131,7 +93,7 @@ def test_marlin_4bits_sym_with_zp_m_1(self): sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_folder autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") @@ -142,7 +104,7 @@ def test_marlin_4bits_sym_with_zp_m_1(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27) @@ -153,7 +115,7 @@ def test_marlin_4bits_sym_with_zp_m_1(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27) @@ -172,7 +134,7 @@ def test_marlin_4bits_sym_with_zp_m_1(self): # sym=sym, # iters=1, # seqlen=2, - # dataset=self.llm_dataloader, + # dataset=dataloader, # ) # quantized_model_path = self.save_folder # autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") @@ -186,7 +148,7 @@ def test_marlin_4bits_sym_with_zp_m_1(self): # ) # # tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - # self.model_infer(model, tokenizer) + # model_infer(model, tokenizer) # result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) # self.assertGreater(result['results']['lambada_openai']['acc,none'], 0.27) @@ -200,13 +162,9 @@ def test_marlin_4bits_sym_with_zp_m_1(self): # ) # # tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - # self.model_infer(model, tokenizer) + # model_infer(model, tokenizer) # result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) # self.assertGreater(result['results']['lambada_openai']['acc,none'], 0.27) # torch.cuda.empty_cache() # shutil.rmtree("./saved", ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_mix_bits.py b/test/test_cuda/test_mix_bits.py index 4f7d39d8c..b9b7dde5c 100644 --- a/test/test_cuda/test_mix_bits.py +++ b/test/test_cuda/test_mix_bits.py @@ -1,15 +1,11 @@ import json import os import shutil -import sys -import unittest - -from parameterized import parameterized - -sys.path.insert(0, "../..") from pathlib import Path +import pytest import torch +from parameterized import parameterized from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from auto_round import AutoRound @@ -20,31 +16,21 @@ ) -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestAutoRound(unittest.TestCase): +class TestAutoRound: @classmethod - def setUpClass(self): + def setup_class(self): self.model_name = "/models/opt-125m" self.save_dir = "./saved" self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) - self.llm_dataloader = LLMDataLoader() @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @require_gptqmodel - def test_mixed_gptqmodel(self): + def test_mixed_gptqmodel(self, dataloader): scheme = "W4A16" layer_config = { "k_proj": {"bits": 8}, # part name @@ -59,7 +45,7 @@ def test_mixed_gptqmodel(self): iters=2, seqlen=2, layer_config=layer_config, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") @@ -73,7 +59,7 @@ def test_mixed_gptqmodel(self): print(res) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mixed_gptqmodel_convert_to_ar(self): + def test_mixed_gptqmodel_convert_to_ar(self, dataloader): layer_config = { "k_proj": {"bits": 8}, # part name "lm_head": {"bits": 4}, # set lm_head quant @@ -86,7 +72,7 @@ def test_mixed_gptqmodel_convert_to_ar(self): iters=2, seqlen=2, layer_config=layer_config, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") @@ -101,7 +87,7 @@ def test_mixed_gptqmodel_convert_to_ar(self): print(res) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mixed_autoround_format(self): + def test_mixed_autoround_format(self, dataloader): layer_config = { "k_proj": {"bits": 8}, "q_proj": {"bits": 3}, @@ -113,7 +99,7 @@ def test_mixed_autoround_format(self): scheme="W4A16", iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = "self.save_dir" @@ -129,7 +115,7 @@ def test_mixed_autoround_format(self): @require_awq @require_package_version_ut("transformers", "<4.57.0") - def test_fallback_regex_for_awq_format(self): + def test_fallback_regex_for_awq_format(self, dataloader): model_name = "facebook/opt-125m" layer_config = { "lm_head": {"bits": 16}, @@ -140,7 +126,7 @@ def test_fallback_regex_for_awq_format(self): scheme="W4A16", iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = "self.save_dir" @@ -155,7 +141,7 @@ def test_fallback_regex_for_awq_format(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mixed_ar_format_part_name_hf_loading(self): + def test_mixed_ar_format_part_name_hf_loading(self, dataloader): layer_config = { "k_proj": {"bits": 8}, # part name "lm_head": {"bits": 16}, # full name @@ -166,7 +152,7 @@ def test_mixed_ar_format_part_name_hf_loading(self): scheme="W4A16", iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = "self.save_dir" @@ -216,7 +202,7 @@ def test_mixed_ar_format_part_name_hf_loading(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mixed_MXFP_autoround_format_loading(self): + def test_mixed_MXFP_autoround_format_loading(self, dataloader): layer_config = { "k_proj": {"bits": 8, "act_bits": 8}, "lm_head": {"bits": 16, "act_bits": 16}, @@ -227,7 +213,7 @@ def test_mixed_MXFP_autoround_format_loading(self): scheme="MXFP4", iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = self.save_dir @@ -245,7 +231,7 @@ def test_mixed_MXFP_autoround_format_loading(self): self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.32) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mixed_autoround_format_vllm(self): + def test_mixed_autoround_format_vllm(self, dataloader): layer_config = { "self_attn": {"bits": 8}, "lm_head": {"bits": 16}, @@ -256,7 +242,7 @@ def test_mixed_autoround_format_vllm(self): scheme="W4A16", iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) autoround.quantize() @@ -285,7 +271,7 @@ def test_mixed_autoround_format_vllm(self): print(f"{prompt}: {generated_text}") shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mixed_llmcompressor_format_vllm(self): + def test_mixed_llmcompressor_format_vllm(self, dataloader): layer_config = { "self_attn": {"bits": 16, "act_bits": 16}, "lm_head": {"bits": 16, "act_bits": 16}, @@ -296,7 +282,7 @@ def test_mixed_llmcompressor_format_vllm(self): scheme="NVFP4", iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = self.save_dir @@ -323,7 +309,3 @@ def test_mixed_llmcompressor_format_vllm(self): print(f"{prompt}: {generated_text}") assert "!!!" not in generated_text shutil.rmtree(quantized_model_path, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_multiple_card.py b/test/test_cuda/test_multiple_card.py index 5dac584fe..2f29f7a37 100644 --- a/test/test_cuda/test_multiple_card.py +++ b/test/test_cuda/test_multiple_card.py @@ -1,11 +1,7 @@ import re import shutil -import sys -import unittest - -sys.path.insert(0, "../..") - +import pytest import torch from lm_eval.utils import make_table # pylint: disable=E0401 from transformers import AutoModelForCausalLM, AutoTokenizer @@ -27,14 +23,14 @@ def get_accuracy(data): # import os # os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" -class TestAutoRound(unittest.TestCase): +class TestAutoRound: @classmethod - def setUpClass(self): + def setup_class(self): self.save_dir = "./saved" self.tasks = "lambada_openai" @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree(self.save_dir, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @@ -361,24 +357,20 @@ def test_mllm_device_map(self): device_map = "0,1" ar = AutoRoundMLLM(model_name, device_map=device_map) - self.assertEqual(ar.device, "cuda:0") - self.assertEqual(ar.device_map, device_map) + assert ar.device == "cuda:0" + assert ar.device_map == device_map device_map = 1 ar = AutoRoundMLLM(ar.model, ar.tokenizer, processor=ar.processor, device_map=device_map) - self.assertEqual(ar.device, "cuda:1") - self.assertEqual(ar.device_map, device_map) + assert ar.device == "cuda:1" + assert ar.device_map == device_map device_map = "auto" ar = AutoRoundMLLM(ar.model, ar.tokenizer, processor=ar.processor, device_map=device_map) - self.assertEqual(ar.device, "cuda") - self.assertEqual(ar.device_map, device_map) + assert ar.device == "cuda" + assert ar.device_map == device_map device_map = {"model.language_model.layers": 0, "model.visual.blocks": 1} ar = AutoRoundMLLM(ar.model, ar.tokenizer, processor=ar.processor, device_map=device_map) - self.assertEqual(ar.model.model.language_model.layers[0].self_attn.q_proj.tuning_device, "cuda:0") - self.assertEqual(ar.model.model.visual.blocks[0].mlp.fc1.tuning_device, "cuda:1") - - -if __name__ == "__main__": - unittest.main() + assert ar.model.model.language_model.layers[0].self_attn.q_proj.tuning_device == "cuda:0" + assert ar.model.model.visual.blocks[0].mlp.fc1.tuning_device == "cuda:1" diff --git a/test/test_cuda/test_multiple_card_calib.py b/test/test_cuda/test_multiple_card_calib.py index 490193532..410855c33 100644 --- a/test/test_cuda/test_multiple_card_calib.py +++ b/test/test_cuda/test_multiple_card_calib.py @@ -2,9 +2,8 @@ import re import shutil import sys -import unittest -sys.path.insert(0, "../..") +import pytest from auto_round.testing_utils import multi_card @@ -19,14 +18,14 @@ def get_accuracy(data): return 0.0 -class TestAutoRound(unittest.TestCase): +class TestAutoRound: @classmethod - def setUpClass(self): + def setup_class(self): self.save_dir = "./saved" self.tasks = "lambada_openai" @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @@ -40,7 +39,3 @@ def test_multiple_card_calib(self): ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_mxfp_nvfp.py b/test/test_cuda/test_mxfp_nvfp.py index 48dd27d9b..357afb0f3 100644 --- a/test/test_cuda/test_mxfp_nvfp.py +++ b/test/test_cuda/test_mxfp_nvfp.py @@ -1,9 +1,7 @@ import copy import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch import transformers from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer @@ -12,28 +10,18 @@ from auto_round.testing_utils import require_awq, require_optimum -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestAutoRound(unittest.TestCase): +class TestAutoRound: @classmethod - def setUpClass(self): + def setup_class(self): self.model_name = "facebook/opt-125m" self.save_dir = "./saved" - self.llm_dataloader = LLMDataLoader() @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_fp8input_mxfp4_llmcompressor_format(self): + def test_fp8input_mxfp4_llmcompressor_format(self, dataloader): model_name = "/models/Qwen3-0.6B-FP8" scheme = "mxfp4" ar = AutoRound( @@ -41,7 +29,7 @@ def test_fp8input_mxfp4_llmcompressor_format(self): iters=2, seqlen=2, scheme=scheme, - dataset=self.llm_dataloader, + dataset=dataloader, ) compressed_model, _ = ar.quantize_and_save(output_dir=self.save_dir, format="llm_compressor") tmp_layer = compressed_model.model.layers[3].self_attn.q_proj @@ -59,14 +47,14 @@ def test_fp8input_mxfp4_llmcompressor_format(self): ), f"Invalid MXFP4 quantization configuration: {quantization_config}" shutil.rmtree(self.save_dir, ignore_errors=True) - def test_nvfp4_llmcompressor_format(self): + def test_nvfp4_llmcompressor_format(self, dataloader): scheme = "nvfp4" autoround = AutoRound( self.model_name, scheme=scheme, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_dir compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") @@ -110,7 +98,7 @@ def test_nvfp4_llmcompressor_format(self): # if "France" in prompt: # assert "Paris" in generated_text - def test_nvfp4_moe_actmax_rtn(self): + def test_nvfp4_moe_actmax_rtn(self, dataloader): model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite" scheme = "nvfp4" autoround = AutoRound( @@ -119,13 +107,13 @@ def test_nvfp4_moe_actmax_rtn(self): iters=0, seqlen=2, nsamples=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() quantized_model_path = self.save_dir autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round") - def test_nvfp4_moe_actmax_ar(self): + def test_nvfp4_moe_actmax_ar(self, dataloader): model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite" scheme = "nvfp4" autoround = AutoRound( @@ -134,13 +122,13 @@ def test_nvfp4_moe_actmax_ar(self): iters=1, seqlen=2, nsamples=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) autoround.quantize() quantized_model_path = self.save_dir autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round") - def test_qwen_moe_quant_infer(self): + def test_qwen_moe_quant_infer(self, dataloader): model_name = "/models/Qwen1.5-MoE-A2.7B" layer_config = { "layers\.(?:[3-9]|1[0-9]|2[0-3])": {"bits": 16, "act_bits": 16}, @@ -152,7 +140,7 @@ def test_qwen_moe_quant_infer(self): iters=1, seqlen=2, nsamples=2, - dataset=self.llm_dataloader, + dataset=dataloader, layer_config=layer_config, ) quantized_model_path = self.save_dir @@ -165,7 +153,3 @@ def test_qwen_moe_quant_infer(self): print(result["results"]["piqa"]["acc,none"]) self.assertGreater(result["results"]["piqa"]["acc,none"], 0.7) shutil.rmtree(quantized_model_path, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_qbits.py b/test/test_cuda/test_qbits.py index d73d474d6..0ce3597db 100644 --- a/test/test_cuda/test_qbits.py +++ b/test/test_cuda/test_qbits.py @@ -1,48 +1,22 @@ import shutil -import sys -import unittest - -sys.path.insert(0, "../..") +import pytest from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound, AutoRoundConfig from auto_round.testing_utils import require_gptqmodel, require_itrex +from ..helpers import model_infer + -class TestAutoRound(unittest.TestCase): +class TestAutoRound: @classmethod - def setUpClass(self): + def setup_class(self): self.model_name = "/models/opt-125m" self.save_folder = "./saved" - def model_infer(self, model, tokenizer): - prompts = [ - "Hello,my name is", - # "The president of the United States is", - # "The capital of France is", - # "The future of AI is", - ] - - inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) - - outputs = model.generate( - input_ids=inputs["input_ids"].to(model.device), - attention_mask=inputs["attention_mask"].to(model.device), - do_sample=False, ## change this to follow official usage - max_new_tokens=5, - ) - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] - - decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - - for i, prompt in enumerate(prompts): - print(f"Prompt: {prompt}") - print(f"Generated: {decoded_outputs[i]}") - print("-" * 50) - @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("runs", ignore_errors=True) ## require torch 2.6 @@ -58,7 +32,7 @@ def test_load_gptq_model_8bits(self): quantization_config=quantization_config, ) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) @require_itrex def test_load_gptq_model_2bits(self): @@ -72,7 +46,7 @@ def test_load_gptq_model_2bits(self): quantization_config=quantization_config, ) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) @require_itrex def test_mixed_precision(self): diff --git a/test/test_cuda/test_scheme.py b/test/test_cuda/test_scheme.py index 1c603c7ed..06c5b27e0 100644 --- a/test/test_cuda/test_scheme.py +++ b/test/test_cuda/test_scheme.py @@ -1,22 +1,19 @@ import shutil -import sys -import unittest -from auto_round.schemes import QuantizationScheme - -sys.path.insert(0, "../..") +import pytest from auto_round import AutoRound +from auto_round.schemes import QuantizationScheme -class TestAutoRound(unittest.TestCase): +class TestAutoRound: @classmethod - def setUpClass(self): + def setup_class(self): self.model_name = "/models/opt-125m" self.save_folder = "./saved" @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree(self.save_folder, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @@ -24,59 +21,59 @@ def tearDownClass(self): def test_gguf(self): ar = AutoRound("/models/Qwen3-0.6B", scheme="W2A16", nsamples=1, iters=1) ar.quantize_and_save(self.save_folder, format="gguf:q4_k_m") - self.assertEqual(ar.bits, 4) + assert ar.bits == 4 shutil.rmtree(self.save_folder, ignore_errors=True) def test_w4a16(self): ar = AutoRound(self.model_name, scheme="W4A16", nsamples=1, iters=1) - self.assertEqual(ar.bits, 4) + assert ar.bits == 4 ar.quantize() def test_w2a16(self): ar = AutoRound(self.model_name, scheme="W2A16", nsamples=1, iters=1) - self.assertEqual(ar.bits, 2) + assert ar.bits == 2 ar.quantize() def test_mxfp4(self): ar = AutoRound(self.model_name, scheme="MXFP4", nsamples=1, iters=1) - self.assertEqual(ar.bits, 4) - self.assertEqual(ar.act_bits, 4) - self.assertEqual(ar.data_type, "mx_fp") - self.assertEqual(ar.act_data_type, "mx_fp_rceil") + assert ar.bits == 4 + assert ar.act_bits == 4 + assert ar.data_type == "mx_fp" + assert ar.act_data_type == "mx_fp_rceil" ar.quantize() def test_fp8_static(self): ar = AutoRound(self.model_name, scheme="FP8_STATIC", nsamples=1, iters=1) - self.assertEqual(ar.bits, 8) - self.assertEqual(ar.act_bits, 8) - self.assertEqual(ar.data_type, "fp") - self.assertEqual(ar.act_data_type, "fp") - self.assertEqual(ar.group_size, -1) - self.assertEqual(ar.act_dynamic, False) + assert ar.bits == 8 + assert ar.act_bits == 8 + assert ar.data_type == "fp" + assert ar.act_data_type == "fp" + assert ar.group_size == -1 + assert ar.act_dynamic == False ar.quantize() ## RTN tests def test_w2a16_rtn(self): ar = AutoRound(self.model_name, scheme="W2A16", nsamples=1, iters=0) - self.assertEqual(ar.bits, 2) + assert ar.bits == 2 ar.quantize() def test_mxfp4_rtn(self): ar = AutoRound(self.model_name, scheme="MXFP4", nsamples=1, iters=0) - self.assertEqual(ar.bits, 4) - self.assertEqual(ar.act_bits, 4) - self.assertEqual(ar.data_type, "mx_fp") - self.assertEqual(ar.act_data_type, "mx_fp_rceil") + assert ar.bits == 4 + assert ar.act_bits == 4 + assert ar.data_type == "mx_fp" + assert ar.act_data_type == "mx_fp_rceil" ar.quantize() def test_fp8_static_rtn(self): ar = AutoRound(self.model_name, scheme="FP8_STATIC", nsamples=1, iters=0) - self.assertEqual(ar.bits, 8) - self.assertEqual(ar.act_bits, 8) - self.assertEqual(ar.data_type, "fp") - self.assertEqual(ar.act_data_type, "fp") - self.assertEqual(ar.group_size, -1) - self.assertEqual(ar.act_dynamic, False) + assert ar.bits == 8 + assert ar.act_bits == 8 + assert ar.data_type == "fp" + assert ar.act_data_type == "fp" + assert ar.group_size == -1 + assert ar.act_dynamic == False ar.quantize() def test_scheme_in_layer_config(self): @@ -90,14 +87,10 @@ def test_scheme_in_layer_config(self): ar.quantize() for n, m in ar.model.named_modules(): if n == "model.decoder.layers.2.self_attn.q_proj": - self.assertEqual(m.bits, 2) + assert m.bits == 2 if n == "model.decoder.layers.2.self_attn.k_proj": - self.assertEqual(m.bits, 2) + assert m.bits == 2 if n == "model.decoder.layers.3.self_attn.v_proj": - self.assertEqual(m.bits, 8) + assert m.bits == 8 if n == "model.decoder.layers.4.self_attn.k_proj": - self.assertEqual(m.group_size, 64) - - -if __name__ == "__main__": - unittest.main() + assert m.group_size == 64 diff --git a/test/test_cuda/test_support_vlms.py b/test/test_cuda/test_support_vlms.py index 5a2759021..15c86363b 100644 --- a/test/test_cuda/test_support_vlms.py +++ b/test/test_cuda/test_support_vlms.py @@ -1,10 +1,8 @@ import os import shutil import sys -import unittest - -sys.path.insert(0, "../..") +import pytest import requests from PIL import Image @@ -12,15 +10,15 @@ from auto_round.testing_utils import require_gptqmodel, require_package_version_ut, require_vlm_env -class TestSupportVLMS(unittest.TestCase): +class TestSupportVLMS: @classmethod - def setUpClass(self): + def setup_class(self): self.save_dir = os.path.join(os.path.dirname(__file__), "ut_saved") self.python_path = sys.executable self.device = 0 @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree(self.save_dir, ignore_errors=True) @require_gptqmodel @@ -192,7 +190,3 @@ def test_granite_vision(self): f"--model {model_path} --iter 1 --output_dir {self.save_dir} --device {self.device}" ) self.assertFalse(res > 0 or res == -1, msg="granite-vision-3.2-2b tuning fail") - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_torch_backend.py b/test/test_cuda/test_torch_backend.py index 3f7cb4141..495da24e3 100644 --- a/test/test_cuda/test_torch_backend.py +++ b/test/test_cuda/test_torch_backend.py @@ -1,12 +1,6 @@ import shutil -import sys -import unittest import pytest - -sys.path.insert(0, "../..") - - import torch from transformers import AutoModelForCausalLM, AutoTokenizer @@ -14,56 +8,22 @@ from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.testing_utils import require_autogptq, require_gptqmodel +from ..helpers import model_infer -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - -class TestAutoRoundTorchBackend(unittest.TestCase): +class TestAutoRoundTorchBackend: @classmethod - def setUpClass(self): + def setup_class(self): self.model_name = "/models/opt-125m" self.save_folder = "./saved" - self.llm_dataloader = LLMDataLoader() - - def model_infer(self, model, tokenizer): - prompts = [ - "Hello,my name is", - # "The president of the United States is", - # "The capital of France is", - # "The future of AI is", - ] - - inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) - - outputs = model.generate( - input_ids=inputs["input_ids"].to(model.device), - attention_mask=inputs["attention_mask"].to(model.device), - do_sample=False, ## change this to follow official usage - max_new_tokens=5, - ) - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] - - decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - - for i, prompt in enumerate(prompts): - print(f"Prompt: {prompt}") - print(f"Generated: {decoded_outputs[i]}") - print("-" * 50) - return decoded_outputs[0] @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree(self.save_folder, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_torch_4bits_asym(self): + def test_torch_4bits_asym(self, dataloader): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, False @@ -75,7 +35,7 @@ def test_torch_4bits_asym(self): sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_folder autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel") @@ -86,7 +46,7 @@ def test_torch_4bits_asym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35) @@ -97,14 +57,14 @@ def test_torch_4bits_asym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35) torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) - def test_torch_4bits_sym(self): + def test_torch_4bits_sym(self, dataloader): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True @@ -116,7 +76,7 @@ def test_torch_4bits_sym(self): sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_folder autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") ##will convert to gptq model @@ -127,13 +87,9 @@ def test_torch_4bits_sym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.28) torch.cuda.empty_cache() shutil.rmtree(self.save_folder, ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_transformers.py b/test/test_cuda/test_transformers.py index 6f953339d..0e43a7e70 100644 --- a/test/test_cuda/test_transformers.py +++ b/test/test_cuda/test_transformers.py @@ -14,8 +14,8 @@ import gc import os import tempfile -import unittest +import pytest from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from transformers.testing_utils import ( require_accelerate, @@ -34,7 +34,7 @@ # @slow @require_torch_gpu @require_accelerate -class AutoRoundTest(unittest.TestCase): +class AutoRoundTest: model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc" input_text = "There is a girl who likes adventure," EXPECTED_OUTPUTS = set() @@ -53,7 +53,7 @@ class AutoRoundTest(unittest.TestCase): # called only once for all test in this class @classmethod - def setUpClass(cls): + def setup_class(cls): """ Setup quantized model """ @@ -203,7 +203,3 @@ def test_mixed_bits(self): text = "There is a girl who likes adventure," inputs = tokenizer(text, return_tensors="pt").to(model.device) tokenizer.decode(model.generate(**inputs, max_new_tokens=5)[0]) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_triton_backend.py b/test/test_cuda/test_triton_backend.py index 7cbc8719d..38958014b 100644 --- a/test/test_cuda/test_triton_backend.py +++ b/test/test_cuda/test_triton_backend.py @@ -1,8 +1,6 @@ import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch from transformers import AutoModelForCausalLM, AutoTokenizer @@ -10,56 +8,22 @@ from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.testing_utils import require_greater_than_050 +from ..helpers import model_infer -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestAutoRoundTritonBackend(unittest.TestCase): +class TestAutoRoundTritonBackend: @classmethod - def setUpClass(self): + def setup_class(self): self.model_name = "/models/opt-125m" self.save_folder = "./saved" - self.llm_dataloader = LLMDataLoader() - - def model_infer(self, model, tokenizer): - prompts = [ - "Hello,my name is", - # "The president of the United States is", - # "The capital of France is", - # "The future of AI is", - ] - - inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) - - outputs = model.generate( - input_ids=inputs["input_ids"].to(model.device), - attention_mask=inputs["attention_mask"].to(model.device), - do_sample=False, ## change this to follow official usage - max_new_tokens=5, - ) - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] - - decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - - for i, prompt in enumerate(prompts): - print(f"Prompt: {prompt}") - print(f"Generated: {decoded_outputs[i]}") - print("-" * 50) - return decoded_outputs[0] @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @require_greater_than_050 - def test_tritonv2_4bits_asym(self): + def test_tritonv2_4bits_asym(self, dataloader): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, False @@ -71,7 +35,7 @@ def test_tritonv2_4bits_asym(self): sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_folder autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel") @@ -82,7 +46,7 @@ def test_tritonv2_4bits_asym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.34) @@ -93,7 +57,7 @@ def test_tritonv2_4bits_asym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.34) @@ -115,7 +79,7 @@ def test_tritonv2_2bits_asym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.19) @@ -126,7 +90,7 @@ def test_tritonv2_2bits_asym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.19) @@ -134,7 +98,7 @@ def test_tritonv2_2bits_asym(self): shutil.rmtree("./saved", ignore_errors=True) @require_greater_than_050 - def test_tritonv2_4bits_sym(self): + def test_tritonv2_4bits_sym(self, dataloader): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True @@ -146,7 +110,7 @@ def test_tritonv2_4bits_sym(self): sym=sym, iters=1, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = self.save_folder autoround.quantize_and_save(output_dir=quantized_model_path) @@ -157,7 +121,7 @@ def test_tritonv2_4bits_sym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.26) @@ -168,7 +132,7 @@ def test_tritonv2_4bits_sym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.26) @@ -191,7 +155,7 @@ def test_tritonv2_8bits_sym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27) @@ -202,7 +166,7 @@ def test_tritonv2_8bits_sym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27) @@ -230,7 +194,7 @@ def test_tritonv2_2bits_sym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.18) @@ -241,13 +205,9 @@ def test_tritonv2_2bits_sym(self): ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.18) torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_cuda/test_vlms.py b/test/test_cuda/test_vlms.py index d06c48ff5..bfc7cf52c 100644 --- a/test/test_cuda/test_vlms.py +++ b/test/test_cuda/test_vlms.py @@ -2,26 +2,22 @@ import os import re import shutil -import sys -import unittest +import pytest import requests - -sys.path.insert(0, "../..") - from PIL import Image from auto_round import AutoRoundConfig from auto_round.testing_utils import require_gptqmodel, require_optimum, require_vlm_env -class TestAutoRound(unittest.TestCase): +class TestAutoRound: @classmethod - def setUpClass(self): + def setup_class(self): self.save_dir = "./saved" @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree(self.save_dir, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @@ -155,7 +151,3 @@ def test_mllm_detect(self): self.assertFalse(is_mllm_model(model_name)) model, _ = llm_load_model(model_name) self.assertFalse(is_mllm_model(model)) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_hpu/__init__.py b/test/test_hpu/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/test_hpu/test_auto_round.py b/test/test_hpu/test_auto_round.py index 2bb7983e5..eb6066982 100644 --- a/test/test_hpu/test_auto_round.py +++ b/test/test_hpu/test_auto_round.py @@ -1,9 +1,10 @@ import pytest import torch -from _test_helpers import is_pytest_mode_compile, is_pytest_mode_lazy from auto_round.utils import is_hpex_available +from ..helpers import is_pytest_mode_compile, is_pytest_mode_lazy + def run_opt_125m_on_hpu(): from transformers import AutoModelForCausalLM, AutoTokenizer diff --git a/test/test_hpu/test_inference.py b/test/test_hpu/test_inference.py index e0a0ef321..95c680c2d 100644 --- a/test/test_hpu/test_inference.py +++ b/test/test_hpu/test_inference.py @@ -1,23 +1,12 @@ import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - def is_hpex_available(): try: import habana_frameworks.torch.core as htcore # pylint: disable=E0401 @@ -28,16 +17,15 @@ def is_hpex_available(): # TODO: This test case is temporarily commented out since it not tested for a long time. We need to add it back and change it into pytest format. -# class TestAutoRound(unittest.TestCase): +# class TestAutoRound: # @classmethod -# def setUpClass(self): +# def setup_class(self): # model_name = "facebook/opt-125m" # self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) # self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) -# self.llm_dataloader = LLMDataLoader() # @classmethod -# def tearDownClass(self): +# def teardown_class(self): # shutil.rmtree("./saved", ignore_errors=True) # shutil.rmtree("runs", ignore_errors=True) @@ -57,7 +45,7 @@ def is_hpex_available(): # sym=sym, # iters=2, # seqlen=2, -# dataset=self.llm_dataloader, +# dataset=dataloader, # ) # autoround.quantize() # quantized_model_path = "./saved" @@ -86,7 +74,7 @@ def is_hpex_available(): # sym=sym, # iters=2, # seqlen=2, -# dataset=self.llm_dataloader, +# dataset=dataloader, # ) # autoround.quantize() # quantized_model_path = "./saved" diff --git a/test/test_xpu/__init__.py b/test/test_xpu/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/test_xpu/test_autoround.py b/test/test_xpu/test_autoround.py index 8052a8af0..b9894cecf 100644 --- a/test/test_xpu/test_autoround.py +++ b/test/test_xpu/test_autoround.py @@ -1,9 +1,7 @@ import copy import shutil -import sys -import unittest -sys.path.insert(0, "../..") +import pytest import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer @@ -11,23 +9,13 @@ from auto_round import AutoRound, AutoRoundConfig -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(3): - yield torch.ones([1, 10], dtype=torch.long) - - -class TestAutoRoundXPU(unittest.TestCase): +class TestAutoRoundXPU: @classmethod - def setUpClass(self): + def setup_class(self): - self.llm_dataloader = LLMDataLoader() @classmethod - def tearDownClass(self): + def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) pass @@ -48,7 +36,7 @@ def test_gptq_format(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path) @@ -80,7 +68,7 @@ def test_awq_format(self): sym=sym, iters=2, seqlen=2, - dataset=self.llm_dataloader, + dataset=dataloader, ) quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq") @@ -97,7 +85,3 @@ def test_awq_format(self): res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]) print(res) assert "!!!" not in res - - -if __name__ == "__main__": - unittest.main() From 25694c0ea722cbd920085b149ffb5e10c75b0773 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Thu, 18 Dec 2025 04:11:18 -0500 Subject: [PATCH 02/24] add readme Signed-off-by: He, Xin3 --- test/README.md | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/test/README.md b/test/README.md index e69de29bb..f5bc6ddba 100644 --- a/test/README.md +++ b/test/README.md @@ -0,0 +1,46 @@ +# Unit Test (UT) Guide + +This project uses `pytest` for unit testing. All test cases are under the `test/` directory. Below is a simple guide for new users to write and run UTs: + +## 1. Environment Setup +- Recommended Python 3.8 or above. +- Install dependencies: + ```sh + pip install -r ../requirements.txt + pip install pytest + ``` + +## 2. Test Structure +- Place your test files in the `test/` directory, and name them starting with `test_`. +- You can refer to existing `test_*.py` files. +- Common fixtures (such as `tiny_opt_model`, `opt_model`, `opt_tokenizer`, `dataloader`) and helper functions (such as `model_infer`) are defined in `confest.py` and `helpers.py` and can be imported directly. +- Example: + ```python + # test_example.py + from ..helper import model_infer + + def test_model_infer(tiny_opt_model, opt_tokenizer): + result = model_infer(tiny_opt_model, opt_tokenizer, input_text="hello world") + assert result is not None + ``` + +## 3. Running Tests +- In the `test/` directory, run: + ```sh + pytest + ``` +- You can specify a single file or test case: + ```sh + pytest test_xxx.py + pytest -k "test_func_name" + ``` + +## 4. Debugging Tips +- `confest.py` adds the parent directory to `sys.path`, so you can debug without installing the local package. +- You can directly import project source code in your test cases. + +## 5. Reference +- Fixtures are defined in `confest.py` and `fixtures.py` +- Helper functions are in `helpers.py` + +If you have any questions, feel free to open an issue. From 0f20e5ffc028cd60125358fc3b2a44e68223728d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 18 Dec 2025 09:17:55 +0000 Subject: [PATCH 03/24] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- test/helpers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/helpers.py b/test/helpers.py index 97870eba6..907507e45 100644 --- a/test/helpers.py +++ b/test/helpers.py @@ -2,7 +2,6 @@ import pytest - # Automatic choose local path or model name. opt_name_or_path = "/tf_dataset/auto_round/models/facebook/opt-125m" if not os.path.exists(opt_name_or_path): From 28ccfae32d852449754339c8de83b3b5cca27e11 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 18 Dec 2025 04:22:01 -0500 Subject: [PATCH 04/24] add get_model_path func Signed-off-by: n1ck-guo --- test/helpers.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test/helpers.py b/test/helpers.py index 907507e45..e7c40ffe1 100644 --- a/test/helpers.py +++ b/test/helpers.py @@ -20,6 +20,18 @@ gptj_name_or_path = "hf-internal-testing/tiny-random-GPTJForCausalLM" +def get_model_path(model_name: str) -> str: + ut_path = f"/tf_dataset/auto_round/models/{model_name}" + local_path = f"/models/{model_name.split('/')[-1]}" + + if os.path.exists(ut_path): + return ut_path + elif os.path.exists(local_path): + return local_path + else: + return model_name + + # HPU mode checking def is_pytest_mode_compile(): return pytest.mode == "compile" From b9a177d01711d3cc91a40e9bc05ea0ce32740743 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Thu, 18 Dec 2025 04:39:00 -0500 Subject: [PATCH 05/24] add more fixtures Signed-off-by: He, Xin3 --- test/fixtures.py | 84 +++++++++++++++++++++++++++++++++++------------- test/helpers.py | 47 ++++++++++++++++----------- 2 files changed, 90 insertions(+), 41 deletions(-) diff --git a/test/fixtures.py b/test/fixtures.py index 615e579a8..a96010060 100644 --- a/test/fixtures.py +++ b/test/fixtures.py @@ -4,7 +4,13 @@ import torch import transformers -from .helpers import opt_name_or_path +from .helpers import ( + opt_name_or_path, + qwen_name_or_path, + lamini_name_or_path, + gptj_name_or_path, + get_tiny_model, +) class DataLoader: @@ -16,45 +22,77 @@ def __iter__(self): yield torch.ones([1, 10], dtype=torch.long) +# Create tiny model path fixtures for testing @pytest.fixture(scope="session") def tiny_opt_model_path(): - tiny_opt_model_path = "./tmp_tiny_opt_model_path" - model = transformers.AutoModelForCausalLM.from_pretrained(opt_name_or_path, dtype="auto", trust_remote_code=True) - model.config.num_hidden_layers = 3 - setattr(model.model.decoder, "layers", model.model.decoder.layers[:3]) - tokenizer = transformers.AutoTokenizer.from_pretrained(opt_name_or_path, trust_remote_code=True) - model.save_pretrained(tiny_opt_model_path) - tokenizer.save_pretrained(tiny_opt_model_path) - print("[Fixture]: built tiny model path for testing in session") - yield tiny_opt_model_path - shutil.rmtree(tiny_opt_model_path) + model_name_or_path = opt_name_or_path + tiny_model_path = "./tmp_tiny_opt_model_path" + model = get_tiny_model(model_name_or_path, num_layers=3) + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) + model.save_pretrained(tiny_model_path) + tokenizer.save_pretrained(tiny_model_path) + print(f"[Fixture]: built tiny model path:{tiny_model_path} for testing in session") + yield tiny_model_path + shutil.rmtree(tiny_model_path) -@pytest.fixture(scope="function") -def tiny_opt_model(): - model = transformers.AutoModelForCausalLM.from_pretrained(opt_name_or_path, dtype="auto", trust_remote_code=True) - model.config.num_hidden_layers = 3 - setattr(model.model.decoder, "layers", model.model.decoder.layers[:3]) - return model +@pytest.fixture(scope="session") +def tiny_qwen_model_path(): + model_name_or_path = qwen_name_or_path + tiny_model_path = "./tmp_tiny_qwen_model_path" + model = get_tiny_model(model_name_or_path, num_layers=3) + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) + model.save_pretrained(tiny_model_path) + tokenizer.save_pretrained(tiny_model_path) + print(f"[Fixture]: built tiny model path:{tiny_model_path} for testing in session") + yield tiny_model_path + shutil.rmtree(tiny_model_path) +@pytest.fixture(scope="session") +def tiny_lamini_model_path(): + model_name_or_path = lamini_name_or_path + tiny_model_path = "./tmp_tiny_lamini_model_path" + model = get_tiny_model(model_name_or_path, num_layers=3) + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) + model.save_pretrained(tiny_model_path) + tokenizer.save_pretrained(tiny_model_path) + print(f"[Fixture]: built tiny model path:{tiny_model_path} for testing in session") + yield tiny_model_path + shutil.rmtree(tiny_model_path) + + +@pytest.fixture(scope="session") +def tiny_gptj_model_path(): + model_name_or_path = gptj_name_or_path + tiny_model_path = "./tmp_tiny_gptj_model_path" + model = get_tiny_model(model_name_or_path, num_layers=3) + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) + model.save_pretrained(tiny_model_path) + tokenizer.save_pretrained(tiny_model_path) + print(f"[Fixture]: built tiny model path:{tiny_model_path} for testing in session") + yield tiny_model_path + shutil.rmtree(tiny_model_path) + + +# Create objective fixtures for testing @pytest.fixture(scope="function") def tiny_opt_model(): - model = transformers.AutoModelForCausalLM.from_pretrained(opt_name_or_path, dtype="auto", trust_remote_code=True) - model.config.num_hidden_layers = 3 - setattr(model.model.decoder, "layers", model.model.decoder.layers[:3]) - return model + model_name_or_path = opt_name_or_path + return get_tiny_model(model_name_or_path, num_layers=3) @pytest.fixture(scope="function") def opt_model(): - model = transformers.AutoModelForCausalLM.from_pretrained(opt_name_or_path, dtype="auto", trust_remote_code=True) + model_name_or_path = opt_name_or_path + model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype="auto", trust_remote_code=True) return model @pytest.fixture(scope="session") def opt_tokenizer(): - tokenizer = transformers.AutoTokenizer.from_pretrained(opt_name_or_path, trust_remote_code=True) + model_name_or_path = opt_name_or_path + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) return tokenizer diff --git a/test/helpers.py b/test/helpers.py index e7c40ffe1..2a78551fc 100644 --- a/test/helpers.py +++ b/test/helpers.py @@ -1,25 +1,10 @@ import os - import pytest - -# Automatic choose local path or model name. -opt_name_or_path = "/tf_dataset/auto_round/models/facebook/opt-125m" -if not os.path.exists(opt_name_or_path): - opt_name_or_path = "facebook/opt-125m" - -qwen_name_or_path = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B" -if not os.path.exists(qwen_name_or_path): - qwen_name_or_path = "Qwen/Qwen3-0.6B" - -lamini_name_or_path = "/tf_dataset/auto_round/models/MBZUAI/LaMini-GPT-124M" -if not os.path.exists(lamini_name_or_path): - lamini_name_or_path = "MBZUAI/LaMini-GPT-124M" - -gptj_name_or_path = "/tf_dataset/auto_round/models/hf-internal-testing/tiny-random-GPTJForCausalLM" -if not os.path.exists(gptj_name_or_path): - gptj_name_or_path = "hf-internal-testing/tiny-random-GPTJForCausalLM" +import torch +import transformers +# Automatic choose local path or model name. def get_model_path(model_name: str) -> str: ut_path = f"/tf_dataset/auto_round/models/{model_name}" local_path = f"/models/{model_name.split('/')[-1]}" @@ -31,6 +16,32 @@ def get_model_path(model_name: str) -> str: else: return model_name +opt_name_or_path = get_model_path("facebook/opt-125m") +qwen_name_or_path = get_model_path("Qwen/Qwen3-0.6B") +lamini_name_or_path = get_model_path("MBZUAI/LaMini-GPT-124M") +gptj_name_or_path = get_model_path("hf-internal-testing/tiny-random-GPTJForCausalLM") + + +# Slice model into tiny model for speedup +def get_tiny_model(model_name_or_path, num_layers=3): + model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype="auto", trust_remote_code=True) + + if hasattr(model.config, "num_hidden_layers"): + model.config.num_hidden_layers = num_layers + + def slice_layers(module): + for name, child in module.named_children(): + if isinstance(child, torch.nn.ModuleList) and len(child) > num_layers: + new_layers = torch.nn.ModuleList(child[:num_layers]) + setattr(module, name, new_layers) + return True + if slice_layers(child): + return True + return False + + slice_layers(model) + return model + # HPU mode checking def is_pytest_mode_compile(): From 07b741e2034b7d625af96c03396caeebf9f42b8f Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Thu, 18 Dec 2025 20:30:29 -0500 Subject: [PATCH 06/24] fix bug Signed-off-by: He, Xin3 --- test/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/conftest.py b/test/conftest.py index ebe377e48..bdaf69b8f 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -6,10 +6,10 @@ from .fixtures import ( dataloader, - model, + opt_model, tiny_opt_model, tiny_opt_model_path, - tokenizer, + opt_tokenizer, ) from .helpers import model_infer From 03917948b05e0537badcb5e70a918cdef5180086 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 19 Dec 2025 01:31:39 +0000 Subject: [PATCH 07/24] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- test/conftest.py | 2 +- test/fixtures.py | 6 +++--- test/helpers.py | 4 +++- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/test/conftest.py b/test/conftest.py index bdaf69b8f..4b0f4709f 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -7,9 +7,9 @@ from .fixtures import ( dataloader, opt_model, + opt_tokenizer, tiny_opt_model, tiny_opt_model_path, - opt_tokenizer, ) from .helpers import model_infer diff --git a/test/fixtures.py b/test/fixtures.py index a96010060..9f46196c2 100644 --- a/test/fixtures.py +++ b/test/fixtures.py @@ -5,11 +5,11 @@ import transformers from .helpers import ( + get_tiny_model, + gptj_name_or_path, + lamini_name_or_path, opt_name_or_path, qwen_name_or_path, - lamini_name_or_path, - gptj_name_or_path, - get_tiny_model, ) diff --git a/test/helpers.py b/test/helpers.py index 2a78551fc..a82e1ed28 100644 --- a/test/helpers.py +++ b/test/helpers.py @@ -1,4 +1,5 @@ import os + import pytest import torch import transformers @@ -16,6 +17,7 @@ def get_model_path(model_name: str) -> str: else: return model_name + opt_name_or_path = get_model_path("facebook/opt-125m") qwen_name_or_path = get_model_path("Qwen/Qwen3-0.6B") lamini_name_or_path = get_model_path("MBZUAI/LaMini-GPT-124M") @@ -28,7 +30,7 @@ def get_tiny_model(model_name_or_path, num_layers=3): if hasattr(model.config, "num_hidden_layers"): model.config.num_hidden_layers = num_layers - + def slice_layers(module): for name, child in module.named_children(): if isinstance(child, torch.nn.ModuleList) and len(child) > num_layers: From d35b14834e5cea1ebd943081d11fd34169b5f27a Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Thu, 18 Dec 2025 21:35:24 -0500 Subject: [PATCH 08/24] fix few bugs Signed-off-by: n1ck-guo --- test/conftest.py | 3 +++ test/fixtures.py | 13 +++++++--- test/helpers.py | 2 ++ test/test_cuda/test_2_3bits.py | 46 ++++++++++++++++------------------ test/test_cuda/test_alg_ext.py | 42 +++++++++++++++++++------------ 5 files changed, 62 insertions(+), 44 deletions(-) diff --git a/test/conftest.py b/test/conftest.py index 4b0f4709f..109b504e8 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -8,8 +8,11 @@ dataloader, opt_model, opt_tokenizer, + tiny_gptj_model_path, + tiny_lamini_model_path, tiny_opt_model, tiny_opt_model_path, + tiny_qwen_model_path, ) from .helpers import model_infer diff --git a/test/fixtures.py b/test/fixtures.py index 9f46196c2..005f321d0 100644 --- a/test/fixtures.py +++ b/test/fixtures.py @@ -1,3 +1,4 @@ +import os import shutil import pytest @@ -26,7 +27,8 @@ def __iter__(self): @pytest.fixture(scope="session") def tiny_opt_model_path(): model_name_or_path = opt_name_or_path - tiny_model_path = "./tmp_tiny_opt_model_path" + test_path = os.path.dirname(__file__) + tiny_model_path = os.path.join(test_path, "tmp_tiny_opt_model_path") model = get_tiny_model(model_name_or_path, num_layers=3) tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) model.save_pretrained(tiny_model_path) @@ -39,7 +41,8 @@ def tiny_opt_model_path(): @pytest.fixture(scope="session") def tiny_qwen_model_path(): model_name_or_path = qwen_name_or_path - tiny_model_path = "./tmp_tiny_qwen_model_path" + test_path = os.path.dirname(__file__) + tiny_model_path = os.path.join(test_path, "tmp_tiny_qwen_model_path") model = get_tiny_model(model_name_or_path, num_layers=3) tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) model.save_pretrained(tiny_model_path) @@ -52,7 +55,8 @@ def tiny_qwen_model_path(): @pytest.fixture(scope="session") def tiny_lamini_model_path(): model_name_or_path = lamini_name_or_path - tiny_model_path = "./tmp_tiny_lamini_model_path" + test_path = os.path.dirname(__file__) + tiny_model_path = os.path.join(test_path, "tmp_tiny_lamini_model_path") model = get_tiny_model(model_name_or_path, num_layers=3) tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) model.save_pretrained(tiny_model_path) @@ -65,7 +69,8 @@ def tiny_lamini_model_path(): @pytest.fixture(scope="session") def tiny_gptj_model_path(): model_name_or_path = gptj_name_or_path - tiny_model_path = "./tmp_tiny_gptj_model_path" + test_path = os.path.dirname(__file__) + tiny_model_path = os.path.join(test_path, "tmp_tiny_gptj_model_path") model = get_tiny_model(model_name_or_path, num_layers=3) tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) model.save_pretrained(tiny_model_path) diff --git a/test/helpers.py b/test/helpers.py index a82e1ed28..d67f85599 100644 --- a/test/helpers.py +++ b/test/helpers.py @@ -42,6 +42,8 @@ def slice_layers(module): return False slice_layers(model) + if hasattr(model.config, "layer_types"): + model.config.layer_types = model.config.layer_types[:num_layers] return model diff --git a/test/test_cuda/test_2_3bits.py b/test/test_cuda/test_2_3bits.py index f12bf240c..1b305f494 100644 --- a/test/test_cuda/test_2_3bits.py +++ b/test/test_cuda/test_2_3bits.py @@ -12,7 +12,7 @@ from auto_round.eval.evaluation import simple_evaluate, simple_evaluate_user_model from auto_round.testing_utils import require_autogptq, require_greater_than_050, require_greater_than_051 -from ..helpers import model_infer +from ..helpers import get_model_path, model_infer def get_accuracy(data): @@ -26,22 +26,26 @@ def get_accuracy(data): class TestAutoRound: - @classmethod - def setup_class(self): - self.save_dir = "./saved" - self.tasks = "lambada_openai" + save_dir = "./saved" + tasks = "lambada_openai" - @classmethod - def teardown_class(self): + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + + # Yield to hand control to the test methods + yield + + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @require_greater_than_051 def test_3bits_autoround(self): - model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") - tokenizer = AutoTokenizer.from_pretrained(model_name) - autoround = AutoRound(model, tokenizer, bits=3) + model_name = get_model_path("facebook/opt-125m") + autoround = AutoRound(model_name, bits=3) quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") ##will convert to gptq model @@ -54,15 +58,13 @@ def test_3bits_autoround(self): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.3) ## 0.3130 + assert result["results"]["lambada_openai"]["acc,none"] > 0.3 @require_greater_than_051 def test_3bits_asym_autoround(self): - model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") - tokenizer = AutoTokenizer.from_pretrained(model_name) + model_name = get_model_path("facebook/opt-125m") bits, sym = 3, False - autoround = AutoRound(model, tokenizer, bits=bits, sym=sym) + autoround = AutoRound(model_name, bits=bits, sym=sym) autoround.quantize_and_save(self.save_dir, format="auto_round", inplace=False) model_args = f"pretrained={self.save_dir}" res = simple_evaluate( @@ -80,10 +82,8 @@ def test_3bits_asym_autoround(self): @require_greater_than_050 def test_norm_bias_tuning(self): - model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") - tokenizer = AutoTokenizer.from_pretrained(model_name) - autoround = AutoRound(model, tokenizer, bits=2, group_size=64, enable_norm_bias_tuning=True) + model_name = get_model_path("facebook/opt-125m") + autoround = AutoRound(model_name, bits=2, group_size=64, enable_norm_bias_tuning=True) autoround.quantize() ##test auto_round format @@ -97,10 +97,8 @@ def test_norm_bias_tuning(self): @require_greater_than_050 def test_2bits_autoround(self): - model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") - tokenizer = AutoTokenizer.from_pretrained(model_name) - autoround = AutoRound(model, tokenizer, bits=2, group_size=64) + model_name = get_model_path("facebook/opt-125m") + autoround = AutoRound(model_name, bits=2, group_size=64) autoround.quantize() ##test auto_round format diff --git a/test/test_cuda/test_alg_ext.py b/test/test_cuda/test_alg_ext.py index 499213c74..e13bfac4a 100644 --- a/test/test_cuda/test_alg_ext.py +++ b/test/test_cuda/test_alg_ext.py @@ -8,21 +8,27 @@ from auto_round import AutoRound, AutoRoundConfig from auto_round.eval.evaluation import simple_evaluate_user_model +from ..helpers import get_model_path + class TestAlgExt: + save_folder = "./saved" - @classmethod - def setup_class(self): - self.model_name = "/models/opt-125m" - self.save_folder = "./saved" + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") - @classmethod - def teardown_class(self): - shutil.rmtree(self.save_folder, ignore_errors=True) + # Yield to hand control to the test methods + yield + + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") + shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) def test_2bits(self): - model_name = "/models/opt-125m" + model_name = get_model_path("facebook/opt-125m") ar = AutoRound(model=model_name, bits=2, group_size=64, enable_alg_ext=True) ar.quantize_and_save(self.save_folder) model = AutoModelForCausalLM.from_pretrained( @@ -34,35 +40,39 @@ def test_2bits(self): result = simple_evaluate_user_model(model, tokenizer, batch_size=64, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) # wo alg ext 0.2078, with 0.2371 - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.22) + assert result["results"]["lambada_openai"]["acc,none"] > 0.22 shutil.rmtree(self.save_folder, ignore_errors=True) - def test_cli(self): + def test_cli(self, tiny_opt_model_path): import os - model_name = "/models/opt-125m" python_path = sys.executable res = os.system( - f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {model_name} --iters 1 --device auto --enable_alg_ext --avg_bits 2 --options=W2A16,W4A16 --ignore_scale_zp_bits" + f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 2 --options=W2A16,W4A16 --ignore_scale_zp_bits --nsampes 1 --seqlen 32" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {model_name} --iters 1 --device auto --enable_alg_ext --avg_bits 5.5 --options=mxfp4,mxfp8 --ignore_scale_zp_bits --enable_torch_compile" + f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 5.5 --options=mxfp4,mxfp8 --ignore_scale_zp_bits --enable_torch_compile --nsampes 1 --seqlen 32" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" - def test_all_support_dtype(self): + def test_all_support_dtype(self, tiny_qwen_model_path): from auto_round.auto_scheme import AutoScheme - model_name = "/models/Qwen3-0.6B" for scheme in ["MXFP4", "NVFP4", "W2A16G64", "gguf:q2_k_s,gguf:q4_k_s"]: avg_bits = 2 if scheme == "W2A16G64" else 4 scheme = AutoScheme(options=scheme, avg_bits=avg_bits, ignore_scale_zp_bits=True) ar = AutoRound( - model_name, scheme=scheme, iters=1, nsamples=1, enable_alg_ext=True, enable_torch_compile=True + tiny_qwen_model_path, + scheme=scheme, + iters=1, + nsamples=1, + seqlen=32, + enable_alg_ext=True, + enable_torch_compile=True, ) ar.quantize() From b15adc8dbf4e82090cd39cc1a5406a84388e964f Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Fri, 19 Dec 2025 02:34:33 -0500 Subject: [PATCH 09/24] use get_model_path and remove self.assertTrue/False Signed-off-by: He, Xin3 --- test/README.md | 2 +- test/conftest.py | 12 +- test/fixtures.py | 63 ++++---- test/helpers.py | 24 ++- test/test_ark/test_model.py | 16 +- test/test_cpu/requirements.txt | 1 - test/test_cpu/test_act_quantization.py | 18 +-- test/test_cpu/test_alg_ext.py | 14 +- test/test_cpu/test_auto_scheme.py | 4 +- test/test_cpu/test_autoopt.py | 4 +- test/test_cpu/test_autoround.py | 149 ++++++++---------- test/test_cpu/test_autoround_acc.py | 2 +- .../test_autoround_export_to_itrex.py | 28 ++-- test/test_cpu/test_block_names.py | 24 ++- test/test_cpu/test_calib_dataset.py | 6 +- test/test_cpu/test_cli_usage.py | 6 +- test/test_cpu/test_export.py | 29 ++-- test/test_cpu/test_generation.py | 4 +- test/test_cpu/test_gguf_format.py | 44 +++--- test/test_cpu/test_gpt_oss.py | 4 +- test/test_cpu/test_llmc_integration.py | 2 +- test/test_cpu/test_llmcompressor.py | 16 +- test/test_cpu/test_load_awq_gptq.py | 8 +- test/test_cpu/test_mix_bits.py | 7 +- test/test_cpu/test_mllm.py | 14 +- test/test_cpu/test_model_scope.py | 6 +- test/test_cpu/test_moe_model.py | 6 +- test/test_cpu/test_mxfp_nvfp.py | 22 +-- test/test_cpu/test_mxfp_save_load.py | 4 +- test/test_cpu/test_scheme.py | 20 +-- test/test_cpu/test_torch_backend.py | 6 +- test/test_cuda/requirements.txt | 1 - test/test_cuda/test_auto_round_format.py | 4 +- test/test_cuda/test_auto_scheme.py | 14 +- test/test_cuda/test_diffusion.py | 8 +- test/test_cuda/test_exllamav2_backend.py | 8 +- test/test_cuda/test_fp8_input.py | 8 +- test/test_cuda/test_get_block_name.py | 2 +- test/test_cuda/test_gguf.py | 26 +-- test/test_cuda/test_main_func.py | 6 +- test/test_cuda/test_marlin_backend.py | 12 +- test/test_cuda/test_mix_bits.py | 3 +- test/test_cuda/test_mxfp_nvfp.py | 2 +- test/test_cuda/test_scheme.py | 4 +- test/test_cuda/test_support_vlms.py | 10 +- test/test_cuda/test_torch_backend.py | 6 +- test/test_cuda/test_transformers.py | 12 +- test/test_cuda/test_triton_backend.py | 20 +-- test/test_cuda/test_vlms.py | 18 +-- test/test_hpu/test_auto_round.py | 6 +- test/test_xpu/test_autoround.py | 12 +- 51 files changed, 380 insertions(+), 367 deletions(-) diff --git a/test/README.md b/test/README.md index f5bc6ddba..9ccca0017 100644 --- a/test/README.md +++ b/test/README.md @@ -17,7 +17,7 @@ This project uses `pytest` for unit testing. All test cases are under the `test/ - Example: ```python # test_example.py - from ..helper import model_infer + from ..helpers import model_infer def test_model_infer(tiny_opt_model, opt_tokenizer): result = model_infer(tiny_opt_model, opt_tokenizer, input_text="hello world") diff --git a/test/conftest.py b/test/conftest.py index 109b504e8..d21100824 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -4,17 +4,7 @@ import pytest -from .fixtures import ( - dataloader, - opt_model, - opt_tokenizer, - tiny_gptj_model_path, - tiny_lamini_model_path, - tiny_opt_model, - tiny_opt_model_path, - tiny_qwen_model_path, -) -from .helpers import model_infer +from .fixtures import * # Easy debugging without installing auto-round. sys.path.insert(0, "..") diff --git a/test/fixtures.py b/test/fixtures.py index 005f321d0..87d0a5f75 100644 --- a/test/fixtures.py +++ b/test/fixtures.py @@ -10,7 +10,9 @@ gptj_name_or_path, lamini_name_or_path, opt_name_or_path, + phi2_name_or_path, qwen_name_or_path, + save_tiny_model, ) @@ -27,13 +29,8 @@ def __iter__(self): @pytest.fixture(scope="session") def tiny_opt_model_path(): model_name_or_path = opt_name_or_path - test_path = os.path.dirname(__file__) - tiny_model_path = os.path.join(test_path, "tmp_tiny_opt_model_path") - model = get_tiny_model(model_name_or_path, num_layers=3) - tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) - model.save_pretrained(tiny_model_path) - tokenizer.save_pretrained(tiny_model_path) - print(f"[Fixture]: built tiny model path:{tiny_model_path} for testing in session") + tiny_model_path = "./tmp_tiny_opt_model_path" + tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path) yield tiny_model_path shutil.rmtree(tiny_model_path) @@ -41,13 +38,8 @@ def tiny_opt_model_path(): @pytest.fixture(scope="session") def tiny_qwen_model_path(): model_name_or_path = qwen_name_or_path - test_path = os.path.dirname(__file__) - tiny_model_path = os.path.join(test_path, "tmp_tiny_qwen_model_path") - model = get_tiny_model(model_name_or_path, num_layers=3) - tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) - model.save_pretrained(tiny_model_path) - tokenizer.save_pretrained(tiny_model_path) - print(f"[Fixture]: built tiny model path:{tiny_model_path} for testing in session") + tiny_model_path = "./tmp_tiny_qwen_model_path" + tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path) yield tiny_model_path shutil.rmtree(tiny_model_path) @@ -55,13 +47,8 @@ def tiny_qwen_model_path(): @pytest.fixture(scope="session") def tiny_lamini_model_path(): model_name_or_path = lamini_name_or_path - test_path = os.path.dirname(__file__) - tiny_model_path = os.path.join(test_path, "tmp_tiny_lamini_model_path") - model = get_tiny_model(model_name_or_path, num_layers=3) - tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) - model.save_pretrained(tiny_model_path) - tokenizer.save_pretrained(tiny_model_path) - print(f"[Fixture]: built tiny model path:{tiny_model_path} for testing in session") + tiny_model_path = "./tmp_tiny_lamini_model_path" + tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path) yield tiny_model_path shutil.rmtree(tiny_model_path) @@ -69,13 +56,17 @@ def tiny_lamini_model_path(): @pytest.fixture(scope="session") def tiny_gptj_model_path(): model_name_or_path = gptj_name_or_path - test_path = os.path.dirname(__file__) - tiny_model_path = os.path.join(test_path, "tmp_tiny_gptj_model_path") - model = get_tiny_model(model_name_or_path, num_layers=3) - tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) - model.save_pretrained(tiny_model_path) - tokenizer.save_pretrained(tiny_model_path) - print(f"[Fixture]: built tiny model path:{tiny_model_path} for testing in session") + tiny_model_path = "./tmp_tiny_gptj_model_path" + tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path) + yield tiny_model_path + shutil.rmtree(tiny_model_path) + + +@pytest.fixture(scope="session") +def tiny_phi2_model_path(): + model_name_or_path = phi2_name_or_path + tiny_model_path = "./tmp_tiny_phi2_model_path" + tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path) yield tiny_model_path shutil.rmtree(tiny_model_path) @@ -84,7 +75,7 @@ def tiny_gptj_model_path(): @pytest.fixture(scope="function") def tiny_opt_model(): model_name_or_path = opt_name_or_path - return get_tiny_model(model_name_or_path, num_layers=3) + return get_tiny_model(model_name_or_path, num_layers=2) @pytest.fixture(scope="function") @@ -101,6 +92,20 @@ def opt_tokenizer(): return tokenizer +@pytest.fixture(scope="function") +def model(): + model_name_or_path = opt_name_or_path + model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype="auto", trust_remote_code=True) + return model + + +@pytest.fixture(scope="session") +def tokenizer(): + model_name_or_path = opt_name_or_path + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) + return tokenizer + + @pytest.fixture(scope="session") def dataloader(): return DataLoader() diff --git a/test/helpers.py b/test/helpers.py index d67f85599..77c219452 100644 --- a/test/helpers.py +++ b/test/helpers.py @@ -22,14 +22,11 @@ def get_model_path(model_name: str) -> str: qwen_name_or_path = get_model_path("Qwen/Qwen3-0.6B") lamini_name_or_path = get_model_path("MBZUAI/LaMini-GPT-124M") gptj_name_or_path = get_model_path("hf-internal-testing/tiny-random-GPTJForCausalLM") +phi2_name_or_path = get_model_path("microsoft/phi-2") # Slice model into tiny model for speedup -def get_tiny_model(model_name_or_path, num_layers=3): - model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype="auto", trust_remote_code=True) - - if hasattr(model.config, "num_hidden_layers"): - model.config.num_hidden_layers = num_layers +def get_tiny_model(model_name_or_path, num_layers=2): def slice_layers(module): for name, child in module.named_children(): @@ -41,12 +38,29 @@ def slice_layers(module): return True return False + model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype="auto", trust_remote_code=True) slice_layers(model) + + if hasattr(model.config, "num_hidden_layers"): + model.config.num_hidden_layers = num_layers if hasattr(model.config, "layer_types"): model.config.layer_types = model.config.layer_types[:num_layers] + return model +# for fixture usage only +def save_tiny_model(model_name_or_path, tiny_model_path): + model = get_tiny_model(model_name_or_path, num_layers=2) + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) + test_path = os.path.dirname(__file__) + tiny_model_path = os.path.join(test_path, tiny_model_path) + model.save_pretrained(tiny_model_path) + tokenizer.save_pretrained(tiny_model_path) + print(f"[Fixture]: built tiny model path:{tiny_model_path} for testing in session") + return tiny_model_path + + # HPU mode checking def is_pytest_mode_compile(): return pytest.mode == "compile" diff --git a/test/test_ark/test_model.py b/test/test_ark/test_model.py index 622f4a6dd..b8dfdca5c 100644 --- a/test/test_ark/test_model.py +++ b/test/test_ark/test_model.py @@ -26,11 +26,11 @@ def setup_and_teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_torch_4bits_sym_cpu(self, model, tokenizer, dataloader): + def test_torch_4bits_sym_cpu(self, opt_model, opt_tokenizer, dataloader): bits, group_size, sym = 4, 32, True autoround = AutoRound( - model, - tokenizer, + opt_model, + opt_tokenizer, bits=bits, group_size=group_size, sym=sym, @@ -50,15 +50,15 @@ def test_torch_4bits_sym_cpu(self, model, tokenizer, dataloader): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=1000) print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.28) + assert result["results"]["lambada_openai"]["acc,none"] > 0.28 shutil.rmtree("./saved", ignore_errors=True) - def test_torch_4bits_sym_xpu(self, model, tokenizer, dataloader): + def test_torch_4bits_sym_xpu(self, opt_model, opt_tokenizer, dataloader): bits, group_size, sym = 4, 32, True autoround = AutoRound( - model, - tokenizer, + opt_model, + opt_tokenizer, bits=bits, group_size=group_size, sym=sym, @@ -78,6 +78,6 @@ def test_torch_4bits_sym_xpu(self, model, tokenizer, dataloader): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=1000) print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.28) + assert result["results"]["lambada_openai"]["acc,none"] > 0.28 torch.xpu.empty_cache() shutil.rmtree(self.save_folder, ignore_errors=True) diff --git a/test/test_cpu/requirements.txt b/test/test_cpu/requirements.txt index 219189829..a54cc4e4e 100644 --- a/test/test_cpu/requirements.txt +++ b/test/test_cpu/requirements.txt @@ -3,7 +3,6 @@ modelscope gguf sentencepiece torchvision -parameterized pillow numba llmcompressor @ git+https://github.com/vllm-project/llm-compressor.git@main diff --git a/test/test_cpu/test_act_quantization.py b/test/test_cpu/test_act_quantization.py index 0483c027d..cd41c0985 100644 --- a/test/test_cpu/test_act_quantization.py +++ b/test/test_cpu/test_act_quantization.py @@ -24,11 +24,11 @@ def setup_and_teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_mx_fp4(self, tiny_opt_model, tokenizer, dataloader): + def test_mx_fp4(self, tiny_opt_model, opt_tokenizer, dataloader): bits, group_size, sym = 4, 128, True autoround = AutoRound( tiny_opt_model, - tokenizer, + opt_tokenizer, bits=bits, group_size=group_size, sym=sym, @@ -40,11 +40,11 @@ def test_mx_fp4(self, tiny_opt_model, tokenizer, dataloader): ) autoround.quantize() - def test_wint4fp8_dynamic(self, tiny_opt_model, tokenizer, dataloader): + def test_wint4fp8_dynamic(self, tiny_opt_model, opt_tokenizer, dataloader): bits, group_size = 4, 128 autoround = AutoRound( tiny_opt_model, - tokenizer, + opt_tokenizer, bits=bits, group_size=group_size, iters=2, @@ -56,11 +56,11 @@ def test_wint4fp8_dynamic(self, tiny_opt_model, tokenizer, dataloader): ) autoround.quantize() - def test_wint4fp8_static(self, tiny_opt_model, tokenizer, dataloader): + def test_wint4fp8_static(self, tiny_opt_model, opt_tokenizer, dataloader): bits, group_size, sym = 4, 128, True autoround = AutoRound( tiny_opt_model, - tokenizer, + opt_tokenizer, bits=bits, group_size=group_size, sym=sym, @@ -75,12 +75,12 @@ def test_wint4fp8_static(self, tiny_opt_model, tokenizer, dataloader): autoround.quantize() @pytest.mark.parametrize("act_group_size", [-1, 128]) - def test_wfp8afp8_static(self, act_group_size, tiny_opt_model, tokenizer, dataloader): + def test_wfp8afp8_static(self, act_group_size, tiny_opt_model, opt_tokenizer, dataloader): from auto_round.wrapper import WrapperWALayer autoround = AutoRound( tiny_opt_model, - tokenizer, + opt_tokenizer, group_size=128, act_group_size=act_group_size, iters=2, @@ -92,7 +92,7 @@ def test_wfp8afp8_static(self, act_group_size, tiny_opt_model, tokenizer, datalo ) autoround.quantize() - k_proj = autoround.model.model.decoder.layers[2].self_attn.k_proj + k_proj = autoround.model.model.decoder.layers[1].self_attn.k_proj assert isinstance(k_proj, WrapperWALayer), "k_proj should be WrapperWALayer" if act_group_size == -1: assert k_proj.orig_layer.act_scale.shape[0] == 20, "act_scale shape[0] should be 20" diff --git a/test/test_cpu/test_alg_ext.py b/test/test_cpu/test_alg_ext.py index 504b7d0f8..0bfdfba47 100644 --- a/test/test_cpu/test_alg_ext.py +++ b/test/test_cpu/test_alg_ext.py @@ -1,30 +1,30 @@ from auto_round import AutoRound -from ..helpers import opt_name_or_path, qwen_name_or_path +from ..helpers import qwen_name_or_path class TestAlgExt: - def test_alg_ext(self): - model_name = opt_name_or_path + def test_alg_ext(self, tiny_opt_model_path, tiny_qwen_model_path): + model_name = tiny_opt_model_path ar = AutoRound(model_name, scheme="W2A16", iters=1, nsamples=1, enable_alg_ext=True) ar.quantize() - model_name = qwen_name_or_path + model_name = tiny_qwen_model_path ar = AutoRound(model_name, scheme="gguf:q4_k_s", iters=1, nsamples=1, enable_alg_ext=True) ar.quantize() from auto_round.auto_scheme import AutoScheme scheme = AutoScheme(options=["mxfp4", "mxfp8"], avg_bits=5.5, ignore_scale_zp_bits=True) - model_name = qwen_name_or_path + model_name = tiny_qwen_model_path ar = AutoRound(model_name, scheme=scheme, iters=1, nsamples=1, enable_alg_ext=True, enable_torch_compile=True) ar.quantize() def test_alg_ext_import(self): from auto_round.alg_ext import wrapper_autoround - def test_all_support_dtype(self): - model_name = opt_name_or_path + def test_all_support_dtype(self, tiny_opt_model_path): + model_name = tiny_opt_model_path for scheme in ["MXFP4", "NVFP4", "W2A16G64"]: ar = AutoRound( model_name, scheme=scheme, iters=1, nsamples=1, enable_alg_ext=True, enable_torch_compile=True diff --git a/test/test_cpu/test_auto_scheme.py b/test/test_cpu/test_auto_scheme.py index b6c20826e..b38e84dc6 100644 --- a/test/test_cpu/test_auto_scheme.py +++ b/test/test_cpu/test_auto_scheme.py @@ -44,11 +44,11 @@ def test_layer_config(self, tiny_opt_model_path): ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config) model, layer_config = ar.quantize() assert layer_config["model.decoder.layers.1.fc1"]["bits"] == 8 - assert layer_config["model.decoder.layers.1.fc1"]["sym"] == False + assert not layer_config["model.decoder.layers.1.fc1"]["sym"] assert layer_config["model.decoder.layers.1.fc1"]["group_size"] == 32 layer = get_module(model, "model.decoder.layers.1.fc1") assert layer.bits == 8 - assert layer.sym == False + assert not layer.sym assert layer.group_size == 32 avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) diff --git a/test/test_cpu/test_autoopt.py b/test/test_cpu/test_autoopt.py index 472711155..c14e04c0e 100644 --- a/test/test_cpu/test_autoopt.py +++ b/test/test_cpu/test_autoopt.py @@ -24,7 +24,7 @@ def setup_and_teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_Adam(self, tiny_opt_model, tokenizer, dataloader): + def test_Adam(self, tiny_opt_model, opt_tokenizer, dataloader): bits, group_size, sym = 4, 128, False from auto_round.utils import get_block_names @@ -32,7 +32,7 @@ def test_Adam(self, tiny_opt_model, tokenizer, dataloader): bits, group_size, sym, batch_size = 4, 128, False, 20 adamround = AutoRoundAdam( tiny_opt_model, - tokenizer, + opt_tokenizer, bits=bits, group_size=group_size, sym=sym, diff --git a/test/test_cpu/test_autoround.py b/test/test_cpu/test_autoround.py index 1f1f85f55..d0049765e 100644 --- a/test/test_cpu/test_autoround.py +++ b/test/test_cpu/test_autoround.py @@ -3,20 +3,19 @@ import pytest import torch -from parameterized import parameterized from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from auto_round import AutoRound from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.utils import get_module -from ..helpers import model_infer +from ..helpers import get_model_path, model_infer, opt_name_or_path, qwen_name_or_path class TestAutoRound: @classmethod def setup_class(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + model_name = opt_name_or_path self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) self.save_folder = "./saved" @@ -26,18 +25,16 @@ def teardown_class(self): shutil.rmtree(self.save_folder, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_bits_setting(self): + def test_bits_setting(self, tiny_opt_model_path): layer_config = {"model.decoder.layers.0.self_attn.k_proj": {"data_type": "mx_fp8", "group_size": 32}} - autoround = AutoRound( - "/tf_dataset/auto_round/models/facebook/opt-125m", iters=2, seqlen=2, nsamples=1, layer_config=layer_config - ) + autoround = AutoRound(tiny_opt_model_path, iters=2, seqlen=2, nsamples=1, layer_config=layer_config) autoround.quantize() module = get_module(autoround.model, "model.decoder.layers.0.self_attn.k_proj") if module.bits != 8: raise ValueError(f"Expected bits to be 8, but got {module.bits}") - def test_layer_config(self, dataloader): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_layer_config(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path layer_config = {"self_attn": {"bits": 4, "data_type": "nv_fp", "act_bits": 16, "group_size": 16}} autoround = AutoRound( model_name, @@ -52,8 +49,8 @@ def test_layer_config(self, dataloader): autoround.quantize_and_save(self.save_folder, inplace=False, format="fake") shutil.rmtree(self.save_folder) - def test_remove_whole_block(self, dataloader): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_remove_whole_block(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path layer_config = { "model.decoder.layers.0.self_attn.k_proj": {"bits": 32}, "model.decoder.layers.0.self_attn.v_proj": {"bits": 32}, @@ -75,11 +72,10 @@ def test_remove_whole_block(self, dataloader): ) autoround.quantize() - def test_consecutive_quant(self, dataloader): + def test_consecutive_quant(self, tiny_opt_model_path, tiny_phi2_model_path, dataloader): bits, group_size, sym = 4, -1, False autoround = AutoRound( - self.model, - self.tokenizer, + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, @@ -89,15 +85,8 @@ def test_consecutive_quant(self, dataloader): ) autoround.quantize() - model = AutoModelForCausalLM.from_pretrained( - "/tf_dataset/auto_round/models/microsoft/phi-2", torch_dtype="auto", trust_remote_code=True - ) - tokenizer = AutoTokenizer.from_pretrained( - "/tf_dataset/auto_round/models/microsoft/phi-2", trust_remote_code=True - ) autoround = AutoRound( - model, - tokenizer, + tiny_phi2_model_path, bits=bits, group_size=group_size, sym=sym, @@ -108,7 +97,7 @@ def test_consecutive_quant(self, dataloader): autoround.quantize() def test_mx_fp4(self, dataloader): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + model_name = opt_name_or_path bits, group_size, sym = 4, 32, False autoround = AutoRound( model_name, @@ -127,10 +116,10 @@ def test_mx_fp4(self, dataloader): model, self.tokenizer, batch_size="auto:8", tasks="lambada_openai", limit=32 ) print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.3) # 0.375 + assert result["results"]["lambada_openai"]["acc,none"] > 0.3 # 0.375 def test_nv_fp4(self, dataloader): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + model_name = opt_name_or_path bits, group_size, sym = 4, 16, False autoround = AutoRound( model_name, @@ -147,10 +136,10 @@ def test_nv_fp4(self, dataloader): model, self.tokenizer, batch_size="auto:8", tasks="lambada_openai", limit=32 ) print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35) + assert result["results"]["lambada_openai"]["acc,none"] > 0.35 - def test_w4g1(self, dataloader): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_w4g1(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path bits, group_size, sym = 4, -1, True autoround = AutoRound( model_name, @@ -163,9 +152,9 @@ def test_w4g1(self, dataloader): ) autoround.quantize() - @parameterized.expand([(2,), (3,), (4,)]) + @pytest.mark.parametrize("bits", [2, 3, 4]) def test_g128(self, bits, dataloader): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + model_name = opt_name_or_path group_size, sym = 128, True autoround = AutoRound( model_name, @@ -182,7 +171,7 @@ def test_g128(self, bits, dataloader): model, self.tokenizer, batch_size="auto:8", tasks="lambada_openai", limit=32 ) print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.3) + assert result["results"]["lambada_openai"]["acc,none"] > 0.3 def test_disable_quanted_input(self, dataloader): bits, group_size, sym = 4, -1, True @@ -199,9 +188,9 @@ def test_disable_quanted_input(self, dataloader): ) autoround.quantize() - def test_enable_norm_bias_tuning_qwen3(self, dataloader): + def test_enable_norm_bias_tuning_qwen3(self, tiny_qwen_model_path, dataloader): bits, group_size, sym = 4, 128, True - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B" + model_name = tiny_qwen_model_path model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) autoround = AutoRound( @@ -249,8 +238,8 @@ def test_disable_minmax_tuning(self, dataloader): autoround.quantize() # - def test_signround(self, dataloader): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_signround(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path bits, group_size, sym = 4, -1, False autoround = AutoRound( model_name, @@ -283,8 +272,8 @@ def test_lm_head_layer_config_way(self, dataloader): ) autoround.quantize() - def test_wa_quant(self, dataloader): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_wa_quant(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path bits, group_size, sym, act_bits = 4, 128, False, 4 autoround = AutoRound( model_name, @@ -298,9 +287,9 @@ def test_wa_quant(self, dataloader): ) autoround.quantize() - def test_auto_device_map(self, dataloader): + def test_auto_device_map(self, tiny_opt_model_path, dataloader): bits, group_size, sym = 4, 128, False - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + model_name = tiny_opt_model_path model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", trust_remote_code=True, device_map="auto" ) @@ -316,7 +305,7 @@ def test_auto_device_map(self, dataloader): ) autoround.quantize() - def test_device_map_dict(self, dataloader): + def test_device_map_dict(self, tiny_opt_model_path, dataloader): bits, group_size, sym = 4, 128, False device_map = {".*": "cpu"} autoround = AutoRound( @@ -333,7 +322,7 @@ def test_device_map_dict(self, dataloader): autoround.quantize() # test model_name - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + model_name = tiny_opt_model_path autoround = AutoRound( model_name, self.tokenizer, @@ -347,9 +336,9 @@ def test_device_map_dict(self, dataloader): ) autoround.quantize() - def test_fp32(self, dataloader): + def test_fp32(self, tiny_opt_model_path, dataloader): bits, group_size, sym = 4, 128, False - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + model_name = tiny_opt_model_path model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float32, trust_remote_code=True, device_map="auto" ) @@ -380,8 +369,8 @@ def test_tensor_reshape(self, dataloader): ) autoround.quantize() - def test_rtn(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_rtn(self, tiny_opt_model_path): + model_name = tiny_opt_model_path model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) @@ -399,9 +388,9 @@ def test_rtn(self): model_infer(model, tokenizer) shutil.rmtree(self.save_folder) - def test_embed_quant(self, dataloader): + def test_embed_quant(self, tiny_opt_model_path, dataloader): bits, group_size, sym = 4, 128, True - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + model_name = tiny_opt_model_path layer_config = { "model.decoder.embed_tokens": {"bits": 4}, } @@ -418,9 +407,9 @@ def test_embed_quant(self, dataloader): ) autoround.quantize() - def test_fallback_layers(self, dataloader): + def test_fallback_layers(self, tiny_opt_model_path, dataloader): bits, group_size, sym = 4, 128, True - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + model_name = tiny_opt_model_path model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float32, trust_remote_code=True, device_map="auto" ) @@ -462,17 +451,17 @@ def test_not_convert_modules(self): from auto_round_extension.ipex.qlinear_ipex_awq import QuantLinear - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct-AWQ" + model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct-AWQ") quantization_config = AutoRoundConfig() model = Qwen2VLForConditionalGeneration.from_pretrained( model_name, quantization_config=quantization_config, device_map="cpu", torch_dtype=torch.float16 ) - self.assertTrue(isinstance(model.visual.blocks[0].attn.qkv, torch.nn.Linear)) - self.assertFalse(isinstance(model.visual.merger.mlp[0], QuantLinear)) + assert isinstance(model.visual.blocks[0].attn.qkv, torch.nn.Linear) + assert not isinstance(model.visual.merger.mlp[0], QuantLinear) if hasattr(model.model, "language_model"): - self.assertTrue(isinstance(model.model.language_model.layers[0].self_attn.v_proj, QuantLinear)) + assert isinstance(model.model.language_model.layers[0].self_attn.v_proj, QuantLinear) else: - self.assertTrue(isinstance(model.model.layers[0].self_attn.v_proj, QuantLinear)) + assert isinstance(model.model.layers[0].self_attn.v_proj, QuantLinear) processor = AutoProcessor.from_pretrained(model_name, size=None) image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" @@ -507,8 +496,8 @@ def test_not_convert_modules(self): ) print(output_text) - def test_fallback_layers_regex_awq(self, dataloader): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_fallback_layers_regex_awq(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path bits, group_size, sym = 4, 128, True model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) @@ -543,8 +532,8 @@ def test_fallback_layers_regex_awq(self, dataloader): print(res) shutil.rmtree(self.save_folder, ignore_errors=True) - def test_fallback_layers_regex_gptq(self, dataloader): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_fallback_layers_regex_gptq(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path bits, group_size, sym = 4, 128, True model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) @@ -579,8 +568,8 @@ def test_fallback_layers_regex_gptq(self, dataloader): print(res) shutil.rmtree(self.save_folder, ignore_errors=True) - def test_fallback_layers_regex_round(self, dataloader): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_fallback_layers_regex_round(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path bits, group_size, sym = 4, 128, True model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) @@ -615,13 +604,13 @@ def test_fallback_layers_regex_round(self, dataloader): print(res) shutil.rmtree(self.save_folder, ignore_errors=True) - def test_fallback_layers_regex_exception(self, dataloader): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_fallback_layers_regex_exception(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path bits, group_size, sym = 4, 128, True model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) layer_config = {"model.decoder.layers.12.self_attn.k_proj": {"bits": 16}} - with self.assertRaises(ValueError): + with pytest.raises(ValueError): autoround = AutoRound( model, tokenizer=tokenizer, @@ -664,8 +653,8 @@ def test_dequant_fp8_weight(self): assert dequant_weight.shape[0] == 32 assert dequant_weight.shape.numel() == 32 * 5760 * 1440 - def test_mixed_bit_setting(self): - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + def test_mixed_bit_setting(self, tiny_opt_model_path): + model_name = tiny_opt_model_path layer_config = {"model.decoder.layers.7.fc1": {"bits": 8, "act_bits": 8}} ar = AutoRound(model_name, data_type="mx_fp4", act_bits=4, iters=0, layer_config=layer_config) ar.quantize() @@ -676,21 +665,21 @@ def test_mixed_bit_setting(self): ): raise ValueError("mixed bits is not correct") - def test_invalid_layer_config(self): - with self.assertRaises(ValueError): + def test_invalid_layer_config(self, tiny_opt_model_path): + with pytest.raises(ValueError): layer_config = {"model.decoder.layers.2.self_attnx": {"bits": 2}} ar = AutoRound( - "/tf_dataset/auto_round/models/facebook/opt-125m", + tiny_opt_model_path, scheme="W3A16", nsamples=1, iters=1, layer_config=layer_config, ) ar.quantize() - with self.assertRaises(ValueError): + with pytest.raises(ValueError): layer_config = {"model.decoder.layers.2.self_attn": {"bit": 2}} # should be bits ar = AutoRound( - "/tf_dataset/auto_round/models/facebook/opt-125m", + tiny_opt_model_path, scheme="W3A16", nsamples=1, iters=1, @@ -699,7 +688,7 @@ def test_invalid_layer_config(self): ar.quantize() def test_quant_lm_head(self): - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-8B" + model_name = get_model_path("Qwen/Qwen3-8B") ar = AutoRound(model_name, quant_lm_head=True, iters=0, seqlen=8, nsamples=1, disable_opt_rtn=True) ar.quantize_and_save(output_dir=self.save_folder, format="auto_round") model = AutoModelForCausalLM.from_pretrained(self.save_folder, device_map="cpu") @@ -722,7 +711,7 @@ def test_quant_lm_head(self): assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4 def test_quant_lm_head_layer_config(self): - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-8B" + model_name = get_model_path("Qwen/Qwen3-8B") layer_config = {"lm_head": {"bits": 4}} ar = AutoRound( model_name, @@ -739,21 +728,21 @@ def test_quant_lm_head_layer_config(self): assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4 def test_compressor(self): - model_name = "Qwen/Qwen2-VL-2B-Instruct" + model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct") ar = AutoRound(model_name, enable_adam=True) assert ar.optimizer == torch.optim.AdamW - self.assertTrue(ar.mllm) + assert ar.mllm # test old api from auto_round import AutoRoundMLLM ar = AutoRoundMLLM(model_name) - self.assertTrue(ar.mllm) + assert ar.mllm def test_attention_mask_in_dataset(self): from transformers import AutoTokenizer - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B" + model_name = qwen_name_or_path # model_name = "/models/Qwen3-0.6B" tokenizer = AutoTokenizer.from_pretrained(model_name) text = ["haha", "hello world"] @@ -771,7 +760,7 @@ def test_attention_mask_in_dataset(self): def test_attention_mask_via_tokenize_in_dataset(self): from transformers import AutoTokenizer - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B" + model_name = qwen_name_or_path # model_name = "/models/Qwen3-0.6B" tokenizer = AutoTokenizer.from_pretrained(model_name) text = ["haha", "hello world"] @@ -788,9 +777,9 @@ def test_attention_mask_via_tokenize_in_dataset(self): ar = AutoRound(model_name, iters=1, dataset=data, seqlen=8) ar.quantize() - def test_low_cpu_mem_usage(self, dataloader): + def test_low_cpu_mem_usage(self, tiny_opt_model_path, dataloader): bits, group_size = 4, 32 - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + model_name = tiny_opt_model_path model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) quantized_model_path = self.save_folder @@ -809,7 +798,7 @@ def test_low_cpu_mem_usage(self, dataloader): shutil.rmtree(quantized_model_path, ignore_errors=True) def test_create_adam(self): - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B" + model_name = qwen_name_or_path from auto_round import AutoRound ar = AutoRound(model=model_name, enable_adam=True) diff --git a/test/test_cpu/test_autoround_acc.py b/test/test_cpu/test_autoround_acc.py index 721a5c8ed..876d4a452 100644 --- a/test/test_cpu/test_autoround_acc.py +++ b/test/test_cpu/test_autoround_acc.py @@ -59,7 +59,7 @@ def test_default_acc(self, dataloader): out1 = model_tmp(inp) assert out0[0].equal(out1[0]) - self.assertTrue(isclose(float(out0[0][0][0][0]), -0.021002087742090225, rel_tol=5e-04)) + assert isclose(float(out0[0][0][0][0]), -0.021002087742090225, rel_tol=5e-04) def test_3bits_asym_autoround(self, tiny_opt_model_path): model_name = tiny_opt_model_path diff --git a/test/test_cpu/test_autoround_export_to_itrex.py b/test/test_cpu/test_autoround_export_to_itrex.py index d4cc2a73c..19f196270 100644 --- a/test/test_cpu/test_autoround_export_to_itrex.py +++ b/test/test_cpu/test_autoround_export_to_itrex.py @@ -8,7 +8,7 @@ from auto_round import AutoRound -from ..helper import gptj_name_or_path +from ..helpers import get_model_path, gptj_name_or_path class SimpleDataLoader: @@ -52,11 +52,11 @@ def test_autoround_int_quant(self): out2 = model(self.lm_input) out3 = q_model(self.lm_input) out4 = compressed_model(self.lm_input) - self.assertTrue(torch.all(torch.isclose(out1[0], out2[0], atol=1e-1))) - self.assertFalse(torch.all(out1[0] == out2[0])) - self.assertTrue(torch.all(out2[0] == out3[0])) - self.assertTrue(torch.all(torch.isclose(out3[0], out4[0], atol=1e-3))) - self.assertTrue("transformer.h.0.attn.k_proj.qzeros" in compressed_model.state_dict().keys()) + assert torch.all(torch.isclose(out1[0], out2[0], atol=1e-1)) + assert not torch.all(out1[0] == out2[0]) + assert torch.all(out2[0] == out3[0]) + assert torch.all(torch.isclose(out3[0], out4[0], atol=1e-3)) + assert "transformer.h.0.attn.k_proj.qzeros" in compressed_model.state_dict().keys() model = copy.deepcopy(self.gptj) out6 = model(self.lm_input) @@ -66,13 +66,13 @@ def test_autoround_int_quant(self): compressed_model = compressed_model.to(torch.float32) out4 = q_model(self.lm_input) out5 = compressed_model(self.lm_input) - self.assertTrue(torch.all(out1[0] == out6[0])) - self.assertTrue(torch.all(torch.isclose(out4[0], out5[0], atol=5e-3))) + assert torch.all(out1[0] == out6[0]) + assert torch.all(torch.isclose(out4[0], out5[0], atol=5e-3)) def test_config(self): from auto_round.export.export_to_itrex import QuantConfig - config = QuantConfig.from_pretrained("/tf_dataset/auto_round/models/TheBloke/Llama-2-7B-Chat-GPTQ") + config = QuantConfig.from_pretrained(get_model_path("TheBloke/Llama-2-7B-Chat-GPTQ")) config.save_pretrained("quantization_config_dir") loaded_config = QuantConfig.from_pretrained("quantization_config_dir") assert config.group_size == loaded_config.group_size @@ -94,8 +94,8 @@ def test_xpu_export(self): out3 = q_model(self.lm_input) out4 = compressed_model_xpu(self.lm_input) out5 = compressed_model_cpu(self.lm_input) - self.assertTrue(torch.all(torch.isclose(out1[0], out2[0], atol=1e-1))) - self.assertFalse(torch.all(out1[0] == out2[0])) - self.assertTrue(torch.all(out2[0] == out3[0])) - self.assertTrue(torch.all(torch.isclose(out3[0], out4[0], atol=1e-3))) - self.assertTrue(torch.all(torch.isclose(out4[0], out5[0], atol=1e-5))) + assert torch.all(torch.isclose(out1[0], out2[0], atol=1e-1)) + assert not torch.all(out1[0] == out2[0]) + assert torch.all(out2[0] == out3[0]) + assert torch.all(torch.isclose(out3[0], out4[0], atol=1e-3)) + assert torch.all(torch.isclose(out4[0], out5[0], atol=1e-5)) diff --git a/test/test_cpu/test_block_names.py b/test/test_cpu/test_block_names.py index 8d5f935d9..5d0423fa2 100644 --- a/test/test_cpu/test_block_names.py +++ b/test/test_cpu/test_block_names.py @@ -2,15 +2,13 @@ import shutil import pytest - -sys.path.insert(0, ".") import torch import torch.nn as nn from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound -from ..helper import lamini_name_or_path +from ..helpers import get_model_path, lamini_name_or_path # ================= simple multimodal model ================= @@ -118,7 +116,7 @@ def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_moe_quant(self, dataloader): + def test_moe_quant(self): input_size = 10 hidden_size = 10 num_groups = 2 @@ -185,24 +183,24 @@ def test_mm_block_name(self): from auto_round.utils import get_block_names - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct" + model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct") model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, trust_remote_code=True, device_map="auto") block_name = get_block_names(model, quant_vision=True) - self.assertTrue(len(block_name) == 2) - self.assertTrue(all(["visual.merger.mlp" not in n for n in block_name])) + assert len(block_name) == 2 + assert all(["visual.merger.mlp" not in n for n in block_name]) block_name = get_block_names(model, quant_vision=False) - self.assertTrue(len(block_name) == 1) - self.assertTrue(block_name == get_block_names(model)) + assert len(block_name) == 1 + assert block_name == get_block_names(model) def test_moe(self): from auto_round.utils import get_block_names - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen1.5-MoE-A2.7B" + model_name = get_model_path("Qwen/Qwen1.5-MoE-A2.7B") # config = AutoConfig.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) block_name = get_block_names(model) block_name_2 = get_block_names(model, quant_vision=True) - self.assertTrue(block_name == block_name_2) - self.assertTrue(len(block_name_2) == 1) - self.assertTrue("model.layers.23" == block_name_2[0][-1]) + assert block_name == block_name_2 + assert len(block_name_2) == 1 + assert "model.layers.23" == block_name_2[0][-1] diff --git a/test/test_cpu/test_calib_dataset.py b/test/test_cpu/test_calib_dataset.py index fc95966b6..cb276147e 100644 --- a/test/test_cpu/test_calib_dataset.py +++ b/test/test_cpu/test_calib_dataset.py @@ -8,6 +8,8 @@ from auto_round import AutoRound +from ..helpers import get_model_path, opt_name_or_path + class TestLocalCalibDataset: @classmethod @@ -26,7 +28,7 @@ def setup_class(self): json.dump(item, jsonl_file, ensure_ascii=False) jsonl_file.write("\n") - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + model_name = opt_name_or_path self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) @@ -59,7 +61,7 @@ def test_jsonl(self): autoround.quantize() def test_apply_chat_template(self): - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct" + model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) dataset = "NeelNanda/pile-10k:apply_chat_template:system_prompt=''" diff --git a/test/test_cpu/test_cli_usage.py b/test/test_cpu/test_cli_usage.py index ffc04d8f1..e71b2854a 100644 --- a/test/test_cpu/test_cli_usage.py +++ b/test/test_cpu/test_cli_usage.py @@ -17,7 +17,7 @@ def teardown_class(self): shutil.rmtree("../../saved", ignore_errors=True) shutil.rmtree("../../tmp_autoround", ignore_errors=True) - def test_auto_round_cmd(self): + def test_auto_round_cmd(self, tiny_opt_model_path): python_path = sys.executable # Test llm script @@ -26,13 +26,13 @@ def test_auto_round_cmd(self): assert False, "cmd line test fail, please have a check" res = os.system( - f"cd ../.. && {python_path} -m auto_round --model '/tf_dataset/auto_round/models/facebook/opt-125m' --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved --tasks piqa" + f"cd ../.. && {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved --tasks piqa" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"cd ../.. && {python_path} -m auto_round --model '/tf_dataset/auto_round/models/facebook/opt-125m' --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32" + f"cd ../.. && {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py index 866a7d396..36ddee546 100644 --- a/test/test_cpu/test_export.py +++ b/test/test_cpu/test_export.py @@ -3,11 +3,12 @@ import pytest import torch -from parameterized import parameterized from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from auto_round import AutoRound +from ..helpers import get_model_path, opt_name_or_path + def _get_folder_size(path: str) -> float: """Return folder size in GB.""" @@ -23,7 +24,7 @@ def _get_folder_size(path: str) -> float: class TestAutoRound: @classmethod def setup_class(self): - self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + self.model_name = opt_name_or_path self.save_dir = "./saved" self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -198,7 +199,7 @@ def test_autoround_3bit_sym_format(self, dataloader): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree(quantized_model_path, ignore_errors=True) - @parameterized.expand([(None,), ("fp8",), ("float16")]) + @pytest.mark.parametrize("static_kv_dtype", ["fp8", "float16"]) def test_static_afp8_export(self, static_kv_dtype): import os @@ -224,8 +225,8 @@ def test_static_afp8_export(self, static_kv_dtype): quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") - self.assertIn("model.decoder.layers.8.self_attn.k_proj.input_scale", f.keys()) - self.assertIn("model.decoder.layers.8.self_attn.k_proj.weight_scale", f.keys()) + assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys() + assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys() assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape == torch.Size([1]) assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype == torch.float8_e4m3fn if static_kv_dtype is None: @@ -257,8 +258,8 @@ def test_static_afp8_export(self, static_kv_dtype): assert output is not None, "Output should not be None" if static_kv_dtype == "fp8": - self.assertIn("model.decoder.layers.8.self_attn.k_scale", f.keys()) - self.assertIn("model.decoder.layers.8.self_attn.v_scale", f.keys()) + assert "model.decoder.layers.8.self_attn.k_scale" in f.keys() + assert "model.decoder.layers.8.self_attn.v_scale" in f.keys() assert f.get_tensor("model.decoder.layers.5.self_attn.v_scale").shape == torch.Size([1]) assert f.get_tensor("model.decoder.layers.5.self_attn.k_scale").shape == torch.Size([1]) assert f.get_tensor("model.decoder.layers.5.self_attn.k_scale").dtype == torch.float32 @@ -283,8 +284,8 @@ def test_static_afp8_export(self, static_kv_dtype): autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") - self.assertIn("model.decoder.layers.8.self_attn.k_proj.input_scale", f.keys()) - self.assertIn("model.decoder.layers.8.self_attn.k_proj.weight_scale", f.keys()) + assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys() + assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys() assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape == torch.Size([1]) assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype == torch.float8_e4m3fn shutil.rmtree(quantized_model_path, ignore_errors=True) @@ -308,14 +309,14 @@ def test_static_fp8_attn(self): quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") f = safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") - self.assertIn("model.decoder.layers.8.self_attn.k_proj.input_scale", f.keys()) - self.assertIn("model.decoder.layers.8.self_attn.k_proj.weight_scale", f.keys()) + assert "model.decoder.layers.8.self_attn.k_proj.input_scale" in f.keys() + assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys() assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape == torch.Size([1]) assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype == torch.float8_e4m3fn check_attrs = ["k_scale", "v_scale", "q_scale"] for attr in check_attrs: weight_name = f"model.decoder.layers.8.self_attn.{attr}" - self.assertIn(weight_name, f.keys()) + assert weight_name in f.keys() assert f.get_tensor(weight_name).shape == torch.Size([1]) assert f.get_tensor(weight_name).dtype == torch.float32 @@ -323,7 +324,7 @@ def test_static_fp8_attn(self): def test_awq_lmhead_export(self, dataloader): bits, sym, group_size = 4, False, 128 - model_name = "/tf_dataset/auto_round/models/microsoft/phi-2" + model_name = get_model_path("microsoft/phi-2") layer_config = { "lm_head": {"bits": 4}, # set lm_head quant "layer": {"bits": 16}, @@ -358,7 +359,7 @@ def test_awq_lmhead_export(self, dataloader): def test_gptq_lmhead_export(self, dataloader): bits, sym, group_size = 4, True, 128 # Note that, to save UT tuning time, the local model is intentionally kept lightweight, using only 2 hidden layers. - model_name = "/tf_dataset/auto_round/models/microsoft/phi-2" + model_name = get_model_path("microsoft/phi-2") layer_config = { "lm_head": {"bits": 4}, # set lm_head quant "layer": {"bits": 16}, diff --git a/test/test_cpu/test_generation.py b/test/test_cpu/test_generation.py index 4c72db93c..e1e9dc3f1 100644 --- a/test/test_cpu/test_generation.py +++ b/test/test_cpu/test_generation.py @@ -7,11 +7,13 @@ from auto_round import AutoRound +from ..helpers import opt_name_or_path + class TestAutoRoundFormatGeneration: @classmethod def setup_class(self): - self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + self.model_name = opt_name_or_path self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) self.save_folder = "./saved" diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py index 393e11dba..c34c4f096 100644 --- a/test/test_cpu/test_gguf_format.py +++ b/test/test_cpu/test_gguf_format.py @@ -8,12 +8,14 @@ from auto_round import AutoRound +from ..helpers import get_model_path + class TestGGUF: @classmethod def setup_class(self): - self.model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct" + self.model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct") self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @classmethod @@ -63,7 +65,7 @@ def test_q4_0(self): # from auto_round.eval.evaluation import simple_evaluate_user_model # result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="openbookqa", eval_model_dtype="bf16") # # 0.246 - # self.assertGreater(result['results']['openbookqa']['acc,none'], 0.23) + # assert result['results']['openbookqa']['acc,none'] > 0.23 shutil.rmtree("./saved", ignore_errors=True) # def test_q4_1(self): @@ -83,7 +85,7 @@ def test_q4_0(self): # # from auto_round.eval.evaluation import simple_evaluate_user_model # # result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="openbookqa", eval_model_dtype="bf16") # # # 0.23 - # # self.assertGreater(result['results']['openbookqa']['acc,none'], 0.22) + # # assert result['results']['openbookqa']['acc,none'] > 0.22 # shutil.rmtree("./saved", ignore_errors=True) def test_func(self): @@ -100,8 +102,8 @@ def test_func(self): ) quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q*_1") - self.assertTrue(autoround.group_size == 32) - self.assertFalse(autoround.sym) + assert autoround.group_size == 32 + assert not autoround.sym gguf_file = os.listdir("saved")[0] model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto") text = "There is a girl who likes adventure," @@ -129,7 +131,7 @@ def test_func(self): # gguf_file = os.listdir("saved")[0] # model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto") # result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="lambada_openai", eval_model_dtype="bf16") - # self.assertGreater(result['results']['lambada_openai']['acc,none'], 0.5) + # assert result['results']['lambada_openai']['acc,none'] > 0.5 shutil.rmtree("./saved", ignore_errors=True) # @@ -182,7 +184,7 @@ def test_func(self): # shutil.rmtree("./saved", ignore_errors=True) def test_gguf_baseline(self): - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-1.5B-Instruct" + model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) autoround = AutoRound( model, @@ -229,7 +231,7 @@ def test_gguf_baseline(self): # shutil.rmtree("./saved", ignore_errors=True) def test_q4_k_m(self, dataloader): - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-1.5B-Instruct" + model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) layer_config = { @@ -276,7 +278,7 @@ def test_q4_k_m(self, dataloader): shutil.rmtree("./saved", ignore_errors=True) def test_all_format(self): - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-1.5B-Instruct" + model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct") python_path = sys.executable # for gguf_format in ["gguf:q4_0", "gguf:q4_1", "gguf:q4_k_m", "gguf:q6_k"]: for gguf_format in ["gguf:q4_k_m"]: @@ -306,7 +308,7 @@ def test_all_format(self): shutil.rmtree("../../tmp_autoround", ignore_errors=True) def test_vlm_gguf(self): - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct" + model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct") from auto_round import AutoRoundMLLM from auto_round.utils import mllm_load_model @@ -321,13 +323,13 @@ def test_vlm_gguf(self): ) quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") - self.assertTrue("mmproj-model.gguf" in os.listdir("./saved")) + assert "mmproj-model.gguf" in os.listdir("./saved") for file_name in os.listdir(quantized_model_path): file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2 if file_name == "mmproj-model.gguf": - self.assertAlmostEqual(file_size, 2537, delta=5.0) + assert abs(file_size - 2537) < 5.0 else: - self.assertAlmostEqual(file_size, 892, delta=5.0) + assert abs(file_size - 892) < 5.0 shutil.rmtree("./saved", ignore_errors=True) def test_qtype_setting(self): @@ -338,7 +340,7 @@ def test_qtype_setting(self): from auto_round.compressors.utils import set_layer_config from auto_round.export.export_to_gguf.config import ModelType - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct" + model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct") ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0) ar.formats = ["gguf:q4_0"] ar.layer_config, _, _ = set_layer_config( @@ -354,8 +356,8 @@ def test_qtype_setting(self): enable_gguf_official_mixed=True, is_mllm=ar.mllm, ) - self.assertTrue(ar.layer_config["model.embed_tokens"]["bits"] == 8) - self.assertTrue("lm_head" not in ar.layer_config) + assert ar.layer_config["model.embed_tokens"]["bits"] == 8 + assert "lm_head" not in ar.layer_config model_name = "Qwen/Qwen3-0.6B" ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0) @@ -373,8 +375,8 @@ def test_qtype_setting(self): enable_gguf_official_mixed=True, is_mllm=ar.mllm, ) - self.assertTrue(ar.layer_config["model.embed_tokens"]["bits"] == 4) - self.assertTrue(ar.layer_config["lm_head"]["bits"] == 6 and ar.layer_config["lm_head"]["super_bits"] == 8) + assert ar.layer_config["model.embed_tokens"]["bits"] == 4 + assert ar.layer_config["lm_head"]["bits"] == 6 and ar.layer_config["lm_head"]["super_bits"] == 8 layer_config = { "model.embed_tokens": {"bits": 6, "super_bits": 8}, @@ -395,8 +397,8 @@ def test_qtype_setting(self): enable_gguf_official_mixed=True, is_mllm=ar.mllm, ) - self.assertTrue(ar.layer_config["lm_head"]["bits"] == 4) - self.assertTrue( - ar.layer_config["model.embed_tokens"]["bits"] == 6 + assert ( + ar.layer_config["lm_head"]["bits"] == 4 + and ar.layer_config["model.embed_tokens"]["bits"] == 6 and ar.layer_config["model.embed_tokens"]["super_bits"] == 8 ) diff --git a/test/test_cpu/test_gpt_oss.py b/test/test_cpu/test_gpt_oss.py index ccc997eba..b82c04c31 100644 --- a/test/test_cpu/test_gpt_oss.py +++ b/test/test_cpu/test_gpt_oss.py @@ -4,11 +4,13 @@ from auto_round import AutoRound +from ..helpers import get_model_path + @pytest.fixture def setup_gpt_oss(): """Fixture to set up the GPT-OSS model and tokenizer.""" - model_name = "/tf_dataset/auto_round/models/unsloth/gpt-oss-20b-BF16" + model_name = get_model_path("unsloth/gpt-oss-20b-BF16") tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) config.num_hidden_layers = 1 # Reduce layers for testing diff --git a/test/test_cpu/test_llmc_integration.py b/test/test_cpu/test_llmc_integration.py index 6dba09cfa..cea412327 100644 --- a/test/test_cpu/test_llmc_integration.py +++ b/test/test_cpu/test_llmc_integration.py @@ -85,7 +85,7 @@ def test_oneshot_application(recipe, tmp_path): assert weight_args.num_bits == 4 # Check a specific layer is quantized - targeted_linear_layer = model_loaded.model.layers[2].self_attn.q_proj + targeted_linear_layer = model_loaded.model.layers[1].self_attn.q_proj assert hasattr(targeted_linear_layer, "quantization_scheme") # Check lm-head is not quantized diff --git a/test/test_cpu/test_llmcompressor.py b/test/test_cpu/test_llmcompressor.py index ebe531f75..614701943 100644 --- a/test/test_cpu/test_llmcompressor.py +++ b/test/test_cpu/test_llmcompressor.py @@ -7,11 +7,13 @@ from auto_round import AutoRound +from ..helpers import get_model_path, opt_name_or_path + class TestLLMC: @classmethod def setup_class(self): - self.model_name = "/tf_dataset/auto_round/models/stas/tiny-random-llama-2" + self.model_name = get_model_path("stas/tiny-random-llama-2") self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -39,7 +41,7 @@ def test_llmcompressor_w8a8(self): def test_llmcompressor_fp8(self): ## quantize the model - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + model_name = opt_name_or_path autoround = AutoRound( model_name, scheme="FP8_STATIC", @@ -56,14 +58,14 @@ def test_llmcompressor_fp8(self): import json config = json.load(open("./saved/config.json")) - self.assertIn("group_0", config["quantization_config"]["config_groups"]) + assert "group_0" in config["quantization_config"]["config_groups"] assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8 assert config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"] == "channel" assert config["quantization_config"]["quant_method"] == "compressed-tensors" def test_autoround_llmcompressor_fp8(self): ## quantize the model - model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + model_name = opt_name_or_path autoround = AutoRound( model_name, scheme="FP8_STATIC", @@ -77,10 +79,8 @@ def test_autoround_llmcompressor_fp8(self): import json config = json.load(open("./saved/config.json")) - self.assertIn("group_0", config["quantization_config"]["config_groups"]) + assert "group_0" in config["quantization_config"]["config_groups"] assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8 assert config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"] == "tensor" - self.assertEqual( - config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["strategy"], "tensor" - ) + assert config["quantization_config"]["config_groups"]["group_0"]["input_activations"]["strategy"] == "tensor" assert config["quantization_config"]["quant_method"] == "compressed-tensors" diff --git a/test/test_cpu/test_load_awq_gptq.py b/test/test_cpu/test_load_awq_gptq.py index e78266182..6dc295b4e 100644 --- a/test/test_cpu/test_load_awq_gptq.py +++ b/test/test_cpu/test_load_awq_gptq.py @@ -3,7 +3,7 @@ import pytest from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer -from ..helpers import model_infer +from ..helpers import get_model_path, model_infer class TestAutoRound: @@ -13,9 +13,9 @@ def teardown_class(self): shutil.rmtree("runs", ignore_errors=True) def test_load_gptq_no_dummy_gidx_model(self): - model_name = "/tf_dataset/auto_round/models/ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1" + model_name = get_model_path("ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1") quantization_config = AutoRoundConfig() - with self.assertRaises(NotImplementedError) as cm: + with pytest.raises(NotImplementedError): model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", @@ -25,7 +25,7 @@ def test_load_gptq_no_dummy_gidx_model(self): ) def test_load_awq(self): - model_name = "/tf_dataset/auto_round/models/casperhansen/opt-125m-awq" + model_name = get_model_path("casperhansen/opt-125m-awq") quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( model_name, diff --git a/test/test_cpu/test_mix_bits.py b/test/test_cpu/test_mix_bits.py index 71354feb9..6cc390637 100644 --- a/test/test_cpu/test_mix_bits.py +++ b/test/test_cpu/test_mix_bits.py @@ -5,12 +5,13 @@ import pytest import torch -from parameterized import parameterized from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from auto_round import AutoRound from auto_round.testing_utils import require_gptqmodel +from ..helpers import opt_name_or_path + def _get_folder_size(path: str) -> float: """Return folder size in GB.""" @@ -26,7 +27,7 @@ def _get_folder_size(path: str) -> float: class TestAutoRound: @classmethod def setup_class(self): - self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + self.model_name = opt_name_or_path self.save_dir = ".saved/" self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -233,5 +234,5 @@ def test_mixed_MXFP_autoround_format_loading(self, dataloader): result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10) print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.14) + assert result["results"]["lambada_openai"]["acc,none"] > 0.14 shutil.rmtree(quantized_model_path, ignore_errors=True) diff --git a/test/test_cpu/test_mllm.py b/test/test_cpu/test_mllm.py index 25f2a209a..5e4842d4c 100644 --- a/test/test_cpu/test_mllm.py +++ b/test/test_cpu/test_mllm.py @@ -5,6 +5,8 @@ from auto_round import AutoRoundMLLM +from ..helpers import get_model_path, opt_name_or_path + class FakeDataLoader: def __init__(self): @@ -26,7 +28,7 @@ def __iter__(self): class TestAutoRoundMLLM: @classmethod def setup_class(self): - self.model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct" + self.model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct") self.dataset = FakeDataLoader() @classmethod @@ -137,11 +139,9 @@ def test_pure_text_model_check(self): model = Qwen2VLForConditionalGeneration.from_pretrained( self.model_name, trust_remote_code=True, device_map="auto" ) - self.assertFalse(is_pure_text_model(model)) - model = AutoModelForCausalLM.from_pretrained( - "/tf_dataset/auto_round/models/facebook/opt-125m", trust_remote_code=True - ) - self.assertTrue(is_pure_text_model(model)) + assert not is_pure_text_model(model) + model = AutoModelForCausalLM.from_pretrained(opt_name_or_path, trust_remote_code=True) + assert is_pure_text_model(model) def test_str_input(self): tokenizer = AutoTokenizer.from_pretrained(self.model_name) @@ -210,7 +210,7 @@ def test_str_input(self): def test_qwen2_5(self): from auto_round.utils import mllm_load_model - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-VL-3B-Instruct" + model_name = get_model_path("Qwen/Qwen2.5-VL-3B-Instruct") model, processor, tokenizer, image_processor = mllm_load_model(model_name) autoround = AutoRoundMLLM( model, diff --git a/test/test_cpu/test_model_scope.py b/test/test_cpu/test_model_scope.py index 0097b3584..cf48eeaab 100644 --- a/test/test_cpu/test_model_scope.py +++ b/test/test_cpu/test_model_scope.py @@ -7,6 +7,8 @@ from auto_round import AutoRound +from ..helpers import get_model_path + class TestModelScope: @classmethod @@ -29,14 +31,14 @@ def teardown_class(self): return super().teardown_class() def test_llm(self): - model_name = "Qwen/Qwen2.5-0.5B-Instruct" + model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct") autoround = AutoRound( model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=self.dataset ) autoround.quantize_and_save() def test_mllm(self): - model_name = "Qwen/Qwen2-VL-2B-Instruct" + model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct") autoround = AutoRound( model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=self.dataset, batch_size=2 ) diff --git a/test/test_cpu/test_moe_model.py b/test/test_cpu/test_moe_model.py index c88571346..62bac4efc 100644 --- a/test/test_cpu/test_moe_model.py +++ b/test/test_cpu/test_moe_model.py @@ -6,11 +6,13 @@ from auto_round import AutoRound +from ..helpers import get_model_path + @pytest.fixture def setup_gpt_oss(): """Fixture to set up the GPT-OSS model and tokenizer.""" - model_name = "/tf_dataset/auto_round/models/unsloth/gpt-oss-20b-BF16" + model_name = get_model_path("unsloth/gpt-oss-20b-BF16") tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) config.num_hidden_layers = 1 # Reduce layers for testing @@ -22,7 +24,7 @@ def setup_gpt_oss(): @pytest.fixture def setup_llama4(): """Fixture to set up the llama4 model and tokenizer.""" - model_name = "/tf_dataset/auto_round/models/meta-llama/Llama-4-Scout-17B-16E-Instruct" + model_name = get_model_path("meta-llama/Llama-4-Scout-17B-16E-Instruct") tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) config.vision_config.num_hidden_layers = 2 # Reduce layers for testing diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py index 1144d00d6..695371061 100644 --- a/test/test_cpu/test_mxfp_nvfp.py +++ b/test/test_cpu/test_mxfp_nvfp.py @@ -3,11 +3,12 @@ import pytest import torch -from parameterized import parameterized from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from auto_round import AutoRound +from ..helpers import get_model_path, opt_name_or_path + def _get_folder_size(path: str) -> float: """Return folder size in GB.""" @@ -23,7 +24,7 @@ def _get_folder_size(path: str) -> float: class TestAutoRoundFP: @classmethod def setup_class(self): - self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + self.model_name = opt_name_or_path self.save_dir = "./saved" self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto") self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -34,7 +35,7 @@ def teardown_class(self): shutil.rmtree("runs", ignore_errors=True) def test_nvfp4_moe_actmax_rtn(self, dataloader): - model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite" + model_name = get_model_path("deepseek-ai/DeepSeek-V2-Lite") layer_config = { "self_attn": {"bits": 16, "act_bits": 16}, "mlp.shared_experts": {"bits": 16, "act_bits": 16}, @@ -61,7 +62,7 @@ def test_nvfp4_moe_actmax_rtn(self, dataloader): shutil.rmtree(self.save_dir, ignore_errors=True) def test_nvfp4_moe_actmax_ar(self, dataloader): - model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite" + model_name = get_model_path("deepseek-ai/DeepSeek-V2-Lite") layer_config = { "q_proj": {"bits": 16, "act_bits": 16}, "mlp.shared_experts": {"bits": 16, "act_bits": 16}, @@ -95,11 +96,11 @@ def test_nvfp4_moe_actmax_ar(self, dataloader): result = simple_evaluate_user_model(model, tokenizer, batch_size=4, tasks="piqa", limit=4) print(result["results"]["piqa"]["acc,none"]) - self.assertGreater(result["results"]["piqa"]["acc,none"], 0.7) + assert result["results"]["piqa"]["acc,none"] > 0.7 shutil.rmtree(self.save_dir, ignore_errors=True) def test_mxfp4_moe_ar(self, dataloader): - model_name = "/tf_dataset/auto_round/models/deepseek-ai/DeepSeek-V2-Lite" + model_name = get_model_path("deepseek-ai/DeepSeek-V2-Lite") layer_config = { "q_proj": {"bits": 16, "act_bits": 16, "data_type": "float"}, "mlp.shared_experts": {"bits": 16, "act_bits": 16, "data_type": "float"}, @@ -332,7 +333,7 @@ def test_nvfp4_autoround_save_quantized(self, dataloader): shutil.rmtree(quantized_model_path, ignore_errors=True) def test_qwen_moe_quant_infer(self, dataloader): - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen1.5-MoE-A2.7B" + model_name = get_model_path("Qwen/Qwen1.5-MoE-A2.7B") layer_config = { "layers\.(?:[3-9]|1[0-9]|2[0-3])": {"bits": 16, "act_bits": 16}, } @@ -354,10 +355,11 @@ def test_qwen_moe_quant_infer(self, dataloader): result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="piqa", limit=10) print(result["results"]["piqa"]["acc,none"]) - self.assertGreater(result["results"]["piqa"]["acc,none"], 0.60) + assert result["results"]["piqa"]["acc,none"] > 0.60 shutil.rmtree(quantized_model_path, ignore_errors=True) - @parameterized.expand( + @pytest.mark.parametrize( + "scheme, static_kv_dtype, static_attention_dtype", [ # scheme, static_kv_dtype, static_attention_dtype ("MXFP4", None, "fp8"), @@ -366,7 +368,7 @@ def test_qwen_moe_quant_infer(self, dataloader): ("MXFP8", "fp8", None), ("NVFP4", None, "fp8"), ("NVFP4", "fp8", None), - ] + ], ) def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype, dataloader): model_name = self.model_name diff --git a/test/test_cpu/test_mxfp_save_load.py b/test/test_cpu/test_mxfp_save_load.py index aca5c7592..bf3e9853b 100644 --- a/test/test_cpu/test_mxfp_save_load.py +++ b/test/test_cpu/test_mxfp_save_load.py @@ -14,6 +14,8 @@ from auto_round.inference.backend import MX_TENSOR_DATA_TYPES from auto_round.testing_utils import has_module +from ..helpers import get_model_path + testing_scheme_name_lst = [ AutoRoundFormat.MXFP8.value, AutoRoundFormat.MXFP4.value, @@ -35,7 +37,7 @@ def test_e2e_quant_and_load(scheme_name, weight_data_type, act_data_type): # Use a temporary directory for saving the quantized model with tempfile.TemporaryDirectory() as temp_dir: - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-0.5B-Instruct" + model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct") config = AutoConfig.from_pretrained(model_name) config.num_hidden_layers = 2 # Use a smaller model for testing # Fix configuration validation issues diff --git a/test/test_cpu/test_scheme.py b/test/test_cpu/test_scheme.py index 71f02dc96..9bd236765 100644 --- a/test/test_cpu/test_scheme.py +++ b/test/test_cpu/test_scheme.py @@ -6,11 +6,13 @@ from auto_round import AutoRound from auto_round.schemes import QuantizationScheme +from ..helpers import get_model_path, opt_name_or_path, qwen_name_or_path + class TestAutoRound: @classmethod def setup_class(self): - self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + self.model_name = opt_name_or_path self.save_folder = "./saved" @classmethod @@ -20,7 +22,7 @@ def teardown_class(self): def test_gguf(self, dataloader): ar = AutoRound( - "/tf_dataset/auto_round/models/Qwen/Qwen3-0.6B", + qwen_name_or_path, scheme="W2A16", nsamples=1, iters=1, @@ -52,9 +54,7 @@ def test_mxfp4(self, dataloader): def test_vllm(self): from auto_round import AutoRoundMLLM - ar = AutoRoundMLLM( - "/tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct", scheme="W2A16", nsamples=1, iters=1, seqlen=2 - ) + ar = AutoRoundMLLM(get_model_path("Qwen/Qwen2-VL-2B-Instruct", scheme="W2A16"), nsamples=1, iters=1, seqlen=2) assert ar.bits == 2 assert ar.act_bits == 16 @@ -73,7 +73,7 @@ def test_all_scheme(self, dataloader): for scheme in preset_schemes: model_name = self.model_name if "gguf" in scheme.lower(): - model_name = "/tf_dataset/auto_round/models/Qwen/Qwen2.5-1.5B-Instruct" + model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct") print(f"scheme={scheme}") ar = AutoRound(model_name, scheme=scheme, nsamples=1, iters=1, seqlen=2, dataset=dataloader) ar.quantize_and_save(self.save_folder) @@ -86,7 +86,7 @@ def test_scheme_in_layer_config(self, dataloader): "model.decoder.layers.4.self_attn.k_proj": QuantizationScheme.from_dict({"group_size": 64}), } ar = AutoRound( - "/tf_dataset/auto_round/models/facebook/opt-125m", + opt_name_or_path, scheme="W3A16", nsamples=1, iters=1, @@ -110,9 +110,9 @@ def test_parse_available_devices(self): from auto_round.utils.device import parse_available_devices device_list = parse_available_devices("auto") - self.assertTrue(len(device_list) == 1 and "cpu" in device_list) + assert len(device_list) == 1 and "cpu" in device_list device_list = parse_available_devices("a:cuda:0,b:cuda:1,c:cpu") - self.assertTrue(len(device_list) == 3) + assert len(device_list) == 3 assert device_list == ["cuda:0", "cuda:1", "cpu"] device_list = parse_available_devices("0,1") - self.assertTrue(len(device_list) == 1 and "cpu" in device_list) + assert len(device_list) == 1 and "cpu" in device_list diff --git a/test/test_cpu/test_torch_backend.py b/test/test_cpu/test_torch_backend.py index e27914d9b..81e009c06 100644 --- a/test/test_cpu/test_torch_backend.py +++ b/test/test_cpu/test_torch_backend.py @@ -49,7 +49,7 @@ def test_torch_4bits_asym(self, dataloader): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10) print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35) + assert result["results"]["lambada_openai"]["acc,none"] > 0.35 torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( @@ -60,7 +60,7 @@ def test_torch_4bits_asym(self, dataloader): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai", limit=10) print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35) + assert result["results"]["lambada_openai"]["acc,none"] > 0.35 torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) @@ -90,6 +90,6 @@ def test_torch_4bits_sym(self, dataloader): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=1000) print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.28) + assert result["results"]["lambada_openai"]["acc,none"] > 0.28 torch.cuda.empty_cache() shutil.rmtree(self.save_folder, ignore_errors=True) diff --git a/test/test_cuda/requirements.txt b/test/test_cuda/requirements.txt index e7dd4e0d8..071eb233e 100644 --- a/test/test_cuda/requirements.txt +++ b/test/test_cuda/requirements.txt @@ -6,7 +6,6 @@ intel-extension-for-pytorch lm-eval>=0.4.9.1 optimum pandas -parameterized pillow torchvision numba diff --git a/test/test_cuda/test_auto_round_format.py b/test/test_cuda/test_auto_round_format.py index cbc6868f1..7299eae0c 100644 --- a/test/test_cuda/test_auto_round_format.py +++ b/test/test_cuda/test_auto_round_format.py @@ -88,7 +88,7 @@ def test_mixed_precision(self): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.32) + assert result["results"]["lambada_openai"]["acc,none"] > 0.32 @require_awq @require_package_version_ut("transformers", "<4.57.0") @@ -117,7 +117,7 @@ def test_awq_backend(self): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.18) + assert result["results"]["lambada_openai"]["acc,none"] > 0.18 torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( diff --git a/test/test_cuda/test_auto_scheme.py b/test/test_cuda/test_auto_scheme.py index b6f5d8066..1a53c2425 100644 --- a/test/test_cuda/test_auto_scheme.py +++ b/test/test_cuda/test_auto_scheme.py @@ -199,11 +199,11 @@ def test_layer_config(self): ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config) model, layer_config = ar.quantize() assert layer_config["model.decoder.layers.10.fc1"]["bits"] == 8 - assert layer_config["model.decoder.layers.10.fc1"]["sym"] == False + assert not layer_config["model.decoder.layers.10.fc1"]["sym"] assert layer_config["model.decoder.layers.10.fc1"]["group_size"] == 32 layer = get_module(model, "model.decoder.layers.10.fc1") assert layer.bits == 8 - assert layer.sym == False + assert not layer.sym assert layer.group_size == 32 avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) @@ -216,11 +216,11 @@ def test_layer_config(self): ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config) model, layer_config = ar.quantize() assert layer_config["model.decoder.layers.10.fc1"]["bits"] == 8 - assert layer_config["model.decoder.layers.10.fc1"]["sym"] == False + assert not layer_config["model.decoder.layers.10.fc1"]["sym"] assert layer_config["model.decoder.layers.10.fc1"]["group_size"] == 32 layer = get_module(model, "model.decoder.layers.10.fc1") assert layer.orig_layer.bits == 8 - assert layer.orig_layer.sym == False + assert not layer.orig_layer.sym assert layer.orig_layer.group_size == 32 avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) @@ -232,7 +232,7 @@ def test_lm_head_and_mix_dtype(self): scheme = AutoScheme(avg_bits=target_bits, options=("MXFP4", "MXFP8")) ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, quant_lm_head=True) model, layer_config = ar.quantize() - self.assertLessEqual(layer_config["lm_head"]["bits"], 8) + assert layer_config["lm_head"]["bits"] <= 8 avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 @@ -245,7 +245,7 @@ def test_auto_scheme_export(self): model_args = f"pretrained={self.save_dir}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.25) + assert result["results"]["lambada_openai"]["acc,none"] > 0.25 shutil.rmtree(self.save_dir, ignore_errors=True) model_name = "/models/Qwen3-0.6B" @@ -262,5 +262,5 @@ def test_enable_torch_compile(self): model_args = f"pretrained={self.save_dir}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.10) + assert result["results"]["lambada_openai"]["acc,none"] > 0.10 shutil.rmtree(self.save_dir, ignore_errors=True) diff --git a/test/test_cuda/test_diffusion.py b/test/test_cuda/test_diffusion.py index 147a34d47..27d72908d 100644 --- a/test/test_cuda/test_diffusion.py +++ b/test/test_cuda/test_diffusion.py @@ -69,7 +69,7 @@ def test_diffusion_rtn(self): def test_diffusion_model_checker(self): from auto_round.utils import is_diffusion_model - self.assertTrue(is_diffusion_model("/dataset/FLUX.1-dev")) - self.assertTrue(is_diffusion_model("/models/stable-diffusion-2-1")) - self.assertTrue(is_diffusion_model("/models/stable-diffusion-xl-base-1.0")) - self.assertFalse(is_diffusion_model("/models/Qwen3-8B")) + assert is_diffusion_model("/dataset/FLUX.1-dev") + assert is_diffusion_model("/models/stable-diffusion-2-1") + assert is_diffusion_model("/models/stable-diffusion-xl-base-1.0") + assert not is_diffusion_model("/models/Qwen3-8B") diff --git a/test/test_cuda/test_exllamav2_backend.py b/test/test_cuda/test_exllamav2_backend.py index e6f78ba90..b7f271f20 100644 --- a/test/test_cuda/test_exllamav2_backend.py +++ b/test/test_cuda/test_exllamav2_backend.py @@ -43,7 +43,7 @@ def test_gptqmodel_exllmav2_4bits_asym(self, dataloader): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35) + assert result["results"]["lambada_openai"]["acc,none"] > 0.35 torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( @@ -54,7 +54,7 @@ def test_gptqmodel_exllmav2_4bits_asym(self, dataloader): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35) + assert result["results"]["lambada_openai"]["acc,none"] > 0.35 torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) @@ -86,7 +86,7 @@ def test_gptq_exllamav2_4bits_sym(self, dataloader): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27) + assert result["results"]["lambada_openai"]["acc,none"] > 0.27 torch.cuda.empty_cache() shutil.rmtree(self.save_folder, ignore_errors=True) @@ -121,6 +121,6 @@ def test_gptq_exllamav2_4bits_sym_group_size(self): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=64, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.15) + assert result["results"]["lambada_openai"]["acc,none"] > 0.15 torch.cuda.empty_cache() shutil.rmtree(self.save_folder, ignore_errors=True) diff --git a/test/test_cuda/test_fp8_input.py b/test/test_cuda/test_fp8_input.py index 90a177ef3..777abaf04 100644 --- a/test/test_cuda/test_fp8_input.py +++ b/test/test_cuda/test_fp8_input.py @@ -55,7 +55,7 @@ def test_small_model_rtn(self): model_args = f"pretrained={self.save_dir}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.25) + assert result["results"]["lambada_openai"]["acc,none"] > 0.25 shutil.rmtree(self.save_dir, ignore_errors=True) @@ -66,7 +66,7 @@ def test_small_model_iters1(self): model_args = f"pretrained={self.save_dir}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.25) + assert result["results"]["lambada_openai"]["acc,none"] > 0.25 shutil.rmtree(self.save_dir, ignore_errors=True) @@ -77,7 +77,7 @@ def test_medium_model_rtn(self): model_args = f"pretrained={self.save_dir}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.55) + assert result["results"]["lambada_openai"]["acc,none"] > 0.55 shutil.rmtree(self.save_dir, ignore_errors=True) @@ -89,7 +89,7 @@ def test_medium_model_rtn_with_lm_head(self): model_args = f"pretrained={self.save_dir}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.55) + assert result["results"]["lambada_openai"]["acc,none"] > 0.55 shutil.rmtree(self.save_dir, ignore_errors=True) diff --git a/test/test_cuda/test_get_block_name.py b/test/test_cuda/test_get_block_name.py index 52f251cb7..829ac1e46 100644 --- a/test/test_cuda/test_get_block_name.py +++ b/test/test_cuda/test_get_block_name.py @@ -193,7 +193,7 @@ def test_flux(self): block_names = get_block_names(model) self.check_block_names(block_names, ["transformer_blocks", "single_transformer_blocks"], [19, 38]) - self.assertTrue(any(["context_embedder" not in n for n in block_names])) + assert any(["context_embedder" not in n for n in block_names]) block_names = get_block_names(model, quant_vision=True) self.check_block_names(block_names, ["transformer_blocks", "single_transformer_blocks"], [19, 38]) diff --git a/test/test_cuda/test_gguf.py b/test/test_cuda/test_gguf.py index b8ee88d0b..a7076667c 100644 --- a/test/test_cuda/test_gguf.py +++ b/test/test_cuda/test_gguf.py @@ -50,7 +50,7 @@ def test_gguf_format(self, dataloader): f"--output_dir {save_dir} --nsample 2 --format gguf:q4_0 --device 0" ) print(save_dir) - self.assertFalse(res > 0 or res == -1, msg="qwen2 tuning fail") + assert not (res > 0 or res == -1), "qwen2 tuning fail" from llama_cpp import Llama @@ -88,7 +88,7 @@ def test_q2_k_export(self, dataloader): from auto_round.eval.evaluation import simple_evaluate_user_model result = simple_evaluate_user_model(model, autoround.tokenizer, batch_size=16, tasks="piqa") - self.assertGreater(result["results"]["piqa"]["acc,none"], 0.45) + assert result["results"]["piqa"]["acc,none"] > 0.45 shutil.rmtree(quantized_model_path, ignore_errors=True) @@ -122,7 +122,7 @@ def test_q4_0(self): from auto_round.eval.evaluation import simple_evaluate_user_model result = simple_evaluate_user_model(model, autoround.tokenizer, batch_size=16, tasks="piqa") - self.assertGreater(result["results"]["piqa"]["acc,none"], 0.54) + assert result["results"]["piqa"]["acc,none"] > 0.54 shutil.rmtree(quantized_model_path, ignore_errors=True) @require_gguf @@ -143,7 +143,7 @@ def test_q4_1(self): from auto_round.eval.evaluation import simple_evaluate_user_model result = simple_evaluate_user_model(model, autoround.tokenizer, batch_size=16, tasks="piqa") - self.assertGreater(result["results"]["piqa"]["acc,none"], 0.54) + assert result["results"]["piqa"]["acc,none"] > 0.54 shutil.rmtree("./saved", ignore_errors=True) @require_gguf @@ -187,11 +187,11 @@ def test_vlm_gguf(self): ) quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") - self.assertTrue("mmproj-model.gguf" in os.listdir("./saved")) + assert "mmproj-model.gguf" in os.listdir("./saved") file_size = os.path.getsize("./saved/Qwen2.5-VL-7B-Instruct-Q4_0.gguf") / 1024**2 - self.assertAlmostEqual(file_size, 4242, delta=5.0) + assert abs(file_size - 4242) < 5.0 file_size = os.path.getsize("./saved/mmproj-model.gguf") / 1024**2 - self.assertAlmostEqual(file_size, 2580, delta=5.0) + assert abs(file_size - 2580) < 5.0 shutil.rmtree("./saved", ignore_errors=True) model_name = "/models/gemma-3-12b-it" @@ -208,11 +208,11 @@ def test_vlm_gguf(self): ) quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m") - self.assertTrue("mmproj-model.gguf" in os.listdir("./saved")) + assert "mmproj-model.gguf" in os.listdir("./saved") file_size = os.path.getsize("./saved/gemma-3-12B-it-Q4_K_M.gguf") / 1024**2 - self.assertAlmostEqual(file_size, 6568, delta=5.0) + assert abs(file_size - 6568) < 5.0 file_size = os.path.getsize("./saved/mmproj-model.gguf") / 1024**2 - self.assertAlmostEqual(file_size, 1599, delta=5.0) + assert abs(file_size - 1599) < 5.0 shutil.rmtree(quantized_model_path, ignore_errors=True) # @require_gguf @@ -233,12 +233,12 @@ def test_vlm_gguf(self): # quantized_model_path = "/dataset/Llam-4-test" # shutil.rmtree(quantized_model_path, ignore_errors=True) # autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") - # self.assertTrue("mmproj-model.gguf" in os.listdir(quantized_model_path)) + # assert "mmproj-model.gguf" in os.listdir(quantized_model_path) # file_size = ( # os.path.getsize(os.path.join(quantized_model_path, "Llama-4-Scout-17B-16E-Instruct-16x17B-Q4_0.gguf")) # / 1024**2 # ) - # self.assertAlmostEqual(file_size, 58093.62, delta=1.0) + # assert abs(file_size - 58093.62) < 1.0 # file_size = os.path.getsize(os.path.join(quantized_model_path, "mmproj-model.gguf")) / 1024**2 - # self.assertAlmostEqual(file_size, 3326.18, delta=5.0) + # assert abs(file_size - 3326.18) < 5.0 # shutil.rmtree(quantized_model_path, ignore_errors=True) diff --git a/test/test_cuda/test_main_func.py b/test/test_cuda/test_main_func.py index 20dc7bdc8..3243963fe 100644 --- a/test/test_cuda/test_main_func.py +++ b/test/test_cuda/test_main_func.py @@ -81,7 +81,7 @@ def test_backend_awq(self): assert accuracy > 0.35 shutil.rmtree("./saved", ignore_errors=True) - @unittest.skipIf(torch.cuda.is_available() is False, "Skipping because no cuda") + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @require_gptqmodel def test_fp_layers(self): model_name = "/models/opt-125m" @@ -105,7 +105,7 @@ def test_fp_layers(self): assert accuracy > 0.35 shutil.rmtree("./saved", ignore_errors=True) - @unittest.skipIf(torch.cuda.is_available() is False, "Skipping because no cuda") + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @require_awq @require_package_version_ut("transformers", "<4.57.0") def test_fp_layers_awq(self): @@ -130,7 +130,7 @@ def test_fp_layers_awq(self): assert accuracy > 0.35 shutil.rmtree("./saved", ignore_errors=True) - @unittest.skipIf(torch.cuda.is_available() is False, "Skipping because no cuda") + @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") def test_undivided_group_size_tuning(self): model_name = "/models/opt-125m" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") diff --git a/test/test_cuda/test_marlin_backend.py b/test/test_cuda/test_marlin_backend.py index b920d9478..334cb2697 100644 --- a/test/test_cuda/test_marlin_backend.py +++ b/test/test_cuda/test_marlin_backend.py @@ -40,7 +40,7 @@ def test_marlin_group_size(self, dataloader): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.14) + assert result["results"]["lambada_openai"]["acc,none"] > 0.14 for group_size in [32, 128]: print(f"{group_size}!!!!!!!!!!!!!!!!!") @@ -69,7 +69,7 @@ def test_marlin_group_size(self, dataloader): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.14) + assert result["results"]["lambada_openai"]["acc,none"] > 0.14 @classmethod def setup_class(self): @@ -107,7 +107,7 @@ def test_marlin_4bits_sym_with_zp_m_1(self, dataloader): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27) + assert result["results"]["lambada_openai"]["acc,none"] > 0.27 torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( @@ -118,7 +118,7 @@ def test_marlin_4bits_sym_with_zp_m_1(self, dataloader): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27) + assert result["results"]["lambada_openai"]["acc,none"] > 0.27 torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) @@ -151,7 +151,7 @@ def test_marlin_4bits_sym_with_zp_m_1(self, dataloader): # model_infer(model, tokenizer) # result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) - # self.assertGreater(result['results']['lambada_openai']['acc,none'], 0.27) + # assert result['results']['lambada_openai']['acc,none'] > 0.27 # torch.cuda.empty_cache() # # model = AutoModelForCausalLM.from_pretrained( @@ -165,6 +165,6 @@ def test_marlin_4bits_sym_with_zp_m_1(self, dataloader): # model_infer(model, tokenizer) # result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) - # self.assertGreater(result['results']['lambada_openai']['acc,none'], 0.27) + # assert result['results']['lambada_openai']['acc,none'] > 0.27 # torch.cuda.empty_cache() # shutil.rmtree("./saved", ignore_errors=True) diff --git a/test/test_cuda/test_mix_bits.py b/test/test_cuda/test_mix_bits.py index b9b7dde5c..958b8ba8e 100644 --- a/test/test_cuda/test_mix_bits.py +++ b/test/test_cuda/test_mix_bits.py @@ -5,7 +5,6 @@ import pytest import torch -from parameterized import parameterized from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from auto_round import AutoRound @@ -228,7 +227,7 @@ def test_mixed_MXFP_autoround_format_loading(self, dataloader): result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.32) + assert result["results"]["lambada_openai"]["acc,none"] > 0.32 shutil.rmtree(quantized_model_path, ignore_errors=True) def test_mixed_autoround_format_vllm(self, dataloader): diff --git a/test/test_cuda/test_mxfp_nvfp.py b/test/test_cuda/test_mxfp_nvfp.py index 357afb0f3..64436f9b6 100644 --- a/test/test_cuda/test_mxfp_nvfp.py +++ b/test/test_cuda/test_mxfp_nvfp.py @@ -151,5 +151,5 @@ def test_qwen_moe_quant_infer(self, dataloader): result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="piqa") print(result["results"]["piqa"]["acc,none"]) - self.assertGreater(result["results"]["piqa"]["acc,none"], 0.7) + assert result["results"]["piqa"]["acc,none"] > 0.7 shutil.rmtree(quantized_model_path, ignore_errors=True) diff --git a/test/test_cuda/test_scheme.py b/test/test_cuda/test_scheme.py index 06c5b27e0..f2ec15bfa 100644 --- a/test/test_cuda/test_scheme.py +++ b/test/test_cuda/test_scheme.py @@ -49,7 +49,7 @@ def test_fp8_static(self): assert ar.data_type == "fp" assert ar.act_data_type == "fp" assert ar.group_size == -1 - assert ar.act_dynamic == False + assert not ar.act_dynamic ar.quantize() ## RTN tests @@ -73,7 +73,7 @@ def test_fp8_static_rtn(self): assert ar.data_type == "fp" assert ar.act_data_type == "fp" assert ar.group_size == -1 - assert ar.act_dynamic == False + assert not ar.act_dynamic ar.quantize() def test_scheme_in_layer_config(self): diff --git a/test/test_cuda/test_support_vlms.py b/test/test_cuda/test_support_vlms.py index 15c86363b..9efd53564 100644 --- a/test/test_cuda/test_support_vlms.py +++ b/test/test_cuda/test_support_vlms.py @@ -29,7 +29,7 @@ def test_qwen2(self): f"cd ../.. && {self.python_path} -m auto_round --mllm " f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}" ) - self.assertFalse(res > 0 or res == -1, msg="qwen2 tuning fail") + assert not (res > 0 or res == -1), "qwen2 tuning fail" # test infer quantized_model_path = os.path.join(self.save_dir, "Qwen2-VL-2B-Instruct-w4g128") @@ -84,7 +84,7 @@ def test_phi3(self): f"cd ../.. && {self.python_path} -m auto_round --mllm " f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}" ) - self.assertFalse(res > 0 or res == -1, msg="Phi-3.5 tuning fail") + assert not (res > 0 or res == -1), "Phi-3.5 tuning fail" ## test infer from transformers import AutoModelForCausalLM, AutoProcessor @@ -134,7 +134,7 @@ def test_phi3_vision_awq(self): f"--nsample 64 --seqlen 32 " f"--format auto_awq --output_dir {self.save_dir} --device {self.device}" ) - self.assertFalse(res > 0 or res == -1, msg="Phi-3.5 tuning fail") + assert not (res > 0 or res == -1), "Phi-3.5 tuning fail" ## test infer from transformers import AutoModelForCausalLM, AutoProcessor @@ -180,7 +180,7 @@ def test_glm(self): f"cd ../.. && {self.python_path} -m auto_round " f"--model {model_path} --iter 1 --output_dir {self.save_dir} --device {self.device}" ) - self.assertFalse(res > 0 or res == -1, msg="glm-4v-9b tuning fail") + assert not (res > 0 or res == -1), "glm-4v-9b tuning fail" def test_granite_vision(self): model_path = "/models/granite-vision-3.2-2b" @@ -189,4 +189,4 @@ def test_granite_vision(self): f"cd ../.. && {self.python_path} -m auto_round " f"--model {model_path} --iter 1 --output_dir {self.save_dir} --device {self.device}" ) - self.assertFalse(res > 0 or res == -1, msg="granite-vision-3.2-2b tuning fail") + assert not (res > 0 or res == -1), "granite-vision-3.2-2b tuning fail" diff --git a/test/test_cuda/test_torch_backend.py b/test/test_cuda/test_torch_backend.py index 495da24e3..5244725e8 100644 --- a/test/test_cuda/test_torch_backend.py +++ b/test/test_cuda/test_torch_backend.py @@ -49,7 +49,7 @@ def test_torch_4bits_asym(self, dataloader): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35) + assert result["results"]["lambada_openai"]["acc,none"] > 0.35 torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( @@ -60,7 +60,7 @@ def test_torch_4bits_asym(self, dataloader): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.35) + assert result["results"]["lambada_openai"]["acc,none"] > 0.35 torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) @@ -90,6 +90,6 @@ def test_torch_4bits_sym(self, dataloader): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.28) + assert result["results"]["lambada_openai"]["acc,none"] > 0.28 torch.cuda.empty_cache() shutil.rmtree(self.save_folder, ignore_errors=True) diff --git a/test/test_cuda/test_transformers.py b/test/test_cuda/test_transformers.py index 0e43a7e70..f6e5b4497 100644 --- a/test/test_cuda/test_transformers.py +++ b/test/test_cuda/test_transformers.py @@ -74,12 +74,12 @@ def test_quantized_model(self): """ input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) output = self.quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False) - self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS) + assert self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUTS def test_raise_if_non_quantized(self): model_id = "facebook/opt-125m" quantization_config = AutoRoundConfig(bits=4) - with self.assertRaises(ValueError): + with pytest.raises(ValueError): _ = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config) def test_quantized_model_bf16(self): @@ -96,7 +96,7 @@ def test_quantized_model_bf16(self): ) output = quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False) - self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS) + assert self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUTS @require_intel_extension_for_pytorch def test_quantized_model_on_cpu(self): @@ -108,7 +108,7 @@ def test_quantized_model_on_cpu(self): quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto") output = quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False) - self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS) + assert self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUTS def test_save_pretrained(self): """ @@ -131,7 +131,7 @@ def test_save_pretrained(self): input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) output = model.generate(**input_ids, max_new_tokens=40, do_sample=False) - self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS) + assert self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUTS @require_torch_multi_gpu def test_quantized_model_multi_gpu(self): @@ -144,7 +144,7 @@ def test_quantized_model_multi_gpu(self): ) input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(quantized_model.device) output = quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False) - self.assertIn(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUTS) + assert self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUTS def test_convert_from_gptq(self): """ diff --git a/test/test_cuda/test_triton_backend.py b/test/test_cuda/test_triton_backend.py index 38958014b..ac5436f47 100644 --- a/test/test_cuda/test_triton_backend.py +++ b/test/test_cuda/test_triton_backend.py @@ -49,7 +49,7 @@ def test_tritonv2_4bits_asym(self, dataloader): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.34) + assert result["results"]["lambada_openai"]["acc,none"] > 0.34 torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( @@ -60,7 +60,7 @@ def test_tritonv2_4bits_asym(self, dataloader): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.34) + assert result["results"]["lambada_openai"]["acc,none"] > 0.34 torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) @@ -82,7 +82,7 @@ def test_tritonv2_2bits_asym(self): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.19) + assert result["results"]["lambada_openai"]["acc,none"] > 0.19 torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( @@ -93,7 +93,7 @@ def test_tritonv2_2bits_asym(self): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.19) + assert result["results"]["lambada_openai"]["acc,none"] > 0.19 torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) @@ -124,7 +124,7 @@ def test_tritonv2_4bits_sym(self, dataloader): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.26) + assert result["results"]["lambada_openai"]["acc,none"] > 0.26 torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( @@ -135,7 +135,7 @@ def test_tritonv2_4bits_sym(self, dataloader): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.26) + assert result["results"]["lambada_openai"]["acc,none"] > 0.26 torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) @@ -158,7 +158,7 @@ def test_tritonv2_8bits_sym(self): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27) + assert result["results"]["lambada_openai"]["acc,none"] > 0.27 torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( @@ -169,7 +169,7 @@ def test_tritonv2_8bits_sym(self): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27) + assert result["results"]["lambada_openai"]["acc,none"] > 0.27 torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) @@ -197,7 +197,7 @@ def test_tritonv2_2bits_sym(self): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.18) + assert result["results"]["lambada_openai"]["acc,none"] > 0.18 torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( @@ -208,6 +208,6 @@ def test_tritonv2_2bits_sym(self): model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.18) + assert result["results"]["lambada_openai"]["acc,none"] > 0.18 torch.cuda.empty_cache() shutil.rmtree("./saved", ignore_errors=True) diff --git a/test/test_cuda/test_vlms.py b/test/test_cuda/test_vlms.py index bfc7cf52c..c8a4adb53 100644 --- a/test/test_cuda/test_vlms.py +++ b/test/test_cuda/test_vlms.py @@ -119,12 +119,12 @@ def test_mm_block_name(self): model = MllamaForConditionalGeneration.from_pretrained(model_name, trust_remote_code=True, device_map="auto") block_name = get_block_names(model, quant_vision=True) - self.assertTrue(len(block_name) == 3) - self.assertTrue(any(["vision_model.global_transformer.layers.0" not in n for n in block_name])) - self.assertTrue(any(["vision_model.transformer.layers.0" not in n for n in block_name])) + assert len(block_name) == 3 + assert any(["vision_model.global_transformer.layers.0" not in n for n in block_name]) + assert any(["vision_model.transformer.layers.0" not in n for n in block_name]) block_name = get_block_names(model, quant_vision=False) - self.assertTrue(len(block_name) == 1) - self.assertTrue(get_block_names(model) == block_name) + assert len(block_name) == 1 + assert get_block_names(model) == block_name def test_mllm_detect(self): from auto_round.utils import is_mllm_model, llm_load_model, mllm_load_model @@ -140,14 +140,14 @@ def test_mllm_detect(self): "/models/InternVL3-1B", "/models/pixtral-12b", ]: - self.assertTrue(is_mllm_model(model_name)) + assert is_mllm_model(model_name) try: model, _, _, _ = mllm_load_model(model_name) except: continue - self.assertTrue(is_mllm_model(model)) + assert is_mllm_model(model) for model_name in ["/models/glm-4-9b-chat", "/models/Qwen2.5-1.5B-Instruct/"]: - self.assertFalse(is_mllm_model(model_name)) + assert not is_mllm_model(model_name) model, _ = llm_load_model(model_name) - self.assertFalse(is_mllm_model(model)) + assert not is_mllm_model(model) diff --git a/test/test_hpu/test_auto_round.py b/test/test_hpu/test_auto_round.py index eb6066982..d2e33dd03 100644 --- a/test/test_hpu/test_auto_round.py +++ b/test/test_hpu/test_auto_round.py @@ -3,7 +3,7 @@ from auto_round.utils import is_hpex_available -from ..helpers import is_pytest_mode_compile, is_pytest_mode_lazy +from ..helpers import get_model_path, is_pytest_mode_compile, is_pytest_mode_lazy def run_opt_125m_on_hpu(): @@ -11,7 +11,7 @@ def run_opt_125m_on_hpu(): from auto_round import AutoRound - model_name = "facebook/opt-125m" + model_name = get_model_path("facebook/opt-125m") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) @@ -56,7 +56,7 @@ def test_w4a8(data_type): from auto_round import AutoRound - model_name = "facebook/opt-125m" + model_name = get_model_path("facebook/opt-125m") model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", diff --git a/test/test_xpu/test_autoround.py b/test/test_xpu/test_autoround.py index b9894cecf..d857e3bdc 100644 --- a/test/test_xpu/test_autoround.py +++ b/test/test_xpu/test_autoround.py @@ -8,11 +8,13 @@ from auto_round import AutoRound, AutoRoundConfig +from ..helpers import get_model_path + class TestAutoRoundXPU: @classmethod def setup_class(self): - + pass @classmethod def teardown_class(self): @@ -20,8 +22,8 @@ def teardown_class(self): shutil.rmtree("runs", ignore_errors=True) pass - def test_gptq_format(self): - model_name = "facebook/opt-125m" + def test_gptq_format(self, dataloader): + model_name = get_model_path("facebook/opt-125m") model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", trust_remote_code=True, device_map="auto" ) @@ -53,8 +55,8 @@ def test_gptq_format(self): print(res) assert "!!!" not in res - def test_awq_format(self): - model_name = "facebook/opt-125m" + def test_awq_format(self, dataloader): + model_name = get_model_path("facebook/opt-125m") model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", trust_remote_code=True, device_map="xpu" ) From 9d26d04ec8d27be5275ebe8649b79c3ef05c4da2 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 19 Dec 2025 03:07:20 -0500 Subject: [PATCH 10/24] update cuda ut Signed-off-by: n1ck-guo --- test/helpers.py | 6 +- test/test_cuda/test_alg_ext.py | 4 +- test/test_cuda/test_asym.py | 130 +++++++++++------------ test/test_cuda/test_auto_round_format.py | 90 +++++++--------- test/test_cuda/test_auto_scheme.py | 116 ++++++++++---------- test/test_cuda/test_calib_dataset.py | 24 +---- test/test_cuda/test_conv1d.py | 26 +++-- test/test_cuda/test_diffusion.py | 25 +++-- test/test_cuda/test_exllamav2_backend.py | 63 ++++++----- test/test_cuda/test_export.py | 88 ++++++++------- test/test_cuda/test_fp8_input.py | 63 ++++++----- test/test_cuda/test_scheme.py | 4 +- 12 files changed, 318 insertions(+), 321 deletions(-) diff --git a/test/helpers.py b/test/helpers.py index d67f85599..a46a9a58b 100644 --- a/test/helpers.py +++ b/test/helpers.py @@ -25,8 +25,10 @@ def get_model_path(model_name: str) -> str: # Slice model into tiny model for speedup -def get_tiny_model(model_name_or_path, num_layers=3): - model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype="auto", trust_remote_code=True) +def get_tiny_model(model_name_or_path, num_layers=3, **kwargs): + kwargs["dtype"] = "auto" if "auto" not in kwargs else kwargs["dtype"] + kwargs["trust_remote_code"] = True if "trust_remote_code" not in kwargs else kwargs["trust_remote_code"] + model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, **kwargs) if hasattr(model.config, "num_hidden_layers"): model.config.num_hidden_layers = num_layers diff --git a/test/test_cuda/test_alg_ext.py b/test/test_cuda/test_alg_ext.py index e13bfac4a..6b04847ed 100644 --- a/test/test_cuda/test_alg_ext.py +++ b/test/test_cuda/test_alg_ext.py @@ -49,13 +49,13 @@ def test_cli(self, tiny_opt_model_path): python_path = sys.executable res = os.system( - f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 2 --options=W2A16,W4A16 --ignore_scale_zp_bits --nsampes 1 --seqlen 32" + f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 2 --options=W2A16,W4A16 --ignore_scale_zp_bits --nsamples 1 --seqlen 32" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 5.5 --options=mxfp4,mxfp8 --ignore_scale_zp_bits --enable_torch_compile --nsampes 1 --seqlen 32" + f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 5.5 --options=mxfp4,mxfp8 --ignore_scale_zp_bits --enable_torch_compile --nsamples 1 --seqlen 32" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" diff --git a/test/test_cuda/test_asym.py b/test/test_cuda/test_asym.py index c41c0d5d8..1eda6f146 100644 --- a/test/test_cuda/test_asym.py +++ b/test/test_cuda/test_asym.py @@ -3,16 +3,16 @@ import sys import unittest -sys.path.insert(0, "../..") - +import pytest import torch -from _test_helpers import model_infer from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from auto_round import AutoRound from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.utils import get_module +from ..helpers import model_infer + class LLMDataLoader: def __init__(self): @@ -23,140 +23,138 @@ def __iter__(self): yield torch.ones([1, 10], dtype=torch.long) -class TestAutoRoundAsym(unittest.TestCase): - @classmethod - def setUpClass(self): - self.model_name = "/models/opt-125m" - # self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" - self.save_folder = "./saved" +class TestAutoRoundAsym: + save_dir = "./saved" + + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + + # Yield to hand control to the test methods + yield - @classmethod - def tearDownClass(self): - shutil.rmtree(self.save_folder, ignore_errors=True) + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") + shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_asym_group_size(self): - model_name = self.model_name - model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto") - tokenizer = AutoTokenizer.from_pretrained(model_name) + def test_asym_group_size(self, tiny_opt_model_path): for group_size in [32, 64, 128]: bits, sym = 4, False - ar = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1) - ar.quantize_and_save(format="auto_round", output_dir=self.save_folder) + ar = AutoRound( + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1 + ) + ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) # TODO when ark is ready, uncomment the following lines to do inference test # model = AutoModelForCausalLM.from_pretrained( - # self.save_folder, + # self.save_dir, # torch_dtype="auto", # device_map="auto", # ) - # tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) # model_infer(model, tokenizer) - shutil.rmtree(self.save_folder) + shutil.rmtree(self.save_dir) - def test_asym_bits(self): - model_name = self.model_name - model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto") - tokenizer = AutoTokenizer.from_pretrained(model_name) + def test_asym_bits(self, tiny_opt_model_path): for bits in [2, 3, 8]: group_size, sym = 128, False - ar = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1) - ar.quantize_and_save(format="auto_round", output_dir=self.save_folder) + ar = AutoRound( + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1 + ) + ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) # TODO when ark is ready, uncomment the following lines to do inference test # model = AutoModelForCausalLM.from_pretrained( - # self.save_folder, + # self.save_dir, # torch_dtype="auto", # device_map="auto", # ) - # tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) # model_infer(model, tokenizer) - shutil.rmtree(self.save_folder) + shutil.rmtree(self.save_dir) # use parameters later - def test_asym_format(self): - model_name = self.model_name - model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto") - tokenizer = AutoTokenizer.from_pretrained(model_name) + def test_asym_format(self, tiny_opt_model_path): for format in ["auto_round", "auto_round:auto_gptq", "auto_round:gptqmodel"]: bits, group_size, sym = 4, 128, False - ar = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1) + ar = AutoRound( + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=0, seqlen=2, nsamples=1 + ) # TODO when ark is ready, uncomment the following lines to do inference test - ar.quantize_and_save(format=format, output_dir=self.save_folder) + ar.quantize_and_save(format=format, output_dir=self.save_dir) # model = AutoModelForCausalLM.from_pretrained( - # self.save_folder, + # self.save_dir, # torch_dtype="auto", # device_map="auto", # ) - # tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) # model_infer(model, tokenizer) - shutil.rmtree(self.save_folder) + shutil.rmtree(self.save_dir) - def test_asym_group_size_with_tuning(self): - model_name = self.model_name - model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto") - tokenizer = AutoTokenizer.from_pretrained(model_name) + def test_asym_group_size_with_tuning(self, tiny_opt_model_path): for group_size in [32, 64, 128]: bits, sym = 4, False - ar = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1) - ar.quantize_and_save(format="auto_round", output_dir=self.save_folder) + ar = AutoRound( + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1 + ) + ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) # TODO when ark is ready, uncomment the following lines to do inference test # model = AutoModelForCausalLM.from_pretrained( - # self.save_folder, + # self.save_dir, # torch_dtype="auto", # device_map="auto", # ) - # tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) # model_infer(model, tokenizer) - shutil.rmtree(self.save_folder) + shutil.rmtree(self.save_dir) - def test_asym_bits_with_tuning(self): - model_name = self.model_name - model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto") - tokenizer = AutoTokenizer.from_pretrained(model_name) + def test_asym_bits_with_tuning(self, tiny_opt_model_path): for bits in [2, 3, 8]: group_size, sym = 128, False - ar = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1) - ar.quantize_and_save(format="auto_round", output_dir=self.save_folder) + ar = AutoRound( + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1 + ) + ar.quantize_and_save(format="auto_round", output_dir=self.save_dir) # TODO when ark is ready, uncomment the following lines to do inference test # model = AutoModelForCausalLM.from_pretrained( - # self.save_folder, + # self.save_dir, # torch_dtype="auto", # device_map="auto", # ) - # tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) # model_infer(model, tokenizer) - shutil.rmtree(self.save_folder) + shutil.rmtree(self.save_dir) # use parameters later - def test_asym_format_with_tuning(self): - model_name = self.model_name - model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto") - tokenizer = AutoTokenizer.from_pretrained(model_name) + def test_asym_format_with_tuning(self, tiny_opt_model_path): for format in ["auto_round", "auto_round:auto_gptq", "auto_round:gptqmodel"]: bits, group_size, sym = 4, 128, False - ar = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1) + ar = AutoRound( + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, nsamples=1 + ) # TODO when ark is ready, uncomment the following lines to do inference test - ar.quantize_and_save(format=format, output_dir=self.save_folder) + ar.quantize_and_save(format=format, output_dir=self.save_dir) # model = AutoModelForCausalLM.from_pretrained( - # self.save_folder, + # self.save_dir, # torch_dtype="auto", # device_map="auto", # ) - # tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) # model_infer(model, tokenizer) - shutil.rmtree(self.save_folder) + shutil.rmtree(self.save_dir) diff --git a/test/test_cuda/test_auto_round_format.py b/test/test_cuda/test_auto_round_format.py index cbc6868f1..6ec5edff9 100644 --- a/test/test_cuda/test_auto_round_format.py +++ b/test/test_cuda/test_auto_round_format.py @@ -16,31 +16,33 @@ require_package_version_ut, ) -from ..helpers import model_infer +from ..helpers import get_model_path, get_tiny_model, model_infer class TestAutoRound: - @classmethod - def setup_class(self): - self.model_name = "facebook/opt-125m" + save_dir = "./saved" - self.save_folder = "./saved" + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") - @classmethod - def teardown_class(self): - shutil.rmtree(self.save_folder, ignore_errors=True) + # Yield to hand control to the test methods + yield + + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") + shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @require_greater_than_050 @require_package_version_ut("transformers", "<4.57.0") - def test_autoround_asym(self, dataloader): + def test_autoround_asym(self, tiny_opt_model_path, dataloader): for bits in [2, 3, 4, 8]: - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + # model_name = get_model_path("facebook/opt-125m") bits, group_size, sym = bits, 128, False autoround = AutoRound( - model, - tokenizer, + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, @@ -48,7 +50,7 @@ def test_autoround_asym(self, dataloader): seqlen=2, dataset=dataloader, ) - quantized_model_path = self.save_folder + quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") @@ -61,12 +63,11 @@ def test_autoround_asym(self, dataloader): res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]) print(res) assert "!!!" not in res - shutil.rmtree(self.save_folder, ignore_errors=True) + shutil.rmtree(self.save_dir, ignore_errors=True) @require_autogptq def test_mixed_precision(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + model_name = get_model_path("facebook/opt-125m") layer_config = {} layer_config["model.decoder.layers.0.self_attn.k_proj"] = {"bits": 8} @@ -76,15 +77,15 @@ def test_mixed_precision(self): } ## 3bits when using asym will have some issue layer_config["model.decoder.layers.6.self_attn.out_proj"] = {"bits": 2, "group_size": 32} bits, group_size, sym = 4, 128, True - autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, layer_config=layer_config) - quantized_model_path = self.save_folder + autoround = AutoRound(model_name, bits=bits, group_size=group_size, sym=sym, layer_config=layer_config) + quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") quantization_config = AutoRoundConfig(backend="auto") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) @@ -93,27 +94,25 @@ def test_mixed_precision(self): @require_awq @require_package_version_ut("transformers", "<4.57.0") def test_awq_backend(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + model_name = get_model_path("facebook/opt-125m") bits, group_size, sym = 4, 128, True autoround = AutoRound( - model, - tokenizer, + model_name, bits=bits, group_size=group_size, iters=1, nsamples=1, sym=sym, ) - quantized_model_path = self.save_folder + quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:auto_awq") quantization_config = AutoRoundConfig(backend="auto") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) @@ -121,18 +120,18 @@ def test_awq_backend(self): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + self.save_dir, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) model_infer(model, tokenizer) - shutil.rmtree(self.save_folder, ignore_errors=True) + shutil.rmtree(self.save_dir, ignore_errors=True) @require_greater_than_050 def test_tritonv2_bf16(self): - model_name = "OPEA/Meta-Llama-3.1-8B-Instruct-int4-sym-inc" + model_name = get_model_path("OPEA/Meta-Llama-3.1-8B-Instruct-int4-sym-inc") quantization_config = AutoRoundConfig(backend="tritonv2") - model = AutoModelForCausalLM.from_pretrained( + model = get_tiny_model( model_name, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) @@ -142,13 +141,10 @@ def test_tritonv2_bf16(self): torch.cuda.empty_cache() @require_ipex - def test_autoround_gptq_sym_format(self, dataloader): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + def test_autoround_gptq_sym_format(self, tiny_opt_model_path, dataloader): bits, group_size, sym = 4, 128, True autoround = AutoRound( - model, - tokenizer, + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, @@ -197,13 +193,10 @@ def test_autoround_gptq_sym_format(self, dataloader): @require_awq @require_ipex @require_package_version_ut("transformers", "<4.57.0") - def test_autoround_awq_sym_format(self, dataloader): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + def test_autoround_awq_sym_format(self, tiny_opt_model_path, dataloader): bits, group_size, sym = 4, 128, True autoround = AutoRound( - model, - tokenizer, + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, @@ -236,14 +229,11 @@ def test_autoround_awq_sym_format(self, dataloader): shutil.rmtree("./saved", ignore_errors=True) @require_greater_than_050 - def test_autoround_sym(self, dataloader): + def test_autoround_sym(self, tiny_opt_model_path, dataloader): for bits in [2, 3, 4, 8]: - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = bits, 128, True autoround = AutoRound( - model, - tokenizer, + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, @@ -264,11 +254,11 @@ def test_autoround_sym(self, dataloader): res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]) print(res) assert "!!!" not in res - shutil.rmtree(self.save_folder, ignore_errors=True) + shutil.rmtree(self.save_dir, ignore_errors=True) @require_greater_than_050 def test_load_gptq_model_3bits(self): - model_name = "LucasSantiago257/gemma-2b-2bits-gptq" + model_name = get_model_path("LucasSantiago257/gemma-2b-2bits-gptq") quantization_config = AutoRoundConfig() model = AutoModelForCausalLM.from_pretrained( model_name, diff --git a/test/test_cuda/test_auto_scheme.py b/test/test_cuda/test_auto_scheme.py index b6f5d8066..b8c40c470 100644 --- a/test/test_cuda/test_auto_scheme.py +++ b/test/test_cuda/test_auto_scheme.py @@ -3,6 +3,7 @@ import shutil import pytest +import transformers from auto_round import AutoRound, AutoRoundConfig, AutoScheme from auto_round.auto_scheme.utils import compute_avg_bits_for_model @@ -10,63 +11,68 @@ from auto_round.testing_utils import multi_card from auto_round.utils import get_module +from ..helpers import get_model_path, get_tiny_model + class TestAutoScheme: - @classmethod - def setup_class(self): - self.save_dir = "./saved" - self.tasks = "lambada_openai" + save_dir = "./saved" + + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") - @classmethod - def teardown_class(self): + # Yield to hand control to the test methods + yield + + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_gguf_k_0(self): - model_name = "/models/Qwen3-0.6B" + def test_gguf_k_0(self, tiny_qwen_model_path): target_bits = 5.5 scheme = AutoScheme(avg_bits=target_bits, options=("GGUF:Q4_K_M", "GGUF:Q8_0")) - ar = AutoRound(model=model_name, scheme=scheme, iters=1, enable_alg_ext=True) + ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=1, enable_alg_ext=True) ar.quantize_and_save(self.save_dir, format="gguf:q2_k_s") shutil.rmtree(self.save_dir, ignore_errors=True) - def test_gguf_k_1(self): - model_name = "/models/Qwen3-0.6B" + def test_gguf_k_1(self, tiny_qwen_model_path): target_bits = 3.5 scheme = AutoScheme(avg_bits=target_bits, options=("GGUF:Q2_K_S", "GGUF:Q4_1")) - ar = AutoRound(model=model_name, scheme=scheme, iters=1, enable_alg_ext=True) + ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=1, enable_alg_ext=True) ar.quantize_and_save(self.save_dir, format="gguf:q2_k_s") shutil.rmtree(self.save_dir, ignore_errors=True) # - def test_embedding_fallback(self): - model_name = "/models/Qwen3-0.6B" + def test_embedding_fallback(self, tiny_qwen_model_path): target_bits = 5.0 scheme = AutoScheme(avg_bits=target_bits, options=("GGUF:Q4_K_M", "GGUF:Q8_0")) - ar = AutoRound(model=model_name, scheme=scheme, iters=1, enable_alg_ext=True) + ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=1, enable_alg_ext=True) ar.quantize_and_save(self.save_dir, format="gguf:q2_k_s") shutil.rmtree(self.save_dir, ignore_errors=True) - def test_gguf_export(self): - model_name = "/models/Qwen3-0.6B" + def test_gguf_export(self, tiny_qwen_model_path): target_bits = 3 scheme = AutoScheme(avg_bits=target_bits, options=("GGUF:Q2_K_S", "GGUF:Q4_K_M"), ignore_scale_zp_bits=True) - ar = AutoRound(model=model_name, scheme=scheme, iters=0) + ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0) ar.quantize_and_save(self.save_dir, format="gguf:q2_k_s") shutil.rmtree(self.save_dir, ignore_errors=True) def test_gguf(self): - model_name = "/models/Qwen3-8B" + model_name = get_model_path("qwen/Qwen3-8B") + model = get_tiny_model(model_name) + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) target_bits = 3 scheme = AutoScheme(avg_bits=target_bits, options=("GGUF:Q2_K_S", "GGUF:Q4_K_M"), ignore_scale_zp_bits=True) - ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, disable_opt_rtn=True) + ar = AutoRound(model=model, tokenizer=tokenizer, scheme=scheme, iters=0, nsamples=1, disable_opt_rtn=True) model, layer_config = ar.quantize() avg_bits, _ = compute_avg_bits_for_model(model, ignore_scale_zp_bits=True) print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 def test_shared_layers(self): - model_name = "/models/opt-125m" + model_name = get_model_path("facebook/opt-125m") from transformers import AutoModelForCausalLM, AutoTokenizer model = AutoModelForCausalLM.from_pretrained(model_name) @@ -106,62 +112,55 @@ def test_shared_layers(self): # @multi_card - def test_multi_card(self): - model_name = "/models/Qwen3-0.6B" + def test_multi_card(self, tiny_qwen_model_path): target_bits = 4.5 for device_map in ["auto", "0,1", "0", None]: scheme = AutoScheme(avg_bits=target_bits, options=("NVFP4")) - ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, device_map=device_map) + ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0, nsamples=1, device_map=device_map) model, layer_config = ar.quantize() avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 @multi_card - def test_multi_card_1(self): - model_name = "/models/Qwen3-0.6B" + def test_multi_card_1(self, tiny_qwen_model_path): target_bits = 4.5 from transformers import AutoModelForCausalLM, AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(model_name) - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto") scheme = AutoScheme(avg_bits=target_bits, options=("NVFP4")) - ar = AutoRound(model=model, tokenizer=tokenizer, scheme=scheme, iters=0, nsamples=1) + ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0, nsamples=1) model, layer_config = ar.quantize() avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 - def test_non_low_gpu_mem_usage(self): - model_name = "/models/Qwen3-0.6B" + def test_non_low_gpu_mem_usage(self, tiny_qwen_model_path): target_bits = 4.5 # for device_map in ["auto", "0,1", "0", None]: scheme = AutoScheme(avg_bits=target_bits, options=("NVFP4"), low_gpu_mem_usage=False, device_map="auto") - ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1) + ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0, nsamples=1) model, layer_config = ar.quantize() avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 @multi_card - def test_dict_device_map(self): - model_name = "/models/Qwen3-8B" + def test_dict_device_map(self, tiny_qwen_model_path): target_bits = 8.25 device_map = {"up_proj": 0, "down_proj": 1} scheme = AutoScheme(avg_bits=target_bits, options=("MXFP8")) - ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, device_map=device_map) + ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0, nsamples=1, device_map=device_map) model, layer_config = ar.quantize() avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 - def test_min_target_bits(self): - model_name = "/models/opt-125m" + def test_min_target_bits(self, tiny_opt_model_path): target_bits = 4.644 scheme = AutoScheme(avg_bits=target_bits, options=("MXFP4", "W8A16")) - ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1) + ar = AutoRound(model=tiny_opt_model_path, scheme=scheme, iters=0, nsamples=1) model, layer_config = ar.quantize() avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) @@ -169,20 +168,19 @@ def test_min_target_bits(self): # def test_max_target_bits(self): - model_name = "/models/opt-125m" target_bits = 8.025 + model_path = get_model_path("facebook/opt-125m") scheme = AutoScheme(avg_bits=target_bits, options=("MXFP4", "W8A16")) - ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1) + ar = AutoRound(model=model_path, scheme=scheme, iters=0, nsamples=1) model, layer_config = ar.quantize() avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 - def test_patch_scheme(self): - model_name = "/models/opt-125m" + def test_patch_scheme(self, tiny_opt_model_path): target_bits = 5 scheme = AutoScheme(avg_bits=target_bits, options=("MXFP4", "W8A16")) - ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, group_size=32) + ar = AutoRound(model=tiny_opt_model_path, scheme=scheme, iters=0, nsamples=1, group_size=32) model, layer_config = ar.quantize() for n, m in model.named_modules(): if hasattr(m, "group_size"): @@ -193,74 +191,74 @@ def test_patch_scheme(self): def test_layer_config(self): target_bits = 3.0 - model_name = "/models/opt-125m" + model_name = get_model_path("facebook/opt-125m") scheme = AutoScheme(avg_bits=3, options=("W2A16", "W4A16", "BF16")) user_layer_config = {"model.decoder.layers.10.fc1": {"bits": 8, "group_size": 32, "sym": False}} ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config) model, layer_config = ar.quantize() assert layer_config["model.decoder.layers.10.fc1"]["bits"] == 8 - assert layer_config["model.decoder.layers.10.fc1"]["sym"] == False + assert layer_config["model.decoder.layers.10.fc1"]["sym"] is False assert layer_config["model.decoder.layers.10.fc1"]["group_size"] == 32 layer = get_module(model, "model.decoder.layers.10.fc1") assert layer.bits == 8 - assert layer.sym == False + assert layer.sym is False assert layer.group_size == 32 avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 target_bits = 5.5 - model_name = "/models/opt-125m" scheme = AutoScheme(avg_bits=target_bits, options=("mxfp4", "mxfp8")) user_layer_config = {"model.decoder.layers.10.fc1": {"bits": 8, "group_size": 32, "sym": False}} ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config) model, layer_config = ar.quantize() assert layer_config["model.decoder.layers.10.fc1"]["bits"] == 8 - assert layer_config["model.decoder.layers.10.fc1"]["sym"] == False + assert layer_config["model.decoder.layers.10.fc1"]["sym"] is False assert layer_config["model.decoder.layers.10.fc1"]["group_size"] == 32 layer = get_module(model, "model.decoder.layers.10.fc1") assert layer.orig_layer.bits == 8 - assert layer.orig_layer.sym == False + assert layer.orig_layer.sym is False assert layer.orig_layer.group_size == 32 avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 def test_lm_head_and_mix_dtype(self): - model_name = "/models/Qwen3-8B" + model_name = get_model_path("qwen/Qwen3-8B") + model = get_tiny_model(model_name) + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) target_bits = 6 scheme = AutoScheme(avg_bits=target_bits, options=("MXFP4", "MXFP8")) - ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, quant_lm_head=True) + ar = AutoRound(model=model, tokenizer=tokenizer, scheme=scheme, iters=0, nsamples=1, quant_lm_head=True) model, layer_config = ar.quantize() - self.assertLessEqual(layer_config["lm_head"]["bits"], 8) + assert layer_config["lm_head"]["bits"] <= 8 avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 - def test_auto_scheme_export(self): - model_name = "/models/opt-125m" + def test_auto_scheme_export(self, tiny_qwen_model_path): + model_name = get_model_path("facebook/opt-125m") scheme = AutoScheme(avg_bits=3, options=("W2A16", "W4A16", "W8A16", "BF16")) ar = AutoRound(model=model_name, scheme=scheme) ar.quantize_and_save(self.save_dir) model_args = f"pretrained={self.save_dir}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.25) + assert result["results"]["lambada_openai"]["acc,none"] > 0.25 shutil.rmtree(self.save_dir, ignore_errors=True) - model_name = "/models/Qwen3-0.6B" scheme = AutoScheme(avg_bits=3, options=("gguf:q2_k_s,gguf:q4_k_s"), nsamples=1, ignore_scale_zp_bits=True) - ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1) + ar = AutoRound(model=tiny_qwen_model_path, scheme=scheme, iters=0, nsamples=1) ar.quantize_and_save(self.save_dir) shutil.rmtree(self.save_dir, ignore_errors=True) def test_enable_torch_compile(self): - model_name = "/models/opt-125m" + model_name = get_model_path("facebook/opt-125m") scheme = AutoScheme(avg_bits=2, options=("W2A16"), ignore_scale_zp_bits=True) ar = AutoRound(model=model_name, scheme=scheme, enable_torch_compile=True) ar.quantize_and_save(self.save_dir) model_args = f"pretrained={self.save_dir}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") print(result["results"]["lambada_openai"]["acc,none"]) - self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.10) + assert result["results"]["lambada_openai"]["acc,none"] > 0.10 shutil.rmtree(self.save_dir, ignore_errors=True) diff --git a/test/test_cuda/test_calib_dataset.py b/test/test_cuda/test_calib_dataset.py index 6a36c21b1..bdee2ebeb 100644 --- a/test/test_cuda/test_calib_dataset.py +++ b/test/test_cuda/test_calib_dataset.py @@ -10,30 +10,10 @@ class TestLocalCalibDataset: - @classmethod - def setup_class(self): - json_data = [{"text": "awefdsfsddfd"}, {"text": "fdfdfsdfdfdfd"}, {"text": "dfdsfsdfdfdfdf"}] - os.makedirs("./saved", exist_ok=True) - self.json_file = "./saved/tmp.json" - with open(self.json_file, "w") as json_file: - json.dump(json_data, json_file, indent=4) - - jsonl_data = [{"text": "哈哈,開心點"}, {"text": "hello world"}] - os.makedirs("./saved", exist_ok=True) - self.jsonl_file = "./saved/tmp.jsonl" - with open(self.jsonl_file, "w") as jsonl_file: - for item in jsonl_data: - json.dump(item, jsonl_file, ensure_ascii=False) - jsonl_file.write("\n") - - model_name = "facebook/opt-125m" - self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - - def test_combine_dataset(self): + def test_combine_dataset(self, tiny_opt_model_path): dataset = "NeelNanda/pile-10k" + ",BAAI/CCI3-HQ" + ",madao33/new-title-chinese" bits, group_size, sym = 4, 128, True autoround = AutoRound( - self.model, self.tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=128, dataset=dataset + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=128, dataset=dataset ) autoround.quantize() diff --git a/test/test_cuda/test_conv1d.py b/test/test_cuda/test_conv1d.py index c5384a384..11f80a1b2 100644 --- a/test/test_cuda/test_conv1d.py +++ b/test/test_cuda/test_conv1d.py @@ -8,14 +8,22 @@ from auto_round import AutoRound from auto_round.testing_utils import require_gptqmodel -from ..helpers import model_infer +from ..helpers import get_model_path, get_tiny_model, model_infer class TestQuantizationConv1d: - @classmethod - def setup_class(self): - self.model_name = "MBZUAI/LaMini-GPT-124M" - self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + + # Yield to hand control to the test methods + yield + + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") + shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree("runs", ignore_errors=True) @classmethod def teardown_class(self): @@ -24,13 +32,15 @@ def teardown_class(self): @require_gptqmodel def test_quant(self, dataloader): - self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model_name = get_model_path("MBZUAI/LaMini-GPT-124M") + model = get_tiny_model(model_name) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, True from auto_round import AutoRoundConfig autoround = AutoRound( - self.model, - self.tokenizer, + model, + tokenizer, bits=bits, group_size=group_size, sym=sym, diff --git a/test/test_cuda/test_diffusion.py b/test/test_cuda/test_diffusion.py index 147a34d47..a3a90d14e 100644 --- a/test/test_cuda/test_diffusion.py +++ b/test/test_cuda/test_diffusion.py @@ -13,12 +13,19 @@ class TestAutoRound: - @classmethod - def setup_class(self): - self.model_name = "/dataset/FLUX.1-dev" + model_name = "/dataset/FLUX.1-dev" - @classmethod - def teardown_class(self): + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + + # Yield to hand control to the test methods + yield + + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") + shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @require_optimum @@ -69,7 +76,7 @@ def test_diffusion_rtn(self): def test_diffusion_model_checker(self): from auto_round.utils import is_diffusion_model - self.assertTrue(is_diffusion_model("/dataset/FLUX.1-dev")) - self.assertTrue(is_diffusion_model("/models/stable-diffusion-2-1")) - self.assertTrue(is_diffusion_model("/models/stable-diffusion-xl-base-1.0")) - self.assertFalse(is_diffusion_model("/models/Qwen3-8B")) + assert is_diffusion_model("/dataset/FLUX.1-dev") + assert is_diffusion_model("/models/stable-diffusion-2-1") + assert is_diffusion_model("/models/stable-diffusion-xl-base-1.0") + assert is_diffusion_model("/models/Qwen3-8B") is False diff --git a/test/test_cuda/test_exllamav2_backend.py b/test/test_cuda/test_exllamav2_backend.py index e6f78ba90..fb08acb29 100644 --- a/test/test_cuda/test_exllamav2_backend.py +++ b/test/test_cuda/test_exllamav2_backend.py @@ -8,38 +8,41 @@ from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.testing_utils import require_autogptq, require_gptqmodel, require_package_version_ut -from ..helpers import model_infer +from ..helpers import get_model_path, model_infer class TestAutoRoundexllamaBackend: + save_dir = "./saved" - @classmethod - def setup_class(self): - self.model_name = "/models/opt-125m" - self.save_folder = "./saved" + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") - @classmethod - def teardown_class(self): - shutil.rmtree(self.save_folder, ignore_errors=True) + # Yield to hand control to the test methods + yield + + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") + shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @require_gptqmodel def test_gptqmodel_exllmav2_4bits_asym(self, dataloader): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + model_path = get_model_path("facebook/opt-125m") bits, group_size, sym = 4, 128, False autoround = AutoRound( - model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, dataset=dataloader + model_path, bits=bits, group_size=group_size, sym=sym, iters=1, seqlen=2, dataset=dataloader ) - quantized_model_path = self.save_folder + quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel") quantization_config = AutoRoundConfig(backend="gptqmodel:exllamav2") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) @@ -47,10 +50,10 @@ def test_gptqmodel_exllmav2_4bits_asym(self, dataloader): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + self.save_dir, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) @@ -61,12 +64,10 @@ def test_gptqmodel_exllmav2_4bits_asym(self, dataloader): @require_autogptq @require_package_version_ut("torch", "<2.6.0") def test_gptq_exllamav2_4bits_sym(self, dataloader): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + model_path = get_model_path("facebook/opt-125m") bits, group_size, sym = 4, 128, True autoround = AutoRound( - model, - tokenizer, + model_path, bits=bits, group_size=group_size, sym=sym, @@ -74,53 +75,51 @@ def test_gptq_exllamav2_4bits_sym(self, dataloader): seqlen=2, dataset=dataloader, ) - quantized_model_path = self.save_folder + quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") ##will convert to gptq model quantization_config = AutoRoundConfig(backend="gptq:exllamav2") ## or exllamav2 model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.27) torch.cuda.empty_cache() - shutil.rmtree(self.save_folder, ignore_errors=True) + shutil.rmtree(self.save_dir, ignore_errors=True) @require_autogptq @require_package_version_ut("torch", "<2.6.0") def test_gptq_exllamav2_4bits_sym_group_size(self): + model_path = get_model_path("facebook/opt-125m") for group_size in [-1, 32, 64, 128, 256, 1024]: ## 384, 768 has accuracy issue print(f"!!!!!!!!!!!!!!!!!{group_size}!!!!!!!!!!!!!!!!!") - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, group_size, True autoround = AutoRound( - model, - tokenizer, + model_path, bits=bits, iters=1, nsamples=1, group_size=group_size, sym=sym, ) - quantized_model_path = self.save_folder + quantized_model_path = self.save_dir autoround.quantize_and_save( output_dir=quantized_model_path, format="auto_round" ) ##will convert to gptq model quantization_config = AutoRoundConfig(backend="gptq:exllamav2") ## or exllamav2 model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=64, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) self.assertGreater(result["results"]["lambada_openai"]["acc,none"], 0.15) torch.cuda.empty_cache() - shutil.rmtree(self.save_folder, ignore_errors=True) + shutil.rmtree(self.save_dir, ignore_errors=True) diff --git a/test/test_cuda/test_export.py b/test/test_cuda/test_export.py index 3e1171162..114760ac3 100644 --- a/test/test_cuda/test_export.py +++ b/test/test_cuda/test_export.py @@ -9,26 +9,31 @@ from auto_round import AutoRound from auto_round.testing_utils import require_awq, require_optimum, require_package_version_ut +from ..helpers import get_model_path, get_tiny_model + class TestAutoRound: - @classmethod - def setup_class(self): - self.model_name = "facebook/opt-125m" - self.save_dir = "./saved" + save_dir = "./saved" + + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + + # Yield to hand control to the test methods + yield - @classmethod - def teardown_class(self): + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @require_optimum def test_autogptq_format(self, dataloader): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + model_path = get_model_path("facebook/opt-125m") bits, group_size, sym = 4, 128, False autoround = AutoRound( - model, - tokenizer, + model_path, bits=bits, group_size=group_size, sym=sym, @@ -53,10 +58,10 @@ def test_autogptq_format(self, dataloader): shutil.rmtree("./saved", ignore_errors=True) @require_optimum - def test_autogptq_format_fp_layers(self, dataloader): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + def test_autogptq_format_fp_layers(self, tiny_opt_model_path, dataloader): layer_config = {} + model = AutoModelForCausalLM.from_pretrained(tiny_opt_model_path) + tokenizer = AutoTokenizer.from_pretrained(tiny_opt_model_path) for n, m in model.named_modules(): if "q_proj" in n: layer_config[n] = {"bits": 16} @@ -91,8 +96,9 @@ def test_autogptq_format_fp_layers(self, dataloader): shutil.rmtree("./saved", ignore_errors=True) def test_autogptq_format_qsave_fp_layers(self, dataloader): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + model_path = get_model_path("facebook/opt-125m") + model = AutoModelForCausalLM.from_pretrained(model_path) + layer_config = {} for n, m in model.named_modules(): if "q_proj" in n: @@ -100,8 +106,7 @@ def test_autogptq_format_qsave_fp_layers(self, dataloader): bits, group_size, sym = 4, 128, False autoround = AutoRound( - model, - tokenizer, + model_path, bits=bits, group_size=group_size, sym=sym, @@ -141,13 +146,10 @@ def test_autogptq_format_qsave_fp_layers(self, dataloader): ##print(res) shutil.rmtree("./saved", ignore_errors=True) - def test_autoround_format(self, dataloader): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + def test_autoround_format(self, tiny_opt_model_path, dataloader): bits, group_size, sym = 4, 128, True autoround = AutoRound( - model, - tokenizer, + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, @@ -175,12 +177,10 @@ def test_autoround_format(self, dataloader): @require_awq @require_package_version_ut("transformers", "<4.57.0") def test_autoawq_format(self, dataloader): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + model_path = get_model_path("facebook/opt-125m") bits, group_size, sym = 4, 128, False autoround = AutoRound( - model, - tokenizer, + model_path, bits=bits, group_size=group_size, sym=sym, @@ -209,16 +209,14 @@ def test_autoawq_format(self, dataloader): @require_awq @require_package_version_ut("transformers", "<4.57.0") def test_autoawq_format_fp_qsave_layers(self, dataloader): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) + model_path = get_model_path("facebook/opt-125m") layer_config = { "model.decoder.layers.0.self_attn.k_proj": {"bits": 16}, "model.decoder.layers.9.self_attn.v_proj": {"bits": 16}, } - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) bits, group_size, sym = 4, 128, False autoround = AutoRound( - model, - tokenizer, + model_path, bits=bits, group_size=group_size, sym=sym, @@ -249,13 +247,10 @@ def test_autoawq_format_fp_qsave_layers(self, dataloader): shutil.rmtree("./saved", ignore_errors=True) - def test_autoround_3bit_asym_torch_format(self, dataloader): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + def test_autoround_3bit_asym_torch_format(self, tiny_opt_model_path, dataloader): bits, group_size, sym = 3, 128, False autoround = AutoRound( - model, - tokenizer, + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, @@ -278,13 +273,10 @@ def test_autoround_3bit_asym_torch_format(self, dataloader): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree("./saved", ignore_errors=True) - def test_autoround_3bit_sym_torch_format(self, dataloader): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + def test_autoround_3bit_sym_torch_format(self, tiny_opt_model_path, dataloader): bits, group_size, sym = 3, 128, True autoround = AutoRound( - model, - tokenizer, + tiny_opt_model_path, bits=bits, group_size=group_size, sym=sym, @@ -312,12 +304,15 @@ def test_autoround_3bit_sym_torch_format(self, dataloader): def test_awq_lmhead_export(self, dataloader): bits, sym, group_size = 4, False, 128 - model_name = "/models/phi-2" + model_name = get_model_path("microsoft/phi-2") + tiny_model = get_tiny_model(model_name) + tokenizer = AutoTokenizer.from_pretrained(model_name) layer_config = { "lm_head": {"bits": 4}, # set lm_head quant } autoround = AutoRound( - model=model_name, + model=tiny_model, + tokenizer=tokenizer, bits=bits, group_size=group_size, sym=sym, @@ -342,14 +337,17 @@ def test_awq_lmhead_export(self, dataloader): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_gptq_lmhead_export(self, dataloader): + def test_gptq_lmhead_export(self, tiny_qwen_model_path, dataloader): bits, sym, group_size = 4, True, 128 - model_name = "/models/phi-2" + model_name = get_model_path("microsoft/phi-2") + tiny_model = get_tiny_model(model_name) + tokenizer = AutoTokenizer.from_pretrained(model_name) layer_config = { "lm_head": {"bits": 4}, # set lm_head quant } autoround = AutoRound( - model=model_name, + model=tiny_model, + tokenizer=tokenizer, bits=bits, group_size=group_size, sym=sym, diff --git a/test/test_cuda/test_fp8_input.py b/test/test_cuda/test_fp8_input.py index 90a177ef3..4b597f378 100644 --- a/test/test_cuda/test_fp8_input.py +++ b/test/test_cuda/test_fp8_input.py @@ -8,21 +8,36 @@ from auto_round import AutoRound from auto_round.eval.evaluation import simple_evaluate +from auto_round.utils import llm_load_model + +from ..helpers import get_model_path, get_tiny_model class TestAutoRound: - @classmethod - def setup_class(self): - self.save_dir = "./saved" + save_dir = "./saved" - @classmethod - def teardown_class(self): - shutil.rmtree(self.save_dir, ignore_errors=True) + def tiny_fp8_model(self): + model_name = get_model_path("qwen/Qwen3-0.6B-FP8") + model, tokenizer = llm_load_model(model_name) + model.model.layers = model.model.layers[:3] + return model, tokenizer + + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + + # Yield to hand control to the test methods + yield + + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") + shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) def test_small_model_rtn_generation(self): - model_name = "/models/Qwen3-0.6B-FP8" - ar = AutoRound(model=model_name, iters=0) + model, tokenizer = self.tiny_fp8_model() + ar = AutoRound(model=model, tokenizer=tokenizer, iters=0) ar.quantize_and_save(output_dir=self.save_dir) model = AutoModelForCausalLM.from_pretrained(self.save_dir, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.save_dir) @@ -32,8 +47,8 @@ def test_small_model_rtn_generation(self): shutil.rmtree(self.save_dir, ignore_errors=True) def test_gguf_imatrix(self): - model_name = "/models/Qwen3-0.6B-FP8" - ar = AutoRound(model=model_name, iters=0) + model, tokenizer = self.tiny_fp8_model() + ar = AutoRound(model=model, tokenizer=tokenizer, iters=0) ar.quantize_and_save(format="gguf:q2_k_s", output_dir=self.save_dir) # from llama_cpp import Llama # @@ -49,8 +64,8 @@ def test_gguf_imatrix(self): # print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) def test_small_model_rtn(self): - model_name = "/models/Qwen3-0.6B-FP8" - ar = AutoRound(model=model_name, iters=0) + model, tokenizer = self.tiny_fp8_model() + ar = AutoRound(model=model, tokenizer=tokenizer, iters=0) _, folder = ar.quantize_and_save(output_dir=self.save_dir) model_args = f"pretrained={self.save_dir}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") @@ -60,8 +75,8 @@ def test_small_model_rtn(self): shutil.rmtree(self.save_dir, ignore_errors=True) def test_small_model_iters1(self): - model_name = "/models/Qwen3-0.6B-FP8" - ar = AutoRound(model=model_name, iters=1) + model, tokenizer = self.tiny_fp8_model() + ar = AutoRound(model=model, tokenizer=tokenizer, iters=1) _, folder = ar.quantize_and_save(output_dir=self.save_dir) model_args = f"pretrained={self.save_dir}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") @@ -71,8 +86,8 @@ def test_small_model_iters1(self): shutil.rmtree(self.save_dir, ignore_errors=True) def test_medium_model_rtn(self): - model_name = "/models/Qwen3-8B-FP8" - ar = AutoRound(model=model_name, iters=0) + model, tokenizer = self.tiny_fp8_model() + ar = AutoRound(model=model, tokenizer=tokenizer, iters=0) _, folder = ar.quantize_and_save(output_dir=self.save_dir) model_args = f"pretrained={self.save_dir}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") @@ -82,9 +97,9 @@ def test_medium_model_rtn(self): shutil.rmtree(self.save_dir, ignore_errors=True) def test_medium_model_rtn_with_lm_head(self): - model_name = "/models/Qwen3-8B-FP8" + model, tokenizer = self.tiny_fp8_model() layer_config = {"lm_head": {"bits": 4}} - ar = AutoRound(model=model_name, iters=0, layer_config=layer_config) + ar = AutoRound(model=model, tokenizer=tokenizer, iters=0, layer_config=layer_config) _, folder = ar.quantize_and_save(output_dir=self.save_dir) model_args = f"pretrained={self.save_dir}" result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto") @@ -96,9 +111,8 @@ def test_medium_model_rtn_with_lm_head(self): def test_fp8_model_gguf(self): from llama_cpp import Llama - model_name = "Qwen/Qwen3-0.6B-FP8" - - ar = AutoRound(model=model_name, iters=0) + model, tokenizer = self.tiny_fp8_model() + ar = AutoRound(model=model, tokenizer=tokenizer, iters=0) ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q4_0") for file in os.listdir(self.save_dir): if file.endswith(".gguf"): @@ -108,7 +122,8 @@ def test_fp8_model_gguf(self): print(output) shutil.rmtree(self.save_dir, ignore_errors=True) - ar = AutoRound(model=model_name, iters=1) + model, tokenizer = self.tiny_fp8_model() + ar = AutoRound(model=model, tokenizer=tokenizer, iters=1) ar.quantize_and_save(output_dir=self.save_dir, format="gguf:q3_k_s") for file in os.listdir(self.save_dir): if file.endswith(".gguf"): @@ -119,10 +134,10 @@ def test_fp8_model_gguf(self): shutil.rmtree(self.save_dir, ignore_errors=True) def test_diff_datatype(self): - model_name = "/models/Qwen3-0.6B-FP8" for scheme in ["NVFP4", "MXFP4"]: + model, tokenizer = self.tiny_fp8_model() for iters in [0, 1]: print(f"Testing scheme: {scheme}, iters: {iters}") - ar = AutoRound(model=model_name, iters=iters, scheme=scheme) + ar = AutoRound(model=model, tokenizer=tokenizer, iters=iters, scheme=scheme) ar.quantize_and_save(output_dir=self.save_dir) shutil.rmtree(self.save_dir, ignore_errors=True) diff --git a/test/test_cuda/test_scheme.py b/test/test_cuda/test_scheme.py index 06c5b27e0..d6fe43374 100644 --- a/test/test_cuda/test_scheme.py +++ b/test/test_cuda/test_scheme.py @@ -49,7 +49,7 @@ def test_fp8_static(self): assert ar.data_type == "fp" assert ar.act_data_type == "fp" assert ar.group_size == -1 - assert ar.act_dynamic == False + assert ar.act_dynamic is False ar.quantize() ## RTN tests @@ -73,7 +73,7 @@ def test_fp8_static_rtn(self): assert ar.data_type == "fp" assert ar.act_data_type == "fp" assert ar.group_size == -1 - assert ar.act_dynamic == False + assert ar.act_dynamic is False ar.quantize() def test_scheme_in_layer_config(self): From 390f997a6e33ba90eaa7b71f239eae3c69be7993 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Fri, 19 Dec 2025 03:09:14 -0500 Subject: [PATCH 11/24] update cuda ut Signed-off-by: n1ck-guo --- test/test_cpu/test_auto_scheme.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_cpu/test_auto_scheme.py b/test/test_cpu/test_auto_scheme.py index b6c20826e..56996dd64 100644 --- a/test/test_cpu/test_auto_scheme.py +++ b/test/test_cpu/test_auto_scheme.py @@ -44,11 +44,11 @@ def test_layer_config(self, tiny_opt_model_path): ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config) model, layer_config = ar.quantize() assert layer_config["model.decoder.layers.1.fc1"]["bits"] == 8 - assert layer_config["model.decoder.layers.1.fc1"]["sym"] == False + assert layer_config["model.decoder.layers.1.fc1"]["sym"] is False assert layer_config["model.decoder.layers.1.fc1"]["group_size"] == 32 layer = get_module(model, "model.decoder.layers.1.fc1") assert layer.bits == 8 - assert layer.sym == False + assert layer.sym is False assert layer.group_size == 32 avg_bits, _ = compute_avg_bits_for_model(model) print(avg_bits) From 65ac22d52bdc4f8d73366a5953323f2296a9b919 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Sun, 21 Dec 2025 09:18:50 -0500 Subject: [PATCH 12/24] replace model with tiny model and fix bug Signed-off-by: He, Xin3 --- test/fixtures.py | 58 +++++++++----- test/helpers.py | 102 ++++++++++++++++++++++- test/test_cpu/test_asym.py | 5 +- test/test_cpu/test_auto_scheme.py | 4 +- test/test_cpu/test_autoround.py | 20 ++--- test/test_cpu/test_block_names.py | 4 +- test/test_cpu/test_cli_usage.py | 6 +- test/test_cpu/test_gguf_format.py | 129 +----------------------------- test/test_cpu/test_mllm.py | 26 +++--- test/test_cpu/test_model_scope.py | 12 +-- test/test_cpu/test_mxfp_nvfp.py | 99 ++++++++++------------- test/test_cpu/test_scheme.py | 2 +- 12 files changed, 216 insertions(+), 251 deletions(-) diff --git a/test/fixtures.py b/test/fixtures.py index 87d0a5f75..c4e2ea198 100644 --- a/test/fixtures.py +++ b/test/fixtures.py @@ -6,25 +6,20 @@ import transformers from .helpers import ( + DataLoader, + deepseek_v2_name_or_path, get_tiny_model, gptj_name_or_path, lamini_name_or_path, opt_name_or_path, phi2_name_or_path, + qwen_moe_name_or_path, qwen_name_or_path, + qwen_vl_name_or_path, save_tiny_model, ) -class DataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) - - # Create tiny model path fixtures for testing @pytest.fixture(scope="session") def tiny_opt_model_path(): @@ -35,15 +30,6 @@ def tiny_opt_model_path(): shutil.rmtree(tiny_model_path) -@pytest.fixture(scope="session") -def tiny_qwen_model_path(): - model_name_or_path = qwen_name_or_path - tiny_model_path = "./tmp_tiny_qwen_model_path" - tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path) - yield tiny_model_path - shutil.rmtree(tiny_model_path) - - @pytest.fixture(scope="session") def tiny_lamini_model_path(): model_name_or_path = lamini_name_or_path @@ -71,6 +57,42 @@ def tiny_phi2_model_path(): shutil.rmtree(tiny_model_path) +@pytest.fixture(scope="session") +def tiny_deepseek_v2_model_path(): + model_name_or_path = deepseek_v2_name_or_path + tiny_model_path = "./tmp_tiny_deepseek_v2_model_path" + tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2) + yield tiny_model_path + shutil.rmtree(tiny_model_path) + + +@pytest.fixture(scope="session") +def tiny_qwen_model_path(): + model_name_or_path = qwen_name_or_path + tiny_model_path = "./tmp_tiny_qwen_model_path" + tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path) + yield tiny_model_path + shutil.rmtree(tiny_model_path) + + +@pytest.fixture(scope="session") +def tiny_qwen_moe_model_path(): + model_name_or_path = qwen_moe_name_or_path + tiny_model_path = "./tmp_tiny_qwen_moe_model_path" + tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2) + yield tiny_model_path + shutil.rmtree(tiny_model_path) + + +@pytest.fixture(scope="session") +def tiny_qwen_vl_model_path(): + model_name_or_path = qwen_vl_name_or_path + tiny_model_path = "./tmp_tiny_qwen_vl_model_path" + tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2) + yield tiny_model_path + shutil.rmtree(tiny_model_path) + + # Create objective fixtures for testing @pytest.fixture(scope="function") def tiny_opt_model(): diff --git a/test/helpers.py b/test/helpers.py index 5f6c9c360..be086497c 100644 --- a/test/helpers.py +++ b/test/helpers.py @@ -4,6 +4,8 @@ import torch import transformers +from auto_round.utils import llm_load_model + # Automatic choose local path or model name. def get_model_path(model_name: str) -> str: @@ -23,6 +25,9 @@ def get_model_path(model_name: str) -> str: lamini_name_or_path = get_model_path("MBZUAI/LaMini-GPT-124M") gptj_name_or_path = get_model_path("hf-internal-testing/tiny-random-GPTJForCausalLM") phi2_name_or_path = get_model_path("microsoft/phi-2") +deepseek_v2_name_or_path = get_model_path("deepseek-ai/DeepSeek-V2-Lite") +qwen_moe_name_or_path = get_model_path("Qwen/Qwen1.5-MoE-A2.7B") +qwen_vl_name_or_path = get_model_path("Qwen/Qwen2-VL-2B-Instruct") # Slice model into tiny model for speedup @@ -44,7 +49,7 @@ def slice_layers(module): return True return False - model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, dtype="auto", trust_remote_code=True) + model, tokenizer = llm_load_model(model_name_or_path) slice_layers(model) if hasattr(model.config, "num_hidden_layers"): @@ -56,11 +61,11 @@ def slice_layers(module): # for fixture usage only -def save_tiny_model(model_name_or_path, tiny_model_path): - model = get_tiny_model(model_name_or_path, num_layers=2) +def save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2): + model = get_tiny_model(model_name_or_path, num_layers=num_layers) tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) test_path = os.path.dirname(__file__) - tiny_model_path = os.path.join(test_path, tiny_model_path) + tiny_model_path = os.path.join(test_path, tiny_model_path.removeprefix("./")) model.save_pretrained(tiny_model_path) tokenizer.save_pretrained(tiny_model_path) print(f"[Fixture]: built tiny model path:{tiny_model_path} for testing in session") @@ -109,3 +114,92 @@ def model_infer(model, tokenizer, apply_chat_template=False): print(f"Generated: {decoded_outputs[i]}") print("-" * 50) return decoded_outputs[0] + + +# Dummy dataloader for testing +class DataLoader: + def __init__(self): + self.batch_size = 1 + + def __iter__(self): + for i in range(2): + yield torch.ones([1, 10], dtype=torch.long) + + +fixed_input = torch.tensor([[10, 20, 30, 40, 50]], dtype=torch.long) + + +def get_output(model_name_or_path): + """Get model output for fixed input.""" + model, tokenizer = llm_load_model(model_name_or_path) + outputs = model(fixed_input)[0] + return outputs.detach().cpu() + + +def is_model_outputs_similar(model_path_1, model_path_2, metric="cosine_similarity", threshold=0.98, k=5, verbose=True): + """ + Compare outputs from two models using specified metric and return pass/fail. + + Args: + model_path_1: Path to first model + model_path_2: Path to second model + metric: Metric to use - "mse", "cosine_similarity"/"cos_sim", or "topk" + threshold: Threshold value for pass/fail + k: K value for top-k metric (only used when metric="topk") + verbose: Whether to print detailed results + + Returns: + bool: True if metric passes threshold, False otherwise + """ + if verbose: + print(f"\n{'='*70}") + print("Comparing Model Outputs") + print(f"{'='*70}") + print(f"Model 1: {model_path_1}") + print(f"Model 2: {model_path_2}") + print(f"Metric: {metric} | Threshold: {threshold}" + (f" | K: {k}" if "top" in metric.lower() else "")) + print(f"{'='*70}\n") + + output_1 = get_output(model_path_1) + output_2 = get_output(model_path_2) + metric = metric.lower().replace("-", "_") + + # Calculate metric and check threshold + if metric == "mse": + value = torch.mean((output_1.float() - output_2.float()) ** 2).item() + passed = value <= threshold + if verbose: + print(f"MSE: {value:.6f} | Threshold: <= {threshold} | {'✓ PASS' if passed else '✗ FAIL'}\n") + + elif metric in ["cosine_similarity", "cos_sim", "cosine"]: + out1 = output_1.float().flatten() + out2 = output_2.float().flatten() + value = torch.nn.functional.cosine_similarity(out1.unsqueeze(0), out2.unsqueeze(0)).item() + passed = value >= threshold + if verbose: + print(f"Cosine Similarity: {value:.6f} | Threshold: >= {threshold} | {'✓ PASS' if passed else '✗ FAIL'}\n") + + elif metric in ["topk", "top_k"]: + _, topk_1 = torch.topk(output_1, k=min(k, output_1.size(-1)), dim=-1) + _, topk_2 = torch.topk(output_2, k=min(k, output_2.size(-1)), dim=-1) + + total_agreement = 0 + total_positions = topk_1.numel() // topk_1.size(-1) + + for i in range(topk_1.size(0)): + for j in range(topk_1.size(1)): + set1 = set(topk_1[i, j].tolist()) + set2 = set(topk_2[i, j].tolist()) + total_agreement += len(set1 & set2) / k + + value = total_agreement / total_positions + passed = value >= threshold + if verbose: + print( + f"Top-{k} Agreement: {value:.4%} | Threshold: >= {threshold:.4%} | {'✓ PASS' if passed else '✗ FAIL'}\n" + ) + + else: + raise ValueError(f"Unknown metric: {metric}. Choose from: 'mse', 'cosine_similarity', 'topk'") + + return passed diff --git a/test/test_cpu/test_asym.py b/test/test_cpu/test_asym.py index 842b208ed..32a0151b3 100644 --- a/test/test_cpu/test_asym.py +++ b/test/test_cpu/test_asym.py @@ -6,13 +6,14 @@ sys.path.insert(0, "../..") import torch -from _test_helpers import model_infer from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer from auto_round import AutoRound from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.utils import get_module +from ..helpers import get_model_path, model_infer + class LLMDataLoader: def __init__(self): @@ -27,7 +28,7 @@ class TestAutoRoundAsym(unittest.TestCase): @classmethod def setUpClass(self): # self.model_name = "/models/opt-125m" - self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m" + self.model_name = get_model_path("facebook/opt-125m") self.save_folder = "./saved" @classmethod diff --git a/test/test_cpu/test_auto_scheme.py b/test/test_cpu/test_auto_scheme.py index 56996dd64..9d549076f 100644 --- a/test/test_cpu/test_auto_scheme.py +++ b/test/test_cpu/test_auto_scheme.py @@ -37,9 +37,9 @@ def test_layer_config(self, tiny_opt_model_path): from auto_round.auto_scheme.utils import compute_avg_bits_for_model from auto_round.utils import get_module - target_bits = 3.0 + target_bits = 3.5 model_name = tiny_opt_model_path - scheme = AutoScheme(avg_bits=3, options=("W2A16", "W4A16", "BF16")) + scheme = AutoScheme(avg_bits=target_bits, options=("W2A16", "W4A16", "BF16")) user_layer_config = {"model.decoder.layers.1.fc1": {"bits": 8, "group_size": 32, "sym": False}} ar = AutoRound(model=model_name, scheme=scheme, iters=0, nsamples=1, layer_config=user_layer_config) model, layer_config = ar.quantize() diff --git a/test/test_cpu/test_autoround.py b/test/test_cpu/test_autoround.py index d0049765e..3d5d60c24 100644 --- a/test/test_cpu/test_autoround.py +++ b/test/test_cpu/test_autoround.py @@ -624,16 +624,6 @@ def test_fallback_layers_regex_exception(self, tiny_opt_model_path, dataloader): ) autoround.quantize() - # def test_fp8_model_input_rtn_generation(self): - # model_name = "Qwen/Qwen3-0.6B-FP8" - # ar = AutoRound(model=model_name, iters=0) - # ar.quantize_and_save(output_dir=self.save_folder) - # model = AutoModelForCausalLM.from_pretrained(self.save_folder, torch_dtype="auto", trust_remote_code=True) - # tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - # text = "There is a girl who likes adventure," - # inputs = tokenizer(text, return_tensors="pt").to(model.device) - # print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) - def test_dequant_fp8_weight(self): from auto_round.utils import dequant_block_fp8_weight @@ -655,13 +645,13 @@ def test_dequant_fp8_weight(self): def test_mixed_bit_setting(self, tiny_opt_model_path): model_name = tiny_opt_model_path - layer_config = {"model.decoder.layers.7.fc1": {"bits": 8, "act_bits": 8}} + layer_config = {"model.decoder.layers.1.fc1": {"bits": 8, "act_bits": 8}} ar = AutoRound(model_name, data_type="mx_fp4", act_bits=4, iters=0, layer_config=layer_config) ar.quantize() layer_config = ar.layer_config if ( - layer_config["model.decoder.layers.7.fc1"]["bits"] != 8 - or layer_config["model.decoder.layers.7.fc1"]["act_bits"] != 8 + layer_config["model.decoder.layers.1.fc1"]["bits"] != 8 + or layer_config["model.decoder.layers.1.fc1"]["act_bits"] != 8 ): raise ValueError("mixed bits is not correct") @@ -727,8 +717,8 @@ def test_quant_lm_head_layer_config(self): assert "lm_head" in model.config.quantization_config.extra_config assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4 - def test_compressor(self): - model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct") + def test_compressor(self, tiny_qwen_vl_model_path): + model_name = tiny_qwen_vl_model_path ar = AutoRound(model_name, enable_adam=True) assert ar.optimizer == torch.optim.AdamW assert ar.mllm diff --git a/test/test_cpu/test_block_names.py b/test/test_cpu/test_block_names.py index 5d0423fa2..47c554317 100644 --- a/test/test_cpu/test_block_names.py +++ b/test/test_cpu/test_block_names.py @@ -178,12 +178,12 @@ def test_block_name_quant(self, dataloader): assert quant_config.block_name_to_quantize is not None shutil.rmtree("./saved", ignore_errors=True) - def test_mm_block_name(self): + def test_mm_block_name(self, tiny_qwen_vl_model_path): from transformers import Qwen2VLForConditionalGeneration from auto_round.utils import get_block_names - model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct") + model_name = tiny_qwen_vl_model_path model = Qwen2VLForConditionalGeneration.from_pretrained(model_name, trust_remote_code=True, device_map="auto") block_name = get_block_names(model, quant_vision=True) assert len(block_name) == 2 diff --git a/test/test_cpu/test_cli_usage.py b/test/test_cpu/test_cli_usage.py index e71b2854a..b848c22df 100644 --- a/test/test_cpu/test_cli_usage.py +++ b/test/test_cpu/test_cli_usage.py @@ -2,7 +2,7 @@ import shutil import sys -import pytest +from ..helpers import get_model_path class TestAutoRoundCmd: @@ -56,13 +56,13 @@ def test_auto_round_cmd(self, tiny_opt_model_path): assert False, "cmd line test fail, please have a check" res = os.system( - f"cd ../.. && {python_path} -m auto_round --mllm --model /tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct --iter 2 --nsamples 2 --seqlen 32 --format auto_round --output_dir ./saved" + f"cd ../.. && {python_path} -m auto_round --mllm --model {get_model_path('Qwen/Qwen2-VL-2B-Instruct')} --iter 2 --nsamples 2 --seqlen 32 --format auto_round --output_dir ./saved" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"cd ../.. && {python_path} -m auto_round --mllm --iter 2 --nsamples 2 --model /tf_dataset/auto_round/models/Qwen/Qwen2-VL-2B-Instruct --seqlen 32 --format auto_round" + f"cd ../.. && {python_path} -m auto_round --mllm --iter 2 --nsamples 2 --model {get_model_path('Qwen/Qwen2-VL-2B-Instruct')} --seqlen 32 --format auto_round" " --quant_nontext_module --output_dir ./saved " ) if res > 0 or res == -1: diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py index c34c4f096..169b07825 100644 --- a/test/test_cpu/test_gguf_format.py +++ b/test/test_cpu/test_gguf_format.py @@ -8,7 +8,7 @@ from auto_round import AutoRound -from ..helpers import get_model_path +from ..helpers import get_model_path, get_tiny_model class TestGGUF: @@ -26,7 +26,7 @@ def teardown_class(self): def test_basic_usage(self): python_path = sys.executable res = os.system( - f"cd ../.. && {python_path} -m auto_round --model /tf_dataset/auto_round/models/benzart/gemma-2b-it-fine-tuning-for-code-test " + f"cd ../.. && {python_path} -m auto_round --model {get_model_path('benzart/gemma-2b-it-fine-tuning-for-code-test')} " f" --bs 16 --iters 0 --nsamples 1 --format gguf:q4_k_m" ) if res > 0 or res == -1: @@ -62,39 +62,12 @@ def test_q4_0(self): inputs = self.tokenizer(text, return_tensors="pt").to(model.device) print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) - # from auto_round.eval.evaluation import simple_evaluate_user_model - # result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="openbookqa", eval_model_dtype="bf16") - # # 0.246 - # assert result['results']['openbookqa']['acc,none'] > 0.23 shutil.rmtree("./saved", ignore_errors=True) - # def test_q4_1(self): - # bits, group_size, sym = 4, 32, False - # autoround = AutoRound( - # self.model, self.tokenizer, bits=bits, group_size=group_size, sym=sym, iters=1, data_type="int", nsamples=1 - # ) - # quantized_model_path = "./saved" - # - # autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q4_1") - # gguf_file = os.listdir(quantized_model_path)[0] - # model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto") - # text = "There is a girl who likes adventure," - # inputs = self.tokenizer(text, return_tensors="pt").to(model.device) - # print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) - # - # # from auto_round.eval.evaluation import simple_evaluate_user_model - # # result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="openbookqa", eval_model_dtype="bf16") - # # # 0.23 - # # assert result['results']['openbookqa']['acc,none'] > 0.22 - # shutil.rmtree("./saved", ignore_errors=True) - def test_func(self): bits, group_size, sym = 4, 128, True autoround = AutoRound( self.model_name, - # bits=bits, - # group_size=group_size, - # sym=sym, iters=1, nsamples=1, seqlen=10, @@ -111,78 +84,6 @@ def test_func(self): print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) shutil.rmtree("./saved", ignore_errors=True) - # model_name = "Qwen/Qwen2.5-1.5B-Instruct" - # model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - # autoround = AutoRound( - # model, - # self.tokenizer, - # bits=3, - # group_size=16, - # sym=True, - # iters=1, - # nsamples=1, - # data_type="int_sym_dq", - # super_group_size=16, - # super_bits=6, - # ) - quantized_model_path = "./saved" - # autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q*_k_s") - # from auto_round.eval.evaluation import simple_evaluate_user_model - # gguf_file = os.listdir("saved")[0] - # model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto") - # result = simple_evaluate_user_model(model, self.tokenizer, batch_size=16, tasks="lambada_openai", eval_model_dtype="bf16") - # assert result['results']['lambada_openai']['acc,none'] > 0.5 - shutil.rmtree("./saved", ignore_errors=True) - - # - # def test_q5_k(self): - # model_name = "Qwen/Qwen2.5-1.5B-Instruct" - # model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - # autoround = AutoRound( - # model, - # self.tokenizer, - # bits=5, - # group_size=32, - # sym=False, - # iters=1, - # nsamples=1, - # data_type="int_asym_dq", - # super_group_size=8, - # super_bits=6, - # ) - # quantized_model_path = "./saved" - # autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q*_k_s") - # gguf_file = os.listdir("saved")[0] - # model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto") - # text = "There is a girl who likes adventure," - # inputs = self.tokenizer(text, return_tensors="pt").to(model.device) - # print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) - # shutil.rmtree("./saved", ignore_errors=True) - - # def test_q6_k(self): - # model_name = "Qwen/Qwen2.5-1.5B-Instruct" - # model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - # autoround = AutoRound( - # model, - # self.tokenizer, - # bits=6, - # group_size=16, - # sym=True, - # iters=1, - # nsamples=1, - # data_type="int_sym_dq", - # super_group_size=16, - # super_bits=8, - # ) - # quantized_model_path = "./saved" - # autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q*_k") - # gguf_file = os.listdir("saved")[0] - # model = AutoModelForCausalLM.from_pretrained(quantized_model_path, gguf_file=gguf_file, device_map="auto") - # text = "There is a girl who likes adventure," - # inputs = self.tokenizer(text, return_tensors="pt").to(model.device) - # print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) - # shutil.rmtree("./saved", ignore_errors=True) - def test_gguf_baseline(self): model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) @@ -207,28 +108,6 @@ def test_gguf_baseline(self): inputs = self.tokenizer(text, return_tensors="pt").to(model.device) print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) shutil.rmtree("./saved", ignore_errors=True) - # - # model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - # autoround = AutoRound( - # model, - # self.tokenizer, - # bits=5, - # group_size=32, - # sym=True, - # iters=0, - # nsamples=8, - # data_type="int_asym_dq", - # super_group_size=8, - # super_bits=6, - # disable_opt_rtn=True, - # ) - # quantized_model_path = "./saved" - # autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="gguf:q5_k_s,fake") - # model = AutoModelForCausalLM.from_pretrained(quantized_model_path + "/fake", device_map="auto") - # text = "There is a girl who likes adventure," - # inputs = self.tokenizer(text, return_tensors="pt").to(model.device) - # print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) - # shutil.rmtree("./saved", ignore_errors=True) def test_q4_k_m(self, dataloader): model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct") @@ -307,8 +186,8 @@ def test_all_format(self): assert False, "cmd line test fail, please have a check" shutil.rmtree("../../tmp_autoround", ignore_errors=True) - def test_vlm_gguf(self): - model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct") + def test_vlm_gguf(self, tiny_qwen_vl_model_path): + model_name = tiny_qwen_vl_model_path from auto_round import AutoRoundMLLM from auto_round.utils import mllm_load_model diff --git a/test/test_cpu/test_mllm.py b/test/test_cpu/test_mllm.py index 5e4842d4c..ec5c1487e 100644 --- a/test/test_cpu/test_mllm.py +++ b/test/test_cpu/test_mllm.py @@ -36,12 +36,10 @@ def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - return super().teardown_class() - - def test_tune(self): + def test_tune(self, tiny_qwen_vl_model_path): bits, group_size = 4, 128 autoround = AutoRoundMLLM( - model=self.model_name, + model=tiny_qwen_vl_model_path, bits=bits, group_size=group_size, nsamples=1, @@ -54,11 +52,11 @@ def test_tune(self): autoround.save_quantized("./saved/", format="auto_gptq", inplace=False) autoround.save_quantized("./saved/", format="auto_round", inplace=False) - def test_quant_vision(self): ## bug need to fix - tokenizer = AutoTokenizer.from_pretrained(self.model_name) - processor = AutoProcessor.from_pretrained(self.model_name, trust_remote_code=True) + def test_quant_vision(self, tiny_qwen_vl_model_path): ## bug need to fix + tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_vl_model_path) + processor = AutoProcessor.from_pretrained(tiny_qwen_vl_model_path, trust_remote_code=True) model = Qwen2VLForConditionalGeneration.from_pretrained( - self.model_name, trust_remote_code=True, device_map="auto" + tiny_qwen_vl_model_path, trust_remote_code=True, device_map="auto" ) bits, group_size = 4, 128 autoround = AutoRoundMLLM( @@ -109,11 +107,11 @@ class Myclass: ) assert len(dataset.questions) == 512 - def test_diff_dataset(self): - tokenizer = AutoTokenizer.from_pretrained(self.model_name) - processor = AutoProcessor.from_pretrained(self.model_name, trust_remote_code=True) + def test_diff_dataset(self, tiny_qwen_vl_model_path): + tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_vl_model_path) + processor = AutoProcessor.from_pretrained(tiny_qwen_vl_model_path, trust_remote_code=True) model = Qwen2VLForConditionalGeneration.from_pretrained( - self.model_name, trust_remote_code=True, device_map="auto" + tiny_qwen_vl_model_path, trust_remote_code=True, device_map="auto" ) bits, group_size = 4, 128 dataset = ["dataset test", "list test"] @@ -131,13 +129,13 @@ def test_diff_dataset(self): ) autoround.quantize() - def test_pure_text_model_check(self): + def test_pure_text_model_check(self, tiny_qwen_vl_model_path): from transformers import AutoModelForCausalLM from auto_round.utils import is_pure_text_model model = Qwen2VLForConditionalGeneration.from_pretrained( - self.model_name, trust_remote_code=True, device_map="auto" + tiny_qwen_vl_model_path, trust_remote_code=True, device_map="auto" ) assert not is_pure_text_model(model) model = AutoModelForCausalLM.from_pretrained(opt_name_or_path, trust_remote_code=True) diff --git a/test/test_cpu/test_model_scope.py b/test/test_cpu/test_model_scope.py index cf48eeaab..7edcab156 100644 --- a/test/test_cpu/test_model_scope.py +++ b/test/test_cpu/test_model_scope.py @@ -28,18 +28,14 @@ def teardown_class(self): if os.path.exists(self.cache_path): shutil.rmtree(self.cache_path, ignore_errors=True) - return super().teardown_class() - - def test_llm(self): + def test_llm(self, dataloader): model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct") - autoround = AutoRound( - model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=self.dataset - ) + autoround = AutoRound(model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=dataloader) autoround.quantize_and_save() - def test_mllm(self): + def test_mllm(self, dataloader): model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct") autoround = AutoRound( - model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=self.dataset, batch_size=2 + model_name, platform="model_scope", scheme="w4a16", iters=0, seqlen=2, dataset=dataloader, batch_size=2 ) autoround.quantize_and_save(self.saved_path) diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py index 695371061..c06fea969 100644 --- a/test/test_cpu/test_mxfp_nvfp.py +++ b/test/test_cpu/test_mxfp_nvfp.py @@ -7,7 +7,7 @@ from auto_round import AutoRound -from ..helpers import get_model_path, opt_name_or_path +from ..helpers import is_model_outputs_similar def _get_folder_size(path: str) -> float: @@ -24,18 +24,15 @@ def _get_folder_size(path: str) -> float: class TestAutoRoundFP: @classmethod def setup_class(self): - self.model_name = opt_name_or_path self.save_dir = "./saved" - self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto") - self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @classmethod def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_nvfp4_moe_actmax_rtn(self, dataloader): - model_name = get_model_path("deepseek-ai/DeepSeek-V2-Lite") + def test_nvfp4_moe_actmax_rtn(self, tiny_deepseek_v2_model_path, dataloader): + model_name = tiny_deepseek_v2_model_path layer_config = { "self_attn": {"bits": 16, "act_bits": 16}, "mlp.shared_experts": {"bits": 16, "act_bits": 16}, @@ -61,8 +58,8 @@ def test_nvfp4_moe_actmax_rtn(self, dataloader): ), "Illegal NVFP4 quantization for lm_head layer" shutil.rmtree(self.save_dir, ignore_errors=True) - def test_nvfp4_moe_actmax_ar(self, dataloader): - model_name = get_model_path("deepseek-ai/DeepSeek-V2-Lite") + def test_nvfp4_moe_actmax_ar(self, tiny_deepseek_v2_model_path, dataloader): + model_name = tiny_deepseek_v2_model_path layer_config = { "q_proj": {"bits": 16, "act_bits": 16}, "mlp.shared_experts": {"bits": 16, "act_bits": 16}, @@ -90,17 +87,11 @@ def test_nvfp4_moe_actmax_ar(self, dataloader): and lm_head.weight_scale.dtype is torch.float8_e4m3fn ), "Illegal NVFP4 packing for lm_head layer" quantized_model_path = self.save_dir - model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="auto") - tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) - from auto_round.eval.evaluation import simple_evaluate_user_model - - result = simple_evaluate_user_model(model, tokenizer, batch_size=4, tasks="piqa", limit=4) - print(result["results"]["piqa"]["acc,none"]) - assert result["results"]["piqa"]["acc,none"] > 0.7 + assert is_model_outputs_similar(model_name, quantized_model_path) shutil.rmtree(self.save_dir, ignore_errors=True) - def test_mxfp4_moe_ar(self, dataloader): - model_name = get_model_path("deepseek-ai/DeepSeek-V2-Lite") + def test_mxfp4_moe_ar(self, tiny_deepseek_v2_model_path, dataloader): + model_name = tiny_deepseek_v2_model_path layer_config = { "q_proj": {"bits": 16, "act_bits": 16, "data_type": "float"}, "mlp.shared_experts": {"bits": 16, "act_bits": 16, "data_type": "float"}, @@ -127,8 +118,8 @@ def test_mxfp4_moe_ar(self, dataloader): ), "Illegal MXFP4 packing for lm_head layer" shutil.rmtree(self.save_dir, ignore_errors=True) - def test_mxfp4_llmcompressor_format(self, dataloader): - model_name = self.model_name + def test_mxfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path from transformers import AutoConfig scheme = "MXFP4" @@ -146,8 +137,8 @@ def test_mxfp4_llmcompressor_format(self, dataloader): compressed_model = autoround.save_quantized( output_dir=quantized_model_path, inplace=True, format="llm_compressor" ) - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj - skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj + tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj + skip_layer = compressed_model.model.decoder.layers[1].self_attn.k_proj assert ( hasattr(tmp_layer, "weight_scale") and hasattr(tmp_layer, "weight_packed") @@ -167,8 +158,8 @@ def test_mxfp4_llmcompressor_format(self, dataloader): ), f"Invalid MXFP4 quantization configuration: {quantization_config}" shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_rtn_mxfp4_llmcompressor_format(self, dataloader): - model_name = self.model_name + def test_rtn_mxfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path from transformers import AutoConfig scheme = "MXFP4" @@ -186,8 +177,8 @@ def test_rtn_mxfp4_llmcompressor_format(self, dataloader): compressed_model = autoround.save_quantized( output_dir=quantized_model_path, inplace=True, format="llm_compressor" ) - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj - skip_layer = compressed_model.model.decoder.layers[3].self_attn.k_proj + tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj + skip_layer = compressed_model.model.decoder.layers[1].self_attn.k_proj assert ( hasattr(tmp_layer, "weight_scale") and hasattr(tmp_layer, "weight_packed") @@ -207,8 +198,8 @@ def test_rtn_mxfp4_llmcompressor_format(self, dataloader): ), f"Invalid MXFP4 quantization configuration: {quantization_config}" shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mxfp8_llmcompressor_format(self, dataloader): - model_name = self.model_name + def test_mxfp8_llmcompressor_format(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path from transformers import AutoConfig scheme = "MXFP8" @@ -221,7 +212,7 @@ def test_mxfp8_llmcompressor_format(self, dataloader): ) quantized_model_path = self.save_dir compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj + tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") and hasattr(tmp_layer, "weight") @@ -238,14 +229,14 @@ def test_mxfp8_llmcompressor_format(self, dataloader): and quantization_config["config_groups"]["group_0"]["weights"]["num_bits"] == 8 ), f"Invalid MXFP8 quantization configuration: {quantization_config}" folder_size_gb = _get_folder_size(quantized_model_path) - # Original opt-125m is < 0.5GB -> quantized mxfp8 model should be smaller but not empty + # Original tiny_opt_model_path-125m is < 0.1GB -> quantized mxfp8 model should be smaller but not empty assert ( - 0.15 < folder_size_gb < 0.2 - ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.2 GB)" + 0.05 < folder_size_gb < 0.1 + ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.05~0.1 GB)" shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_nvfp4_llmcompressor_format(self, dataloader): - model_name = self.model_name + def test_nvfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path from transformers import AutoConfig scheme = "NVFP4" @@ -258,7 +249,7 @@ def test_nvfp4_llmcompressor_format(self, dataloader): ) quantized_model_path = self.save_dir compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj + tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") and hasattr(tmp_layer, "weight_global_scale") @@ -275,14 +266,14 @@ def test_nvfp4_llmcompressor_format(self, dataloader): and quantization_config["config_groups"]["group_0"]["input_activations"]["num_bits"] == 4 ), f"Invalid NVFP4 quantization configuration: {quantization_config}" folder_size_gb = _get_folder_size(quantized_model_path) - # Original opt-125m is < 0.5GB -> quantized nvfp4 model should be smaller but not empty + # Original opt-125m is < 0.1GB -> quantized nvfp4 model should be smaller but not empty assert ( - 0.1 < folder_size_gb < 0.15 - ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.1~0.15 GB)" + 0.05 < folder_size_gb < 0.1 + ), f"Quantized model folder size {folder_size_gb:.2f} GB is outside the expected range (0.05~0.1 GB)" shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_nvfp4_autoround_format(self, dataloader): - model_name = self.model_name + def test_nvfp4_autoround_format(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path from transformers import AutoConfig scheme = "NVFP4" @@ -295,7 +286,7 @@ def test_nvfp4_autoround_format(self, dataloader): ) quantized_model_path = self.save_dir compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj + tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") and hasattr(tmp_layer, "weight_global_scale") @@ -306,8 +297,8 @@ def test_nvfp4_autoround_format(self, dataloader): ), "Illegal NVFP4 packing name or data_type or shape" shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_nvfp4_autoround_save_quantized(self, dataloader): - model_name = self.model_name + def test_nvfp4_autoround_save_quantized(self, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path from transformers import AutoConfig scheme = "NVFP4" @@ -321,7 +312,7 @@ def test_nvfp4_autoround_save_quantized(self, dataloader): quantized_model_path = self.save_dir autoround.quantize() compressed_model = autoround.save_quantized(output_dir=quantized_model_path, format="auto_round") - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj + tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") and hasattr(tmp_layer, "weight_global_scale") @@ -332,10 +323,10 @@ def test_nvfp4_autoround_save_quantized(self, dataloader): ), "Illegal NVFP4 packing name or data_type or shape" shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_qwen_moe_quant_infer(self, dataloader): - model_name = get_model_path("Qwen/Qwen1.5-MoE-A2.7B") + def test_qwen_moe_quant_infer(self, tiny_qwen_moe_model_path, dataloader): + model_name = tiny_qwen_moe_model_path layer_config = { - "layers\.(?:[3-9]|1[0-9]|2[0-3])": {"bits": 16, "act_bits": 16}, + "layers.0": {"bits": 16, "act_bits": 16}, } scheme = "nvfp4" autoround = AutoRound( @@ -349,14 +340,8 @@ def test_qwen_moe_quant_infer(self, dataloader): ) quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, inplace=True, format="auto_round") - model = AutoModelForCausalLM.from_pretrained(quantized_model_path, torch_dtype="auto", device_map="cpu") - tokenizer = AutoTokenizer.from_pretrained(quantized_model_path) - from auto_round.eval.evaluation import simple_evaluate_user_model - - result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="piqa", limit=10) - print(result["results"]["piqa"]["acc,none"]) - assert result["results"]["piqa"]["acc,none"] > 0.60 - shutil.rmtree(quantized_model_path, ignore_errors=True) + assert is_model_outputs_similar(model_name, quantized_model_path) + shutil.rmtree(self.save_dir, ignore_errors=True) @pytest.mark.parametrize( "scheme, static_kv_dtype, static_attention_dtype", @@ -370,9 +355,9 @@ def test_qwen_moe_quant_infer(self, dataloader): ("NVFP4", "fp8", None), ], ) - def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype, dataloader): - model_name = self.model_name - from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer + def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype, tiny_opt_model_path, dataloader): + model_name = tiny_opt_model_path + from transformers import AutoConfig from transformers.models.opt.modeling_opt import OPTForCausalLM config = AutoConfig.from_pretrained(model_name) diff --git a/test/test_cpu/test_scheme.py b/test/test_cpu/test_scheme.py index 9bd236765..890b4bee4 100644 --- a/test/test_cpu/test_scheme.py +++ b/test/test_cpu/test_scheme.py @@ -54,7 +54,7 @@ def test_mxfp4(self, dataloader): def test_vllm(self): from auto_round import AutoRoundMLLM - ar = AutoRoundMLLM(get_model_path("Qwen/Qwen2-VL-2B-Instruct", scheme="W2A16"), nsamples=1, iters=1, seqlen=2) + ar = AutoRoundMLLM(get_model_path("Qwen/Qwen2-VL-2B-Instruct"), scheme="W2A16", nsamples=1, iters=1, seqlen=2) assert ar.bits == 2 assert ar.act_bits == 16 From 3764e88515d73a35c7e78b529edc5485cbb9f355 Mon Sep 17 00:00:00 2001 From: sys-lpot-val Date: Sun, 21 Dec 2025 22:57:16 -0800 Subject: [PATCH 13/24] support mllm and untied tiny model Signed-off-by: sys-lpot-val Signed-off-by: He, Xin3 --- auto_round/utils/model.py | 5 +++ test/fixtures.py | 21 +++++++++- test/helpers.py | 43 ++++++++++++++------ test/test_cpu/test_autoround.py | 8 ++-- test/test_cpu/test_cli_usage.py | 6 +-- test/test_cpu/test_gguf_format.py | 63 +++++++++++++++--------------- test/test_cpu/test_mllm.py | 5 +-- test/test_cpu/test_scheme.py | 41 +++++++++---------- test/test_cpu/test_script.py | 15 ------- test/test_cuda/test_auto_scheme.py | 4 +- 10 files changed, 119 insertions(+), 92 deletions(-) delete mode 100644 test/test_cpu/test_script.py diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index f4bb15575..38f984663 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -1047,6 +1047,11 @@ def set_module(model, key, new_module): setattr(module, name_list[-1], new_module) +# For getting and setting attribution, such as 'lm_head.weight' +get_attr = get_module +set_attr = set_module + + def get_layer_features(layer): """Extracts input and output feature dimensions for supported layers.""" from auto_round.utils import deepspeed_exists diff --git a/test/fixtures.py b/test/fixtures.py index c4e2ea198..e64f9d25b 100644 --- a/test/fixtures.py +++ b/test/fixtures.py @@ -8,6 +8,7 @@ from .helpers import ( DataLoader, deepseek_v2_name_or_path, + gemma_name_or_path, get_tiny_model, gptj_name_or_path, lamini_name_or_path, @@ -66,6 +67,15 @@ def tiny_deepseek_v2_model_path(): shutil.rmtree(tiny_model_path) +@pytest.fixture(scope="session") +def tiny_gemma_model_path(): + model_name_or_path = gemma_name_or_path + tiny_model_path = "./tmp_tiny_gemma_model_path" + tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2) + yield tiny_model_path + shutil.rmtree(tiny_model_path) + + @pytest.fixture(scope="session") def tiny_qwen_model_path(): model_name_or_path = qwen_name_or_path @@ -75,6 +85,15 @@ def tiny_qwen_model_path(): shutil.rmtree(tiny_model_path) +@pytest.fixture(scope="session") +def tiny_untied_qwen_model_path(): + model_name_or_path = qwen_name_or_path + tiny_model_path = "./tmp_tiny_untied_qwen_model_path" + tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, force_untie=True) + yield tiny_model_path + shutil.rmtree(tiny_model_path) + + @pytest.fixture(scope="session") def tiny_qwen_moe_model_path(): model_name_or_path = qwen_moe_name_or_path @@ -88,7 +107,7 @@ def tiny_qwen_moe_model_path(): def tiny_qwen_vl_model_path(): model_name_or_path = qwen_vl_name_or_path tiny_model_path = "./tmp_tiny_qwen_vl_model_path" - tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2) + tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2, is_mllm=True) yield tiny_model_path shutil.rmtree(tiny_model_path) diff --git a/test/helpers.py b/test/helpers.py index be086497c..9f29c9b1d 100644 --- a/test/helpers.py +++ b/test/helpers.py @@ -1,10 +1,11 @@ +import copy import os import pytest import torch import transformers -from auto_round.utils import llm_load_model +from auto_round.utils import get_attr, llm_load_model, mllm_load_model, set_attr # Automatic choose local path or model name. @@ -28,18 +29,15 @@ def get_model_path(model_name: str) -> str: deepseek_v2_name_or_path = get_model_path("deepseek-ai/DeepSeek-V2-Lite") qwen_moe_name_or_path = get_model_path("Qwen/Qwen1.5-MoE-A2.7B") qwen_vl_name_or_path = get_model_path("Qwen/Qwen2-VL-2B-Instruct") +gemma_name_or_path = get_model_path("benzart/gemma-2b-it-fine-tuning-for-code-test") # Slice model into tiny model for speedup -def get_tiny_model(model_name_or_path, num_layers=3, **kwargs): - kwargs["dtype"] = "auto" if "auto" not in kwargs else kwargs["dtype"] - kwargs["trust_remote_code"] = True if "trust_remote_code" not in kwargs else kwargs["trust_remote_code"] - model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, **kwargs) - - if hasattr(model.config, "num_hidden_layers"): - model.config.num_hidden_layers = num_layers +def get_tiny_model(model_name_or_path, num_layers=2, is_mllm=False, **kwargs): + """Generate a tiny model by slicing layers from the original model.""" def slice_layers(module): + """slice layers in the model.""" for name, child in module.named_children(): if isinstance(child, torch.nn.ModuleList) and len(child) > num_layers: new_layers = torch.nn.ModuleList(child[:num_layers]) @@ -49,7 +47,12 @@ def slice_layers(module): return True return False - model, tokenizer = llm_load_model(model_name_or_path) + kwargs["dtype"] = "auto" if "auto" not in kwargs else kwargs["dtype"] + kwargs["trust_remote_code"] = True if "trust_remote_code" not in kwargs else kwargs["trust_remote_code"] + if is_mllm: + model, processor, tokenizer, image_processor = mllm_load_model(model_name_or_path, **kwargs) + else: + model, tokenizer = llm_load_model(model_name_or_path, **kwargs) slice_layers(model) if hasattr(model.config, "num_hidden_layers"): @@ -61,13 +64,25 @@ def slice_layers(module): # for fixture usage only -def save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2): - model = get_tiny_model(model_name_or_path, num_layers=num_layers) +def save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2, is_mllm=False, force_untie=False, **kwargs): + """Generate a tiny model and save to the specified path.""" + model = get_tiny_model(model_name_or_path, num_layers=num_layers, is_mllm=is_mllm, **kwargs) + if force_untie: + if getattr(getattr(model, "config", None), "tie_word_embeddings", False): + model.config.tie_word_embeddings = False + for key in model._tied_weights_keys: + weight = get_attr(model, key) + set_attr(model, key, copy.deepcopy(weight)) tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) test_path = os.path.dirname(__file__) tiny_model_path = os.path.join(test_path, tiny_model_path.removeprefix("./")) model.save_pretrained(tiny_model_path) tokenizer.save_pretrained(tiny_model_path) + if is_mllm: + processor = transformers.AutoProcessor.from_pretrained(model_name_or_path, trust_remote_code=True) + processor.save_pretrained(tiny_model_path) + image_processor = transformers.AutoImageProcessor.from_pretrained(model_name_or_path, trust_remote_code=True) + image_processor.save_pretrained(tiny_model_path) print(f"[Fixture]: built tiny model path:{tiny_model_path} for testing in session") return tiny_model_path @@ -83,6 +98,7 @@ def is_pytest_mode_lazy(): # General model inference code def model_infer(model, tokenizer, apply_chat_template=False): + """Run model inference and print generated outputs.""" prompts = [ "Hello,my name is", # "The president of the United States is", @@ -131,7 +147,10 @@ def __iter__(self): def get_output(model_name_or_path): """Get model output for fixed input.""" - model, tokenizer = llm_load_model(model_name_or_path) + try: + model, tokenizer = llm_load_model(model_name_or_path) + except: + model, processor, tokenizer, image_processor = mllm_load_model(model_name_or_path) outputs = model(fixed_input)[0] return outputs.detach().cpu() diff --git a/test/test_cpu/test_autoround.py b/test/test_cpu/test_autoround.py index 3d5d60c24..aa7aeca5e 100644 --- a/test/test_cpu/test_autoround.py +++ b/test/test_cpu/test_autoround.py @@ -677,8 +677,8 @@ def test_invalid_layer_config(self, tiny_opt_model_path): ) ar.quantize() - def test_quant_lm_head(self): - model_name = get_model_path("Qwen/Qwen3-8B") + def test_quant_lm_head(self, tiny_untied_qwen_model_path): + model_name = tiny_untied_qwen_model_path ar = AutoRound(model_name, quant_lm_head=True, iters=0, seqlen=8, nsamples=1, disable_opt_rtn=True) ar.quantize_and_save(output_dir=self.save_folder, format="auto_round") model = AutoModelForCausalLM.from_pretrained(self.save_folder, device_map="cpu") @@ -700,8 +700,8 @@ def test_quant_lm_head(self): assert "lm_head" in model.config.quantization_config.extra_config assert model.config.quantization_config.extra_config["lm_head"]["bits"] == 4 - def test_quant_lm_head_layer_config(self): - model_name = get_model_path("Qwen/Qwen3-8B") + def test_quant_lm_head_layer_config(self, tiny_untied_qwen_model_path): + model_name = tiny_untied_qwen_model_path layer_config = {"lm_head": {"bits": 4}} ar = AutoRound( model_name, diff --git a/test/test_cpu/test_cli_usage.py b/test/test_cpu/test_cli_usage.py index b848c22df..82466dc82 100644 --- a/test/test_cpu/test_cli_usage.py +++ b/test/test_cpu/test_cli_usage.py @@ -17,7 +17,7 @@ def teardown_class(self): shutil.rmtree("../../saved", ignore_errors=True) shutil.rmtree("../../tmp_autoround", ignore_errors=True) - def test_auto_round_cmd(self, tiny_opt_model_path): + def test_auto_round_cmd(self, tiny_opt_model_path, tiny_qwen_vl_model_path): python_path = sys.executable # Test llm script @@ -56,13 +56,13 @@ def test_auto_round_cmd(self, tiny_opt_model_path): assert False, "cmd line test fail, please have a check" res = os.system( - f"cd ../.. && {python_path} -m auto_round --mllm --model {get_model_path('Qwen/Qwen2-VL-2B-Instruct')} --iter 2 --nsamples 2 --seqlen 32 --format auto_round --output_dir ./saved" + f"cd ../.. && {python_path} -m auto_round --mllm --model {tiny_qwen_vl_model_path} --iter 2 --nsamples 2 --seqlen 32 --format auto_round --output_dir ./saved" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"cd ../.. && {python_path} -m auto_round --mllm --iter 2 --nsamples 2 --model {get_model_path('Qwen/Qwen2-VL-2B-Instruct')} --seqlen 32 --format auto_round" + f"cd ../.. && {python_path} -m auto_round --mllm --iter 2 --nsamples 2 --model {tiny_qwen_vl_model_path} --seqlen 32 --format auto_round" " --quant_nontext_module --output_dir ./saved " ) if res > 0 or res == -1: diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py index 169b07825..81bb0667c 100644 --- a/test/test_cpu/test_gguf_format.py +++ b/test/test_cpu/test_gguf_format.py @@ -4,29 +4,29 @@ import pytest import torch +import transformers from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound -from ..helpers import get_model_path, get_tiny_model +from ..helpers import get_tiny_model, qwen_name_or_path class TestGGUF: @classmethod def setup_class(self): - self.model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct") - self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + self.tokenizer = AutoTokenizer.from_pretrained(qwen_name_or_path, trust_remote_code=True) @classmethod def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_basic_usage(self): + def test_basic_usage(self, tiny_gemma_model_path, tiny_qwen_model_path): python_path = sys.executable res = os.system( - f"cd ../.. && {python_path} -m auto_round --model {get_model_path('benzart/gemma-2b-it-fine-tuning-for-code-test')} " + f"cd ../.. && {python_path} -m auto_round --model {tiny_gemma_model_path} " f" --bs 16 --iters 0 --nsamples 1 --format gguf:q4_k_m" ) if res > 0 or res == -1: @@ -34,17 +34,17 @@ def test_basic_usage(self): shutil.rmtree("./saved", ignore_errors=True) res = os.system( - f"cd ../.. && {python_path} -m auto_round --model {self.model_name}" + f"cd ../.. && {python_path} -m auto_round --model {tiny_qwen_model_path}" f" --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" shutil.rmtree("./saved", ignore_errors=True) - def test_q4_0(self): + def test_q4_0(self, tiny_qwen_model_path): bits, group_size, sym = 4, 32, True autoround = AutoRound( - self.model_name, + tiny_qwen_model_path, bits=bits, group_size=group_size, sym=sym, @@ -61,13 +61,12 @@ def test_q4_0(self): text = "There is a girl who likes adventure," inputs = self.tokenizer(text, return_tensors="pt").to(model.device) print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) - shutil.rmtree("./saved", ignore_errors=True) - def test_func(self): + def test_func(self, tiny_qwen_model_path): bits, group_size, sym = 4, 128, True autoround = AutoRound( - self.model_name, + tiny_qwen_model_path, iters=1, nsamples=1, seqlen=10, @@ -84,8 +83,8 @@ def test_func(self): print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) shutil.rmtree("./saved", ignore_errors=True) - def test_gguf_baseline(self): - model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct") + def test_gguf_baseline(self, tiny_qwen_model_path): + model_name = tiny_qwen_model_path model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) autoround = AutoRound( model, @@ -103,16 +102,16 @@ def test_gguf_baseline(self): ) quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="fake") + model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto") text = "There is a girl who likes adventure," inputs = self.tokenizer(text, return_tensors="pt").to(model.device) print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) shutil.rmtree("./saved", ignore_errors=True) - def test_q4_k_m(self, dataloader): - model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct") - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + def test_q4_k_m(self, tiny_qwen_model_path, dataloader): + model = get_tiny_model(qwen_name_or_path, num_layers=4) + tokenizer = transformers.AutoTokenizer.from_pretrained(qwen_name_or_path, trust_remote_code=True) layer_config = { "lm_head": { "bits": 4, @@ -123,8 +122,8 @@ def test_q4_k_m(self, dataloader): "super_group_size": 8, }, "model.embed_tokens": {"bits": 6, "group_size": 32, "super_bits": 6, "super_group_size": 8}, - "model.layers.12.mlp.gate_proj": {"bits": 3}, - "model.layers.10.mlp.gate_proj": {"bits": 8}, + "model.layers.3.mlp.gate_proj": {"bits": 3}, + "model.layers.1.mlp.gate_proj": {"bits": 8}, } autoround = AutoRound( model, @@ -138,26 +137,26 @@ def test_q4_k_m(self, dataloader): ) quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake") - assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["super_group_size"] == 16 - assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["data_type"] == "int_sym_dq" - assert autoround.layer_config["model.layers.7.self_attn.v_proj"]["data_type"] == "int_asym_dq" + assert autoround.layer_config["model.layers.2.self_attn.v_proj"]["super_group_size"] == 16 + assert autoround.layer_config["model.layers.2.self_attn.v_proj"]["data_type"] == "int_sym_dq" + assert autoround.layer_config["model.layers.0.self_attn.v_proj"]["data_type"] == "int_asym_dq" assert autoround.model.model.layers[0].self_attn.v_proj.bits == 6 - assert autoround.model.model.layers[12].self_attn.v_proj.bits == 4 + assert autoround.model.model.layers[3].self_attn.v_proj.bits == 4 assert autoround.model.model.embed_tokens.bits == 6 assert autoround.model.model.embed_tokens.group_size == 16 - assert autoround.model.model.layers[12].mlp.gate_proj.bits == 3 - assert autoround.model.model.layers[10].mlp.gate_proj.bits == 8 - assert autoround.layer_config["model.layers.10.mlp.gate_proj"]["mostly"] == "gguf:q8_0" + assert autoround.model.model.layers[3].mlp.gate_proj.bits == 3 + assert autoround.model.model.layers[1].mlp.gate_proj.bits == 8 + assert autoround.layer_config["model.layers.1.mlp.gate_proj"]["mostly"] == "gguf:q8_0" shutil.rmtree("./saved", ignore_errors=True) - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(tiny_qwen_model_path, torch_dtype="auto", trust_remote_code=True) autoround = AutoRound(model, tokenizer, iters=0, nsamples=1, seqlen=128, disable_opt_rtn=False) quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake") shutil.rmtree("./saved", ignore_errors=True) - def test_all_format(self): - model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct") + def test_all_format(self, tiny_qwen_model_path): + model_name = tiny_qwen_model_path python_path = sys.executable # for gguf_format in ["gguf:q4_0", "gguf:q4_1", "gguf:q4_k_m", "gguf:q6_k"]: for gguf_format in ["gguf:q4_k_m"]: @@ -211,7 +210,7 @@ def test_vlm_gguf(self, tiny_qwen_vl_model_path): assert abs(file_size - 892) < 5.0 shutil.rmtree("./saved", ignore_errors=True) - def test_qtype_setting(self): + def test_qtype_setting(self, tiny_qwen_model_path): # Qwen2.5-0.5B-Instruct no output, token_embed q6_k fallbakc to q8_0 336M # Qwen3-0.6B output q6_k, token_embed q4_0 448M # Qwen3-8B output q6_k, token_embed q4_0 4.5G @@ -219,7 +218,7 @@ def test_qtype_setting(self): from auto_round.compressors.utils import set_layer_config from auto_round.export.export_to_gguf.config import ModelType - model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct") + model_name = tiny_qwen_model_path ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0) ar.formats = ["gguf:q4_0"] ar.layer_config, _, _ = set_layer_config( @@ -238,7 +237,7 @@ def test_qtype_setting(self): assert ar.layer_config["model.embed_tokens"]["bits"] == 8 assert "lm_head" not in ar.layer_config - model_name = "Qwen/Qwen3-0.6B" + model_name = tiny_qwen_model_path ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0) ar.formats = ["gguf:q4_0"] ar.layer_config, _, _ = set_layer_config( diff --git a/test/test_cpu/test_mllm.py b/test/test_cpu/test_mllm.py index ec5c1487e..0f8f7219c 100644 --- a/test/test_cpu/test_mllm.py +++ b/test/test_cpu/test_mllm.py @@ -205,10 +205,10 @@ def test_str_input(self): ) print(output_text[0]) - def test_qwen2_5(self): + def test_qwen2_5(self, tiny_qwen_vl_model_path): from auto_round.utils import mllm_load_model - model_name = get_model_path("Qwen/Qwen2.5-VL-3B-Instruct") + model_name = tiny_qwen_vl_model_path model, processor, tokenizer, image_processor = mllm_load_model(model_name) autoround = AutoRoundMLLM( model, @@ -258,4 +258,3 @@ def test_qwen2_5(self): output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) - print(output_text) diff --git a/test/test_cpu/test_scheme.py b/test/test_cpu/test_scheme.py index 890b4bee4..7a60a9ccd 100644 --- a/test/test_cpu/test_scheme.py +++ b/test/test_cpu/test_scheme.py @@ -1,18 +1,16 @@ import shutil -import pytest -import torch +import transformers from auto_round import AutoRound from auto_round.schemes import QuantizationScheme -from ..helpers import get_model_path, opt_name_or_path, qwen_name_or_path +from ..helpers import get_model_path, get_tiny_model, opt_name_or_path, qwen_name_or_path class TestAutoRound: @classmethod def setup_class(self): - self.model_name = opt_name_or_path self.save_folder = "./saved" @classmethod @@ -20,9 +18,9 @@ def teardown_class(self): shutil.rmtree(self.save_folder, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_gguf(self, dataloader): + def test_gguf(self, tiny_qwen_model_path, dataloader): ar = AutoRound( - qwen_name_or_path, + tiny_qwen_model_path, scheme="W2A16", nsamples=1, iters=1, @@ -33,60 +31,63 @@ def test_gguf(self, dataloader): assert ar.bits == 4 shutil.rmtree(self.save_folder, ignore_errors=True) - def test_w4a16(self, dataloader): - ar = AutoRound(self.model_name, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=dataloader) + def test_w4a16(self, tiny_opt_model_path, dataloader): + ar = AutoRound(tiny_opt_model_path, scheme="W4A16", nsamples=1, iters=1, seqlen=2, dataset=dataloader) assert ar.bits == 4 ar.quantize() - def test_w2a16_rtn(self, dataloader): - ar = AutoRound(self.model_name, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=dataloader) + def test_w2a16_rtn(self, tiny_opt_model_path, dataloader): + ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=0, seqlen=2, dataset=dataloader) assert ar.bits == 2 ar.quantize() - def test_mxfp4(self, dataloader): - ar = AutoRound(self.model_name, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader) + def test_mxfp4(self, tiny_opt_model_path, dataloader): + ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader) assert ar.bits == 4 assert ar.act_bits == 4 assert ar.data_type == "mx_fp" assert ar.act_data_type == "mx_fp_rceil" ar.quantize() - def test_vllm(self): + def test_vllm(self, tiny_qwen_vl_model_path): from auto_round import AutoRoundMLLM - ar = AutoRoundMLLM(get_model_path("Qwen/Qwen2-VL-2B-Instruct"), scheme="W2A16", nsamples=1, iters=1, seqlen=2) + ar = AutoRoundMLLM(tiny_qwen_vl_model_path, scheme="W2A16", nsamples=1, iters=1, seqlen=2) assert ar.bits == 2 assert ar.act_bits == 16 - def test_nvfp4(self, dataloader): - ar = AutoRound(self.model_name, scheme="NVFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader) + def test_nvfp4(self, tiny_opt_model_path, dataloader): + ar = AutoRound(tiny_opt_model_path, scheme="NVFP4", nsamples=1, iters=1, seqlen=2, dataset=dataloader) assert ar.bits == 4 assert ar.act_bits == 4 assert ar.data_type == "nv_fp" assert ar.act_data_type == "nv_fp4_with_static_gs" ar.quantize() - def test_all_scheme(self, dataloader): + def test_all_scheme(self, tiny_opt_model_path, tiny_qwen_model_path, dataloader): import copy preset_schemes = ["W8A16", "MXFP8", "FPW8A16", "FP8_STATIC", "GGUF:Q2_K_S", "GGUF:Q4_K_M"] for scheme in preset_schemes: - model_name = self.model_name + model_name = tiny_opt_model_path if "gguf" in scheme.lower(): - model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct") + model_name = tiny_qwen_model_path print(f"scheme={scheme}") ar = AutoRound(model_name, scheme=scheme, nsamples=1, iters=1, seqlen=2, dataset=dataloader) ar.quantize_and_save(self.save_folder) shutil.rmtree(self.save_folder, ignore_errors=True) def test_scheme_in_layer_config(self, dataloader): + model = get_tiny_model(opt_name_or_path, num_layers=5) + tokenizer = transformers.AutoTokenizer.from_pretrained(opt_name_or_path, trust_remote_code=True) layer_config = { "model.decoder.layers.2.self_attn": {"bits": 2}, "model.decoder.layers.3.self_attn.v_proj": "W8A16", "model.decoder.layers.4.self_attn.k_proj": QuantizationScheme.from_dict({"group_size": 64}), } ar = AutoRound( - opt_name_or_path, + model, + tokenizer, scheme="W3A16", nsamples=1, iters=1, diff --git a/test/test_cpu/test_script.py b/test/test_cpu/test_script.py deleted file mode 100644 index aa25d7f61..000000000 --- a/test/test_cpu/test_script.py +++ /dev/null @@ -1,15 +0,0 @@ -import os - -import pytest - - -class TestScript: - def test_default(self): - os.system( - """ - cd ../.. && - python -m auto_round - --iters 2 - --deployment_device fake - --output_dir ./tmp_script_test""" - ) diff --git a/test/test_cuda/test_auto_scheme.py b/test/test_cuda/test_auto_scheme.py index b8c40c470..259bc4450 100644 --- a/test/test_cuda/test_auto_scheme.py +++ b/test/test_cuda/test_auto_scheme.py @@ -223,8 +223,8 @@ def test_layer_config(self): print(avg_bits) assert target_bits - 0.1 < avg_bits <= target_bits + 1e-3 - def test_lm_head_and_mix_dtype(self): - model_name = get_model_path("qwen/Qwen3-8B") + def test_lm_head_and_mix_dtype(self, tiny_untied_qwen_model_path): + model_name = tiny_untied_qwen_model_path model = get_tiny_model(model_name) tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) target_bits = 6 From 418022fa5ae87f03e1f3cae7643403d49fbf4740 Mon Sep 17 00:00:00 2001 From: "Sun, Xuehao" Date: Mon, 22 Dec 2025 15:47:24 +0800 Subject: [PATCH 14/24] fix ut path Signed-off-by: Sun, Xuehao --- .azure-pipelines/scripts/ut/run_ut.sh | 5 ++--- .azure-pipelines/scripts/ut/run_ut_cuda.sh | 24 ++++++++++------------ .azure-pipelines/scripts/ut/run_ut_hpu.sh | 7 +++---- 3 files changed, 16 insertions(+), 20 deletions(-) diff --git a/.azure-pipelines/scripts/ut/run_ut.sh b/.azure-pipelines/scripts/ut/run_ut.sh index dcf1a7170..e7d3d9e00 100644 --- a/.azure-pipelines/scripts/ut/run_ut.sh +++ b/.azure-pipelines/scripts/ut/run_ut.sh @@ -19,8 +19,7 @@ cd /auto-round && uv pip install . echo "##[endgroup]" uv pip list -cd /auto-round/test/test_cpu || exit 1 -find . -type f -exec sed -i '/sys\.path\.insert(0, "\.\.")/d' {} + +cd /auto-round/test || exit 1 export LD_LIBRARY_PATH=${HOME}/.venv/lib/:$LD_LIBRARY_PATH export FORCE_BF16=1 @@ -32,7 +31,7 @@ mkdir -p ${LOG_DIR} ut_log_name=${LOG_DIR}/ut.log # Split test files into 5 parts -find . -name "test*.py" | sort > all_tests.txt +find ./test_cpu -name "test*.py" | sort > all_tests.txt total_lines=$(wc -l < all_tests.txt) NUM_CHUNKS=5 q=$(( total_lines / NUM_CHUNKS )) diff --git a/.azure-pipelines/scripts/ut/run_ut_cuda.sh b/.azure-pipelines/scripts/ut/run_ut_cuda.sh index 18a9bb00d..0f111d3fa 100644 --- a/.azure-pipelines/scripts/ut/run_ut_cuda.sh +++ b/.azure-pipelines/scripts/ut/run_ut_cuda.sh @@ -27,16 +27,14 @@ function create_conda_env() { # install AutoRound cd ${REPO_PATH} - pip uninstall auto-round -y + uv pip install torch==2.8.0 torchvision uv pip install -r requirements.txt - sed -i '/^torch==/d;/^transformers==/d;/^lm-eval==/d' requirements.txt if [ -d "/proc/driver/nvidia" ]; then export PATH=/usr/local/cuda/bin${PATH:+:${PATH}} export LD_LIBRARY_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/nvidia/nvjitlink/lib:$LD_LIBRARY_PATH fi uv pip install --no-build-isolation . uv pip install pytest-cov pytest-html cmake==4.0.2 - uv pip install torch==2.8.0 torchvision } function print_test_results_table() { @@ -92,7 +90,7 @@ function run_unit_test() { # install unit test dependencies create_conda_env - cd ${REPO_PATH}/test/test_cuda + cd ${REPO_PATH}/test rm -rf .coverage* *.xml *.html uv pip install -v git+https://github.com/casper-hansen/AutoAWQ.git --no-build-isolation @@ -100,15 +98,15 @@ function run_unit_test() { uv pip install -r https://raw.githubusercontent.com/ModelCloud/GPTQModel/refs/heads/main/requirements.txt CMAKE_ARGS="-DGGML_CUDA=on -DLLAVA_BUILD=off" uv pip install llama-cpp-python uv pip install 'git+https://github.com/ggml-org/llama.cpp.git#subdirectory=gguf-py' - uv pip install -r requirements.txt - uv pip install -r requirements_diffusion.txt + uv pip install -r test_cuda/requirements.txt + uv pip install -r test_cuda/requirements_diffusion.txt pip list > ${LOG_DIR}/ut_pip_list.txt export COVERAGE_RCFILE=${REPO_PATH}/.azure-pipelines/scripts/ut/.coverage local auto_round_path=$(python -c 'import auto_round; print(auto_round.__path__[0])') # run unit tests individually with separate logs - for test_file in $(find . -name "test_*.py" ! -name "test_*vlms.py" ! -name "test_llmc*.py" | sort); do + for test_file in $(find ./test_cuda -name "test_*.py" ! -name "test_*vlms.py" ! -name "test_llmc*.py" | sort); do local test_basename=$(basename ${test_file} .py) local ut_log_name=${LOG_DIR}/unittest_cuda_${test_basename}.log echo "Running ${test_file}..." @@ -128,7 +126,7 @@ function run_unit_test() { function run_unit_test_vlm() { # install unit test dependencies create_conda_env - cd ${REPO_PATH}/test/test_cuda + cd ${REPO_PATH}/test rm -rf .coverage* *.xml *.html uv pip install git+https://github.com/haotian-liu/LLaVA.git@v1.2.2 --no-deps @@ -138,14 +136,14 @@ function run_unit_test_vlm() { uv pip install git+https://github.com/deepseek-ai/DeepSeek-VL2.git timm attrdict --no-deps uv pip install -v git+https://github.com/casper-hansen/AutoAWQ.git@v0.2.0 --no-build-isolation uv pip install flash-attn==2.7.4.post1 --no-build-isolation - uv pip install -r requirements_vlm.txt + uv pip install -r test_cuda/requirements_vlm.txt pip list > ${LOG_DIR}/vlm_ut_pip_list.txt export COVERAGE_RCFILE=${REPO_PATH}/.azure-pipelines/scripts/ut/.coverage local auto_round_path=$(python -c 'import auto_round; print(auto_round.__path__[0])') # run VLM unit tests individually with separate logs - for test_file in $(find . -name "test*vlms.py"); do + for test_file in $(find ./test_cuda -name "test*vlms.py"); do local test_basename=$(basename ${test_file} .py) local ut_log_name=${LOG_DIR}/unittest_cuda_vlm_${test_basename}.log echo "Running ${test_file}..." @@ -166,17 +164,17 @@ function run_unit_test_llmc() { # install unit test dependencies create_conda_env - cd ${REPO_PATH}/test/test_cuda + cd ${REPO_PATH}/test rm -rf .coverage* *.xml *.html - uv pip install -r requirements_llmc.txt + uv pip install -r test_cuda/requirements_llmc.txt pip list > ${LOG_DIR}/llmc_ut_pip_list.txt export COVERAGE_RCFILE=${REPO_PATH}/.azure-pipelines/scripts/ut/.coverage local auto_round_path=$(python -c 'import auto_round; print(auto_round.__path__[0])') # run unit tests individually with separate logs - for test_file in $(find . -name "test_llmc*.py" | sort); do + for test_file in $(find ./test_cuda -name "test_llmc*.py" | sort); do local test_basename=$(basename ${test_file} .py) local ut_log_name=${LOG_DIR}/unittest_cuda_llmc_${test_basename}.log echo "Running ${test_file}..." diff --git a/.azure-pipelines/scripts/ut/run_ut_hpu.sh b/.azure-pipelines/scripts/ut/run_ut_hpu.sh index 3c3bb6991..b370edfb5 100644 --- a/.azure-pipelines/scripts/ut/run_ut_hpu.sh +++ b/.azure-pipelines/scripts/ut/run_ut_hpu.sh @@ -7,8 +7,7 @@ export TQDM_MININTERVAL=60 pip install pytest-cov pytest-html pip list -cd /auto-round/test/test_hpu || exit 1 -find . -type f -exec sed -i '/sys\.path\.insert(0, "\.\.")/d' {} + +cd /auto-round/test || exit 1 export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH export FORCE_BF16=1 @@ -19,8 +18,8 @@ LOG_DIR=/auto-round/log_dir mkdir -p ${LOG_DIR} ut_log_name=${LOG_DIR}/ut.log -find . -name "test*.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_lazy.sh -find . -name "test*.py" | sed "s,\.\/,python -m pytest --mode compile --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_compile.sh +find ./test_hpu -name "test*.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_lazy.sh +find ./test_hpu -name "test*.py" | sed "s,\.\/,python -m pytest --mode compile --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run_compile.sh cat run_lazy.sh bash run_lazy.sh 2>&1 | tee ${ut_log_name} From 71ec4c220842d751cc72257e5d7dd6bd84458673 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Mon, 22 Dec 2025 11:15:25 -0500 Subject: [PATCH 15/24] fix UT failures Signed-off-by: He, Xin3 --- auto_round/compressors/mllm/compressor.py | 3 +++ test/fixtures.py | 10 ++++++++++ test/helpers.py | 8 +++++++- test/test_cpu/test_gguf_format.py | 8 +++++--- test/test_cpu/test_mllm.py | 4 ++-- 5 files changed, 27 insertions(+), 6 deletions(-) diff --git a/auto_round/compressors/mllm/compressor.py b/auto_round/compressors/mllm/compressor.py index c6808eeb0..e690bc7a9 100644 --- a/auto_round/compressors/mllm/compressor.py +++ b/auto_round/compressors/mllm/compressor.py @@ -205,6 +205,9 @@ def __init__( if hasattr(model, "name_or_path") and any([name in model.name_or_path for name in MISTRAL_3_2_MODELS]): template = "mistral3_2" if iters > 0: + # TODO: Remove after fixing https://github.com/huggingface/transformers/issues/43005 + model.config.model_type = model.config.to_dict()["model_type"] + self.template = template if template is not None else model.config.model_type if not isinstance(dataset, torch.utils.data.DataLoader): self.template = get_template( diff --git a/test/fixtures.py b/test/fixtures.py index e64f9d25b..86bc36e48 100644 --- a/test/fixtures.py +++ b/test/fixtures.py @@ -14,6 +14,7 @@ lamini_name_or_path, opt_name_or_path, phi2_name_or_path, + qwen_2_5_vl_name_or_path, qwen_moe_name_or_path, qwen_name_or_path, qwen_vl_name_or_path, @@ -112,6 +113,15 @@ def tiny_qwen_vl_model_path(): shutil.rmtree(tiny_model_path) +@pytest.fixture(scope="session") +def tiny_qwen_2_5_vl_model_path(): + model_name_or_path = qwen_2_5_vl_name_or_path + tiny_model_path = "./tmp_tiny_qwen_2_5_vl_model_path" + tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2, is_mllm=True) + yield tiny_model_path + shutil.rmtree(tiny_model_path) + + # Create objective fixtures for testing @pytest.fixture(scope="function") def tiny_opt_model(): diff --git a/test/helpers.py b/test/helpers.py index 9f29c9b1d..b239bb451 100644 --- a/test/helpers.py +++ b/test/helpers.py @@ -29,6 +29,7 @@ def get_model_path(model_name: str) -> str: deepseek_v2_name_or_path = get_model_path("deepseek-ai/DeepSeek-V2-Lite") qwen_moe_name_or_path = get_model_path("Qwen/Qwen1.5-MoE-A2.7B") qwen_vl_name_or_path = get_model_path("Qwen/Qwen2-VL-2B-Instruct") +qwen_2_5_vl_name_or_path = get_model_path("Qwen/Qwen2.5-VL-3B-Instruct") gemma_name_or_path = get_model_path("benzart/gemma-2b-it-fine-tuning-for-code-test") @@ -51,6 +52,11 @@ def slice_layers(module): kwargs["trust_remote_code"] = True if "trust_remote_code" not in kwargs else kwargs["trust_remote_code"] if is_mllm: model, processor, tokenizer, image_processor = mllm_load_model(model_name_or_path, **kwargs) + if hasattr(model.config, "vision_config"): + if hasattr(model.config.vision_config, "num_hidden_layers"): # mistral, etc. + model.config.num_hidden_layers = num_layers + elif hasattr(model.config.vision_config, "depth"): # qwen vl + model.config.vision_config.depth = num_layers else: model, tokenizer = llm_load_model(model_name_or_path, **kwargs) slice_layers(model) @@ -80,8 +86,8 @@ def save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2, is_mllm=F tokenizer.save_pretrained(tiny_model_path) if is_mllm: processor = transformers.AutoProcessor.from_pretrained(model_name_or_path, trust_remote_code=True) - processor.save_pretrained(tiny_model_path) image_processor = transformers.AutoImageProcessor.from_pretrained(model_name_or_path, trust_remote_code=True) + processor.save_pretrained(tiny_model_path) image_processor.save_pretrained(tiny_model_path) print(f"[Fixture]: built tiny model path:{tiny_model_path} for testing in session") return tiny_model_path diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py index 81bb0667c..b7f25541c 100644 --- a/test/test_cpu/test_gguf_format.py +++ b/test/test_cpu/test_gguf_format.py @@ -9,7 +9,7 @@ from auto_round import AutoRound -from ..helpers import get_tiny_model, qwen_name_or_path +from ..helpers import get_tiny_model, qwen_name_or_path, qwen_vl_name_or_path class TestGGUF: @@ -185,8 +185,10 @@ def test_all_format(self, tiny_qwen_model_path): assert False, "cmd line test fail, please have a check" shutil.rmtree("../../tmp_autoround", ignore_errors=True) - def test_vlm_gguf(self, tiny_qwen_vl_model_path): - model_name = tiny_qwen_vl_model_path + def test_vlm_gguf(self): + # TODO: Using two-layers tiny model will return ValueError: + # Can not map tensor 'model.layers.10.input_layernorm.weight' + model_name = qwen_vl_name_or_path from auto_round import AutoRoundMLLM from auto_round.utils import mllm_load_model diff --git a/test/test_cpu/test_mllm.py b/test/test_cpu/test_mllm.py index 0f8f7219c..2eb1d3e2f 100644 --- a/test/test_cpu/test_mllm.py +++ b/test/test_cpu/test_mllm.py @@ -205,10 +205,10 @@ def test_str_input(self): ) print(output_text[0]) - def test_qwen2_5(self, tiny_qwen_vl_model_path): + def test_qwen2_5(self, tiny_qwen_2_5_vl_model_path): from auto_round.utils import mllm_load_model - model_name = tiny_qwen_vl_model_path + model_name = tiny_qwen_2_5_vl_model_path model, processor, tokenizer, image_processor = mllm_load_model(model_name) autoround = AutoRoundMLLM( model, From f1700bd901797ead09cd8f802ad88075a1067e6d Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 23 Dec 2025 00:25:35 -0500 Subject: [PATCH 16/24] update cuda ut Signed-off-by: n1ck-guo --- test/helpers.py | 4 + test/test_cuda/test_gguf.py | 114 ++++++++------------- test/test_cuda/test_main_func.py | 54 +++++----- test/test_cuda/test_marlin_backend.py | 56 +++++----- test/test_cuda/test_mix_bits.py | 53 +++++----- test/test_cuda/test_multiple_card.py | 72 ++++++------- test/test_cuda/test_multiple_card_calib.py | 18 ++-- test/test_cuda/test_mxfp_and_nvfp_quant.py | 9 +- test/test_cuda/test_mxfp_nvfp.py | 40 +++++--- test/test_cuda/test_qbits.py | 43 +++++--- test/test_cuda/test_scheme.py | 62 ++++++----- test/test_cuda/test_torch_backend.py | 45 ++++---- 12 files changed, 299 insertions(+), 271 deletions(-) diff --git a/test/helpers.py b/test/helpers.py index b239bb451..f30e632f7 100644 --- a/test/helpers.py +++ b/test/helpers.py @@ -13,6 +13,9 @@ def get_model_path(model_name: str) -> str: ut_path = f"/tf_dataset/auto_round/models/{model_name}" local_path = f"/models/{model_name.split('/')[-1]}" + if "DeepSeek-V2-Lite" in model_name and os.path.exists("/data0/deepseek-ai/DeepSeek-V2-Lite"): + return "/data0/deepseek-ai/DeepSeek-V2-Lite" + if os.path.exists(ut_path): return ut_path elif os.path.exists(local_path): @@ -36,6 +39,7 @@ def get_model_path(model_name: str) -> str: # Slice model into tiny model for speedup def get_tiny_model(model_name_or_path, num_layers=2, is_mllm=False, **kwargs): """Generate a tiny model by slicing layers from the original model.""" + model_name_or_path = get_model_path(model_name_or_path) def slice_layers(module): """slice layers in the model.""" diff --git a/test/test_cuda/test_gguf.py b/test/test_cuda/test_gguf.py index a7076667c..7a3a0cd89 100644 --- a/test/test_cuda/test_gguf.py +++ b/test/test_cuda/test_gguf.py @@ -10,19 +10,30 @@ from auto_round import AutoRound from auto_round.testing_utils import require_gguf +from ..helpers import get_model_path, get_tiny_model, save_tiny_model + class TestAutoRound: - @classmethod - def teardown_class(self): + save_dir = "./saved" + + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + + # Yield to hand control to the test methods + yield + + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @require_gguf - def test_gguf_format(self, dataloader): - model_name = "Qwen/Qwen2.5-0.5B-Instruct" + def test_gguf_format(self, tiny_qwen_model_path, dataloader): bits, group_size, sym = 4, 32, False autoround = AutoRound( - model_name, + tiny_qwen_model_path, bits=bits, group_size=group_size, sym=sym, @@ -44,9 +55,8 @@ def test_gguf_format(self, dataloader): shutil.rmtree("./saved", ignore_errors=True) save_dir = os.path.join(os.path.dirname(__file__), "saved") - model_path = "Qwen/Qwen2.5-0.5B-Instruct" res = os.system( - f"cd ../.. && {sys.executable} -m auto_round --model {model_path} --iter 2 " + f"cd ../.. && {sys.executable} -m auto_round --model {tiny_qwen_model_path} --iter 2 " f"--output_dir {save_dir} --nsample 2 --format gguf:q4_0 --device 0" ) print(save_dir) @@ -54,8 +64,8 @@ def test_gguf_format(self, dataloader): from llama_cpp import Llama - gguf_file = os.listdir("saved/Qwen2.5-0.5B-Instruct-gguf")[0] - llm = Llama(f"saved/Qwen2.5-0.5B-Instruct-gguf/{gguf_file}", n_gpu_layers=-1) + gguf_file = os.listdir("saved/tmp_tiny_qwen_model_path-gguf")[0] + llm = Llama(f"saved/tmp_tiny_qwen_model_path-gguf/{gguf_file}", n_gpu_layers=-1) output = llm("There is a girl who likes adventure,", max_tokens=32) print(output) shutil.rmtree("./saved", ignore_errors=True) @@ -63,9 +73,12 @@ def test_gguf_format(self, dataloader): @require_gguf def test_q2_k_export(self, dataloader): bits, group_size, sym = 2, 16, False - model_name = "Qwen/Qwen2.5-1.5B-Instruct" + model_path = get_model_path("Qwen/Qwen2.5-1.5B-Instruct") + model = get_tiny_model(model_path) + tokenizer = AutoTokenizer.from_pretrained(model_path) autoround = AutoRound( - model_name, + model, + tokenizer, bits=bits, group_size=group_size, sym=sym, @@ -84,20 +97,13 @@ def test_q2_k_export(self, dataloader): inputs = autoround.tokenizer(text, return_tensors="pt").to(model.device) result = autoround.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0]) print(result) - - from auto_round.eval.evaluation import simple_evaluate_user_model - - result = simple_evaluate_user_model(model, autoround.tokenizer, batch_size=16, tasks="piqa") - assert result["results"]["piqa"]["acc,none"] > 0.45 - shutil.rmtree(quantized_model_path, ignore_errors=True) @require_gguf - def test_basic_usage(self): - model_name = "Qwen/Qwen2.5-0.5B-Instruct" + def test_basic_usage(self, tiny_qwen_model_path): python_path = sys.executable res = os.system( - f"cd ../.. && {python_path} -m auto_round --model {model_name} --eval_task_by_task" + f"cd ../.. && {python_path} -m auto_round --model {tiny_qwen_model_path} --eval_task_by_task" f" --tasks piqa,openbookqa --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0 --eval_model_dtype bf16" ) if res > 0 or res == -1: @@ -106,7 +112,7 @@ def test_basic_usage(self): @require_gguf def test_q4_0(self): - model_name = "Qwen/Qwen2.5-0.5B-Instruct" + model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct") bits, group_size, sym = 4, 32, True autoround = AutoRound(model_name, bits=bits, group_size=group_size, sym=sym, iters=1, data_type="int") autoround.quantize() @@ -127,7 +133,7 @@ def test_q4_0(self): @require_gguf def test_q4_1(self): - model_name = "Qwen/Qwen2.5-0.5B-Instruct" + model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct") bits, group_size, sym = 4, 32, False autoround = AutoRound(model=model_name, bits=bits, group_size=group_size, sym=sym, iters=1, data_type="int") autoround.quantize() @@ -148,31 +154,23 @@ def test_q4_1(self): @require_gguf def test_all_format(self): - from auto_round.export.export_to_gguf.config import GGUF_CONFIG + for model_name in ["qwen/Qwen3-8B", "meta-llama/Llama-3.1-8B-Instruct", "meta-llama/Llama-3.2-3B"]: + for gguf_format in ["gguf:q5_0", "gguf:q5_1", "gguf:q3_k_m", "q5_k_m", "q6_k", "q8_0"]: + model_path = get_model_path(model_name) + tiny_model_path = "tmp_tiny_model" + tiny_model_path = save_tiny_model(model_path, tiny_model_path, num_layers=2) + ar = AutoRound(tiny_model_path, scheme=gguf_format, iters=0, nsampels=1, seqlen=16) + ar.quantize_and_save(output_dir=self.save_dir, format=gguf_format) - python_path = sys.executable - for model_name in ["/models/Qwen3-8B/", "/models/Llama-3.2-3B/", "/models/Meta-Llama-3.1-8B-Instruct"]: - for gguf_format in GGUF_CONFIG.keys(): - print(model_name, gguf_format) - res = os.system( - f"cd ../.. && {python_path} -m auto_round --model {model_name} " - f" --bs 16 --iters 1 --nsamples 1 --format fake,{gguf_format}" - ) - if res > 0 or res == -1: - assert False, "cmd line test fail, please have a check" - shutil.rmtree("../../tmp_autoround", ignore_errors=True) - - res = os.system( - f"cd ../.. && {python_path} -m auto_round --model {model_name} " - f" --bs 16 --iters 0 --nsamples 1 --format {gguf_format}" - ) - if res > 0 or res == -1: - assert False, "cmd line test fail, please have a check" - shutil.rmtree("../../tmp_autoround", ignore_errors=True) + ar = AutoRound(tiny_model_path, scheme=gguf_format, iters=1, nsampels=1, seqlen=16) + ar.quantize_and_save(output_dir=self.save_dir, format=gguf_format) + + shutil.rmtree(tiny_model_path, ignore_errors=True) + shutil.rmtree(self.save_dir, ignore_errors=True) @require_gguf def test_vlm_gguf(self): - model_name = "/models/Qwen2.5-VL-7B-Instruct" + model_name = "/models/Qwen2-VL-2B-Instruct" from auto_round import AutoRoundMLLM from auto_round.utils import mllm_load_model @@ -188,7 +186,7 @@ def test_vlm_gguf(self): quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") assert "mmproj-model.gguf" in os.listdir("./saved") - file_size = os.path.getsize("./saved/Qwen2.5-VL-7B-Instruct-Q4_0.gguf") / 1024**2 + file_size = os.path.getsize("./saved/Qwen2-VL-2B-Instruct-Q4_0.gguf") / 1024**2 assert abs(file_size - 4242) < 5.0 file_size = os.path.getsize("./saved/mmproj-model.gguf") / 1024**2 assert abs(file_size - 2580) < 5.0 @@ -214,31 +212,3 @@ def test_vlm_gguf(self): file_size = os.path.getsize("./saved/mmproj-model.gguf") / 1024**2 assert abs(file_size - 1599) < 5.0 shutil.rmtree(quantized_model_path, ignore_errors=True) - - # @require_gguf - # def test_llama_4(self): - # model_name = "/dataset/Llama-4-Scout-17B-16E-Instruct/" - # from auto_round import AutoRoundMLLM - # from auto_round.utils import mllm_load_model - - # model, processor, tokenizer, image_processor = mllm_load_model(model_name, use_auto_mapping=False) - # autoround = AutoRoundMLLM( - # model, - # tokenizer=tokenizer, - # processor=processor, - # image_processor=image_processor, - # device="auto", - # iters=0, - # ) - # quantized_model_path = "/dataset/Llam-4-test" - # shutil.rmtree(quantized_model_path, ignore_errors=True) - # autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0") - # assert "mmproj-model.gguf" in os.listdir(quantized_model_path) - # file_size = ( - # os.path.getsize(os.path.join(quantized_model_path, "Llama-4-Scout-17B-16E-Instruct-16x17B-Q4_0.gguf")) - # / 1024**2 - # ) - # assert abs(file_size - 58093.62) < 1.0 - # file_size = os.path.getsize(os.path.join(quantized_model_path, "mmproj-model.gguf")) / 1024**2 - # assert abs(file_size - 3326.18) < 5.0 - # shutil.rmtree(quantized_model_path, ignore_errors=True) diff --git a/test/test_cuda/test_main_func.py b/test/test_cuda/test_main_func.py index 3243963fe..ac8b8b91e 100644 --- a/test/test_cuda/test_main_func.py +++ b/test/test_cuda/test_main_func.py @@ -13,6 +13,8 @@ from auto_round.eval.evaluation import simple_evaluate from auto_round.testing_utils import require_awq, require_gptqmodel, require_optimum, require_package_version_ut +from ..helpers import get_model_path + def get_accuracy(data): match = re.search(r"\|acc\s+\|[↑↓]\s+\|\s+([\d.]+)\|", data) @@ -25,20 +27,26 @@ def get_accuracy(data): class TestMainFunc: - @classmethod - def setup_class(self): - self.save_dir = "./saved" - self.tasks = "lambada_openai" + save_dir = "./saved" + tasks = "lambada_openai" + + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + + # Yield to hand control to the test methods + yield - @classmethod - def teardown_class(self): + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @require_gptqmodel @require_optimum def test_backend(self): - model_name = "/models/opt-125m" + model_name = get_model_path("facebook/opt-125m") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) autoround = AutoRound(model, tokenizer, bits=4, group_size=128) @@ -66,7 +74,7 @@ def test_backend(self): @require_awq @require_package_version_ut("transformers", "<4.57.0") def test_backend_awq(self): - model_name = "/models/opt-125m" + model_name = get_model_path("facebook/opt-125m") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) autoround = AutoRound(model, tokenizer, bits=4, group_size=128) @@ -84,7 +92,7 @@ def test_backend_awq(self): @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @require_gptqmodel def test_fp_layers(self): - model_name = "/models/opt-125m" + model_name = get_model_path("facebook/opt-125m") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) from auto_round.compressors.utils import get_fp_layer_names @@ -109,7 +117,7 @@ def test_fp_layers(self): @require_awq @require_package_version_ut("transformers", "<4.57.0") def test_fp_layers_awq(self): - model_name = "/models/opt-125m" + model_name = get_model_path("facebook/opt-125m") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) from auto_round.compressors.utils import get_fp_layer_names @@ -131,17 +139,16 @@ def test_fp_layers_awq(self): shutil.rmtree("./saved", ignore_errors=True) @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") - def test_undivided_group_size_tuning(self): - model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") - tokenizer = AutoTokenizer.from_pretrained(model_name) + def test_undivided_group_size_tuning(self, tiny_opt_model_path): + model = AutoModelForCausalLM.from_pretrained(tiny_opt_model_path, torch_dtype=torch.float16, device_map="auto") + tokenizer = AutoTokenizer.from_pretrained(tiny_opt_model_path) autoround = AutoRound(model, tokenizer, bits=4, group_size=127, nsamples=2, iters=2) autoround.quantize() @require_gptqmodel def test_adam(self): - model_name = "/models/opt-125m" + model_name = get_model_path("facebook/opt-125m") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) autoround = AutoRoundAdam(model, tokenizer, bits=4, group_size=128) @@ -162,7 +169,7 @@ def test_autoround_asym(self): ##need to install false except ImportError as e: print("skip autoround asym test, as autoround is not installed from source") return - model_name = "/models/opt-125m" + model_name = get_model_path("facebook/opt-125m") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) autoround = AutoRound(model, tokenizer, bits=4, group_size=128, sym=False) @@ -177,12 +184,12 @@ def test_autoround_asym(self): ##need to install false assert accuracy > 0.35 shutil.rmtree("./saved", ignore_errors=True) - def test_attention_mask_lm_head(self): + def test_attention_mask_lm_head(self, tiny_qwen_moe_model_path): from transformers import AutoTokenizer - model_name = "/models/Qwen3-8B" + # model_name = "/models/Qwen3-8B" # model_name = "/models/Qwen3-0.6B" - tokenizer = AutoTokenizer.from_pretrained(model_name) + tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_moe_model_path) text = ["haha", "hello world"] res = tokenizer(text, return_tensors="pt", max_length=8, padding="max_length", truncation=True) res.data.pop("attention_mask") @@ -194,14 +201,13 @@ def test_attention_mask_lm_head(self): data.append(res.data) from auto_round import AutoRound - ar = AutoRound(model_name, iters=1, dataset=data, seqlen=8, quant_lm_head=True) + ar = AutoRound(tiny_qwen_moe_model_path, iters=1, dataset=data, seqlen=8, quant_lm_head=True) ar.quantize() - def test_low_cpu_mem_usage(self): + def test_low_cpu_mem_usage(self, tiny_opt_model_path): bits, group_size = 4, 32 - model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(tiny_opt_model_path, torch_dtype="auto", trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(tiny_opt_model_path, trust_remote_code=True) quantized_model_path = "./saved" autoround = AutoRound( model, diff --git a/test/test_cuda/test_marlin_backend.py b/test/test_cuda/test_marlin_backend.py index 334cb2697..8d7594086 100644 --- a/test/test_cuda/test_marlin_backend.py +++ b/test/test_cuda/test_marlin_backend.py @@ -11,6 +11,20 @@ class TestAutoRoundMarlinBackend: + save_dir = "./saved" + + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + + # Yield to hand control to the test methods + yield + + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") + shutil.rmtree("./saved", ignore_errors=True) + shutil.rmtree("runs", ignore_errors=True) def test_marlin_group_size(self, dataloader): for group_size in [-1, 64]: @@ -28,15 +42,15 @@ def test_marlin_group_size(self, dataloader): seqlen=2, dataset=dataloader, ) - quantized_model_path = self.save_folder + quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") quantization_config = AutoRoundConfig(backend="marlin") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) @@ -57,30 +71,20 @@ def test_marlin_group_size(self, dataloader): seqlen=2, dataset=dataloader, ) - quantized_model_path = self.save_folder + quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") quantization_config = AutoRoundConfig(backend="marlin") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) assert result["results"]["lambada_openai"]["acc,none"] > 0.14 - @classmethod - def setup_class(self): - self.model_name = "/models/opt-125m" - self.save_folder = "./saved" - - @classmethod - def teardown_class(self): - shutil.rmtree("./saved", ignore_errors=True) - shutil.rmtree("runs", ignore_errors=True) - def test_marlin_4bits_sym_with_zp_m_1(self, dataloader): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -95,15 +99,15 @@ def test_marlin_4bits_sym_with_zp_m_1(self, dataloader): seqlen=2, dataset=dataloader, ) - quantized_model_path = self.save_folder + quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_gptq") quantization_config = AutoRoundConfig(backend="marlin") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config + self.save_dir, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) @@ -111,10 +115,10 @@ def test_marlin_4bits_sym_with_zp_m_1(self, dataloader): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + self.save_dir, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) @@ -136,18 +140,18 @@ def test_marlin_4bits_sym_with_zp_m_1(self, dataloader): # seqlen=2, # dataset=dataloader, # ) - # quantized_model_path = self.save_folder + # quantized_model_path = self.save_dir # autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") # # quantization_config = AutoRoundConfig(backend="marlin") # model = AutoModelForCausalLM.from_pretrained( - # self.save_folder, + # self.save_dir, # torch_dtype=torch.float16, # device_map="auto", # quantization_config=quantization_config # ) # - # tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) # model_infer(model, tokenizer) # result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) @@ -155,13 +159,13 @@ def test_marlin_4bits_sym_with_zp_m_1(self, dataloader): # torch.cuda.empty_cache() # # model = AutoModelForCausalLM.from_pretrained( - # self.save_folder, + # self.save_dir, # torch_dtype=torch.bfloat16, # device_map="auto", # quantization_config=quantization_config # ) # - # tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + # tokenizer = AutoTokenizer.from_pretrained(self.save_dir) # model_infer(model, tokenizer) # result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") # print(result['results']['lambada_openai']['acc,none']) diff --git a/test/test_cuda/test_mix_bits.py b/test/test_cuda/test_mix_bits.py index 958b8ba8e..6988709d5 100644 --- a/test/test_cuda/test_mix_bits.py +++ b/test/test_cuda/test_mix_bits.py @@ -14,22 +14,27 @@ require_package_version_ut, ) +from ..helpers import get_model_path + class TestAutoRound: - @classmethod - def setup_class(self): - self.model_name = "/models/opt-125m" - self.save_dir = "./saved" - self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + save_dir = "./saved" + + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + + # Yield to hand control to the test methods + yield - @classmethod - def teardown_class(self): + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @require_gptqmodel - def test_mixed_gptqmodel(self, dataloader): + def test_mixed_gptqmodel(self, tiny_opt_model_path, dataloader): scheme = "W4A16" layer_config = { "k_proj": {"bits": 8}, # part name @@ -39,7 +44,7 @@ def test_mixed_gptqmodel(self, dataloader): "model.decoder.layers.0.self_attn.q_proj": {"bits": 8}, # full name } autoround = AutoRound( - model=self.model_name, + model=tiny_opt_model_path, scheme=scheme, iters=2, seqlen=2, @@ -58,7 +63,7 @@ def test_mixed_gptqmodel(self, dataloader): print(res) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mixed_gptqmodel_convert_to_ar(self, dataloader): + def test_mixed_gptqmodel_convert_to_ar(self, tiny_opt_model_path, dataloader): layer_config = { "k_proj": {"bits": 8}, # part name "lm_head": {"bits": 4}, # set lm_head quant @@ -66,7 +71,7 @@ def test_mixed_gptqmodel_convert_to_ar(self, dataloader): "model.decoder.layers.0.self_attn.q_proj": {"bits": 8}, # full name } autoround = AutoRound( - model=self.model_name, + model=tiny_opt_model_path, scheme="W4A16", iters=2, seqlen=2, @@ -86,7 +91,7 @@ def test_mixed_gptqmodel_convert_to_ar(self, dataloader): print(res) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mixed_autoround_format(self, dataloader): + def test_mixed_autoround_format(self, tiny_opt_model_path, dataloader): layer_config = { "k_proj": {"bits": 8}, "q_proj": {"bits": 3}, @@ -94,7 +99,7 @@ def test_mixed_autoround_format(self, dataloader): "fc1": {"bits": 16}, } autoround = AutoRound( - model=self.model_name, + model=tiny_opt_model_path, scheme="W4A16", iters=2, seqlen=2, @@ -114,14 +119,13 @@ def test_mixed_autoround_format(self, dataloader): @require_awq @require_package_version_ut("transformers", "<4.57.0") - def test_fallback_regex_for_awq_format(self, dataloader): - model_name = "facebook/opt-125m" + def test_fallback_regex_for_awq_format(self, tiny_opt_model_path, dataloader): layer_config = { "lm_head": {"bits": 16}, "fc1": {"bits": 16}, } autoround = AutoRound( - model=model_name, + model=tiny_opt_model_path, scheme="W4A16", iters=2, seqlen=2, @@ -140,14 +144,14 @@ def test_fallback_regex_for_awq_format(self, dataloader): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mixed_ar_format_part_name_hf_loading(self, dataloader): + def test_mixed_ar_format_part_name_hf_loading(self, tiny_opt_model_path, dataloader): layer_config = { "k_proj": {"bits": 8}, # part name "lm_head": {"bits": 16}, # full name ".*fc1.*": {"bits": 16}, # standard regex } autoround = AutoRound( - model=self.model_name, + model=tiny_opt_model_path, scheme="W4A16", iters=2, seqlen=2, @@ -207,8 +211,9 @@ def test_mixed_MXFP_autoround_format_loading(self, dataloader): "lm_head": {"bits": 16, "act_bits": 16}, "fc1": {"bits": 8, "act_bits": 8}, } + model_path = get_model_path("facebook/opt-125m") autoround = AutoRound( - self.model_name, + model_path, scheme="MXFP4", iters=2, seqlen=2, @@ -230,13 +235,13 @@ def test_mixed_MXFP_autoround_format_loading(self, dataloader): assert result["results"]["lambada_openai"]["acc,none"] > 0.32 shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mixed_autoround_format_vllm(self, dataloader): + def test_mixed_autoround_format_vllm(self, tiny_opt_model_path, dataloader): layer_config = { "self_attn": {"bits": 8}, "lm_head": {"bits": 16}, } autoround = AutoRound( - self.model, + tiny_opt_model_path, self.tokenizer, scheme="W4A16", iters=2, @@ -270,14 +275,14 @@ def test_mixed_autoround_format_vllm(self, dataloader): print(f"{prompt}: {generated_text}") shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_mixed_llmcompressor_format_vllm(self, dataloader): + def test_mixed_llmcompressor_format_vllm(self, tiny_opt_model_path, dataloader): layer_config = { "self_attn": {"bits": 16, "act_bits": 16}, "lm_head": {"bits": 16, "act_bits": 16}, "fc1": {"bits": 16, "act_bits": 16}, } autoround = AutoRound( - self.model_name, + tiny_opt_model_path, scheme="NVFP4", iters=2, seqlen=2, diff --git a/test/test_cuda/test_multiple_card.py b/test/test_cuda/test_multiple_card.py index 2f29f7a37..e09975a19 100644 --- a/test/test_cuda/test_multiple_card.py +++ b/test/test_cuda/test_multiple_card.py @@ -10,6 +10,8 @@ from auto_round.eval.evaluation import simple_evaluate from auto_round.testing_utils import multi_card, require_gptqmodel, require_greater_than_050 +from ..helpers import get_model_path, get_tiny_model + def get_accuracy(data): match = re.search(r"\|acc\s+\|[↑↓]\s+\|\s+([\d.]+)\|", data) @@ -24,14 +26,20 @@ def get_accuracy(data): # import os # os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" class TestAutoRound: - @classmethod - def setup_class(self): - self.save_dir = "./saved" - self.tasks = "lambada_openai" - - @classmethod - def teardown_class(self): - shutil.rmtree(self.save_dir, ignore_errors=True) + save_dir = "./saved" + tasks = "lambada_openai" + + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + + # Yield to hand control to the test methods + yield + + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") + shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) @multi_card @@ -53,10 +61,9 @@ def test_device_map_str(self): shutil.rmtree("./saved", ignore_errors=True) @multi_card - def test_layer_norm(self): - model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) - tokenizer = AutoTokenizer.from_pretrained(model_name) + def test_layer_norm(self, tiny_opt_model_path): + model = AutoModelForCausalLM.from_pretrained(tiny_opt_model_path, torch_dtype=torch.float16) + tokenizer = AutoTokenizer.from_pretrained(tiny_opt_model_path) device_map = {"norm": "cuda:1"} autoround = AutoRound( model, tokenizer, iters=2, device_map=device_map, nsamples=7, seqlen=32, enable_norm_bias_tuning=True @@ -64,10 +71,9 @@ def test_layer_norm(self): autoround.quantize() @multi_card - def test_rms_norm(self): - model_name = "/models/Qwen2-0.5B-Instruct" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) - tokenizer = AutoTokenizer.from_pretrained(model_name) + def test_rms_norm(self, tiny_qwen_model_path): + model = AutoModelForCausalLM.from_pretrained(tiny_qwen_model_path, torch_dtype=torch.float16) + tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_model_path) device_map = {"norm": "cuda:1"} autoround = AutoRound( model, tokenizer, iters=2, device_map=device_map, nsamples=7, seqlen=32, enable_norm_bias_tuning=True @@ -75,10 +81,9 @@ def test_rms_norm(self): autoround.quantize() @multi_card - def test_act_quantization(self): - model_name = "/models/Qwen2-0.5B-Instruct" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) - tokenizer = AutoTokenizer.from_pretrained(model_name) + def test_act_quantization(self, tiny_qwen_model_path): + model = AutoModelForCausalLM.from_pretrained(tiny_qwen_model_path, torch_dtype=torch.float16) + tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_model_path) device_map = {".*q_proj": "0", ".*k_proj": "cuda:1", "v_proj": 1, ".*up_proj": "1"} autoround = AutoRound( model, tokenizer, iters=2, device_map=device_map, nsamples=7, seqlen=32, act_bits=4, act_dynamic=False @@ -87,9 +92,9 @@ def test_act_quantization(self): @multi_card def test_lm_head(self): - model_name = "/models/Qwen2.5-7B-Instruct" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) - tokenizer = AutoTokenizer.from_pretrained(model_name) + model_path = get_model_path("qwen/Qwen2.5-7B-Instruct") + model = get_tiny_model(model_path) + tokenizer = AutoTokenizer.from_pretrained(model_path) device_map = {".*q_proj": "0", ".*k_proj": "cuda:1", "v_proj": 1, ".*up_proj": "1", "lm_head": 1} layer_config = {"lm_head": {"bits": 4}} autoround = AutoRound( @@ -105,10 +110,9 @@ def test_lm_head(self): autoround.quantize() @multi_card - def test_device_map(self): - model_name = "/models/Qwen2-0.5B-Instruct" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16) - tokenizer = AutoTokenizer.from_pretrained(model_name) + def test_device_map(self, tiny_qwen_model_path): + model = AutoModelForCausalLM.from_pretrained(tiny_qwen_model_path, torch_dtype=torch.float16) + tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_model_path) device_map = {".*q_proj": "0", ".*k_proj": "cuda:1", "v_proj": 1, ".*up_proj": "cpu"} autoround = AutoRound(model, tokenizer, iters=2, device_map=device_map, nsamples=7, seqlen=32) autoround.quantize() @@ -206,12 +210,11 @@ def test_device_map(self): torch.cuda.empty_cache() @multi_card - def test_device_map_dict(self): + def test_device_map_dict(self, tiny_opt_model_path): device_map = {".*q_proj": "0", ".*k_proj": "cuda:1", "v_proj": 1, ".*up_proj": "1"} bits, group_size, sym = 4, 128, False - model_name = "/models/opt-125m" - model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") - tokenizer = AutoTokenizer.from_pretrained(model_name) + model = AutoModelForCausalLM.from_pretrained(tiny_opt_model_path, torch_dtype=torch.float16, device_map="auto") + tokenizer = AutoTokenizer.from_pretrained(tiny_opt_model_path) autoround = AutoRound( model, tokenizer, @@ -225,9 +228,8 @@ def test_device_map_dict(self): autoround.quantize() # test model_name - model_name = "/models/opt-125m" autoround = AutoRound( - model_name, + tiny_opt_model_path, tokenizer, bits=bits, group_size=group_size, @@ -240,7 +242,7 @@ def test_device_map_dict(self): # test rtn autoround = AutoRound( - model_name, + tiny_opt_model_path, tokenizer, bits=bits, group_size=group_size, @@ -352,7 +354,7 @@ def test_device_map_for_triton(self): @multi_card def test_mllm_device_map(self): - model_name = "/models/Qwen2-VL-2B-Instruct/" + model_name = get_model_path("qwen/Qwen2-VL-2B-Instruct/") from auto_round import AutoRoundMLLM device_map = "0,1" diff --git a/test/test_cuda/test_multiple_card_calib.py b/test/test_cuda/test_multiple_card_calib.py index 410855c33..e82d0b330 100644 --- a/test/test_cuda/test_multiple_card_calib.py +++ b/test/test_cuda/test_multiple_card_calib.py @@ -19,13 +19,19 @@ def get_accuracy(data): class TestAutoRound: - @classmethod - def setup_class(self): - self.save_dir = "./saved" - self.tasks = "lambada_openai" + save_dir = "./saved" + tasks = "lambada_openai" - @classmethod - def teardown_class(self): + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + + # Yield to hand control to the test methods + yield + + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) diff --git a/test/test_cuda/test_mxfp_and_nvfp_quant.py b/test/test_cuda/test_mxfp_and_nvfp_quant.py index 0dc43b093..808fa4a28 100644 --- a/test/test_cuda/test_mxfp_and_nvfp_quant.py +++ b/test/test_cuda/test_mxfp_and_nvfp_quant.py @@ -12,6 +12,8 @@ from auto_round.export.export_to_autoround import qlinear_fp as ar_qlinear_fp from auto_round.testing_utils import has_module +from ..helpers import get_model_path + testing_schemes = [AutoRoundFormat.MXFP8.value, AutoRoundFormat.MXFP4.value, AutoRoundFormat.NVFP4.value] QMODULE_MAPPING = { AutoRoundFormat.MXFP8.value: ar_qmodules.MXFP8QuantLinear, @@ -22,15 +24,14 @@ @pytest.mark.parametrize("scheme", testing_schemes) @torch.inference_mode() -def test_e2e_quant_and_infer(scheme): +def test_e2e_quant_and_infer(scheme, tiny_qwen_model_path): # Use a temporary directory for saving the quantized model with tempfile.TemporaryDirectory() as temp_dir: - model_name = "Qwen/Qwen2.5-0.5B-Instruct" # Load the tokenizer and model - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(tiny_qwen_model_path, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( - model_name, + tiny_qwen_model_path, device_map="cpu", torch_dtype="auto", trust_remote_code=True, diff --git a/test/test_cuda/test_mxfp_nvfp.py b/test/test_cuda/test_mxfp_nvfp.py index 38116f3be..41c996b95 100644 --- a/test/test_cuda/test_mxfp_nvfp.py +++ b/test/test_cuda/test_mxfp_nvfp.py @@ -9,20 +9,27 @@ from auto_round import AutoRound from auto_round.testing_utils import require_awq, require_optimum +from ..helpers import get_model_path, get_tiny_model + class TestAutoRound: - @classmethod - def setup_class(self): - self.model_name = "facebook/opt-125m" - self.save_dir = "./saved" + save_dir = "./saved" + + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + + # Yield to hand control to the test methods + yield - @classmethod - def teardown_class(self): + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) def test_fp8input_mxfp4_llmcompressor_format(self, dataloader): - model_name = "/models/Qwen3-0.6B-FP8" + model_name = get_model_path("qwen/Qwen3-0.6B-FP8") scheme = "mxfp4" ar = AutoRound( model=model_name, @@ -47,10 +54,10 @@ def test_fp8input_mxfp4_llmcompressor_format(self, dataloader): ), f"Invalid MXFP4 quantization configuration: {quantization_config}" shutil.rmtree(self.save_dir, ignore_errors=True) - def test_nvfp4_llmcompressor_format(self, dataloader): + def test_nvfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader): scheme = "nvfp4" autoround = AutoRound( - self.model_name, + tiny_opt_model_path, scheme=scheme, iters=2, seqlen=2, @@ -58,7 +65,7 @@ def test_nvfp4_llmcompressor_format(self, dataloader): ) quantized_model_path = self.save_dir compressed_model, _ = autoround.quantize_and_save(output_dir=quantized_model_path, format="llm_compressor") - tmp_layer = compressed_model.model.decoder.layers[3].self_attn.q_proj + tmp_layer = compressed_model.model.decoder.layers[1].self_attn.q_proj assert ( hasattr(tmp_layer, "weight_scale") and hasattr(tmp_layer, "weight_global_scale") @@ -98,11 +105,11 @@ def test_nvfp4_llmcompressor_format(self, dataloader): # if "France" in prompt: # assert "Paris" in generated_text - def test_nvfp4_moe_actmax_rtn(self, dataloader): - model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite" + def test_nvfp4_moe_actmax_rtn(self, tiny_deepseek_v2_model_path, dataloader): + # model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite" scheme = "nvfp4" autoround = AutoRound( - model_name, + tiny_deepseek_v2_model_path, scheme=scheme, iters=0, seqlen=2, @@ -113,11 +120,10 @@ def test_nvfp4_moe_actmax_rtn(self, dataloader): quantized_model_path = self.save_dir autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round") - def test_nvfp4_moe_actmax_ar(self, dataloader): - model_name = "/data0/deepseek-ai/DeepSeek-V2-Lite" + def test_nvfp4_moe_actmax_ar(self, tiny_deepseek_v2_model_path, dataloader): scheme = "nvfp4" autoround = AutoRound( - model_name, + tiny_deepseek_v2_model_path, scheme=scheme, iters=1, seqlen=2, @@ -129,7 +135,7 @@ def test_nvfp4_moe_actmax_ar(self, dataloader): autoround.save_quantized(output_dir=quantized_model_path, inplace=False, format="auto_round") def test_qwen_moe_quant_infer(self, dataloader): - model_name = "/models/Qwen1.5-MoE-A2.7B" + model_name = get_model_path("qwen/Qwen1.5-MoE-A2.7B") layer_config = { "layers\.(?:[3-9]|1[0-9]|2[0-3])": {"bits": 16, "act_bits": 16}, } diff --git a/test/test_cuda/test_qbits.py b/test/test_cuda/test_qbits.py index 0ce3597db..37e119b2c 100644 --- a/test/test_cuda/test_qbits.py +++ b/test/test_cuda/test_qbits.py @@ -6,17 +6,23 @@ from auto_round import AutoRound, AutoRoundConfig from auto_round.testing_utils import require_gptqmodel, require_itrex -from ..helpers import model_infer +from ..helpers import get_model_path, model_infer class TestAutoRound: - @classmethod - def setup_class(self): - self.model_name = "/models/opt-125m" - self.save_folder = "./saved" + save_dir = "./saved" - @classmethod - def teardown_class(self): + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + + # Yield to hand control to the test methods + yield + + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") + shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) ## require torch 2.6 @@ -50,8 +56,9 @@ def test_load_gptq_model_2bits(self): @require_itrex def test_mixed_precision(self): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + model_path = get_model_path("facebook/opt-125m") + model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto", trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) layer_config = {} layer_config["model.decoder.layers.0.self_attn.k_proj"] = {"bits": 8} @@ -64,27 +71,29 @@ def test_mixed_precision(self): autoround = AutoRound( model, tokenizer, bits=bits, group_size=group_size, iters=1, nsamples=1, sym=sym, layer_config=layer_config ) - quantized_model_path = self.save_folder + quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") model = AutoModelForCausalLM.from_pretrained( - self.save_folder, + self.save_dir, torch_dtype=torch.float16, device_map="cpu", ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) text = "There is a girl who likes adventure," inputs = tokenizer(text, return_tensors="pt").to(model.device) res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]) print(res) assert "!!!" not in res - shutil.rmtree(self.save_folder, ignore_errors=True) + shutil.rmtree(self.save_dir, ignore_errors=True) @require_gptqmodel - def test_autoround_sym(self): + def test_autoround_sym(self, tiny_opt_model_path): for bits in [4]: - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained( + tiny_opt_model_path, torch_dtype="auto", trust_remote_code=True + ) + tokenizer = AutoTokenizer.from_pretrained(tiny_opt_model_path, trust_remote_code=True) bits, group_size, sym = bits, 128, True autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=2, seqlen=2) quantized_model_path = "./saved" @@ -100,4 +109,4 @@ def test_autoround_sym(self): res = tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]) print(res) assert "!!!" not in res - shutil.rmtree(self.save_folder, ignore_errors=True) + shutil.rmtree(self.save_dir, ignore_errors=True) diff --git a/test/test_cuda/test_scheme.py b/test/test_cuda/test_scheme.py index d6fe43374..2ed5527bd 100644 --- a/test/test_cuda/test_scheme.py +++ b/test/test_cuda/test_scheme.py @@ -5,45 +5,52 @@ from auto_round import AutoRound from auto_round.schemes import QuantizationScheme +from ..helpers import get_model_path + class TestAutoRound: - @classmethod - def setup_class(self): - self.model_name = "/models/opt-125m" - self.save_folder = "./saved" - - @classmethod - def teardown_class(self): - shutil.rmtree(self.save_folder, ignore_errors=True) + save_dir = "./saved" + + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + + # Yield to hand control to the test methods + yield + + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") + shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) # Tuning tests - def test_gguf(self): - ar = AutoRound("/models/Qwen3-0.6B", scheme="W2A16", nsamples=1, iters=1) - ar.quantize_and_save(self.save_folder, format="gguf:q4_k_m") + def test_gguf(self, tiny_qwen_model_path): + ar = AutoRound(tiny_qwen_model_path, scheme="W2A16", nsamples=1, iters=1) + ar.quantize_and_save(self.save_dir, format="gguf:q4_k_m") assert ar.bits == 4 - shutil.rmtree(self.save_folder, ignore_errors=True) + shutil.rmtree(self.save_dir, ignore_errors=True) - def test_w4a16(self): - ar = AutoRound(self.model_name, scheme="W4A16", nsamples=1, iters=1) + def test_w4a16(self, tiny_opt_model_path): + ar = AutoRound(tiny_opt_model_path, scheme="W4A16", nsamples=1, iters=1) assert ar.bits == 4 ar.quantize() - def test_w2a16(self): - ar = AutoRound(self.model_name, scheme="W2A16", nsamples=1, iters=1) + def test_w2a16(self, tiny_opt_model_path): + ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=1) assert ar.bits == 2 ar.quantize() - def test_mxfp4(self): - ar = AutoRound(self.model_name, scheme="MXFP4", nsamples=1, iters=1) + def test_mxfp4(self, tiny_opt_model_path): + ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=1) assert ar.bits == 4 assert ar.act_bits == 4 assert ar.data_type == "mx_fp" assert ar.act_data_type == "mx_fp_rceil" ar.quantize() - def test_fp8_static(self): - ar = AutoRound(self.model_name, scheme="FP8_STATIC", nsamples=1, iters=1) + def test_fp8_static(self, tiny_opt_model_path): + ar = AutoRound(tiny_opt_model_path, scheme="FP8_STATIC", nsamples=1, iters=1) assert ar.bits == 8 assert ar.act_bits == 8 assert ar.data_type == "fp" @@ -53,21 +60,21 @@ def test_fp8_static(self): ar.quantize() ## RTN tests - def test_w2a16_rtn(self): - ar = AutoRound(self.model_name, scheme="W2A16", nsamples=1, iters=0) + def test_w2a16_rtn(self, tiny_opt_model_path): + ar = AutoRound(tiny_opt_model_path, scheme="W2A16", nsamples=1, iters=0) assert ar.bits == 2 ar.quantize() - def test_mxfp4_rtn(self): - ar = AutoRound(self.model_name, scheme="MXFP4", nsamples=1, iters=0) + def test_mxfp4_rtn(self, tiny_opt_model_path): + ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=0) assert ar.bits == 4 assert ar.act_bits == 4 assert ar.data_type == "mx_fp" assert ar.act_data_type == "mx_fp_rceil" ar.quantize() - def test_fp8_static_rtn(self): - ar = AutoRound(self.model_name, scheme="FP8_STATIC", nsamples=1, iters=0) + def test_fp8_static_rtn(self, tiny_opt_model_path): + ar = AutoRound(tiny_opt_model_path, scheme="FP8_STATIC", nsamples=1, iters=0) assert ar.bits == 8 assert ar.act_bits == 8 assert ar.data_type == "fp" @@ -77,12 +84,13 @@ def test_fp8_static_rtn(self): ar.quantize() def test_scheme_in_layer_config(self): + model_path = get_model_path("facebook/opt-125m") layer_config = { "model.decoder.layers.2.self_attn": {"bits": 2}, "model.decoder.layers.3.self_attn.v_proj": "W8A16", "model.decoder.layers.4.self_attn.k_proj": QuantizationScheme.from_dict({"group_size": 64}), } - ar = AutoRound(self.model_name, scheme="W3A16", nsamples=1, iters=1, layer_config=layer_config) + ar = AutoRound(model_path, scheme="W3A16", nsamples=1, iters=1, layer_config=layer_config) ar.quantize() for n, m in ar.model.named_modules(): diff --git a/test/test_cuda/test_torch_backend.py b/test/test_cuda/test_torch_backend.py index 5244725e8..a7eb30552 100644 --- a/test/test_cuda/test_torch_backend.py +++ b/test/test_cuda/test_torch_backend.py @@ -8,24 +8,30 @@ from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.testing_utils import require_autogptq, require_gptqmodel -from ..helpers import model_infer +from ..helpers import get_model_path, model_infer class TestAutoRoundTorchBackend: - @classmethod - def setup_class(self): - self.model_name = "/models/opt-125m" - self.save_folder = "./saved" + save_dir = "./saved" - @classmethod - def teardown_class(self): - shutil.rmtree(self.save_folder, ignore_errors=True) + @pytest.fixture(autouse=True, scope="class") + def setup_and_teardown_class(self): + # ===== SETUP (setup_class) ===== + print("[Setup] Running before any test in class") + + # Yield to hand control to the test methods + yield + + # ===== TEARDOWN (teardown_class) ===== + print("[Teardown] Running after all tests in class") + shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) def test_torch_4bits_asym(self, dataloader): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + model_path = get_model_path("facebook/opt-125m") + model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto", trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) bits, group_size, sym = 4, 128, False autoround = AutoRound( model, @@ -37,7 +43,7 @@ def test_torch_4bits_asym(self, dataloader): seqlen=2, dataset=dataloader, ) - quantized_model_path = self.save_folder + quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel") quantization_config = AutoRoundConfig(backend="torch") @@ -45,7 +51,7 @@ def test_torch_4bits_asym(self, dataloader): quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) @@ -53,10 +59,10 @@ def test_torch_4bits_asym(self, dataloader): torch.cuda.empty_cache() model = AutoModelForCausalLM.from_pretrained( - self.save_folder, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config + self.save_dir, torch_dtype=torch.bfloat16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) @@ -65,8 +71,9 @@ def test_torch_4bits_asym(self, dataloader): shutil.rmtree("./saved", ignore_errors=True) def test_torch_4bits_sym(self, dataloader): - model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) - tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) + model_path = get_model_path("facebook/opt-125m") + model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto", trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) bits, group_size, sym = 4, 128, True autoround = AutoRound( model, @@ -78,7 +85,7 @@ def test_torch_4bits_sym(self, dataloader): seqlen=2, dataset=dataloader, ) - quantized_model_path = self.save_folder + quantized_model_path = self.save_dir autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") ##will convert to gptq model quantization_config = AutoRoundConfig(backend="torch") @@ -86,10 +93,10 @@ def test_torch_4bits_sym(self, dataloader): quantized_model_path, torch_dtype=torch.float16, device_map="auto", quantization_config=quantization_config ) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) + tokenizer = AutoTokenizer.from_pretrained(self.save_dir) model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai") print(result["results"]["lambada_openai"]["acc,none"]) assert result["results"]["lambada_openai"]["acc,none"] > 0.28 torch.cuda.empty_cache() - shutil.rmtree(self.save_folder, ignore_errors=True) + shutil.rmtree(self.save_dir, ignore_errors=True) From cb5acf61fed19c1be9aa40063773b21f2bb7aef7 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Tue, 23 Dec 2025 00:48:05 -0500 Subject: [PATCH 17/24] add ./tmp as workspace and remove duplicate UTs Signed-off-by: He, Xin3 --- test/fixtures.py | 29 +++++++----- test/helpers.py | 10 +++-- test/test_cpu/test_gguf_format.py | 67 ++++++++++++++-------------- test/test_cpu/test_gpt_oss.py | 74 ------------------------------- test/test_cpu/test_moe_model.py | 59 +++++++++++++++++++----- 5 files changed, 104 insertions(+), 135 deletions(-) delete mode 100644 test/test_cpu/test_gpt_oss.py diff --git a/test/fixtures.py b/test/fixtures.py index 86bc36e48..c76040322 100644 --- a/test/fixtures.py +++ b/test/fixtures.py @@ -26,7 +26,7 @@ @pytest.fixture(scope="session") def tiny_opt_model_path(): model_name_or_path = opt_name_or_path - tiny_model_path = "./tmp_tiny_opt_model_path" + tiny_model_path = "./tmp/tiny_opt_model_path" tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path) yield tiny_model_path shutil.rmtree(tiny_model_path) @@ -35,7 +35,7 @@ def tiny_opt_model_path(): @pytest.fixture(scope="session") def tiny_lamini_model_path(): model_name_or_path = lamini_name_or_path - tiny_model_path = "./tmp_tiny_lamini_model_path" + tiny_model_path = "./tmp/tiny_lamini_model_path" tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path) yield tiny_model_path shutil.rmtree(tiny_model_path) @@ -44,7 +44,7 @@ def tiny_lamini_model_path(): @pytest.fixture(scope="session") def tiny_gptj_model_path(): model_name_or_path = gptj_name_or_path - tiny_model_path = "./tmp_tiny_gptj_model_path" + tiny_model_path = "./tmp/tiny_gptj_model_path" tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path) yield tiny_model_path shutil.rmtree(tiny_model_path) @@ -53,7 +53,7 @@ def tiny_gptj_model_path(): @pytest.fixture(scope="session") def tiny_phi2_model_path(): model_name_or_path = phi2_name_or_path - tiny_model_path = "./tmp_tiny_phi2_model_path" + tiny_model_path = "./tmp/tiny_phi2_model_path" tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path) yield tiny_model_path shutil.rmtree(tiny_model_path) @@ -62,7 +62,7 @@ def tiny_phi2_model_path(): @pytest.fixture(scope="session") def tiny_deepseek_v2_model_path(): model_name_or_path = deepseek_v2_name_or_path - tiny_model_path = "./tmp_tiny_deepseek_v2_model_path" + tiny_model_path = "./tmp/tiny_deepseek_v2_model_path" tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2) yield tiny_model_path shutil.rmtree(tiny_model_path) @@ -71,7 +71,7 @@ def tiny_deepseek_v2_model_path(): @pytest.fixture(scope="session") def tiny_gemma_model_path(): model_name_or_path = gemma_name_or_path - tiny_model_path = "./tmp_tiny_gemma_model_path" + tiny_model_path = "./tmp/tiny_gemma_model_path" tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2) yield tiny_model_path shutil.rmtree(tiny_model_path) @@ -80,7 +80,7 @@ def tiny_gemma_model_path(): @pytest.fixture(scope="session") def tiny_qwen_model_path(): model_name_or_path = qwen_name_or_path - tiny_model_path = "./tmp_tiny_qwen_model_path" + tiny_model_path = "./tmp/tiny_qwen_model_path" tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path) yield tiny_model_path shutil.rmtree(tiny_model_path) @@ -89,7 +89,7 @@ def tiny_qwen_model_path(): @pytest.fixture(scope="session") def tiny_untied_qwen_model_path(): model_name_or_path = qwen_name_or_path - tiny_model_path = "./tmp_tiny_untied_qwen_model_path" + tiny_model_path = "./tmp/tiny_untied_qwen_model_path" tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, force_untie=True) yield tiny_model_path shutil.rmtree(tiny_model_path) @@ -98,7 +98,7 @@ def tiny_untied_qwen_model_path(): @pytest.fixture(scope="session") def tiny_qwen_moe_model_path(): model_name_or_path = qwen_moe_name_or_path - tiny_model_path = "./tmp_tiny_qwen_moe_model_path" + tiny_model_path = "./tmp/tiny_qwen_moe_model_path" tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2) yield tiny_model_path shutil.rmtree(tiny_model_path) @@ -107,7 +107,7 @@ def tiny_qwen_moe_model_path(): @pytest.fixture(scope="session") def tiny_qwen_vl_model_path(): model_name_or_path = qwen_vl_name_or_path - tiny_model_path = "./tmp_tiny_qwen_vl_model_path" + tiny_model_path = "./tmp/tiny_qwen_vl_model_path" tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2, is_mllm=True) yield tiny_model_path shutil.rmtree(tiny_model_path) @@ -116,12 +116,19 @@ def tiny_qwen_vl_model_path(): @pytest.fixture(scope="session") def tiny_qwen_2_5_vl_model_path(): model_name_or_path = qwen_2_5_vl_name_or_path - tiny_model_path = "./tmp_tiny_qwen_2_5_vl_model_path" + tiny_model_path = "./tmp/tiny_qwen_2_5_vl_model_path" tiny_model_path = save_tiny_model(model_name_or_path, tiny_model_path, num_layers=2, is_mllm=True) yield tiny_model_path shutil.rmtree(tiny_model_path) +@pytest.fixture(autouse=True, scope="session") +def clean_tmp_model_folder(): + yield + shutil.rmtree("./tmp", ignore_errors=True) # unittest default workspace + shutil.rmtree("./tmp_autoround", ignore_errors=True) # autoround default workspace + + # Create objective fixtures for testing @pytest.fixture(scope="function") def tiny_opt_model(): diff --git a/test/helpers.py b/test/helpers.py index f30e632f7..89b832c6d 100644 --- a/test/helpers.py +++ b/test/helpers.py @@ -43,14 +43,15 @@ def get_tiny_model(model_name_or_path, num_layers=2, is_mllm=False, **kwargs): def slice_layers(module): """slice layers in the model.""" + sliced = False for name, child in module.named_children(): if isinstance(child, torch.nn.ModuleList) and len(child) > num_layers: new_layers = torch.nn.ModuleList(child[:num_layers]) setattr(module, name, new_layers) - return True - if slice_layers(child): - return True - return False + sliced = True + elif slice_layers(child): + sliced = True + return sliced kwargs["dtype"] = "auto" if "auto" not in kwargs else kwargs["dtype"] kwargs["trust_remote_code"] = True if "trust_remote_code" not in kwargs else kwargs["trust_remote_code"] @@ -63,6 +64,7 @@ def slice_layers(module): model.config.vision_config.depth = num_layers else: model, tokenizer = llm_load_model(model_name_or_path, **kwargs) + slice_layers(model) if hasattr(model.config, "num_hidden_layers"): diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py index b7f25541c..366819234 100644 --- a/test/test_cpu/test_gguf_format.py +++ b/test/test_cpu/test_gguf_format.py @@ -4,29 +4,29 @@ import pytest import torch -import transformers from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound -from ..helpers import get_tiny_model, qwen_name_or_path, qwen_vl_name_or_path +from ..helpers import get_model_path, get_tiny_model class TestGGUF: @classmethod def setup_class(self): - self.tokenizer = AutoTokenizer.from_pretrained(qwen_name_or_path, trust_remote_code=True) + self.model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct") + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @classmethod def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_basic_usage(self, tiny_gemma_model_path, tiny_qwen_model_path): + def test_basic_usage(self): python_path = sys.executable res = os.system( - f"cd ../.. && {python_path} -m auto_round --model {tiny_gemma_model_path} " + f"cd ../.. && {python_path} -m auto_round --model {get_model_path('benzart/gemma-2b-it-fine-tuning-for-code-test')} " f" --bs 16 --iters 0 --nsamples 1 --format gguf:q4_k_m" ) if res > 0 or res == -1: @@ -34,17 +34,17 @@ def test_basic_usage(self, tiny_gemma_model_path, tiny_qwen_model_path): shutil.rmtree("./saved", ignore_errors=True) res = os.system( - f"cd ../.. && {python_path} -m auto_round --model {tiny_qwen_model_path}" + f"cd ../.. && {python_path} -m auto_round --model {self.model_name}" f" --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" shutil.rmtree("./saved", ignore_errors=True) - def test_q4_0(self, tiny_qwen_model_path): + def test_q4_0(self): bits, group_size, sym = 4, 32, True autoround = AutoRound( - tiny_qwen_model_path, + self.model_name, bits=bits, group_size=group_size, sym=sym, @@ -61,12 +61,13 @@ def test_q4_0(self, tiny_qwen_model_path): text = "There is a girl who likes adventure," inputs = self.tokenizer(text, return_tensors="pt").to(model.device) print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) + shutil.rmtree("./saved", ignore_errors=True) - def test_func(self, tiny_qwen_model_path): + def test_func(self): bits, group_size, sym = 4, 128, True autoround = AutoRound( - tiny_qwen_model_path, + self.model_name, iters=1, nsamples=1, seqlen=10, @@ -83,8 +84,8 @@ def test_func(self, tiny_qwen_model_path): print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) shutil.rmtree("./saved", ignore_errors=True) - def test_gguf_baseline(self, tiny_qwen_model_path): - model_name = tiny_qwen_model_path + def test_gguf_baseline(self): + model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) autoround = AutoRound( model, @@ -102,16 +103,16 @@ def test_gguf_baseline(self, tiny_qwen_model_path): ) quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, inplace=False, format="fake") - model = AutoModelForCausalLM.from_pretrained(quantized_model_path, device_map="auto") text = "There is a girl who likes adventure," inputs = self.tokenizer(text, return_tensors="pt").to(model.device) print(self.tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) shutil.rmtree("./saved", ignore_errors=True) - def test_q4_k_m(self, tiny_qwen_model_path, dataloader): - model = get_tiny_model(qwen_name_or_path, num_layers=4) - tokenizer = transformers.AutoTokenizer.from_pretrained(qwen_name_or_path, trust_remote_code=True) + def test_q4_k_m(self, dataloader): + model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct") + model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) layer_config = { "lm_head": { "bits": 4, @@ -122,8 +123,8 @@ def test_q4_k_m(self, tiny_qwen_model_path, dataloader): "super_group_size": 8, }, "model.embed_tokens": {"bits": 6, "group_size": 32, "super_bits": 6, "super_group_size": 8}, - "model.layers.3.mlp.gate_proj": {"bits": 3}, - "model.layers.1.mlp.gate_proj": {"bits": 8}, + "model.layers.12.mlp.gate_proj": {"bits": 3}, + "model.layers.10.mlp.gate_proj": {"bits": 8}, } autoround = AutoRound( model, @@ -137,26 +138,26 @@ def test_q4_k_m(self, tiny_qwen_model_path, dataloader): ) quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake") - assert autoround.layer_config["model.layers.2.self_attn.v_proj"]["super_group_size"] == 16 - assert autoround.layer_config["model.layers.2.self_attn.v_proj"]["data_type"] == "int_sym_dq" - assert autoround.layer_config["model.layers.0.self_attn.v_proj"]["data_type"] == "int_asym_dq" + assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["super_group_size"] == 16 + assert autoround.layer_config["model.layers.11.self_attn.v_proj"]["data_type"] == "int_sym_dq" + assert autoround.layer_config["model.layers.7.self_attn.v_proj"]["data_type"] == "int_asym_dq" assert autoround.model.model.layers[0].self_attn.v_proj.bits == 6 - assert autoround.model.model.layers[3].self_attn.v_proj.bits == 4 + assert autoround.model.model.layers[12].self_attn.v_proj.bits == 4 assert autoround.model.model.embed_tokens.bits == 6 assert autoround.model.model.embed_tokens.group_size == 16 - assert autoround.model.model.layers[3].mlp.gate_proj.bits == 3 - assert autoround.model.model.layers[1].mlp.gate_proj.bits == 8 - assert autoround.layer_config["model.layers.1.mlp.gate_proj"]["mostly"] == "gguf:q8_0" + assert autoround.model.model.layers[12].mlp.gate_proj.bits == 3 + assert autoround.model.model.layers[10].mlp.gate_proj.bits == 8 + assert autoround.layer_config["model.layers.10.mlp.gate_proj"]["mostly"] == "gguf:q8_0" shutil.rmtree("./saved", ignore_errors=True) - model = AutoModelForCausalLM.from_pretrained(tiny_qwen_model_path, torch_dtype="auto", trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) autoround = AutoRound(model, tokenizer, iters=0, nsamples=1, seqlen=128, disable_opt_rtn=False) quantized_model_path = "./saved" autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake") shutil.rmtree("./saved", ignore_errors=True) - def test_all_format(self, tiny_qwen_model_path): - model_name = tiny_qwen_model_path + def test_all_format(self): + model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct") python_path = sys.executable # for gguf_format in ["gguf:q4_0", "gguf:q4_1", "gguf:q4_k_m", "gguf:q6_k"]: for gguf_format in ["gguf:q4_k_m"]: @@ -186,9 +187,7 @@ def test_all_format(self, tiny_qwen_model_path): shutil.rmtree("../../tmp_autoround", ignore_errors=True) def test_vlm_gguf(self): - # TODO: Using two-layers tiny model will return ValueError: - # Can not map tensor 'model.layers.10.input_layernorm.weight' - model_name = qwen_vl_name_or_path + model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct") from auto_round import AutoRoundMLLM from auto_round.utils import mllm_load_model @@ -212,7 +211,7 @@ def test_vlm_gguf(self): assert abs(file_size - 892) < 5.0 shutil.rmtree("./saved", ignore_errors=True) - def test_qtype_setting(self, tiny_qwen_model_path): + def test_qtype_setting(self): # Qwen2.5-0.5B-Instruct no output, token_embed q6_k fallbakc to q8_0 336M # Qwen3-0.6B output q6_k, token_embed q4_0 448M # Qwen3-8B output q6_k, token_embed q4_0 4.5G @@ -220,7 +219,7 @@ def test_qtype_setting(self, tiny_qwen_model_path): from auto_round.compressors.utils import set_layer_config from auto_round.export.export_to_gguf.config import ModelType - model_name = tiny_qwen_model_path + model_name = get_model_path("Qwen/Qwen2.5-0.5B-Instruct") ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0) ar.formats = ["gguf:q4_0"] ar.layer_config, _, _ = set_layer_config( @@ -239,7 +238,7 @@ def test_qtype_setting(self, tiny_qwen_model_path): assert ar.layer_config["model.embed_tokens"]["bits"] == 8 assert "lm_head" not in ar.layer_config - model_name = tiny_qwen_model_path + model_name = "Qwen/Qwen3-0.6B" ar = AutoRound(model=model_name, scheme="gguf:q4_0", iters=0) ar.formats = ["gguf:q4_0"] ar.layer_config, _, _ = set_layer_config( diff --git a/test/test_cpu/test_gpt_oss.py b/test/test_cpu/test_gpt_oss.py deleted file mode 100644 index b82c04c31..000000000 --- a/test/test_cpu/test_gpt_oss.py +++ /dev/null @@ -1,74 +0,0 @@ -import pytest -from transformers import AutoConfig, AutoTokenizer -from transformers.models.gpt_oss.modeling_gpt_oss import GptOssForCausalLM - -from auto_round import AutoRound - -from ..helpers import get_model_path - - -@pytest.fixture -def setup_gpt_oss(): - """Fixture to set up the GPT-OSS model and tokenizer.""" - model_name = get_model_path("unsloth/gpt-oss-20b-BF16") - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) - config.num_hidden_layers = 1 # Reduce layers for testing - model = GptOssForCausalLM(config) - output_dir = "/tmp/test_quantized_gpt_oss" - return model, tokenizer, output_dir, config - - -def quantize_model(model, tokenizer, output_dir, scheme, iters=0): - """Helper function to quantize the model with the given scheme.""" - autoround = AutoRound( - model, - tokenizer, - scheme=scheme, - nsamples=2, - iters=iters, - fp_layers="self_attn,router,lm_head,mlp.gate", - ) - quantized_model, save_folder = autoround.quantize_and_save(format="auto_round", output_dir=output_dir) - return quantized_model - - -def count_modules_by_type(model, target_module_name_or_class): - """Helper function to count modules of a specific type in the model.""" - cnt = 0 - for name, module in model.named_modules(): - if isinstance(target_module_name_or_class, str): - if target_module_name_or_class == module.__class__.__name__: - cnt += 1 - else: - if isinstance(module, target_module_name_or_class): - cnt += 1 - return cnt - - -@pytest.mark.parametrize("scheme", ["MXFP4", "MXFP8"]) -def test_quantization(setup_gpt_oss, scheme): - """Test quantization with the scheme.""" - model, tokenizer, output_dir, config = setup_gpt_oss - quantized_model = quantize_model(model, tokenizer, output_dir, scheme) - - # Ensure the quantized model is not None - assert quantized_model is not None, "Quantized model should not be None." - from auto_round.export.export_to_autoround.qlinear_fp import QuantLinear - from auto_round.modelling.gpt_oss import GPTOssSingleExpert - - single_expert_cnt = count_modules_by_type(quantized_model, GPTOssSingleExpert) - quant_linear_cnt = count_modules_by_type(quantized_model, QuantLinear) - assert ( - single_expert_cnt == config.num_local_experts - ), f"Expected {config.num_local_experts} GPTOssSingleExpert modules, found {single_expert_cnt}." - assert ( - quant_linear_cnt == config.num_hidden_layers * 3 * config.num_local_experts - ), f"Expected {config.num_hidden_layers * 3 * config.num_local_experts} QuantLinear modules, found {quant_linear_cnt}." - - print(f"[{scheme}] Total {GPTOssSingleExpert.__name__} modules: {single_expert_cnt}") - print(f"[{scheme}] Total {QuantLinear.__name__} modules: {quant_linear_cnt}") - # clean the output directory after test - import shutil - - shutil.rmtree(output_dir, ignore_errors=True) diff --git a/test/test_cpu/test_moe_model.py b/test/test_cpu/test_moe_model.py index 62bac4efc..c30ab0e39 100644 --- a/test/test_cpu/test_moe_model.py +++ b/test/test_cpu/test_moe_model.py @@ -8,29 +8,35 @@ from ..helpers import get_model_path +gpt_oss_name_or_path = get_model_path("unsloth/gpt-oss-20b-BF16") +llama4_name_or_path = get_model_path("meta-llama/Llama-4-Scout-17B-16E-Instruct") + +# local path for debug +# llama4_name_or_path = get_model_path("/dataset/Llama-4-Scout-17B-16E-Instruct") + @pytest.fixture def setup_gpt_oss(): """Fixture to set up the GPT-OSS model and tokenizer.""" - model_name = get_model_path("unsloth/gpt-oss-20b-BF16") + model_name = gpt_oss_name_or_path tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) config.num_hidden_layers = 1 # Reduce layers for testing model = GptOssForCausalLM(config) - output_dir = "/tmp/test_quantized_gpt_oss" + output_dir = "./tmp/test_quantized_gpt_oss" return model, tokenizer, output_dir, config @pytest.fixture def setup_llama4(): """Fixture to set up the llama4 model and tokenizer.""" - model_name = get_model_path("meta-llama/Llama-4-Scout-17B-16E-Instruct") + model_name = llama4_name_or_path tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) config.vision_config.num_hidden_layers = 2 # Reduce layers for testing config.text_config.num_hidden_layers = 2 model = Llama4ForConditionalGeneration(config) - output_dir = "/tmp/test_quantized_llama4" + output_dir = "./tmp/test_quantized_llama4" return model, tokenizer, output_dir, config @@ -48,23 +54,52 @@ def quantize_model(model, tokenizer, output_dir, scheme, iters=0): return quantized_model -def test_gptoss(setup_gpt_oss): +def count_modules_by_type(model, target_module_name_or_class): + """Helper function to count modules of a specific type in the model.""" + cnt = 0 + for name, module in model.named_modules(): + if isinstance(target_module_name_or_class, str): + if target_module_name_or_class == module.__class__.__name__: + cnt += 1 + else: + if isinstance(module, target_module_name_or_class): + cnt += 1 + return cnt + + +@pytest.mark.parametrize("scheme", ["MXFP4", "MXFP8"]) +def test_gptoss(setup_gpt_oss, scheme): model, tokenizer, output_dir, config = setup_gpt_oss # Below parameter is set to be same as the full model # Remove it to avoid mismatch during quantized model loading delattr(model.config, "layer_types") - quantized_model = quantize_model(model, tokenizer, output_dir, "MXFP4") + quantized_model = quantize_model(model, tokenizer, output_dir, scheme) # Ensure the quantized model is not None assert quantized_model is not None, "Quantized model should not be None." - - loaded_model = GptOssForCausalLM.from_pretrained(output_dir) - for n, m in quantized_model.named_modules(): - if m.__class__.__name__ == "QuantLinear": - loaded_m = loaded_model.get_submodule(n) - assert (loaded_m.weight_packed.to("cpu") == m.weight_packed.to("cpu")).all() + from auto_round.export.export_to_autoround.qlinear_fp import QuantLinear + from auto_round.modelling.gpt_oss import GPTOssSingleExpert + + single_expert_cnt = count_modules_by_type(quantized_model, GPTOssSingleExpert) + quant_linear_cnt = count_modules_by_type(quantized_model, QuantLinear) + assert ( + single_expert_cnt == config.num_local_experts + ), f"Expected {config.num_local_experts} GPTOssSingleExpert modules, found {single_expert_cnt}." + assert ( + quant_linear_cnt == config.num_hidden_layers * 3 * config.num_local_experts + ), f"Expected {config.num_hidden_layers * 3 * config.num_local_experts} QuantLinear modules, found {quant_linear_cnt}." + + print(f"[{scheme}] Total {GPTOssSingleExpert.__name__} modules: {single_expert_cnt}") + print(f"[{scheme}] Total {QuantLinear.__name__} modules: {quant_linear_cnt}") + + if scheme == "MXFP4": + loaded_model = GptOssForCausalLM.from_pretrained(output_dir) + for n, m in quantized_model.named_modules(): + if m.__class__.__name__ == "QuantLinear": + loaded_m = loaded_model.get_submodule(n) + assert (loaded_m.weight_packed.to("cpu") == m.weight_packed.to("cpu")).all() # clean the output directory after test shutil.rmtree(output_dir, ignore_errors=True) From cd26af94aa0787aedd6082f9298dda6d5e951382 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Tue, 23 Dec 2025 01:32:28 -0500 Subject: [PATCH 18/24] revert ark change and add some gguf tiny model back Signed-off-by: He, Xin3 --- test/test_ark/test_model.py | 145 +++++++++++++++++++----------- test/test_cpu/test_gguf_format.py | 10 +-- 2 files changed, 97 insertions(+), 58 deletions(-) diff --git a/test/test_ark/test_model.py b/test/test_ark/test_model.py index b8dfdca5c..09d8bf25a 100644 --- a/test/test_ark/test_model.py +++ b/test/test_ark/test_model.py @@ -1,6 +1,10 @@ import shutil +import sys import pytest + +sys.path.insert(0, "../..") + import torch from transformers import AutoModelForCausalLM, AutoTokenizer @@ -8,76 +12,111 @@ from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.testing_utils import require_autogptq, require_gptqmodel -from ..helpers import model_infer +class LLMDataLoader: + def __init__(self): + self.batch_size = 1 + + def __iter__(self): + for i in range(2): + yield torch.ones([1, 10], dtype=torch.long) -class TestAutoRoundTorchBackend: - @pytest.fixture(autouse=True, scope="class") - def setup_and_teardown_class(self): - # ===== SETUP (setup_class) ===== - print("[Setup] Running before any test in class") +class TestAutoRoundARKBackend: - # Yield to hand control to the test methods - yield + @classmethod + def setup_class(self): + self.model_name = "facebook/opt-125m" + self.save_folder = "./saved" + self.llm_dataloader = LLMDataLoader() - # ===== TEARDOWN (teardown_class) ===== - print("[Teardown] Running after all tests in class") - shutil.rmtree("./saved", ignore_errors=True) + @classmethod + def teardown_class(self): + shutil.rmtree(self.save_folder, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_torch_4bits_sym_cpu(self, opt_model, opt_tokenizer, dataloader): - bits, group_size, sym = 4, 32, True - autoround = AutoRound( - opt_model, - opt_tokenizer, - bits=bits, - group_size=group_size, - sym=sym, - iters=0, - seqlen=2, - dataset=dataloader, - ) - quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round:gptqmodel") + def model_infer(self, model, tokenizer): + prompts = [ + "Hello,my name is", + # "The president of the United States is", + # "The capital of France is", + # "The future of AI is", + ] - quantization_config = AutoRoundConfig(backend="ark") - model = AutoModelForCausalLM.from_pretrained( - quantized_model_path, dtype=torch.float16, device_map="cpu", quantization_config=quantization_config - ) + inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) - tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - model_infer(model, tokenizer) - result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=1000) - print(result["results"]["lambada_openai"]["acc,none"]) - assert result["results"]["lambada_openai"]["acc,none"] > 0.28 - - shutil.rmtree("./saved", ignore_errors=True) - - def test_torch_4bits_sym_xpu(self, opt_model, opt_tokenizer, dataloader): - bits, group_size, sym = 4, 32, True - autoround = AutoRound( - opt_model, - opt_tokenizer, - bits=bits, - group_size=group_size, - sym=sym, - iters=0, - seqlen=2, - dataset=dataloader, + outputs = model.generate( + input_ids=inputs["input_ids"].to(model.device), + attention_mask=inputs["attention_mask"].to(model.device), + do_sample=False, ## change this to follow official usage + max_new_tokens=5, ) + generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] + + decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + + for i, prompt in enumerate(prompts): + print(f"Prompt: {prompt}") + print(f"Generated: {decoded_outputs[i]}") + print("-" * 50) + return decoded_outputs[0] + + def main_op(self, format, bits, group_size, sym, dtype, device, fast_cfg=True, tar_acc=0.28): + limit = 100 + if device == "xpu": + limit = 1000 + if not torch.xpu.is_available(): + pytest.skip("No XPU device") + if sym is False: + pytest.skip("No asym support for XPU") + model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto") + tokenizer = AutoTokenizer.from_pretrained(self.model_name) + if fast_cfg: + autoround = AutoRound( + model, tokenizer, bits=bits, group_size=group_size, sym=sym, iters=0, nsamples=1, disable_opt_rtn=True + ) + else: + autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sym) quantized_model_path = self.save_folder - autoround.quantize_and_save(output_dir=quantized_model_path, format="auto_round") ##will convert to gptq model + autoround.quantize_and_save(output_dir=quantized_model_path, format=format) ##will convert to gptq model quantization_config = AutoRoundConfig(backend="ark") model = AutoModelForCausalLM.from_pretrained( - quantized_model_path, dtype=torch.float16, device_map="xpu", quantization_config=quantization_config + quantized_model_path, dtype=dtype, device_map=device, quantization_config=quantization_config ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - model_infer(model, tokenizer) - result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=1000) + self.model_infer(model, tokenizer) + result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=limit) print(result["results"]["lambada_openai"]["acc,none"]) - assert result["results"]["lambada_openai"]["acc,none"] > 0.28 + assert result["results"]["lambada_openai"]["acc,none"] > tar_acc torch.xpu.empty_cache() shutil.rmtree(self.save_folder, ignore_errors=True) + + @pytest.mark.parametrize("format", ["auto_round", "auto_round:gptqmodel"]) + @pytest.mark.parametrize("bits, group_size, sym", [(4, 128, True), (8, 128, True)]) + @pytest.mark.parametrize("dtype", [torch.bfloat16]) + @pytest.mark.parametrize("device", ["cpu", "xpu"]) + def test_formats(self, format, bits, group_size, sym, dtype, device): + self.main_op(format, bits, group_size, sym, dtype, device) + + @pytest.mark.parametrize("format", ["auto_round:auto_awq"]) + @pytest.mark.parametrize("bits, group_size, sym", [(4, 32, True)]) + @pytest.mark.parametrize("dtype", [torch.float16]) + @pytest.mark.parametrize("device", ["cpu", "xpu"]) + def test_awq_fp16(self, format, bits, group_size, sym, dtype, device): + self.main_op(format, bits, group_size, sym, dtype, device) + + @pytest.mark.parametrize("format", ["auto_round"]) + @pytest.mark.parametrize("bits, group_size, sym", [(2, 32, False)]) + @pytest.mark.parametrize("dtype", [torch.bfloat16]) + @pytest.mark.parametrize("device", ["cpu"]) + def test_other_bits(self, format, bits, group_size, sym, dtype, device): + self.main_op(format, bits, group_size, sym, dtype, device, False, 0.2) + + +if __name__ == "__main__": + p = TestAutoRoundARKBackend() + p.setup_class() + p.test_formats("auto_round:auto_awq", 4, 32, True, torch.bfloat16, "xpu") + p.teardown_class() diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py index 366819234..73491eb7f 100644 --- a/test/test_cpu/test_gguf_format.py +++ b/test/test_cpu/test_gguf_format.py @@ -23,10 +23,10 @@ def teardown_class(self): shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def test_basic_usage(self): + def test_basic_usage(self, tiny_gemma_model_path, tiny_qwen_model_path): python_path = sys.executable res = os.system( - f"cd ../.. && {python_path} -m auto_round --model {get_model_path('benzart/gemma-2b-it-fine-tuning-for-code-test')} " + f"cd ../.. && {python_path} -m auto_round --model {tiny_gemma_model_path} " f" --bs 16 --iters 0 --nsamples 1 --format gguf:q4_k_m" ) if res > 0 or res == -1: @@ -34,7 +34,7 @@ def test_basic_usage(self): shutil.rmtree("./saved", ignore_errors=True) res = os.system( - f"cd ../.. && {python_path} -m auto_round --model {self.model_name}" + f"cd ../.. && {python_path} -m auto_round --model {tiny_qwen_model_path}" f" --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0" ) if res > 0 or res == -1: @@ -156,8 +156,8 @@ def test_q4_k_m(self, dataloader): autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m,fake") shutil.rmtree("./saved", ignore_errors=True) - def test_all_format(self): - model_name = get_model_path("Qwen/Qwen2.5-1.5B-Instruct") + def test_all_format(self, tiny_qwen_model_path): + model_name = tiny_qwen_model_path python_path = sys.executable # for gguf_format in ["gguf:q4_0", "gguf:q4_1", "gguf:q4_k_m", "gguf:q6_k"]: for gguf_format in ["gguf:q4_k_m"]: From 95bd71e1e41537302a0acc8f7f17ed4dd0d857e3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 23 Dec 2025 09:01:58 +0000 Subject: [PATCH 19/24] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- test/test_cuda/test_multiple_card_calib.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_cuda/test_multiple_card_calib.py b/test/test_cuda/test_multiple_card_calib.py index 5243b1c07..03c59bba6 100644 --- a/test/test_cuda/test_multiple_card_calib.py +++ b/test/test_cuda/test_multiple_card_calib.py @@ -56,4 +56,3 @@ def test_multiple_card_nvfp4(self): ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" - From f3369d6156a03396e742eba8110d69c15e0ba89b Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Tue, 23 Dec 2025 04:09:50 -0500 Subject: [PATCH 20/24] add test_ark change and minor fix Signed-off-by: He, Xin3 --- test/test_ark/test_model.py | 42 ++--------------------------- test/test_cpu/test_init.py | 6 ++--- test/test_cpu/test_torch_backend.py | 4 +-- test/test_cuda/test_transformers.py | 8 +++--- 4 files changed, 12 insertions(+), 48 deletions(-) diff --git a/test/test_ark/test_model.py b/test/test_ark/test_model.py index 09d8bf25a..de4e9238e 100644 --- a/test/test_ark/test_model.py +++ b/test/test_ark/test_model.py @@ -2,65 +2,27 @@ import sys import pytest - -sys.path.insert(0, "../..") - import torch from transformers import AutoModelForCausalLM, AutoTokenizer from auto_round import AutoRound, AutoRoundConfig from auto_round.eval.evaluation import simple_evaluate_user_model -from auto_round.testing_utils import require_autogptq, require_gptqmodel - -class LLMDataLoader: - def __init__(self): - self.batch_size = 1 - - def __iter__(self): - for i in range(2): - yield torch.ones([1, 10], dtype=torch.long) +from ..helpers import get_model_path, model_infer class TestAutoRoundARKBackend: @classmethod def setup_class(self): - self.model_name = "facebook/opt-125m" + self.model_name = get_model_path("facebook/opt-125m") self.save_folder = "./saved" - self.llm_dataloader = LLMDataLoader() @classmethod def teardown_class(self): shutil.rmtree(self.save_folder, ignore_errors=True) shutil.rmtree("runs", ignore_errors=True) - def model_infer(self, model, tokenizer): - prompts = [ - "Hello,my name is", - # "The president of the United States is", - # "The capital of France is", - # "The future of AI is", - ] - - inputs = tokenizer(prompts, return_tensors="pt", padding=False, truncation=True) - - outputs = model.generate( - input_ids=inputs["input_ids"].to(model.device), - attention_mask=inputs["attention_mask"].to(model.device), - do_sample=False, ## change this to follow official usage - max_new_tokens=5, - ) - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs["input_ids"], outputs)] - - decoded_outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - - for i, prompt in enumerate(prompts): - print(f"Prompt: {prompt}") - print(f"Generated: {decoded_outputs[i]}") - print("-" * 50) - return decoded_outputs[0] - def main_op(self, format, bits, group_size, sym, dtype, device, fast_cfg=True, tar_acc=0.28): limit = 100 if device == "xpu": diff --git a/test/test_cpu/test_init.py b/test/test_cpu/test_init.py index 6ebee954d..01785d679 100644 --- a/test/test_cpu/test_init.py +++ b/test/test_cpu/test_init.py @@ -1,8 +1,8 @@ from auto_round import AutoRound -def test_torch_compile(): - ar = AutoRound(model="facebook/opt-125m", scheme="NVFP4", enable_torch_compile=True) +def test_torch_compile(tiny_opt_model_path): + ar = AutoRound(model=tiny_opt_model_path, scheme="NVFP4", enable_torch_compile=True) assert not ar.enable_torch_compile, "NVFP4 cannot work with torch.compile." - ar = AutoRound(model="facebook/opt-125m", scheme="FP8_STATIC", enable_torch_compile=True) + ar = AutoRound(model=tiny_opt_model_path, scheme="FP8_STATIC", enable_torch_compile=True) assert not ar.enable_torch_compile, "FP8_STATIC cannot work with torch.compile." diff --git a/test/test_cpu/test_torch_backend.py b/test/test_cpu/test_torch_backend.py index 81e009c06..0be8f76e6 100644 --- a/test/test_cpu/test_torch_backend.py +++ b/test/test_cpu/test_torch_backend.py @@ -8,14 +8,14 @@ from auto_round.eval.evaluation import simple_evaluate_user_model from auto_round.testing_utils import require_autogptq, require_gptqmodel -from ..helpers import model_infer +from ..helpers import get_model_path, model_infer class TestAutoRoundTorchBackend: @classmethod def setup_class(self): - self.model_name = "facebook/opt-125m" + self.model_name = get_model_path("facebook/opt-125m") self.save_folder = "./saved" @classmethod diff --git a/test/test_cuda/test_transformers.py b/test/test_cuda/test_transformers.py index f6e5b4497..f37fe94ff 100644 --- a/test/test_cuda/test_transformers.py +++ b/test/test_cuda/test_transformers.py @@ -27,6 +27,8 @@ ) from transformers.utils import is_torch_available +from ..helpers import get_model_path + if is_torch_available(): import torch @@ -76,8 +78,8 @@ def test_quantized_model(self): output = self.quantized_model.generate(**input_ids, max_new_tokens=40, do_sample=False) assert self.tokenizer.decode(output[0], skip_special_tokens=True) in self.EXPECTED_OUTPUTS - def test_raise_if_non_quantized(self): - model_id = "facebook/opt-125m" + def test_raise_if_non_quantized(self, tiny_opt_model_path): + model_id = tiny_opt_model_path quantization_config = AutoRoundConfig(bits=4) with pytest.raises(ValueError): _ = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config) @@ -185,7 +187,7 @@ def test_mixed_bits(self): """ Simple test that checks if auto-round work properly with mixed bits """ - model_name = "facebook/opt-125m" + model_name = get_model_path("facebook/opt-125m") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) layer_config = { From e7a238bd911f3996376b129b1633148f7f191ed1 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Wed, 24 Dec 2025 00:29:32 -0500 Subject: [PATCH 21/24] update path for testing Signed-off-by: He, Xin3 --- test/test_cpu/test_cli_usage.py | 16 ++++++++-------- test/test_cpu/test_gguf_format.py | 10 +++++----- test/test_cuda/test_alg_ext.py | 4 ++-- test/test_cuda/test_gguf.py | 4 ++-- test/test_cuda/test_multiple_card_calib.py | 4 ++-- test/test_cuda/test_support_vlms.py | 10 +++++----- 6 files changed, 24 insertions(+), 24 deletions(-) diff --git a/test/test_cpu/test_cli_usage.py b/test/test_cpu/test_cli_usage.py index 82466dc82..b3aecf2f1 100644 --- a/test/test_cpu/test_cli_usage.py +++ b/test/test_cpu/test_cli_usage.py @@ -21,24 +21,24 @@ def test_auto_round_cmd(self, tiny_opt_model_path, tiny_qwen_vl_model_path): python_path = sys.executable # Test llm script - res = os.system(f"cd ../.. && {python_path} -m auto_round -h") + res = os.system(f"cd .. && {python_path} -m auto_round -h") if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"cd ../.. && {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved --tasks piqa" + f"cd .. && {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved --tasks piqa" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"cd ../.. && {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32" + f"cd .. && {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"cd ../.. && {python_path} -c 'from auto_round.__main__ import run_light; run_light()' --seqlen 8 --iter 2 --nsamples 8 --output_dir ./saved --tasks lambada_openai" + f"cd .. && {python_path} -c 'from auto_round.__main__ import run_light; run_light()' --seqlen 8 --iter 2 --nsamples 8 --output_dir ./saved --tasks lambada_openai" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" @@ -46,23 +46,23 @@ def test_auto_round_cmd(self, tiny_opt_model_path, tiny_qwen_vl_model_path): # test mllm script # test auto_round_mllm --eval help - res = os.system(f"cd ../.. && {python_path} -m auto_round --eval -h") + res = os.system(f"cd .. && {python_path} -m auto_round --eval -h") if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" # test auto_round_mllm --lmms help - res = os.system(f"cd ../.. && {python_path} -m auto_round --eval --lmms -h") + res = os.system(f"cd .. && {python_path} -m auto_round --eval --lmms -h") if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"cd ../.. && {python_path} -m auto_round --mllm --model {tiny_qwen_vl_model_path} --iter 2 --nsamples 2 --seqlen 32 --format auto_round --output_dir ./saved" + f"cd .. && {python_path} -m auto_round --mllm --model {tiny_qwen_vl_model_path} --iter 2 --nsamples 2 --seqlen 32 --format auto_round --output_dir ./saved" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"cd ../.. && {python_path} -m auto_round --mllm --iter 2 --nsamples 2 --model {tiny_qwen_vl_model_path} --seqlen 32 --format auto_round" + f"cd .. && {python_path} -m auto_round --mllm --iter 2 --nsamples 2 --model {tiny_qwen_vl_model_path} --seqlen 32 --format auto_round" " --quant_nontext_module --output_dir ./saved " ) if res > 0 or res == -1: diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py index 73491eb7f..92e9d620e 100644 --- a/test/test_cpu/test_gguf_format.py +++ b/test/test_cpu/test_gguf_format.py @@ -26,7 +26,7 @@ def teardown_class(self): def test_basic_usage(self, tiny_gemma_model_path, tiny_qwen_model_path): python_path = sys.executable res = os.system( - f"cd ../.. && {python_path} -m auto_round --model {tiny_gemma_model_path} " + f"cd .. && {python_path} -m auto_round --model {tiny_gemma_model_path} " f" --bs 16 --iters 0 --nsamples 1 --format gguf:q4_k_m" ) if res > 0 or res == -1: @@ -34,7 +34,7 @@ def test_basic_usage(self, tiny_gemma_model_path, tiny_qwen_model_path): shutil.rmtree("./saved", ignore_errors=True) res = os.system( - f"cd ../.. && {python_path} -m auto_round --model {tiny_qwen_model_path}" + f"cd .. && {python_path} -m auto_round --model {tiny_qwen_model_path}" f" --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0" ) if res > 0 or res == -1: @@ -162,7 +162,7 @@ def test_all_format(self, tiny_qwen_model_path): # for gguf_format in ["gguf:q4_0", "gguf:q4_1", "gguf:q4_k_m", "gguf:q6_k"]: for gguf_format in ["gguf:q4_k_m"]: res = os.system( - f"cd ../.. && {python_path} -m auto_round --model {model_name} " + f"cd .. && {python_path} -m auto_round --model {model_name} " f" --bs 16 --iters 1 --nsamples 1 --seqlen 16 --format {gguf_format}" ) if res > 0 or res == -1: @@ -170,7 +170,7 @@ def test_all_format(self, tiny_qwen_model_path): shutil.rmtree("../../tmp_autoround", ignore_errors=True) res = os.system( - f"cd ../.. && {python_path} -m auto_round --model {model_name}" + f"cd .. && {python_path} -m auto_round --model {model_name}" f" --bs 16 --iters 0 --nsamples 1 --seqlen 16 --format fake,{gguf_format}" ) if res > 0 or res == -1: @@ -179,7 +179,7 @@ def test_all_format(self, tiny_qwen_model_path): # test mixed q2_k_s res = os.system( - f"cd ../.. && {python_path} -m auto_round --model {model_name}" + f"cd .. && {python_path} -m auto_round --model {model_name}" f" --bs 16 --iters 0 --nsamples 1 --seqlen 16 --scheme GGUF:Q2_K_MIXED" ) if res > 0 or res == -1: diff --git a/test/test_cuda/test_alg_ext.py b/test/test_cuda/test_alg_ext.py index 6b04847ed..6cdbc82ab 100644 --- a/test/test_cuda/test_alg_ext.py +++ b/test/test_cuda/test_alg_ext.py @@ -49,13 +49,13 @@ def test_cli(self, tiny_opt_model_path): python_path = sys.executable res = os.system( - f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 2 --options=W2A16,W4A16 --ignore_scale_zp_bits --nsamples 1 --seqlen 32" + f"cd .. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 2 --options=W2A16,W4A16 --ignore_scale_zp_bits --nsamples 1 --seqlen 32" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" res = os.system( - f"cd ../.. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 5.5 --options=mxfp4,mxfp8 --ignore_scale_zp_bits --enable_torch_compile --nsamples 1 --seqlen 32" + f"cd .. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 5.5 --options=mxfp4,mxfp8 --ignore_scale_zp_bits --enable_torch_compile --nsamples 1 --seqlen 32" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" diff --git a/test/test_cuda/test_gguf.py b/test/test_cuda/test_gguf.py index 7a3a0cd89..174deab2f 100644 --- a/test/test_cuda/test_gguf.py +++ b/test/test_cuda/test_gguf.py @@ -56,7 +56,7 @@ def test_gguf_format(self, tiny_qwen_model_path, dataloader): save_dir = os.path.join(os.path.dirname(__file__), "saved") res = os.system( - f"cd ../.. && {sys.executable} -m auto_round --model {tiny_qwen_model_path} --iter 2 " + f"cd .. && {sys.executable} -m auto_round --model {tiny_qwen_model_path} --iter 2 " f"--output_dir {save_dir} --nsample 2 --format gguf:q4_0 --device 0" ) print(save_dir) @@ -103,7 +103,7 @@ def test_q2_k_export(self, dataloader): def test_basic_usage(self, tiny_qwen_model_path): python_path = sys.executable res = os.system( - f"cd ../.. && {python_path} -m auto_round --model {tiny_qwen_model_path} --eval_task_by_task" + f"cd .. && {python_path} -m auto_round --model {tiny_qwen_model_path} --eval_task_by_task" f" --tasks piqa,openbookqa --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0 --eval_model_dtype bf16" ) if res > 0 or res == -1: diff --git a/test/test_cuda/test_multiple_card_calib.py b/test/test_cuda/test_multiple_card_calib.py index 03c59bba6..fedb3f328 100644 --- a/test/test_cuda/test_multiple_card_calib.py +++ b/test/test_cuda/test_multiple_card_calib.py @@ -41,7 +41,7 @@ def test_multiple_card_calib(self): ##test llm script res = os.system( - f"cd ../.. && {python_path} -m auto_round --model /models/Meta-Llama-3.1-8B-Instruct --devices '0,1' --quant_lm_head --iters 1 --nsamples 1 --output_dir None" + f"cd .. && {python_path} -m auto_round --model /models/Meta-Llama-3.1-8B-Instruct --devices '0,1' --quant_lm_head --iters 1 --nsamples 1 --output_dir None" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" @@ -52,7 +52,7 @@ def test_multiple_card_nvfp4(self): ##test llm script res = os.system( - f"cd ../.. && {python_path} -m auto_round --model facebook/opt-125m --scheme NVFP4 --devices '0,1' --iters 1 --nsamples 1 --enable_torch_compile --low_gpu_mem_usage" + f"cd .. && {python_path} -m auto_round --model facebook/opt-125m --scheme NVFP4 --devices '0,1' --iters 1 --nsamples 1 --enable_torch_compile --low_gpu_mem_usage" ) if res > 0 or res == -1: assert False, "cmd line test fail, please have a check" diff --git a/test/test_cuda/test_support_vlms.py b/test/test_cuda/test_support_vlms.py index 9efd53564..3358c8226 100644 --- a/test/test_cuda/test_support_vlms.py +++ b/test/test_cuda/test_support_vlms.py @@ -26,7 +26,7 @@ def test_qwen2(self): model_path = "/models/Qwen2-VL-2B-Instruct/" # test tune res = os.system( - f"cd ../.. && {self.python_path} -m auto_round --mllm " + f"cd .. && {self.python_path} -m auto_round --mllm " f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}" ) assert not (res > 0 or res == -1), "qwen2 tuning fail" @@ -81,7 +81,7 @@ def test_phi3(self): model_path = "/models/Phi-3.5-vision-instruct/" ## test tune res = os.system( - f"cd ../.. && {self.python_path} -m auto_round --mllm " + f"cd .. && {self.python_path} -m auto_round --mllm " f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}" ) assert not (res > 0 or res == -1), "Phi-3.5 tuning fail" @@ -129,7 +129,7 @@ def test_phi3_vision_awq(self): model_path = "/models/Phi-3.5-vision-instruct/" ## test tune res = os.system( - f"cd ../.. && {self.python_path} -m auto_round --mllm " + f"cd .. && {self.python_path} -m auto_round --mllm " f"--model {model_path} --iter 2 --quant_nontext_module " f"--nsample 64 --seqlen 32 " f"--format auto_awq --output_dir {self.save_dir} --device {self.device}" @@ -177,7 +177,7 @@ def test_glm(self): model_path = "/models/glm-4v-9b/" ## test tune res = os.system( - f"cd ../.. && {self.python_path} -m auto_round " + f"cd .. && {self.python_path} -m auto_round " f"--model {model_path} --iter 1 --output_dir {self.save_dir} --device {self.device}" ) assert not (res > 0 or res == -1), "glm-4v-9b tuning fail" @@ -186,7 +186,7 @@ def test_granite_vision(self): model_path = "/models/granite-vision-3.2-2b" ## test tune res = os.system( - f"cd ../.. && {self.python_path} -m auto_round " + f"cd .. && {self.python_path} -m auto_round " f"--model {model_path} --iter 1 --output_dir {self.save_dir} --device {self.device}" ) assert not (res > 0 or res == -1), "granite-vision-3.2-2b tuning fail" From 7d1453b2826b9b0da9138d6f622e1539a299ed63 Mon Sep 17 00:00:00 2001 From: "Sun, Xuehao" Date: Wed, 24 Dec 2025 13:31:07 +0800 Subject: [PATCH 22/24] fix xpu ut path Signed-off-by: Sun, Xuehao --- .azure-pipelines/scripts/ut/run_ut_xpu.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.azure-pipelines/scripts/ut/run_ut_xpu.sh b/.azure-pipelines/scripts/ut/run_ut_xpu.sh index 2ab0aef64..740937d18 100644 --- a/.azure-pipelines/scripts/ut/run_ut_xpu.sh +++ b/.azure-pipelines/scripts/ut/run_ut_xpu.sh @@ -12,8 +12,7 @@ echo "##[endgroup]" uv pip list # test ark cpu part only before external xpu available -cd /auto-round/test/test_ark || exit 1 -find . -type f -exec sed -i '/sys\.path\.insert(0, "\.\.")/d' {} + +cd /auto-round/test || exit 1 export LD_LIBRARY_PATH=${HOME}/.venv/lib/:$LD_LIBRARY_PATH export COVERAGE_RCFILE=/auto-round/.azure-pipelines/scripts/ut/.coverage @@ -23,7 +22,7 @@ LOG_DIR=/auto-round/log_dir mkdir -p ${LOG_DIR} ut_log_name=${LOG_DIR}/ut.log -find . -name "test*.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run.sh +find ./test_ark -name "test*.py" | sed "s,\.\/,python -m pytest --cov=\"${auto_round_path}\" --cov-report term --html=report.html --self-contained-html --cov-report xml:coverage.xml --cov-append -vs --disable-warnings ,g" > run.sh cat run.sh bash run.sh 2>&1 | tee "${ut_log_name}" From 562689fe7b1349fc76f7c7de2961c270cf5c440d Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Wed, 24 Dec 2025 01:27:38 -0500 Subject: [PATCH 23/24] fix bug Signed-off-by: He, Xin3 --- test/test_ark/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 test/test_ark/__init__.py diff --git a/test/test_ark/__init__.py b/test/test_ark/__init__.py new file mode 100644 index 000000000..e69de29bb From 4ee57dadf8208395c0eae53d2597ebd0ee3bef6c Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Wed, 24 Dec 2025 03:34:33 -0500 Subject: [PATCH 24/24] fix bug Signed-off-by: He, Xin3 --- test/test_ark/test_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_ark/test_model.py b/test/test_ark/test_model.py index de4e9238e..bd4734609 100644 --- a/test/test_ark/test_model.py +++ b/test/test_ark/test_model.py @@ -48,7 +48,7 @@ def main_op(self, format, bits, group_size, sym, dtype, device, fast_cfg=True, t ) tokenizer = AutoTokenizer.from_pretrained(self.save_folder) - self.model_infer(model, tokenizer) + model_infer(model, tokenizer) result = simple_evaluate_user_model(model, tokenizer, batch_size=32, tasks="lambada_openai", limit=limit) print(result["results"]["lambada_openai"]["acc,none"]) assert result["results"]["lambada_openai"]["acc,none"] > tar_acc