Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions test/common/datasets/doc_qa/prompt.json

Large diffs are not rendered by default.

11 changes: 6 additions & 5 deletions test/common/db_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,6 @@ def _get_db():
db_config = _get_db_config()
_db_enabled = db_config.get("enabled", False)

backup_str = db_config.get("backup", "results/")
_backup_path = Path(backup_str).resolve()
_backup_path.mkdir(parents=True, exist_ok=True)
logger.info(f"Backup directory set to: {_backup_path}")

if not _db_enabled:
return None

Expand Down Expand Up @@ -205,10 +200,16 @@ def write_to_db(table_name: str, data: Dict[str, Any]) -> bool:


def database_connection(build_id: str) -> None:
global _backup_path
logger.info(f"Setting test build ID: {build_id}")
_set_test_build_id(build_id)

db_config = _get_db_config()
backup_str = db_config.get("backup", "results/")
_backup_path = Path(backup_str).resolve()
_backup_path.mkdir(parents=True, exist_ok=True)
logger.info(f"Backup directory set to: {_backup_path}")

if not db_config.get("enabled", False):
logger.info("Database connection skipped because enabled=false.")
return
Expand Down
87 changes: 84 additions & 3 deletions test/common/uc_eval/task.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import json
import os
import time
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Union

import pandas as pd
from common.uc_eval.utils.config_loader import ConfigLoader, TaskFactory
from common.uc_eval.utils.data_class import (
BenchmarkModeType,
Expand Down Expand Up @@ -241,6 +244,7 @@ def __init__(
self.prompt_tokens = perf_config.prompt_tokens
self.output_tokens = perf_config.output_tokens
self.prefix_cache_num = perf_config.prefix_cache_num
self.enable_warmup = perf_config.enable_warmup
self.prompt_seed = 0 if self.enable_prefix_cache else -1
self.stable_perf = self.benchmark_mode == BenchmarkModeType.STABLE_PREF
self.stable_rate = stable_rate
Expand Down Expand Up @@ -272,7 +276,11 @@ def process(self):
logger.info(
f"Performance benchmark running with: enable prefix cache: ({self.enable_prefix_cache}), {syntheric_params=}"
)
if self.enable_prefix_cache and self.prefix_cache_num[idx] > 0:
if (
self.enable_prefix_cache
and self.prefix_cache_num[idx] > 0
and self.enable_warmup
):
logger.info(f"Begin build kvcache...")
input_data = self.dataset.prepare_data(syntheric_params)
self.client.handle_requests_with_pool(
Expand Down Expand Up @@ -359,10 +367,11 @@ def __init__(
)
self.dataset_file_path = perf_config.dataset_file_path
self.max_tokens = model_config.payload.get("max_tokens")
self.enable_warmup = perf_config.enable_warmup

def process(self):
cases_list = self.dataset.prepare_data(self.dataset_file_path)
if self.enable_prefix_cache:
if self.enable_prefix_cache and self.enable_warmup:
logger.info("Begin build kvcache...")
self.client.handle_requests_with_pool(
cases_list, self.parallel_num, BAD_COMPLETION_TOKENS_THR
Expand All @@ -389,10 +398,38 @@ def __init__(
self.dataset_file_path = eval_config.dataset_file_path
self.max_tokens = model_config.payload.get("max_tokens")
self.eval_cls = eval_config.eval_class
self.prompt_split_ratio = eval_config.prompt_split_ratio
self.enable_warmup = eval_config.enable_warmup
self.round = getattr(eval_config, "round", 0)

def _split_prompt_by_tokens(
self, prompt: str, tokenizer, split_ratio: float
) -> str:
"""Split prompt by token ratio and return the first part."""
tokens = tokenizer.encode(prompt)
split_idx = int(len(tokens) * split_ratio)
first_tokens = tokens[:split_idx]
return tokenizer.decode(first_tokens, skip_special_tokens=False)

def process(self):
cases_list = self.dataset.prepare_data(self.dataset_file_path)
if self.enable_prefix_cache:

if self.prompt_split_ratio is not None and 0 < self.prompt_split_ratio < 1:
logger.info(
f"Applying prompt split ratio: {self.prompt_split_ratio} (only sending first {self.prompt_split_ratio*100:.0f}% of prompt)"
)
tokenizer = self.client.tokenizer
modified_cases = []
for case in cases_list:
case_name, context, question, answer = case
full_prompt = context + question
split_prompt = self._split_prompt_by_tokens(
full_prompt, tokenizer, self.prompt_split_ratio
)
modified_cases.append([case_name, split_prompt, "", answer])
cases_list = modified_cases

if self.enable_prefix_cache and self.enable_warmup:
logger.info("Begin build kvcache...")
self.client.handle_requests_with_pool(
cases_list, self.parallel_num, BAD_COMPLETION_TOKENS_THR
Expand All @@ -402,8 +439,52 @@ def process(self):
records: List[RequestRecord] = self.client.handle_requests_with_pool(
cases_list, self.parallel_num, self.max_tokens
)

if self.prompt_split_ratio is not None and 0 < self.prompt_split_ratio < 1:
logger.info(
f"Skipping accuracy evaluation when prompt_split_ratio={self.prompt_split_ratio} (service ran but no accuracy check)"
)
from common.uc_eval.utils.data_class import LatencyStatistics

empty_latency = LatencyStatistics()
empty_latency.metric_dict = {}
return empty_latency, len(records)

metric_result, match_record_list = self.benchmark.perf_show(
records, self.parallel_num
)
self.save_eval_cases_excel(match_record_list, self.eval_cls)
self.compare_first_round_results(match_record_list, self.round)
return metric_result, len(records)

def compare_first_round_results(
self, match_record_list: List[RequestRecord], round: int
):
if round == 0:
return
cache_file = "first_round_outputs.json"
if round == 1:
first_round_data = {r.case_name: r.output_data for r in match_record_list}
with open(cache_file, "w", encoding="utf-8") as f:
json.dump(first_round_data, f, ensure_ascii=False, indent=2)
logger.info(f"First round outputs saved to {cache_file}")
elif round == 2:
if not os.path.exists(cache_file):
return
with open(cache_file, "r", encoding="utf-8") as f:
first_round_data = json.load(f)
for r in match_record_list:
if r.case_name in first_round_data:
first_output = first_round_data[r.case_name]
is_match = first_output == r.output_data
logger.info(f"First Round Output: {first_output}")
logger.info(f"Second Round Output: {r.output_data}")
if not is_match:
logger.error(
f"Case {r.case_name}: The output results are inconsistent."
)
else:
logger.info(
f"Case {r.case_name}: The output results are consistent"
)
os.remove(cache_file)
6 changes: 6 additions & 0 deletions test/common/uc_eval/utils/config_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,12 @@ def create_task(
client_kwargs = {}
if data_type is DatasetType.MULTI_DIALOGUE:
client_kwargs["enable_prefix_cache"] = perf_config.enable_prefix_cache
elif data_type is DatasetType.DOC_QA and eval_config:
if (
hasattr(eval_config, "prompt_split_ratio")
and eval_config.prompt_split_ratio is not None
):
client_kwargs["prompt_split_ratio"] = eval_config.prompt_split_ratio
return (
cls._dataset[data_type](tokenizer_path),
cls._client[data_type](model_config, stream, **client_kwargs),
Expand Down
6 changes: 5 additions & 1 deletion test/common/uc_eval/utils/data_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,14 @@ class ModelConfig:
class EvalConfig:
data_type: str = ""
dataset_file_path: str = ""
enable_prefix_cache: str = False
enable_prefix_cache: bool = False
parallel_num: int = 1
benchmark_mode: str = "evaluate"
metrics: Optional[List[str]] = field(default_factory=list)
eval_class: Optional[str] = None
prompt_split_ratio: Optional[float] = None
enable_warmup: bool = True
round: int = 0


@dataclass
Expand All @@ -53,6 +56,7 @@ class PerfConfig:
output_tokens: List[int] = field(default_factory=list)
prefix_cache_num: List[float] = field(default_factory=list)
benchmark_mode: str = ""
enable_warmup: bool = True


@dataclass
Expand Down
99 changes: 99 additions & 0 deletions test/suites/E2E/test_accuracy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import dataclasses
import json

import pytest
from common.capture_utils import export_vars
from common.config_utils import config_utils as config_instance
from common.uc_eval.task import DocQaEvalTask
from common.uc_eval.utils.data_class import EvalConfig, ModelConfig


@pytest.fixture(scope="session")
def model_config() -> ModelConfig:
cfg = config_instance.get_config("models") or {}
field_name = [field.name for field in dataclasses.fields(ModelConfig)]
kwargs = {k: v for k, v in cfg.items() if k in field_name and v is not None}
if "payload" in kwargs and isinstance(kwargs["payload"], str):
try:
kwargs["payload"] = json.loads(kwargs["payload"])
except json.JSONDecodeError as e:
raise ValueError(f"Invalid payload JSON format: {e}")
return ModelConfig(**kwargs)


_DOC_QA_BASE_CONFIG = {
"data_type": "doc_qa",
"dataset_file_path": "../../common/uc_eval/datasets/doc_qa/prompt.json",
"enable_prefix_cache": True,
"parallel_num": 1,
"benchmark_mode": "evaluate",
"metrics": ["accuracy", "bootstrap-accuracy", "f1-score"],
"eval_class": "common.uc_eval.utils.metric:Includes",
}

doc_qa_eval_cases = [
pytest.param(
EvalConfig(
**{
**_DOC_QA_BASE_CONFIG,
"prompt_split_ratio": None,
"enable_warmup": False,
"round": 1,
}
),
id="doc-qa-full-prompt-warmup-evaluate",
),
pytest.param(
EvalConfig(
**{**_DOC_QA_BASE_CONFIG, "prompt_split_ratio": 0.5, "enable_warmup": False}
),
id="doc-qa-full-prompt-no-warmup-evaluate",
),
pytest.param(
EvalConfig(
**{
**_DOC_QA_BASE_CONFIG,
"prompt_split_ratio": None,
"enable_warmup": False,
"round": 2,
}
),
id="doc-qa-half-prompt-warmup-evaluate",
),
]

test_configs = [
pytest.param(
{"max_tokens": 1024, "ignore_eos": True, "temperature": 0.7},
False, # enable_clear_hbm
id="max_tokens_2048_clear_hbm_true",
),
]


@pytest.mark.feature("accu_test")
@pytest.mark.stage(2)
@pytest.mark.parametrize("eval_config", doc_qa_eval_cases)
@pytest.mark.parametrize("payload_updates,enable_clear_hbm", test_configs)
@export_vars
def test_doc_qa_perf(
eval_config: EvalConfig,
model_config: ModelConfig,
payload_updates: dict,
enable_clear_hbm: bool,
request: pytest.FixtureRequest,
):
file_save_path = config_instance.get_config("reports").get("base_dir")
if isinstance(model_config.payload, str):
model_config.payload = json.loads(model_config.payload)

model_config.payload.update(payload_updates)

if eval_config.prompt_split_ratio is None:
model_config.enable_clear_hbm = True
else:
model_config.enable_clear_hbm = enable_clear_hbm

task = DocQaEvalTask(model_config, eval_config, file_save_path)
result = task.run()
return {"_name": request.node.callspec.id, "_data": result}