From 24ce3368fb944c49b763f7efc6a08e2f6a1d87f4 Mon Sep 17 00:00:00 2001 From: Raghavendra Vedula Date: Thu, 19 Feb 2026 14:10:41 -0800 Subject: [PATCH] Add rerank API support to load test script --- README.md | 6 ++++ llm_bench/README.md | 19 ++++++++++++ llm_bench/load_test.py | 70 +++++++++++++++++++++++++++++++++++++++--- 3 files changed, 91 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index c742baf..336c520 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,12 @@ Supported providers and API flavors: * Legacy HTTP endpoints (no streaming) * LLM-focused endpoints (with or without streaming) +Supported API types: +* Chat completions (`/v1/chat/completions`) +* Text completions (`/v1/completions`) +* Embeddings (`/v1/embeddings`) +* Rerank (`/v1/rerank`) + Captured metrics: * Overall latency * Number of generated tokens diff --git a/llm_bench/README.md b/llm_bench/README.md index 2755b30..bd1cbd9 100644 --- a/llm_bench/README.md +++ b/llm_bench/README.md @@ -66,6 +66,13 @@ Generation options: - `--stream`: stream the result back. Enabling this gives "time to first token" and "time per token" metrics - (optional) `--logprobs`: corresponds to `logprobs` API parameter. For some providers, it's needed for output token counting in streaming mode. +Embeddings and rerank options: +- `--embeddings`: use the `/v1/embeddings` API instead of completions +- `--rerank`: use the `/v1/rerank` API. The generated prompt text is split into documents (by paragraph), and `--rerank-query` is used as the query. +- `--rerank-query`: the search query string for rerank requests. Defaults to a generic query if not specified. +- `--rerank-top-n`: number of top results to return from the rerank endpoint. +- `--rerank-return-documents` / `--no-rerank-return-documents`: whether to include document text in the rerank response (default: true). + ### Writing results Locust prints out the detailed summary including quantiles of various metrics. Additionally, the script prints out the summary block at the very end of the output that includes the model being tested. @@ -129,6 +136,18 @@ Benchmark Fireworks public deployment with 1 request and 2 images (1024w x 1024h locust -u 1 -H https://api.fireworks.ai/inference -p 128 -o 200 --api-key $FIREWORKS_API_KEY --model=accounts/fireworks/models/llama-v3p1-8b-instruct --chat --prompt-images-with-resolutions 1024x1024 3084x1080 ``` +Benchmark Fireworks rerank deployment with a single request and 115k prompt tokens: + +```bash +locust -u 1 -r 2 -H https://api.fireworks.ai/inference --api-key $FIREWORKS_API_KEY -m "accounts/fireworks/models/qwen3-reranker-8b" --rerank --prompt-tokens 115000 -t 3min --tokenizer /path/to/tokenizer --summary-file rerank_results.csv +``` + +Benchmark Fireworks rerank with a custom query and top-5 results: + +```bash +locust -u 1 -r 2 -H https://api.fireworks.ai/inference --api-key $FIREWORKS_API_KEY -m "accounts/fireworks/models/qwen3-reranker-8b" --rerank --rerank-query "How do I reset my password?" --rerank-top-n 5 --prompt-tokens 4096 -t 1min --tokenizer /path/to/tokenizer +``` + Benchmark OpenAI deployment reading prompts from a file at 1 QPS: ```bash diff --git a/llm_bench/load_test.py b/llm_bench/load_test.py index 174a903..29cc8dc 100644 --- a/llm_bench/load_test.py +++ b/llm_bench/load_test.py @@ -166,7 +166,7 @@ def _create_dataset(cls, options: argparse.Namespace): path=os.path.join(os.path.dirname(os.path.abspath(__file__)), dataset_file), prompt="\n\n" + prompt, tokenizer_path=options.tokenizer, - chat=options.chat, + chat=options.chat and not getattr(options, "rerank", False), num_tokens=options.prompt_tokens, common_tokens=options.prompt_cache_max_len, ) @@ -578,7 +578,9 @@ def post_response_hook(self, headers, num_tokens, perf_metrics=None): class OpenAIProvider(BaseProvider): def get_url(self): - if self.parsed_options.embeddings: + if self.parsed_options.rerank: + return "/v1/rerank" + elif self.parsed_options.embeddings: return "/v1/embeddings" elif self.parsed_options.chat: return "/v1/chat/completions" @@ -586,6 +588,20 @@ def get_url(self): return "/v1/completions" def format_payload(self, prompt, max_tokens, images): + if self.parsed_options.rerank: + documents = [doc.strip() for doc in prompt.split("\n\n") if doc.strip()] + query = self.parsed_options.rerank_query or "Find the most relevant document." + data = { + "model": self.model, + "query": query, + "documents": documents, + } + if self.parsed_options.rerank_top_n is not None: + data["top_n"] = self.parsed_options.rerank_top_n + if not self.parsed_options.rerank_return_documents: + data["return_documents"] = False + return data + if self.parsed_options.embeddings: data = { "model": self.model, @@ -650,6 +666,17 @@ def format_payload(self, prompt, max_tokens, images): return data def parse_output_json(self, data): + if self.parsed_options.rerank: + usage = data.get("usage", {}) + scores = [f"{item['index']}:{item['relevance_score']:.4f}" for item in data.get("data", [])] + return ChunkMetadata( + text=", ".join(scores), + logprob_tokens=None, + completion_tokens=None, + prompt_tokens=usage.get("prompt_tokens"), + cached_tokens=None, + ) + if self.parsed_options.embeddings: return ChunkMetadata( text=data["data"][0]["embedding"], @@ -698,6 +725,8 @@ def parse_output_json(self, data): class FireworksProvider(OpenAIProvider): def format_payload(self, prompt, max_tokens, images): data = super().format_payload(prompt, max_tokens, images) + if self.parsed_options.rerank: + return data # Enable perf_metrics_in_response to get speculation stats in streaming responses data["perf_metrics_in_response"] = True # Add prompt_cache_max_pct if specified (Fireworks-specific parameter) @@ -1104,6 +1133,14 @@ def _do_generate_text(self): print(f"WARNING: Received more chunks after [DONE]: {chunk}") try: now = time.perf_counter() + if self.provider_formatter.parsed_options.rerank: + t_first_token = now + out = self.provider_formatter.parse_output_json(orjson.loads(chunk)) + if out.prompt_tokens: + prompt_tokens = out.prompt_tokens + if self.environment.parsed_options.show_response: + combined_text = out.text + break if self.provider_formatter.parsed_options.embeddings: t_first_token = now if self.environment.parsed_options.show_response: @@ -1175,7 +1212,7 @@ def _do_generate_text(self): dur_generation = now - t_first_token dur_first_token = t_first_token - t_start - if not self.provider_formatter.parsed_options.embeddings: + if not (self.provider_formatter.parsed_options.embeddings or self.provider_formatter.parsed_options.rerank): prompt_tokens = prompt_tokens or self.prompt_tokenizer_tokens token_parts = [] @@ -1209,7 +1246,7 @@ def _do_generate_text(self): num_tokens, ) - if not self.provider_formatter.parsed_options.embeddings: + if not (self.provider_formatter.parsed_options.embeddings or self.provider_formatter.parsed_options.rerank): if prompt_tokens: add_custom_metric("prompt_tokens", prompt_tokens) if cached_tokens is not None: @@ -1293,6 +1330,31 @@ def init_parser(parser): default=False, help="Use /v1/embeddings API", ) + parser.add_argument( + "--rerank", + action=argparse.BooleanOptionalAction, + default=False, + help="Use /v1/rerank API. The generated prompt text is split into documents (by paragraph), " + "and --rerank-query is used as the query.", + ) + parser.add_argument( + "--rerank-query", + type=str, + default=None, + help="For rerank: the search query string. Defaults to a generic query if not specified.", + ) + parser.add_argument( + "--rerank-top-n", + type=int, + default=None, + help="For rerank: number of top results to return.", + ) + parser.add_argument( + "--rerank-return-documents", + action=argparse.BooleanOptionalAction, + default=True, + help="For rerank: whether to return document text in response.", + ) parser.add_argument( "--return-logits", type=int,