Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ Supported providers and API flavors:
* Legacy HTTP endpoints (no streaming)
* LLM-focused endpoints (with or without streaming)

Supported API types:
* Chat completions (`/v1/chat/completions`)
* Text completions (`/v1/completions`)
* Embeddings (`/v1/embeddings`)
* Rerank (`/v1/rerank`)

Captured metrics:
* Overall latency
* Number of generated tokens
Expand Down
19 changes: 19 additions & 0 deletions llm_bench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,13 @@ Generation options:
- `--stream`: stream the result back. Enabling this gives "time to first token" and "time per token" metrics
- (optional) `--logprobs`: corresponds to `logprobs` API parameter. For some providers, it's needed for output token counting in streaming mode.

Embeddings and rerank options:
- `--embeddings`: use the `/v1/embeddings` API instead of completions
- `--rerank`: use the `/v1/rerank` API. The generated prompt text is split into documents (by paragraph), and `--rerank-query` is used as the query.
- `--rerank-query`: the search query string for rerank requests. Defaults to a generic query if not specified.
- `--rerank-top-n`: number of top results to return from the rerank endpoint.
- `--rerank-return-documents` / `--no-rerank-return-documents`: whether to include document text in the rerank response (default: true).

### Writing results

Locust prints out the detailed summary including quantiles of various metrics. Additionally, the script prints out the summary block at the very end of the output that includes the model being tested.
Expand Down Expand Up @@ -129,6 +136,18 @@ Benchmark Fireworks public deployment with 1 request and 2 images (1024w x 1024h
locust -u 1 -H https://api.fireworks.ai/inference -p 128 -o 200 --api-key $FIREWORKS_API_KEY --model=accounts/fireworks/models/llama-v3p1-8b-instruct --chat --prompt-images-with-resolutions 1024x1024 3084x1080
```

Benchmark Fireworks rerank deployment with a single request and 115k prompt tokens:

```bash
locust -u 1 -r 2 -H https://api.fireworks.ai/inference --api-key $FIREWORKS_API_KEY -m "accounts/fireworks/models/qwen3-reranker-8b" --rerank --prompt-tokens 115000 -t 3min --tokenizer /path/to/tokenizer --summary-file rerank_results.csv
```

Benchmark Fireworks rerank with a custom query and top-5 results:

```bash
locust -u 1 -r 2 -H https://api.fireworks.ai/inference --api-key $FIREWORKS_API_KEY -m "accounts/fireworks/models/qwen3-reranker-8b" --rerank --rerank-query "How do I reset my password?" --rerank-top-n 5 --prompt-tokens 4096 -t 1min --tokenizer /path/to/tokenizer
```

Benchmark OpenAI deployment reading prompts from a file at 1 QPS:

```bash
Expand Down
70 changes: 66 additions & 4 deletions llm_bench/load_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def _create_dataset(cls, options: argparse.Namespace):
path=os.path.join(os.path.dirname(os.path.abspath(__file__)), dataset_file),
prompt="\n\n" + prompt,
tokenizer_path=options.tokenizer,
chat=options.chat,
chat=options.chat and not getattr(options, "rerank", False),
num_tokens=options.prompt_tokens,
common_tokens=options.prompt_cache_max_len,
)
Expand Down Expand Up @@ -578,14 +578,30 @@ def post_response_hook(self, headers, num_tokens, perf_metrics=None):

class OpenAIProvider(BaseProvider):
def get_url(self):
if self.parsed_options.embeddings:
if self.parsed_options.rerank:
return "/v1/rerank"
elif self.parsed_options.embeddings:
return "/v1/embeddings"
elif self.parsed_options.chat:
return "/v1/chat/completions"
else:
return "/v1/completions"

def format_payload(self, prompt, max_tokens, images):
if self.parsed_options.rerank:
documents = [doc.strip() for doc in prompt.split("\n\n") if doc.strip()]
query = self.parsed_options.rerank_query or "Find the most relevant document."
data = {
"model": self.model,
"query": query,
"documents": documents,
}
if self.parsed_options.rerank_top_n is not None:
data["top_n"] = self.parsed_options.rerank_top_n
if not self.parsed_options.rerank_return_documents:
data["return_documents"] = False
return data

if self.parsed_options.embeddings:
data = {
"model": self.model,
Expand Down Expand Up @@ -650,6 +666,17 @@ def format_payload(self, prompt, max_tokens, images):
return data

def parse_output_json(self, data):
if self.parsed_options.rerank:
usage = data.get("usage", {})
scores = [f"{item['index']}:{item['relevance_score']:.4f}" for item in data.get("data", [])]
return ChunkMetadata(
text=", ".join(scores),
logprob_tokens=None,
completion_tokens=None,
prompt_tokens=usage.get("prompt_tokens"),
cached_tokens=None,
)

if self.parsed_options.embeddings:
return ChunkMetadata(
text=data["data"][0]["embedding"],
Expand Down Expand Up @@ -698,6 +725,8 @@ def parse_output_json(self, data):
class FireworksProvider(OpenAIProvider):
def format_payload(self, prompt, max_tokens, images):
data = super().format_payload(prompt, max_tokens, images)
if self.parsed_options.rerank:
return data
# Enable perf_metrics_in_response to get speculation stats in streaming responses
data["perf_metrics_in_response"] = True
# Add prompt_cache_max_pct if specified (Fireworks-specific parameter)
Expand Down Expand Up @@ -1104,6 +1133,14 @@ def _do_generate_text(self):
print(f"WARNING: Received more chunks after [DONE]: {chunk}")
try:
now = time.perf_counter()
if self.provider_formatter.parsed_options.rerank:
t_first_token = now
out = self.provider_formatter.parse_output_json(orjson.loads(chunk))
if out.prompt_tokens:
prompt_tokens = out.prompt_tokens
if self.environment.parsed_options.show_response:
combined_text = out.text
break
if self.provider_formatter.parsed_options.embeddings:
t_first_token = now
if self.environment.parsed_options.show_response:
Expand Down Expand Up @@ -1175,7 +1212,7 @@ def _do_generate_text(self):
dur_generation = now - t_first_token
dur_first_token = t_first_token - t_start

if not self.provider_formatter.parsed_options.embeddings:
if not (self.provider_formatter.parsed_options.embeddings or self.provider_formatter.parsed_options.rerank):
prompt_tokens = prompt_tokens or self.prompt_tokenizer_tokens

token_parts = []
Expand Down Expand Up @@ -1209,7 +1246,7 @@ def _do_generate_text(self):
num_tokens,
)

if not self.provider_formatter.parsed_options.embeddings:
if not (self.provider_formatter.parsed_options.embeddings or self.provider_formatter.parsed_options.rerank):
if prompt_tokens:
add_custom_metric("prompt_tokens", prompt_tokens)
if cached_tokens is not None:
Expand Down Expand Up @@ -1293,6 +1330,31 @@ def init_parser(parser):
default=False,
help="Use /v1/embeddings API",
)
parser.add_argument(
"--rerank",
action=argparse.BooleanOptionalAction,
default=False,
help="Use /v1/rerank API. The generated prompt text is split into documents (by paragraph), "
"and --rerank-query is used as the query.",
)
parser.add_argument(
"--rerank-query",
type=str,
default=None,
help="For rerank: the search query string. Defaults to a generic query if not specified.",
)
parser.add_argument(
"--rerank-top-n",
type=int,
default=None,
help="For rerank: number of top results to return.",
)
parser.add_argument(
"--rerank-return-documents",
action=argparse.BooleanOptionalAction,
default=True,
help="For rerank: whether to return document text in response.",
)
parser.add_argument(
"--return-logits",
type=int,
Expand Down