Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
186 changes: 176 additions & 10 deletions packages/prime/src/prime_cli/commands/env.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import asyncio
import hashlib
import json
import os
Expand Down Expand Up @@ -28,6 +29,11 @@
from ..utils.env_metadata import find_environment_metadata
from ..utils.eval_push import push_eval_results_to_hub
from ..utils.formatters import format_file_size
from ..utils.hosted_eval import (
HostedEvalConfig,
print_hosted_result,
run_hosted_evaluation,
)

app = typer.Typer(help="Manage verifiers environments", no_args_is_help=True)
console = Console()
Expand Down Expand Up @@ -2076,9 +2082,17 @@ def run_eval(
env_path: Optional[str],
endpoints_path: Optional[str] = None,
headers: Optional[List[str]] = None,
hosted: bool = False,
poll_interval: float = 10.0,
no_stream_logs: bool = False,
timeout_minutes: Optional[int] = None,
allow_sandbox_access: bool = False,
allow_instances_access: bool = False,
custom_secrets: Optional[str] = None,
eval_name: Optional[str] = None,
) -> None:
"""
Run verifiers' vf-eval with Prime Inference
Run verifiers' vf-eval with Prime Inference (local) or as a hosted evaluation on the platform.
"""
is_slug = (
"/" in environment and not environment.startswith("./") and not environment.startswith("/")
Expand All @@ -2087,31 +2101,177 @@ def run_eval(
upstream_owner = None
upstream_name = None
env_name_for_vf_eval = environment
environment_id = None
requested_version = "latest"

if is_slug:
env_slug = environment
requested_version = "latest"
if "@" in environment:
env_slug, requested_version = environment.rsplit("@", 1)

parts = env_slug.split("/")
if len(parts) == 2 and parts[0] and parts[1]:
upstream_owner, upstream_name = parts
env_name_for_vf_eval = upstream_name
else:
console.print(f"[red]Invalid environment slug format: {environment}[/red]")
raise typer.Exit(1)

if hosted:
if not is_slug or not upstream_owner or not upstream_name:
metadata = find_environment_metadata(
env_name=environment,
env_path=Path(env_path) if env_path else None,
)

console.print(
f"[dim]Using upstream environment {upstream_owner}/{upstream_name}[/dim]\n"
"[red]Error: Hosted evaluations require environment slug (owner/name).[/red]"
)

if not _is_environment_installed(upstream_name, requested_version):
console.print(f"[cyan]Installing {environment}...[/cyan]")
if not _install_single_environment(environment):
raise typer.Exit(1)
if metadata and metadata.get("owner") and metadata.get("name"):
suggested_slug = f"{metadata['owner']}/{metadata['name']}"
console.print("[yellow]Tip:[/yellow] Found local environment metadata.")
console.print(f"[dim]Try:[/dim] prime eval {suggested_slug} --hosted")
else:
console.print(
f"[dim]Example: prime eval primeintellect/{environment} --hosted[/dim]"
)

raise typer.Exit(1)

client = APIClient(require_auth=False)
try:
env_details = fetch_environment_details(
client, upstream_owner, upstream_name, requested_version
)
environment_id = env_details.get("id")
except APIError as e:
console.print(f"[red]Error: Environment '{environment}' not found on the hub.[/red]")
console.print(f"[dim]{e}[/dim]")
console.print()

metadata = find_environment_metadata(
env_name=upstream_name, env_path=Path(env_path) if env_path else None
)

if metadata and metadata.get("owner") == upstream_owner:
console.print(
"[yellow]Found local environment that hasn't been pushed yet.[/yellow]"
)
console.print()

is_resolved = True
else:
console.print(f"[red]Invalid environment slug format: {environment}[/red]")
should_push = typer.confirm(
"Would you like to push this environment to the hub now?", default=True
)

if should_push:
console.print()
console.print("[cyan]Pushing environment to hub...[/cyan]")

env_dir = env_path if env_path else Path.cwd()
result = subprocess.run(
["prime", "env", "push"], cwd=env_dir, capture_output=False, text=True
)

if result.returncode != 0:
console.print("[red]Failed to push environment.[/red]")
raise typer.Exit(1)

console.print()
console.print("[green]✓ Environment pushed successfully![/green]")
console.print("[cyan]Continuing with hosted evaluation...[/cyan]")
console.print()

try:
env_details = fetch_environment_details(
client, upstream_owner, upstream_name, requested_version
)
environment_id = env_details.get("id")
except APIError as e2:
console.print(
f"[red]Error: Still couldn't find environment after push: {e2}[/red]"
)
raise typer.Exit(1)
else:
console.print()
console.print("[yellow]Cancelled. To push manually, run:[/yellow]")
console.print(" prime env push")
raise typer.Exit(1)
else:
console.print("[dim]To publish your environment, run:[/dim]")
console.print(" prime env push")
raise typer.Exit(1)

if not environment_id:
console.print(f"[red]Error: Could not get environment ID for '{environment}'[/red]")
raise typer.Exit(1)

console.print(f"[dim]Using environment {upstream_owner}/{upstream_name}[/dim]\n")

# Parse env_args JSON if provided
parsed_env_args = None
if env_args:
try:
parsed_env_args = json.loads(env_args)
except json.JSONDecodeError as e:
console.print(f"[red]Error parsing --env-args: {e}[/red]")
raise typer.Exit(1)

# Parse custom_secrets JSON if provided
parsed_custom_secrets = None
if custom_secrets:
try:
parsed_custom_secrets = json.loads(custom_secrets)
except json.JSONDecodeError as e:
console.print(f"[red]Error parsing --custom-secrets: {e}[/red]")
raise typer.Exit(1)

# Create hosted eval config
hosted_config = HostedEvalConfig(
environment_id=environment_id,
inference_model=model,
num_examples=num_examples if num_examples is not None else 5,
rollouts_per_example=rollouts_per_example if rollouts_per_example is not None else 3,
env_args=parsed_env_args,
name=eval_name,
timeout_minutes=timeout_minutes,
allow_sandbox_access=allow_sandbox_access,
allow_instances_access=allow_instances_access,
custom_secrets=parsed_custom_secrets,
)

try:
result = asyncio.run(
run_hosted_evaluation(
config=hosted_config,
poll_interval=poll_interval,
stream_logs=not no_stream_logs,
)
)
print_hosted_result(result)

if result.status != "COMPLETED":
raise typer.Exit(1)
except APIError as e:
console.print(f"[red]Hosted evaluation failed: {e}[/red]")
raise typer.Exit(1)

return

if is_slug:
console.print(f"[dim]Using upstream environment {upstream_owner}/{upstream_name}[/dim]\n")

requested_version = "latest"
if "@" in environment:
_, requested_version = environment.rsplit("@", 1)

if not _is_environment_installed(env_name_for_vf_eval, requested_version):
console.print(f"[cyan]Installing {environment}...[/cyan]")
if not _install_single_environment(environment):
raise typer.Exit(1)
console.print()

is_resolved = True
else:
check_path = Path(env_path) if env_path else Path.cwd()
is_resolved = display_upstream_environment_info(
Expand Down Expand Up @@ -2392,6 +2552,11 @@ def eval_env(
"(used to locate .prime/.env-metadata.json for upstream resolution)"
),
),
hosted: bool = typer.Option(
False,
"--hosted",
help="Run evaluation on the Prime Intellect platform instead of locally",
),
) -> None:
"""Use 'prime eval' instead."""

Expand Down Expand Up @@ -2423,4 +2588,5 @@ def eval_env(
env_path=env_path,
endpoints_path=None,
headers=None,
hosted=hosted,
)
51 changes: 50 additions & 1 deletion packages/prime/src/prime_cli/commands/evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,13 +641,54 @@ def run_eval_cmd(
"--header",
help="Extra HTTP header for inference API ('Name: Value'). Repeatable.",
),
hosted: bool = typer.Option(
False,
"--hosted",
help="Run evaluation on the platform instead of locally",
),
poll_interval: float = typer.Option(
10.0,
"--poll-interval",
help="Polling interval in seconds for hosted evaluation status",
),
no_stream_logs: bool = typer.Option(
False,
"--no-stream-logs",
help="Disable log streaming for hosted evaluations",
),
timeout_minutes: Optional[int] = typer.Option(
None,
"--timeout-minutes",
help="Timeout in minutes for hosted evaluation (default: 120, max: 1440)",
),
allow_sandbox_access: bool = typer.Option(
False,
"--allow-sandbox-access",
help="Allow sandbox read/write access for hosted evaluations",
),
allow_instances_access: bool = typer.Option(
False,
"--allow-instances-access",
help="Allow pod/instance creation and management for hosted evaluations",
),
custom_secrets: Optional[str] = typer.Option(
None,
"--custom-secrets",
help='Custom secrets for hosted eval as JSON (e.g., \'{"API_KEY": "xxx"}\')',
),
eval_name: Optional[str] = typer.Option(
None,
"--eval-name",
help="Custom name for the hosted evaluation",
),
) -> None:
"""
Run verifiers' vf-eval with Prime Inference.
Run verifiers' vf-eval with Prime Inference (local) or on the platform (--hosted).

Examples:
prime eval run primeintellect/wordle -m openai/gpt-4.1-mini -n 5
prime eval run wordle -m openai/gpt-4.1-mini -n 2 -r 3 -t 1024 -T 0.7
prime eval run primeintellect/gsm8k --hosted -m openai/gpt-4.1-mini -n 10
"""
run_eval(
environment=environment,
Expand Down Expand Up @@ -677,4 +718,12 @@ def run_eval_cmd(
env_path=env_path,
endpoints_path=endpoints_path,
headers=header,
hosted=hosted,
poll_interval=poll_interval,
no_stream_logs=no_stream_logs,
timeout_minutes=timeout_minutes,
allow_sandbox_access=allow_sandbox_access,
allow_instances_access=allow_instances_access,
custom_secrets=custom_secrets,
eval_name=eval_name,
)
Loading