diff --git a/.changeset/floppy-experts-wash.md b/.changeset/early-hats-read.md similarity index 63% rename from .changeset/floppy-experts-wash.md rename to .changeset/early-hats-read.md index 79a30b391..341c69fba 100644 --- a/.changeset/floppy-experts-wash.md +++ b/.changeset/early-hats-read.md @@ -2,4 +2,4 @@ "@browserbasehq/stagehand": patch --- -remove unnecessary log +improve logging in agent diff --git a/.changeset/fifty-cats-sell.md b/.changeset/fifty-cats-sell.md deleted file mode 100644 index dfc981460..000000000 --- a/.changeset/fifty-cats-sell.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"@browserbasehq/stagehand": minor ---- - -extract links diff --git a/.changeset/fluffy-zoos-joke.md b/.changeset/fluffy-zoos-joke.md new file mode 100644 index 000000000..4485ca6f5 --- /dev/null +++ b/.changeset/fluffy-zoos-joke.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +move extract handler response log to after URL injection diff --git a/.changeset/gold-women-sell.md b/.changeset/gold-women-sell.md new file mode 100644 index 000000000..916342208 --- /dev/null +++ b/.changeset/gold-women-sell.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +export tool function & type to simplify defining custom tools diff --git a/.changeset/vast-vans-crash.md b/.changeset/hungry-lemons-push.md similarity index 55% rename from .changeset/vast-vans-crash.md rename to .changeset/hungry-lemons-push.md index 3fdc06f83..e0197417b 100644 --- a/.changeset/vast-vans-crash.md +++ b/.changeset/hungry-lemons-push.md @@ -2,4 +2,4 @@ "@browserbasehq/stagehand": patch --- -Fixes a redundant unnecessary log +add waitForTimeout to page diff --git a/.changeset/mean-jars-cross.md b/.changeset/mean-jars-cross.md new file mode 100644 index 000000000..5ba161e11 --- /dev/null +++ b/.changeset/mean-jars-cross.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +Optimize screenshot handling in agent hybrid mode diff --git a/.changeset/mean-melons-repeat.md b/.changeset/mean-melons-repeat.md new file mode 100644 index 000000000..dce1d3f47 --- /dev/null +++ b/.changeset/mean-melons-repeat.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +fix: replaying cached actions (for agent & act) now uses the originally defined model, (instead of default model) when action fails and rerunning inference is needed diff --git a/.changeset/proud-ads-live.md b/.changeset/proud-ads-live.md new file mode 100644 index 000000000..696d67821 --- /dev/null +++ b/.changeset/proud-ads-live.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +Recommend hybrid mode over DOM mode in agent, which is now considered legacy diff --git a/.changeset/quick-games-try.md b/.changeset/quick-games-try.md new file mode 100644 index 000000000..da6f7b87b --- /dev/null +++ b/.changeset/quick-games-try.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +Add structured output to agent result + ensure close tool is always called diff --git a/.changeset/silent-wolves-sell.md b/.changeset/silent-wolves-sell.md new file mode 100644 index 000000000..8e6e1224b --- /dev/null +++ b/.changeset/silent-wolves-sell.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +Fix ControlOrMeta keypress event diff --git a/.changeset/silly-emus-knock.md b/.changeset/silly-emus-knock.md new file mode 100644 index 000000000..8602be6b1 --- /dev/null +++ b/.changeset/silly-emus-knock.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +Update agent to only calculate xpath when caching is enabled diff --git a/.changeset/silly-rooms-grow.md b/.changeset/silly-rooms-grow.md new file mode 100644 index 000000000..259b22da5 --- /dev/null +++ b/.changeset/silly-rooms-grow.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +add support for page.waitForSelector() diff --git a/.changeset/solid-rice-admire.md b/.changeset/solid-rice-admire.md deleted file mode 100644 index 2f0291d02..000000000 --- a/.changeset/solid-rice-admire.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"@browserbasehq/stagehand": minor ---- - -Added Gemini 2.5 Flash to Google supported models diff --git a/.changeset/strong-ideas-guess.md b/.changeset/strong-ideas-guess.md new file mode 100644 index 000000000..4a632a4b1 --- /dev/null +++ b/.changeset/strong-ideas-guess.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +update agent message handling diff --git a/.changeset/tiny-pens-serve.md b/.changeset/tiny-pens-serve.md new file mode 100644 index 000000000..3b42a8de2 --- /dev/null +++ b/.changeset/tiny-pens-serve.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +add page.snapshot() for capturing a stringified DOM snapshot of the page, including an xpath map & url map diff --git a/.cursorrules b/.cursorrules index fe68bca7b..eb86cb052 100644 --- a/.cursorrules +++ b/.cursorrules @@ -1,140 +1,263 @@ # Stagehand Project -This is a project that uses Stagehand, which amplifies Playwright with `act`, `extract`, and `observe` added to the Page class. +This is a project that uses Stagehand V3, a browser automation framework with AI-powered `act`, `extract`, `observe`, and `agent` methods. -`Stagehand` is a class that provides config, a `StagehandPage` object via `stagehand.page`, and a `StagehandContext` object via `stagehand.context`. +The main class can be imported as `Stagehand` from `@browserbasehq/stagehand`. -`Page` is a class that extends the Playwright `Page` class and adds `act`, `extract`, and `observe` methods. -`Context` is a class that extends the Playwright `BrowserContext` class. +**Key Classes:** -Use the following rules to write code for this project. +- `Stagehand`: Main orchestrator class providing `act`, `extract`, `observe`, and `agent` methods +- `context`: A `V3Context` object that manages browser contexts and pages +- `page`: Individual page objects accessed via `stagehand.context.pages()[i]` or created with `stagehand.context.newPage()` -- To take an action on the page like "click the sign in button", use Stagehand `act` like this: +## Initialize ```typescript -await page.act("Click the sign in button"); +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "LOCAL", // or "BROWSERBASE" + verbose: 2, // 0, 1, or 2 + model: "openai/gpt-4.1-mini", // or any supported model +}); + +await stagehand.init(); + +// Access the browser context and pages +const page = stagehand.context.pages()[0]; +const context = stagehand.context; + +// Create new pages if needed +const page2 = await stagehand.context.newPage(); ``` -- To plan an instruction before taking an action, use Stagehand `observe` to get the action to execute. +## Act + +Actions are called on the `stagehand` instance (not the page). Use atomic, specific instructions: ```typescript -const [action] = await page.observe("Click the sign in button"); +// Act on the current active page +await stagehand.act("click the sign in button"); + +// Act on a specific page (when you need to target a page that isn't currently active) +await stagehand.act("click the sign in button", { page: page2 }); ``` -- The result of `observe` is an array of `ObserveResult` objects that can directly be used as params for `act` like this: +**Important:** Act instructions should be atomic and specific: - ```typescript - const [action] = await page.observe("Click the sign in button"); - await page.act(action); - ``` +- ✅ Good: "Click the sign in button" or "Type 'hello' into the search input" +- ❌ Bad: "Order me pizza" or "Type in the search bar and hit enter" (multi-step) -- When writing code that needs to extract data from the page, use Stagehand `extract`. Explicitly pass the following params by default: +### Observe + Act Pattern (Recommended) + +Cache the results of `observe` to avoid unexpected DOM changes: + +```typescript +const instruction = "Click the sign in button"; + +// Get candidate actions +const actions = await stagehand.observe(instruction); + +// Execute the first action +await stagehand.act(actions[0]); +``` + +To target a specific page: ```typescript -const { someValue } = await page.extract({ - instruction: the instruction to execute, - schema: z.object({ - someValue: z.string(), - }), // The schema to extract +const actions = await stagehand.observe("select blue as the favorite color", { + page: page2, }); +await stagehand.act(actions[0], { page: page2 }); ``` -## Initialize +## Extract + +Extract data from pages using natural language instructions. The `extract` method is called on the `stagehand` instance. + +### Basic Extraction (with schema) ```typescript -import { Stagehand } from "@browserbasehq/stagehand"; -import StagehandConfig from "./stagehand.config"; +import { z } from "zod/v3"; + +// Extract with explicit schema +const data = await stagehand.extract( + "extract all apartment listings with prices and addresses", + z.object({ + listings: z.array( + z.object({ + price: z.string(), + address: z.string(), + }), + ), + }), +); -const stagehand = new Stagehand(StagehandConfig); -await stagehand.init(); +console.log(data.listings); +``` + +### Simple Extraction (without schema) + +```typescript +// Extract returns a default object with 'extraction' field +const result = await stagehand.extract("extract the sign in button text"); + +console.log(result); +// Output: { extraction: "Sign in" } -const page = stagehand.page; // Playwright Page with act, extract, and observe methods -const context = stagehand.context; // Playwright BrowserContext +// Or destructure directly +const { extraction } = await stagehand.extract( + "extract the sign in button text", +); +console.log(extraction); // "Sign in" ``` -## Act +### Targeted Extraction -You can cache the results of `observe` and use them as params for `act` like this: +Extract data from a specific element using a selector: ```typescript -const instruction = "Click the sign in button"; -const cachedAction = await getCache(instruction); +const reason = await stagehand.extract( + "extract the reason why script injection fails", + z.string(), + { selector: "/html/body/div[2]/div[3]/iframe/html/body/p[2]" }, +); +``` -if (cachedAction) { - await page.act(cachedAction); -} else { - try { - const results = await page.observe(instruction); - await setCache(instruction, results); - await page.act(results[0]); - } catch (error) { - await page.act(instruction); // If the action is not cached, execute the instruction directly - } -} +### URL Extraction + +When extracting links or URLs, use `z.string().url()`: + +```typescript +const { links } = await stagehand.extract( + "extract all navigation links", + z.object({ + links: z.array(z.string().url()), + }), +); ``` -Be sure to cache the results of `observe` and use them as params for `act` to avoid unexpected DOM changes. Using `act` without caching will result in more unpredictable behavior. +### Extracting from a Specific Page -Act `action` should be as atomic and specific as possible, i.e. "Click the sign in button" or "Type 'hello' into the search input". -AVOID actions that are more than one step, i.e. "Order me pizza" or "Type in the search bar and hit enter". +```typescript +// Extract from a specific page (when you need to target a page that isn't currently active) +const data = await stagehand.extract( + "extract the placeholder text on the name field", + { page: page2 }, +); +``` -## Extract +## Observe -If you are writing code that needs to extract data from the page, use Stagehand `extract`. +Plan actions before executing them. Returns an array of candidate actions: ```typescript -const signInButtonText = await page.extract("extract the sign in button text"); +// Get candidate actions on the current active page +const [action] = await stagehand.observe("Click the sign in button"); + +// Execute the action +await stagehand.act(action); ``` -You can also pass in params like an output schema in Zod, and a flag to use text extraction: +Observing on a specific page: ```typescript -const data = await page.extract({ - instruction: "extract the sign in button text", - schema: z.object({ - text: z.string(), - }), +// Target a specific page (when you need to target a page that isn't currently active) +const actions = await stagehand.observe("find the next page button", { + page: page2, }); +await stagehand.act(actions[0], { page: page2 }); ``` -`schema` is a Zod schema that describes the data you want to extract. To extract an array, make sure to pass in a single object that contains the array, as follows: +## Agent + +Use the `agent` method to autonomously execute complex, multi-step tasks. + +### Basic Agent Usage ```typescript -const data = await page.extract({ - instruction: "extract the text inside all buttons", - schema: z.object({ - text: z.array(z.string()), - }), - useTextExtract: true, // Set true for larger-scale extractions (multiple paragraphs), or set false for small extractions (name, birthday, etc) +const page = stagehand.context.pages()[0]; +await page.goto("https://www.google.com"); + +const agent = stagehand.agent({ + model: "google/gemini-2.0-flash", + executionModel: "google/gemini-2.0-flash", +}); + +const result = await agent.execute({ + instruction: "Search for the stock price of NVDA", + maxSteps: 20, }); + +console.log(result.message); ``` -## Agent +### Computer Use Agent (CUA) -Use the `agent` method to automonously execute larger tasks like "Get the stock price of NVDA" +For more advanced scenarios using computer-use models: ```typescript -// Navigate to a website -await stagehand.page.goto("https://www.google.com"); +const agent = stagehand.agent({ + mode: "cua", // Enable Computer Use Agent mode + model: "anthropic/claude-sonnet-4-20250514", + // or "google/gemini-2.5-computer-use-preview-10-2025" + systemPrompt: `You are a helpful assistant that can use a web browser. + Do not ask follow up questions, the user will trust your judgement.`, +}); + +await agent.execute({ + instruction: "Apply for a library card at the San Francisco Public Library", + maxSteps: 30, +}); +``` +### Agent with Custom Model Configuration + +```typescript const agent = stagehand.agent({ - // You can use either OpenAI or Anthropic - provider: "openai", - // The model to use (claude-3-7-sonnet-20250219 or claude-3-5-sonnet-20240620 for Anthropic) - model: "computer-use-preview", - - // Customize the system prompt - instructions: `You are a helpful assistant that can use a web browser. - Do not ask follow up questions, the user will trust your judgement.`, - - // Customize the API key - options: { - apiKey: process.env.OPENAI_API_KEY, + model: { + modelName: "google/gemini-2.5-computer-use-preview-10-2025", + apiKey: process.env.GEMINI_API_KEY, }, + systemPrompt: `You are a helpful assistant.`, }); +``` -// Execute the agent -await agent.execute( - "Apply for a library card at the San Francisco Public Library" -); +### Agent with Integrations (MCP/External Tools) + +```typescript +const agent = stagehand.agent({ + integrations: [`https://mcp.exa.ai/mcp?exaApiKey=${process.env.EXA_API_KEY}`], + systemPrompt: `You have access to the Exa search tool.`, +}); +``` + +## Advanced Features + +### DeepLocator (XPath Targeting) + +Target specific elements across shadow DOM and iframes: + +```typescript +await page + .deepLocator("/html/body/div[2]/div[3]/iframe/html/body/p") + .highlight({ + durationMs: 5000, + contentColor: { r: 255, g: 0, b: 0 }, + }); +``` + +### Multi-Page Workflows + +```typescript +const page1 = stagehand.context.pages()[0]; +await page1.goto("https://example.com"); + +const page2 = await stagehand.context.newPage(); +await page2.goto("https://example2.com"); + +// Act/extract/observe operate on the current active page by default +// Pass { page } option to target a specific page +await stagehand.act("click button", { page: page1 }); +await stagehand.extract("get title", { page: page2 }); ``` diff --git a/.env.example b/.env.example index f7b468d6f..f25a24d6e 100644 --- a/.env.example +++ b/.env.example @@ -9,4 +9,4 @@ ENABLE_CACHING=false EVAL_MODELS="gpt-4o,claude-3-5-sonnet-latest" EXPERIMENTAL_EVAL_MODELS="gpt-4o,claude-3-5-sonnet-latest,o1-mini,o1-preview" EVAL_CATEGORIES="observe,act,combination,extract,experimental" -STAGEHAND_API_URL="http://localhost:80" +AGENT_EVAL_MAX_STEPS=50 \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 000000000..47f85d4d2 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,76 @@ +--- +name: Bug report +about: Detailed descriptions help us resolve faster +title: '' +labels: '' +assignees: '' + +--- + +**Before submitting an issue, please:** + +- [ ] Check the [documentation](https://docs.stagehand.dev/) for relevant information +- [ ] Search existing [issues](https://github.com/browserbase/stagehand/issues) to avoid duplicates + +## Environment Information + +Please provide the following information to help us reproduce and resolve your issue: + +**Stagehand:** + +- Language/SDK: [TypeScript, Python, MCP…] +- Stagehand version: [e.g., 1.0.0] + +**AI Provider:** + +- Provider: [e.g., OpenAI, Anthropic, Azure OpenAI] +- Model: [e.g., gpt-4o, claude-3-7-sonnet-latest] + +## Issue Description + +``` +[Describe the current behavior here] + +``` + +### Steps to Reproduce + +1. +2. +3. + +### Minimal Reproduction Code + +```tsx +// Your minimal reproduction code here +import { Stagehand } from '@browserbase/stagehand'; + +const stagehand = new Stagehand({ + // IMPORTANT: include your stagehand config +}); + +// Steps that reproduce the issue + +``` + +### Error Messages / Log trace + +``` +[Paste error messages/logs here] + +``` + +### Screenshots / Videos + +``` +[Attach screenshots or videos here] + +``` + +### Related Issues + +Are there any related issues or PRs? + +- Related to: #[issue number] +- Duplicate of: #[issue number] +- Blocks: #[issue number] diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 000000000..75889eb82 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,23 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: '' +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Are you willing to contribute to implementing this feature or fix?** + +- [ ] Yes, I can submit a PR +- [ ] Yes, but I need guidance +- [ ] No, I cannot contribute at this time diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 66fdc2dd8..35f415ff1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,147 +7,232 @@ on: - synchronize - labeled - unlabeled + paths-ignore: + - "packages/docs/**" env: - EVAL_MODELS: "gpt-4o,gpt-4o-mini,claude-3-5-sonnet-latest" - EVAL_CATEGORIES: "observe,act,combination,extract,text_extract,targeted_extract" + EVAL_MODELS: "openai/gpt-4.1,google/gemini-2.0-flash,anthropic/claude-haiku-4-5" + EVAL_CATEGORIES: "observe,act,combination,extract,targeted_extract,agent" + EVAL_MAX_CONCURRENCY: 25 + EVAL_TRIAL_COUNT: 3 concurrency: group: ${{ github.ref }} cancel-in-progress: true jobs: + determine-changes: + runs-on: ubuntu-latest + outputs: + core: ${{ steps.filter.outputs.core }} + evals: ${{ steps.filter.outputs.evals }} + docs-only: ${{ steps.filter.outputs.docs-only }} + steps: + - name: Check out repository code + uses: actions/checkout@v4 + + - uses: dorny/paths-filter@v3 + id: filter + with: + filters: | + core: + - '.github/workflows/ci.yml' + - 'packages/core/**' + - 'package.json' + - 'pnpm-lock.yaml' + - 'turbo.json' + evals: + - 'packages/evals/**' + - 'package.json' + - 'pnpm-lock.yaml' + docs-only: + - '**/*.md' + - 'examples/**' + - '!packages/**/*.md' + determine-evals: + needs: [determine-changes] runs-on: ubuntu-latest outputs: + skip-all-evals: ${{ steps.check-labels.outputs.skip-all-evals }} + run-regression: ${{ steps.check-labels.outputs.run-regression }} run-combination: ${{ steps.check-labels.outputs.run-combination }} run-extract: ${{ steps.check-labels.outputs.run-extract }} run-act: ${{ steps.check-labels.outputs.run-act }} run-observe: ${{ steps.check-labels.outputs.run-observe }} - run-text-extract: ${{ steps.check-labels.outputs.run-text-extract }} run-targeted-extract: ${{ steps.check-labels.outputs.run-targeted-extract }} + run-agent: ${{ steps.check-labels.outputs.run-agent }} steps: - id: check-labels run: | + # Check if skip-evals label is present + if [[ "${{ contains(github.event.pull_request.labels.*.name, 'skip-evals') }}" == "true" ]]; then + echo "skip-evals label found - skipping all evals" + echo "skip-all-evals=true" >> $GITHUB_OUTPUT + echo "run-regression=false" >> $GITHUB_OUTPUT + echo "run-combination=false" >> $GITHUB_OUTPUT + echo "run-extract=false" >> $GITHUB_OUTPUT + echo "run-act=false" >> $GITHUB_OUTPUT + echo "run-observe=false" >> $GITHUB_OUTPUT + echo "run-targeted-extract=false" >> $GITHUB_OUTPUT + echo "run-agent=false" >> $GITHUB_OUTPUT + exit 0 + fi + + # Skip evals if only docs/examples changed (and not on main) + if [[ "${{ needs.determine-changes.outputs.docs-only }}" == "true" && "${{ needs.determine-changes.outputs.core }}" == "false" && "${{ needs.determine-changes.outputs.evals }}" == "false" && "${{ github.ref }}" != "refs/heads/main" ]]; then + echo "Only docs/examples changed - skipping evals" + echo "skip-all-evals=true" >> $GITHUB_OUTPUT + echo "run-regression=false" >> $GITHUB_OUTPUT + echo "run-combination=false" >> $GITHUB_OUTPUT + echo "run-extract=false" >> $GITHUB_OUTPUT + echo "run-act=false" >> $GITHUB_OUTPUT + echo "run-observe=false" >> $GITHUB_OUTPUT + echo "run-targeted-extract=false" >> $GITHUB_OUTPUT + echo "run-agent=false" >> $GITHUB_OUTPUT + exit 0 + fi + # Default to running all tests on main branch if [[ "${{ github.ref }}" == "refs/heads/main" ]]; then echo "Running all tests for main branch" + echo "skip-all-evals=false" >> $GITHUB_OUTPUT + echo "run-regression=true" >> $GITHUB_OUTPUT echo "run-combination=true" >> $GITHUB_OUTPUT echo "run-extract=true" >> $GITHUB_OUTPUT echo "run-act=true" >> $GITHUB_OUTPUT echo "run-observe=true" >> $GITHUB_OUTPUT - echo "run-text-extract=true" >> $GITHUB_OUTPUT echo "run-targeted-extract=true" >> $GITHUB_OUTPUT + echo "run-agent=true" >> $GITHUB_OUTPUT exit 0 fi + # Check for skip-regression-evals label + if [[ "${{ contains(github.event.pull_request.labels.*.name, 'skip-regression-evals') }}" == "true" ]]; then + echo "skip-regression-evals label found - regression evals will be skipped" + echo "run-regression=false" >> $GITHUB_OUTPUT + else + echo "Regression evals will run by default" + echo "run-regression=true" >> $GITHUB_OUTPUT + fi + # Check for specific labels + echo "skip-all-evals=false" >> $GITHUB_OUTPUT echo "run-combination=${{ contains(github.event.pull_request.labels.*.name, 'combination') }}" >> $GITHUB_OUTPUT echo "run-extract=${{ contains(github.event.pull_request.labels.*.name, 'extract') }}" >> $GITHUB_OUTPUT echo "run-act=${{ contains(github.event.pull_request.labels.*.name, 'act') }}" >> $GITHUB_OUTPUT echo "run-observe=${{ contains(github.event.pull_request.labels.*.name, 'observe') }}" >> $GITHUB_OUTPUT - echo "run-text-extract=${{ contains(github.event.pull_request.labels.*.name, 'text-extract') }}" >> $GITHUB_OUTPUT echo "run-targeted-extract=${{ contains(github.event.pull_request.labels.*.name, 'targeted-extract') }}" >> $GITHUB_OUTPUT + echo "run-agent=${{ contains(github.event.pull_request.labels.*.name, 'agent') }}" >> $GITHUB_OUTPUT run-lint: + needs: [determine-changes] + if: needs.determine-changes.outputs.core == 'true' || needs.determine-changes.outputs.evals == 'true' runs-on: ubuntu-latest steps: - name: Check out repository code uses: actions/checkout@v4 + - name: Setup pnpm + uses: pnpm/action-setup@v4 + - name: Set up Node.js uses: actions/setup-node@v4 with: node-version: "20" + cache: "pnpm" - name: Install dependencies - run: | - rm -rf node_modules - rm -f package-lock.json - npm install + run: pnpm install --frozen-lockfile - name: Run Lint - run: npm run lint + run: pnpm run lint run-build: + needs: [determine-changes] + if: needs.determine-changes.outputs.core == 'true' || needs.determine-changes.outputs.evals == 'true' runs-on: ubuntu-latest steps: - name: Check out repository code uses: actions/checkout@v4 + - name: Setup pnpm + uses: pnpm/action-setup@v4 + - name: Set up Node.js uses: actions/setup-node@v4 with: node-version: "20" + cache: "pnpm" - name: Install dependencies - run: | - rm -rf node_modules - rm -f package-lock.json - npm install + run: pnpm install --frozen-lockfile - name: Run Build - run: npm run build - - run-e2e-tests: - needs: [run-lint, run-build] - runs-on: ubuntu-latest - timeout-minutes: 50 - env: - HEADLESS: true - steps: - - name: Check out repository code - uses: actions/checkout@v4 + run: pnpm run build - - name: Set up Node.js - uses: actions/setup-node@v4 + - name: Upload build artifacts + uses: actions/upload-artifact@v4 with: - node-version: "20" + name: build-artifacts + path: | + packages/core/dist/** + packages/core/lib/** + retention-days: 1 - - name: Install dependencies - run: | - rm -rf node_modules - rm -f package-lock.json - npm install - - - name: Install Playwright browsers - run: npm exec playwright install --with-deps - - - name: Build Stagehand - run: npm run build - - - name: Run E2E Tests (Deterministic Playwright) - run: npm run e2e + - name: Run Vitest + run: pnpm --filter @browserbasehq/stagehand run test:vitest run-e2e-local-tests: needs: [run-lint, run-build] runs-on: ubuntu-latest timeout-minutes: 50 + if: > + github.event_name == 'push' || + (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} + BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} + BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} HEADLESS: true steps: - name: Check out repository code uses: actions/checkout@v4 + - name: Setup pnpm + uses: pnpm/action-setup@v4 + - name: Set up Node.js uses: actions/setup-node@v4 with: node-version: "20" + cache: "pnpm" - - name: Install dependencies + - name: Install stable Chromium run: | - rm -rf node_modules - rm -f package-lock.json - npm install + set -euo pipefail + CHROME_VERSION=$(curl -s https://googlechromelabs.github.io/chrome-for-testing/last-known-good-versions.json | jq -r '.channels.Stable.version') + DOWNLOAD_URL="https://storage.googleapis.com/chrome-for-testing-public/${CHROME_VERSION}/linux64/chrome-linux64.zip" + INSTALL_DIR="${RUNNER_TEMP}/chrome-stable" + mkdir -p "$INSTALL_DIR" + curl -sSL "$DOWNLOAD_URL" -o "$INSTALL_DIR/chrome-linux64.zip" + unzip -q "$INSTALL_DIR/chrome-linux64.zip" -d "$INSTALL_DIR" + CHROME_BIN="$INSTALL_DIR/chrome-linux64/chrome" + chmod +x "$CHROME_BIN" + echo "Installed Chromium version: $CHROME_VERSION" + "$CHROME_BIN" --version + echo "CHROME_PATH=$CHROME_BIN" >> $GITHUB_ENV - - name: Install Playwright browsers - run: npm exec playwright install --with-deps + - name: Install dependencies + run: pnpm install --frozen-lockfile - name: Build Stagehand - run: npm run build + run: pnpm run build - name: Run local E2E Tests (Deterministic Playwright) - run: npm run e2e:local + run: pnpm run e2e:local --log-order=stream run-e2e-bb-tests: needs: [run-lint, run-build] @@ -159,6 +244,7 @@ jobs: env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} HEADLESS: true @@ -166,29 +252,28 @@ jobs: - name: Check out repository code uses: actions/checkout@v4 + - name: Setup pnpm + uses: pnpm/action-setup@v4 + - name: Set up Node.js uses: actions/setup-node@v4 with: node-version: "20" + cache: "pnpm" - name: Install dependencies - run: | - rm -rf node_modules - rm -f package-lock.json - npm install - - - name: Install Playwright browsers - run: npm exec playwright install --with-deps + run: pnpm install --frozen-lockfile - name: Build Stagehand - run: npm run build + run: pnpm run build - name: Run E2E Tests (browserbase) - run: npm run e2e:bb + run: pnpm run e2e:bb --log-order=stream run-regression-evals: needs: - [run-e2e-bb-tests, run-e2e-tests, run-e2e-local-tests, determine-evals] + [run-e2e-bb-tests, run-e2e-local-tests, run-build, determine-evals] + if: needs.determine-evals.outputs.skip-all-evals != 'true' && needs.determine-evals.outputs.run-regression == 'true' runs-on: ubuntu-latest timeout-minutes: 9 outputs: @@ -196,6 +281,7 @@ jobs: env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} @@ -205,25 +291,25 @@ jobs: - name: Check out repository code uses: actions/checkout@v4 + - name: Setup pnpm + uses: pnpm/action-setup@v4 + - name: Set up Node.js uses: actions/setup-node@v4 with: node-version: "20" + cache: "pnpm" - name: Install dependencies - run: | - rm -rf node_modules - rm -f package-lock.json - npm install - - - name: Build Stagehand - run: npm run build + run: pnpm install --frozen-lockfile - - name: Install Playwright browsers - run: npm exec playwright install --with-deps + - name: Download build artifacts + uses: actions/download-artifact@v4 + with: + name: build-artifacts - name: Run Regression Evals - run: npm run evals category regression trials=2 concurrency=20 env=BROWSERBASE + run: pnpm run evals category regression trials=2 concurrency=20 env=BROWSERBASE - name: Log Regression Evals Performance run: | @@ -242,12 +328,13 @@ jobs: fi run-combination-evals: - needs: [run-regression-evals, determine-evals] + needs: [run-regression-evals, run-build, determine-evals] runs-on: ubuntu-latest timeout-minutes: 40 env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} @@ -267,30 +354,30 @@ jobs: echo "has_label=true" >> $GITHUB_OUTPUT fi + - name: Setup pnpm + if: needs.determine-evals.outputs.run-combination == 'true' + uses: pnpm/action-setup@v4 + - name: Set up Node.js if: needs.determine-evals.outputs.run-combination == 'true' uses: actions/setup-node@v4 with: node-version: "20" + cache: "pnpm" - name: Install dependencies if: needs.determine-evals.outputs.run-combination == 'true' - run: | - rm -rf node_modules - rm -f package-lock.json - npm install + run: pnpm install --frozen-lockfile - - name: Build Stagehand + - name: Download build artifacts if: needs.determine-evals.outputs.run-combination == 'true' - run: npm run build - - - name: Install Playwright browsers - if: needs.determine-evals.outputs.run-combination == 'true' - run: npm exec playwright install --with-deps + uses: actions/download-artifact@v4 + with: + name: build-artifacts - name: Run Combination Evals if: needs.determine-evals.outputs.run-combination == 'true' - run: npm run evals category combination + run: pnpm run evals category combination - name: Log Combination Evals Performance if: needs.determine-evals.outputs.run-combination == 'true' @@ -307,12 +394,13 @@ jobs: fi run-act-evals: - needs: [run-combination-evals, determine-evals] + needs: [run-regression-evals, run-build, determine-evals] runs-on: ubuntu-latest timeout-minutes: 25 env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} @@ -332,30 +420,30 @@ jobs: echo "has_label=true" >> $GITHUB_OUTPUT fi + - name: Setup pnpm + if: needs.determine-evals.outputs.run-act == 'true' + uses: pnpm/action-setup@v4 + - name: Set up Node.js if: needs.determine-evals.outputs.run-act == 'true' uses: actions/setup-node@v4 with: node-version: "20" + cache: "pnpm" - name: Install dependencies if: needs.determine-evals.outputs.run-act == 'true' - run: | - rm -rf node_modules - rm -f package-lock.json - npm install - - - name: Build Stagehand - if: needs.determine-evals.outputs.run-act == 'true' - run: npm run build + run: pnpm install --frozen-lockfile - - name: Install Playwright browsers + - name: Download build artifacts if: needs.determine-evals.outputs.run-act == 'true' - run: npm exec playwright install --with-deps + uses: actions/download-artifact@v4 + with: + name: build-artifacts - name: Run Act Evals if: needs.determine-evals.outputs.run-act == 'true' - run: npm run evals category act + run: pnpm run evals category act - name: Log Act Evals Performance if: needs.determine-evals.outputs.run-act == 'true' @@ -375,12 +463,13 @@ jobs: fi run-extract-evals: - needs: [run-act-evals, determine-evals] + needs: [run-regression-evals, run-build, determine-evals] runs-on: ubuntu-latest timeout-minutes: 50 env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} @@ -400,46 +489,37 @@ jobs: echo "has_label=true" >> $GITHUB_OUTPUT fi + - name: Setup pnpm + if: needs.determine-evals.outputs.run-extract == 'true' + uses: pnpm/action-setup@v4 + - name: Set up Node.js if: needs.determine-evals.outputs.run-extract == 'true' uses: actions/setup-node@v4 with: node-version: "20" + cache: "pnpm" - name: Install dependencies if: needs.determine-evals.outputs.run-extract == 'true' - run: | - rm -rf node_modules - rm -f package-lock.json - npm install - - - name: Build Stagehand - if: needs.determine-evals.outputs.run-extract == 'true' - run: npm run build + run: pnpm install --frozen-lockfile - - name: Install Playwright browsers + - name: Download build artifacts if: needs.determine-evals.outputs.run-extract == 'true' - run: npm exec playwright install --with-deps + uses: actions/download-artifact@v4 + with: + name: build-artifacts # 1. Run extract category with domExtract - name: Run Extract Evals (domExtract) if: needs.determine-evals.outputs.run-extract == 'true' - run: npm run evals category extract -- --extract-method=domExtract + run: pnpm run evals category extract -- --extract-method=domExtract - name: Save Extract Dom Results if: needs.determine-evals.outputs.run-extract == 'true' run: mv eval-summary.json eval-summary-extract-dom.json - # 2. Then run extract category with textExtract - - name: Run Extract Evals (textExtract) - if: needs.determine-evals.outputs.run-extract == 'true' - run: npm run evals category extract -- --extract-method=textExtract - - - name: Save Extract Text Results - if: needs.determine-evals.outputs.run-extract == 'true' - run: mv eval-summary.json eval-summary-extract-text.json - - # 3. Log and Compare Extract Evals Performance + # 2. Log and Compare Extract Evals Performance - name: Log and Compare Extract Evals Performance if: needs.determine-evals.outputs.run-extract == 'true' run: | @@ -448,93 +528,20 @@ jobs: echo "DomExtract Extract category score: $dom_score%" echo "View domExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameDom}" - experimentNameText=$(jq -r '.experimentName' eval-summary-extract-text.json) - text_score=$(jq '.categories.extract' eval-summary-extract-text.json) - echo "TextExtract Extract category score: $text_score%" - echo "View textExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameText}" - # If domExtract <80% fail CI if (( $(echo "$dom_score < 80" | bc -l) )); then echo "DomExtract extract category score is below 80%. Failing CI." exit 1 fi - run-text-extract-evals: - needs: [run-extract-evals, determine-evals] - runs-on: ubuntu-latest - timeout-minutes: 120 - env: - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} - BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} - BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} - HEADLESS: true - EVAL_ENV: browserbase - steps: - - name: Check out repository code - uses: actions/checkout@v4 - - - name: Check for 'text-extract' label - id: label-check - run: | - if [ "${{ needs.determine-evals.outputs.run-text-extract }}" != "true" ]; then - echo "has_label=false" >> $GITHUB_OUTPUT - echo "No label for TEXT-EXTRACT. Exiting with success." - else - echo "has_label=true" >> $GITHUB_OUTPUT - fi - - - name: Set up Node.js - if: needs.determine-evals.outputs.run-text-extract == 'true' - uses: actions/setup-node@v4 - with: - node-version: "20" - - - name: Install dependencies - if: needs.determine-evals.outputs.run-text-extract == 'true' - run: | - rm -rf node_modules - rm -f package-lock.json - npm install - - - name: Install Playwright browsers - if: needs.determine-evals.outputs.run-text-extract == 'true' - run: npm exec playwright install --with-deps - - - name: Build Stagehand - if: needs.determine-evals.outputs.run-text-extract == 'true' - run: npm run build - - - name: Run text_extract Evals (textExtract) - if: needs.determine-evals.outputs.run-text-extract == 'true' - run: npm run evals category text_extract -- --extract-method=textExtract - - - name: Save text_extract Results - if: needs.determine-evals.outputs.run-text-extract == 'true' - run: mv eval-summary.json eval-summary-text_extract-text.json - - - name: Log text_extract Evals Performance - if: needs.determine-evals.outputs.run-text-extract == 'true' - run: | - experimentNameText=$(jq -r '.experimentName' eval-summary-text_extract-text.json) - text_score=$(jq '.categories.text_extract' eval-summary-text_extract-text.json) - echo "TextExtract text_extract category score: $text_score%" - echo "View textExtract results: https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentNameText}" - - # If text_score <80% fail CI - if (( $(echo "$text_score < 80" | bc -l) )); then - echo "textExtract text_extract category score is below 80%. Failing CI." - exit 1 - fi - run-observe-evals: - needs: [run-text-extract-evals, determine-evals] + needs: [run-regression-evals, run-build, determine-evals] runs-on: ubuntu-latest timeout-minutes: 60 env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} @@ -554,30 +561,30 @@ jobs: echo "has_label=true" >> $GITHUB_OUTPUT fi + - name: Setup pnpm + if: needs.determine-evals.outputs.run-observe == 'true' + uses: pnpm/action-setup@v4 + - name: Set up Node.js if: needs.determine-evals.outputs.run-observe == 'true' uses: actions/setup-node@v4 with: node-version: "20" + cache: "pnpm" - name: Install dependencies if: needs.determine-evals.outputs.run-observe == 'true' - run: | - rm -rf node_modules - rm -f package-lock.json - npm install - - - name: Install Playwright browsers - if: needs.determine-evals.outputs.run-observe == 'true' - run: npm exec playwright install --with-deps + run: pnpm install --frozen-lockfile - - name: Build Stagehand + - name: Download build artifacts if: needs.determine-evals.outputs.run-observe == 'true' - run: npm run build + uses: actions/download-artifact@v4 + with: + name: build-artifacts - name: Run Observe Evals if: needs.determine-evals.outputs.run-observe == 'true' - run: npm run evals category observe + run: pnpm run evals category observe - name: Log Observe Evals Performance if: needs.determine-evals.outputs.run-observe == 'true' @@ -597,12 +604,13 @@ jobs: fi run-targeted-extract-evals: - needs: [run-observe-evals, determine-evals] + needs: [run-regression-evals, run-build, determine-evals] runs-on: ubuntu-latest timeout-minutes: 60 env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} @@ -622,30 +630,30 @@ jobs: echo "has_label=true" >> $GITHUB_OUTPUT fi + - name: Setup pnpm + if: needs.determine-evals.outputs.run-targeted-extract == 'true' + uses: pnpm/action-setup@v4 + - name: Set up Node.js if: needs.determine-evals.outputs.run-targeted-extract == 'true' uses: actions/setup-node@v4 with: node-version: "20" + cache: "pnpm" - name: Install dependencies if: needs.determine-evals.outputs.run-targeted-extract == 'true' - run: | - rm -rf node_modules - rm -f package-lock.json - npm install + run: pnpm install --frozen-lockfile - - name: Install Playwright browsers + - name: Download build artifacts if: needs.determine-evals.outputs.run-targeted-extract == 'true' - run: npm exec playwright install --with-deps - - - name: Build Stagehand - if: needs.determine-evals.outputs.run-targeted-extract == 'true' - run: npm run build + uses: actions/download-artifact@v4 + with: + name: build-artifacts - name: Run targeted extract Evals if: needs.determine-evals.outputs.run-targeted-extract == 'true' - run: npm run evals category targeted_extract -- --extract-method=textExtract + run: pnpm run evals category targeted_extract - name: Log targeted extract Evals Performance if: needs.determine-evals.outputs.run-targeted-extract == 'true' @@ -663,3 +671,77 @@ jobs: echo "Eval summary not found for targeted_extract category. Failing CI." exit 1 fi + + run-agent-evals: + needs: [run-regression-evals, run-build, determine-evals] + runs-on: ubuntu-latest + timeout-minutes: 90 # Agent evals can be long-running + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} + BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} + BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }} + BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }} + HEADLESS: true + EVAL_ENV: browserbase + # Use agent models for agent evals in CI + EVAL_AGENT_MODELS: "computer-use-preview-2025-03-11,claude-3-7-sonnet-latest" + EVAL_TRIAL_COUNT: 2 # Reduce trials for agent evals + EVAL_MAX_CONCURRENCY: 10 # Lower concurrency for agent evals + steps: + - name: Check out repository code + uses: actions/checkout@v4 + + - name: Check for 'agent' label + id: label-check + run: | + if [ "${{ needs.determine-evals.outputs.run-agent }}" != "true" ]; then + echo "has_label=false" >> $GITHUB_OUTPUT + echo "No label for AGENT. Exiting with success." + else + echo "has_label=true" >> $GITHUB_OUTPUT + fi + + - name: Setup pnpm + if: needs.determine-evals.outputs.run-agent == 'true' + uses: pnpm/action-setup@v4 + + - name: Set up Node.js + if: needs.determine-evals.outputs.run-agent == 'true' + uses: actions/setup-node@v4 + with: + node-version: "20" + cache: "pnpm" + + - name: Install dependencies + if: needs.determine-evals.outputs.run-agent == 'true' + run: pnpm install --frozen-lockfile + + - name: Download build artifacts + if: needs.determine-evals.outputs.run-agent == 'true' + uses: actions/download-artifact@v4 + with: + name: build-artifacts + + - name: Run Agent Evals + if: needs.determine-evals.outputs.run-agent == 'true' + run: pnpm run evals category agent + + - name: Log Agent Evals Performance + if: needs.determine-evals.outputs.run-agent == 'true' + run: | + experimentName=$(jq -r '.experimentName' eval-summary.json) + echo "View results at https://www.braintrust.dev/app/Browserbase/p/stagehand/experiments/${experimentName}" + if [ -f eval-summary.json ]; then + agent_score=$(jq '.categories.agent' eval-summary.json) + echo "Agent category score: $agent_score%" + # Lower threshold for agent evals since they're complex + if (( $(echo "$agent_score < 50" | bc -l) )); then + echo "Agent category score is below 50%. Failing CI." + exit 1 + fi + else + echo "Eval summary not found for agent category. Failing CI." + exit 1 + fi diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml new file mode 100644 index 000000000..d51e7c4c5 --- /dev/null +++ b/.github/workflows/claude.yml @@ -0,0 +1,50 @@ +name: Claude Code + +on: + issue_comment: + types: [created] + pull_request_review_comment: + types: [created] + issues: + types: [opened, assigned] + pull_request_review: + types: [submitted] + +jobs: + claude: + if: | + (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) || + (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) || + (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) || + (github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude'))) + runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write + issues: write + id-token: write + actions: read # Required for Claude to read CI results on PRs + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Run Claude Code + id: claude + uses: anthropics/claude-code-action@v1 + with: + anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} + + # This is an optional setting that allows Claude to read CI results on PRs + additional_permissions: | + actions: read + + # Optional: Give a custom prompt to Claude. If this is not specified, Claude will perform the instructions specified in the comment that tagged it. + # prompt: 'Update the pull request description to include a summary of changes.' + + # Optional: Add claude_args to customize behavior and configuration + # See https://github.com/anthropics/claude-code-action/blob/main/docs/usage.md + # or https://code.claude.com/docs/en/cli-reference for available options + # claude_args: '--allowed-tools Bash(gh pr:*)' + diff --git a/.github/workflows/feature-parity.yml b/.github/workflows/feature-parity.yml new file mode 100644 index 000000000..9a40db33e --- /dev/null +++ b/.github/workflows/feature-parity.yml @@ -0,0 +1,147 @@ +name: Feature Parity + +on: + pull_request: + types: + - opened + - synchronize + - labeled + - unlabeled + paths-ignore: + - "packages/docs/**" + +jobs: + check-parity-label: + runs-on: ubuntu-latest + if: github.event.action == 'labeled' && github.event.label.name == 'parity' + permissions: + contents: read + pull-requests: write + issues: write + steps: + - name: Check out repository code + uses: actions/checkout@v4 + + - name: Check user permissions + uses: actions/github-script@v7 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + const { data: permission } = await github.rest.repos.getCollaboratorPermissionLevel({ + owner: context.repo.owner, + repo: context.repo.repo, + username: context.actor + }); + + const hasWriteAccess = ['admin', 'write'].includes(permission.permission); + + if (!hasWriteAccess) { + // Remove the parity label if user doesn't have write access + await github.rest.issues.removeLabel({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + name: 'parity' + }); + + // Add a comment explaining why the label was removed + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: `❌ **Parity Label Removed**\n\n@${context.actor}, you do not have sufficient permissions to add the 'parity' label. Only users with write access can trigger feature parity issues.\n\nIf you believe this feature should be implemented in the Python SDK, please ask a maintainer to add the label.` + }); + + throw new Error(`User ${context.actor} does not have write access to add parity label`); + } + + console.log(`User ${context.actor} has ${permission.permission} access - proceeding with parity workflow`); + + - name: Generate GitHub App token + id: generate-token + uses: actions/create-github-app-token@v1 + with: + app-id: ${{ secrets.PARITY_APP_ID }} + private-key: ${{ secrets.PARITY_APP_PRIVATE_KEY }} + owner: browserbase + repositories: stagehand + + - name: Create issue in Python SDK repository + uses: actions/github-script@v7 + with: + github-token: ${{ steps.generate-token.outputs.token }} + script: | + const { data: pullRequest } = await github.rest.pulls.get({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number: context.issue.number, + }); + + // Get PR comments for additional context + const { data: comments } = await github.rest.issues.listComments({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + }); + + // Format comments for the issue description + let commentsSection = ''; + if (comments.length > 0) { + commentsSection = '\n\n## Recent Comments\n\n'; + comments.slice(-3).forEach(comment => { + commentsSection += `**@${comment.user.login}** commented:\n`; + commentsSection += `${comment.body.substring(0, 500)}${comment.body.length > 500 ? '...' : ''}\n\n`; + }); + } + + // Get list of changed files for context + const { data: files } = await github.rest.pulls.listFiles({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number: context.issue.number, + }); + + const changedFiles = files.map(file => `- \`${file.filename}\``).join('\n'); + + const issueTitle = `[Feature Parity] ${pullRequest.title}`; + const issueBody = `## Feature Parity Request + + This issue was automatically created from a pull request in the TypeScript Stagehand repository that was labeled with 'parity'. + + ### Original PR Details + - **PR**: #${context.issue.number} - ${pullRequest.title} + - **Author**: @${pullRequest.user.login} + - **Link**: ${pullRequest.html_url} + + ### Description + ${pullRequest.body || 'No description provided.'} + + ### Changed Files + ${changedFiles} + + ${commentsSection} + + ### Action Required + Please review the changes in the original PR and implement equivalent functionality in the Python SDK if applicable. + + --- + *This issue was automatically generated by the Feature Parity workflow.*`; + + // Create the issue in the Python repository + const { data: issue } = await github.rest.issues.create({ + owner: 'browserbase', + repo: 'stagehand-python', + title: issueTitle, + body: issueBody, + labels: ['parity'] + }); + + console.log(`Created issue: ${issue.html_url}`); + + // Add a comment to the original PR confirming the issue was created + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: `🔄 **Feature Parity Issue Created**\n\nAn issue has been automatically created in the Python SDK repository to track parity implementation:\n${issue.html_url}` + }); diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 00975f83a..f3c5d1100 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -8,6 +8,7 @@ on: permissions: contents: write pull-requests: write + id-token: write concurrency: ${{ github.workflow }}-${{ github.ref }} @@ -17,38 +18,41 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout Repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 + with: + fetch-depth: 0 - name: Setup Node.js 20.x - uses: actions/setup-node@v3 + uses: actions/setup-node@v4 with: node-version: 20.x registry-url: "https://registry.npmjs.org" + # Ensure npm 11.5.1 or later is installed for Trusted Publishing. + - name: Update npm + run: npm install -g npm@latest + - name: Install dependencies run: | rm -rf node_modules - rm -f package-lock.json - npm install + npm install -g pnpm + pnpm install --no-frozen-lockfile - name: Build - run: npm run build + run: pnpm run build - name: Create Release Pull Request or Publish to npm id: changesets uses: changesets/action@v1 with: - publish: npm run release + publish: pnpm run release env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} - name: Publish Canary if: github.ref == 'refs/heads/main' run: | - npm config set //registry.npmjs.org/:_authToken=${NODE_AUTH_TOKEN} git checkout main - npm run release-canary + pnpm run release-canary env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} diff --git a/.github/workflows/stagehand-server-release.yml b/.github/workflows/stagehand-server-release.yml new file mode 100644 index 000000000..08d5fac7d --- /dev/null +++ b/.github/workflows/stagehand-server-release.yml @@ -0,0 +1,307 @@ +name: Release stagehand/server + +on: + push: + branches: + - main + paths: + - packages/server/package.json + workflow_dispatch: + +permissions: + contents: write + +concurrency: ${{ github.workflow }}-${{ github.ref }} + +env: + OAS_PATH: packages/server/openapi.v3.yaml + +jobs: + detect: + name: Detect server version bump + runs-on: ubuntu-latest + outputs: + release: ${{ steps.meta.outputs.release }} + version: ${{ steps.meta.outputs.version }} + tag: ${{ steps.meta.outputs.tag }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Determine release metadata + id: meta + shell: bash + run: | + set -euo pipefail + + after_version="$(node -p "require('./packages/server/package.json').version")" + tag="stagehand-server/v${after_version}" + + latest_tag="$(git tag -l 'stagehand-server/v*' --sort=-v:refname | head -n 1 || true)" + latest_version="${latest_tag#stagehand-server/v}" + + release="false" + if [ -z "${latest_tag}" ] || [ "${latest_version}" != "${after_version}" ]; then + release="true" + fi + + echo "release=${release}" >> "$GITHUB_OUTPUT" + echo "version=${after_version}" >> "$GITHUB_OUTPUT" + echo "tag=${tag}" >> "$GITHUB_OUTPUT" + + - name: Create stagehand/server tag + if: steps.meta.outputs.release == 'true' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + shell: bash + run: | + set -euo pipefail + + TAG="${{ steps.meta.outputs.tag }}" + VERSION="${{ steps.meta.outputs.version }}" + TARGET_SHA="${{ github.sha }}" + + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + + git fetch --tags --force + + if git rev-parse -q --verify "refs/tags/${TAG}" >/dev/null; then + echo "Tag already exists: ${TAG}" + exit 0 + fi + + git tag -a "${TAG}" "${TARGET_SHA}" -m "stagehand/server v${VERSION}" + git push origin "${TAG}" + + build_binaries: + name: Build SEA binaries (${{ matrix.binary_name }}) + needs: detect + if: needs.detect.outputs.release == 'true' + strategy: + fail-fast: false + matrix: + include: + - os: ubuntu-latest + platform: linux + arch: x64 + binary_name: stagehand-server-linux-x64 + + - os: ubuntu-latest + platform: linux + arch: arm64 + binary_name: stagehand-server-linux-arm64 + + - os: macos-15 + platform: darwin + arch: arm64 + binary_name: stagehand-server-darwin-arm64 + + - os: macos-15-intel + platform: darwin + arch: x64 + binary_name: stagehand-server-darwin-x64 + + - os: windows-latest + platform: win32 + arch: x64 + binary_name: stagehand-server-win32-x64.exe + + - os: windows-latest + platform: win32 + arch: arm64 + binary_name: stagehand-server-win32-arm64.exe + + runs-on: ${{ matrix.os }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup pnpm + uses: pnpm/action-setup@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: 22.x + cache: pnpm + + - name: Install dependencies + env: + PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD: "1" + run: pnpm install --frozen-lockfile + + - name: Build binary (Linux/macOS x64/host arch) + if: matrix.platform != 'win32' && !(matrix.platform == 'linux' && matrix.arch == 'arm64') + shell: bash + run: | + CI=true pnpm --filter @browserbasehq/stagehand-server build:binary + + - name: Build binary (Linux arm64) + if: matrix.platform == 'linux' && matrix.arch == 'arm64' + shell: bash + run: | + set -euo pipefail + + # Intentionally build the core SDK package (not `@browserbasehq/stagehand-server`), + # since the SEA bundle depends on the built SDK output and the server's `build` + # also runs OpenAPI generation (unnecessary for binary packaging here). + pnpm --filter @browserbasehq/stagehand build + cd packages/server + mkdir -p dist/sea + + pnpm exec esbuild src/server.ts \ + --bundle \ + --platform=node \ + --format=cjs \ + --outfile=dist/sea/bundle.cjs \ + --log-level=warning + + node --experimental-sea-config sea-config.json + + BLOB="dist/sea/sea-prep.blob" + if [ ! -f "${BLOB}" ]; then + echo "Missing ${BLOB}; SEA blob generation failed." >&2 + exit 1 + fi + + NODE_VERSION="$(node -p 'process.version')" + NODE_TARBALL="node-${NODE_VERSION}-linux-arm64.tar.xz" + NODE_URL="https://nodejs.org/dist/${NODE_VERSION}/${NODE_TARBALL}" + + curl -fsSL "${NODE_URL}" -o "${NODE_TARBALL}" + tar -xJf "${NODE_TARBALL}" + + NODE_EXE="node-${NODE_VERSION}-linux-arm64/bin/node" + if [ ! -f "${NODE_EXE}" ]; then + echo "Missing downloaded Node binary at ${NODE_EXE}" >&2 + exit 1 + fi + + OUT="dist/sea/${{ matrix.binary_name }}" + cp "${NODE_EXE}" "${OUT}" + chmod +x "${OUT}" + + pnpm exec postject "${OUT}" NODE_SEA_BLOB "${BLOB}" \ + --sentinel-fuse NODE_SEA_FUSE_fce680ab2cc467b6e072b8b5df1996b2 + + ls -lh "${OUT}" + + - name: Build binary (Windows) + if: matrix.platform == 'win32' && matrix.arch == 'x64' + shell: powershell + run: | + # Intentionally build the core SDK package (not `@browserbasehq/stagehand-server`), + # since the SEA bundle depends on the built SDK output and the server's `build` + # also runs OpenAPI generation (unnecessary for binary packaging here). + pnpm --filter @browserbasehq/stagehand build + Set-Location packages/server + node -e "require('fs').mkdirSync('dist/sea',{recursive:true})" + pnpm exec esbuild src/server.ts --bundle --platform=node --format=cjs --outfile=dist/sea/bundle.cjs --log-level=warning + node --experimental-sea-config sea-config.json + $blob = "dist/sea/sea-prep.blob" + if (!(Test-Path $blob)) { throw "Missing blob at $blob; SEA blob generation failed." } + Copy-Item (Get-Command node).Source -Destination "dist/sea/${{ matrix.binary_name }}" + pnpm exec postject "dist/sea/${{ matrix.binary_name }}" NODE_SEA_BLOB $blob ` + --sentinel-fuse NODE_SEA_FUSE_fce680ab2cc467b6e072b8b5df1996b2 + + - name: Build binary (Windows arm64) + if: matrix.platform == 'win32' && matrix.arch == 'arm64' + shell: powershell + run: | + # Intentionally build the core SDK package (not `@browserbasehq/stagehand-server`), + # since the SEA bundle depends on the built SDK output and the server's `build` + # also runs OpenAPI generation (unnecessary for binary packaging here). + pnpm --filter @browserbasehq/stagehand build + Set-Location packages/server + node -e "require('fs').mkdirSync('dist/sea',{recursive:true})" + pnpm exec esbuild src/server.ts --bundle --platform=node --format=cjs --outfile=dist/sea/bundle.cjs --log-level=warning + node --experimental-sea-config sea-config.json + + $blob = "dist/sea/sea-prep.blob" + if (!(Test-Path $blob)) { throw "Missing blob at $blob; SEA blob generation failed." } + + $nodeVersion = node -p "process.version" + $zipName = "node-$nodeVersion-win-arm64.zip" + $url = "https://nodejs.org/dist/$nodeVersion/$zipName" + + $tmp = Join-Path $env:RUNNER_TEMP "node-arm64" + Remove-Item -Recurse -Force $tmp -ErrorAction SilentlyContinue + New-Item -ItemType Directory -Force -Path $tmp | Out-Null + + $zipPath = Join-Path $tmp $zipName + Invoke-WebRequest -Uri $url -OutFile $zipPath + Expand-Archive -Path $zipPath -DestinationPath $tmp + + $nodeExe = Join-Path $tmp "node-$nodeVersion-win-arm64\\node.exe" + if (!(Test-Path $nodeExe)) { throw "Missing downloaded Node binary at $nodeExe" } + + Copy-Item $nodeExe -Destination "dist/sea/${{ matrix.binary_name }}" + pnpm exec postject "dist/sea/${{ matrix.binary_name }}" NODE_SEA_BLOB $blob ` + --sentinel-fuse NODE_SEA_FUSE_fce680ab2cc467b6e072b8b5df1996b2 + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: ${{ matrix.binary_name }} + path: packages/server/dist/sea/${{ matrix.binary_name }} + retention-days: 7 + + release: + name: Publish GitHub Release + needs: [detect, build_binaries] + if: needs.detect.outputs.release == 'true' + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Prepare release assets directory + run: mkdir -p release-assets + + - name: Prepare stagehand/server release assets + run: | + set -euo pipefail + cp "${{ env.OAS_PATH }}" "release-assets/openapi.v3.stagehand-server-${{ needs.detect.outputs.version }}.yaml" + + - name: Download SEA binary artifacts + uses: actions/download-artifact@v4 + with: + pattern: stagehand-server-* + path: release-assets + merge-multiple: true + + - name: Create checksums + shell: bash + run: | + set -euo pipefail + cd release-assets + # Only checksum binaries (exclude openapi yaml). Avoid failing if no matches. + shopt -s nullglob + files=(stagehand-server-*) + bins=() + for f in "${files[@]}"; do + [[ "$f" == *openapi* ]] && continue + [[ -f "$f" ]] && bins+=("$f") + done + : > checksums.sha256 + if [ "${#bins[@]}" -gt 0 ]; then + shasum -a 256 "${bins[@]}" > checksums.sha256 + fi + + - name: Publish stagehand/server GitHub release + uses: softprops/action-gh-release@v2 + with: + tag_name: ${{ needs.detect.outputs.tag }} + name: stagehand/server v${{ needs.detect.outputs.version }} + generate_release_notes: true + files: | + release-assets/openapi.v3.stagehand-server-${{ needs.detect.outputs.version }}.yaml + release-assets/stagehand-server-* + release-assets/checksums.sha256 diff --git a/.github/workflows/stainless.yml b/.github/workflows/stainless.yml new file mode 100644 index 000000000..70703c23d --- /dev/null +++ b/.github/workflows/stainless.yml @@ -0,0 +1,60 @@ +name: Build SDKs for pull request + +on: + pull_request: + types: + - opened + - synchronize + - reopened + - closed + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number }} + cancel-in-progress: true + +env: + STAINLESS_ORG: ${{ vars.STAINLESS_ORG }} + STAINLESS_PROJECT: ${{ vars.STAINLESS_PROJECT }} + OAS_PATH: packages/server/openapi.v3.yaml + +jobs: + preview: + if: github.event.action != 'closed' + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 2 + + - name: Run preview builds + uses: stainless-api/upload-openapi-spec-action/preview@v1 + with: + stainless_api_key: ${{ secrets.STAINLESS_API_KEY }} + org: ${{ env.STAINLESS_ORG }} + project: ${{ env.STAINLESS_PROJECT }} + oas_path: ${{ env.OAS_PATH }} + config_path: stainless.yml + + merge: + if: github.event.action == 'closed' && github.event.pull_request.merged == true && github.event.pull_request.base.ref == 'main' + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 2 + - name: Run merge build + uses: stainless-api/upload-openapi-spec-action/merge@v1 + with: + stainless_api_key: ${{ secrets.STAINLESS_API_KEY }} + org: ${{ env.STAINLESS_ORG }} + project: ${{ env.STAINLESS_PROJECT }} + oas_path: ${{ env.OAS_PATH }} + config_path: stainless.yml diff --git a/.gitignore b/.gitignore index e5ea06bbe..f9a77a016 100644 --- a/.gitignore +++ b/.gitignore @@ -9,12 +9,20 @@ screenshot.png .env downloads/ dist/ -evals/**/public -lib/dom/build/ -evals/public +.browserbase/ +packages/evals/**/public +packages/core/lib/dom/build/ +packages/core/lib/v3/dom/build/ +packages/evals/public *.tgz evals/playground.ts tmp/ eval-summary.json -pnpm-lock.yaml +package-lock.json evals/deterministic/tests/BrowserContext/tmp-test.har +packages/core/lib/version.ts +packages/core/test-results/ +/examples/inference_summary +/inference_summary +.turbo +.idea diff --git a/.prettierignore b/.prettierignore index 9581fb07d..98ad8477c 100644 --- a/.prettierignore +++ b/.prettierignore @@ -1,3 +1,19 @@ pnpm-lock.yaml README.md -**/*.json \ No newline at end of file +**/*.json +docs/ +.github/ +dist/ +node_modules/ +lib/dom/build/ +lib/v3/dom/build/ +packages/core/dist/ +packages/core/lib/dom/build/ +packages/core/lib/v3/dom/build/ +packages/evals/dist/ +packages/docs/ +*.min.js +.browserbase/ +.browserbase/** +**/.browserbase/ +**/.browserbase/** \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b34a6abc..b37ce4d87 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,222 @@ # @browserbasehq/stagehand +## 3.0.0 + +### Major Changes + +- Removes internal Playwright dependency +- A generous 20-40% speed increase across `act`, `extract`, & `observe` calls +- Compatibility with Playwright, Puppeteer, and Patchright +- Automatic action caching (agent, stagehand.act). Go from CUA → deterministic scripts w/o inference +- A suite of non AI primitives: + - `page` + - `locator` (built in closed mode shadow root traversal, with xpaths & css selectors) + - `frameLocator` + - `deepLocator` (crosses iframes & shadow roots) +- bun compatibility +- Simplified extract schemas +- CSS selector support (id-based support coming soon) +- Targeted extract and observe across iframes & shadow roots +- More intuitive type names (observeResult is now action, act accepts an instruction string instead of an action string, solidified ModelConfiguration) + +Check the [migration guide](https://docs.stagehand.dev/v3/migrations/v2) for more information + +## 2.5.0 + +### Minor Changes + +- [#981](https://github.com/browserbase/stagehand/pull/981) [`8244ab2`](https://github.com/browserbase/stagehand/commit/8244ab247cd679962685ae2f7c54e874ce1fa614) Thanks [@sameelarif](https://github.com/sameelarif)! - Added support for `stagehand.agent` to interact with MCP servers as well as custom tools to be passed in. For more information, reference the [MCP integrations documentation](https://docs.stagehand.dev/best-practices/mcp-integrations) + +### Patch Changes + +- [#959](https://github.com/browserbase/stagehand/pull/959) [`09b5e1e`](https://github.com/browserbase/stagehand/commit/09b5e1e9c23c845903686db6665cc968ac34efbb) Thanks [@filip-michalsky](https://github.com/filip-michalsky)! - add webvoyager evals + +- [#1049](https://github.com/browserbase/stagehand/pull/1049) [`e3734b9`](https://github.com/browserbase/stagehand/commit/e3734b9c98352d5f0a4eca49791b0bbf2130ab41) Thanks [@miguelg719](https://github.com/miguelg719)! - Support local MCP server connections + +- [#1025](https://github.com/browserbase/stagehand/pull/1025) [`be85b19`](https://github.com/browserbase/stagehand/commit/be85b19679a826f19702e00f0aae72fce1118ec8) Thanks [@tkattkat](https://github.com/tkattkat)! - add support for custom baseUrl within openai provider + +- [#1040](https://github.com/browserbase/stagehand/pull/1040) [`88d1565`](https://github.com/browserbase/stagehand/commit/88d1565c65bb65a104fea2d5f5e862bbbda69677) Thanks [@miguelg719](https://github.com/miguelg719)! - Allow OpenAI CUA to take in an optional baseURL + +- [#1046](https://github.com/browserbase/stagehand/pull/1046) [`ab5d6ed`](https://github.com/browserbase/stagehand/commit/ab5d6ede19aabc059badc4247f1cb2c6c9e71bae) Thanks [@tkattkat](https://github.com/tkattkat)! - Add support for gpt-5 in operator agent + +## 2.4.4 + +### Patch Changes + +- [#1012](https://github.com/browserbase/stagehand/pull/1012) [`9e8c173`](https://github.com/browserbase/stagehand/commit/9e8c17374fdc8fbe7f26e6cf802c36bd14f11039) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix disabling api validation whenever a customLLM client is provided + +## 2.4.3 + +### Patch Changes + +- [#951](https://github.com/browserbase/stagehand/pull/951) [`f45afdc`](https://github.com/browserbase/stagehand/commit/f45afdccc8680650755fee66ffbeac32b41e075d) Thanks [@miguelg719](https://github.com/miguelg719)! - Patch GPT-5 new api format + +- [#954](https://github.com/browserbase/stagehand/pull/954) [`261bba4`](https://github.com/browserbase/stagehand/commit/261bba43fa79ac3af95328e673ef3e9fced3279b) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - add support for shadow DOMs (open & closed mode) when experimental: true + +- [#944](https://github.com/browserbase/stagehand/pull/944) [`8de7bd8`](https://github.com/browserbase/stagehand/commit/8de7bd8635c2051cd8025e365c6c8aa83d81c7e7) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - Bump zod version compatibility and add pathing spec + +- [#919](https://github.com/browserbase/stagehand/pull/919) [`3d80421`](https://github.com/browserbase/stagehand/commit/3d804210a106a6828c7fa50f8b765b10afd4cc6a) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - enable scrolling inside of iframes + +- [#963](https://github.com/browserbase/stagehand/pull/963) [`0ead63d`](https://github.com/browserbase/stagehand/commit/0ead63d6526f6c286362b74b6407c8bebc900e69) Thanks [@tkattkat](https://github.com/tkattkat)! - Properly handle images in evaluator + clean up response parsing logic + +- [#961](https://github.com/browserbase/stagehand/pull/961) [`8422828`](https://github.com/browserbase/stagehand/commit/8422828c4cd5fd5ebcf348cfbdb40c768bb76dd9) Thanks [@tkattkat](https://github.com/tkattkat)! - Add more evals for stagehand agent + +- [#946](https://github.com/browserbase/stagehand/pull/946) [`b769206`](https://github.com/browserbase/stagehand/commit/b7692060f98a2f49aeeefb90d8789ed034b08ec2) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix: unable to act on/get content from some same process iframes + +- [#962](https://github.com/browserbase/stagehand/pull/962) [`72d2683`](https://github.com/browserbase/stagehand/commit/72d2683202af7e578d98367893964b33e0828de5) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - handle namespaced elements in xpath build step + +## 2.4.2 + +### Patch Changes + +- [#865](https://github.com/browserbase/stagehand/pull/865) [`6b4e6e3`](https://github.com/browserbase/stagehand/commit/6b4e6e3f31d5496cf15728e9018eddeb04839542) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - improve type safety for trimTrailingTextNode + +- [#897](https://github.com/browserbase/stagehand/pull/897) [`e77d018`](https://github.com/browserbase/stagehand/commit/e77d0188683ebf596dfb78dfafbbca1dc32993f0) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix selfHeal to remember intially received arguments + +- [#920](https://github.com/browserbase/stagehand/pull/920) [`c20adb9`](https://github.com/browserbase/stagehand/commit/c20adb95539fed8c56a4aa413262a9c65a8e6474) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix: tab handling on API + +- [#882](https://github.com/browserbase/stagehand/pull/882) [`b86df93`](https://github.com/browserbase/stagehand/commit/b86df93b9136aae96292121a29c25f3d74d84bf7) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - remove elements that don't have xpaths from observe response + +- [#905](https://github.com/browserbase/stagehand/pull/905) [`023c2c2`](https://github.com/browserbase/stagehand/commit/023c2c273b46d3792d7e5d3c902089487b16b531) Thanks [@tkattkat](https://github.com/tkattkat)! - Delete old images from anthropic cua client + +- [#925](https://github.com/browserbase/stagehand/pull/925) [`8c28647`](https://github.com/browserbase/stagehand/commit/8c2864755ecd05c8f7de235d4198deec0dd5f78e) Thanks [@miguelg719](https://github.com/miguelg719)! - Remove \_refreshPageFromApi() + +- [#887](https://github.com/browserbase/stagehand/pull/887) [`87e09c6`](https://github.com/browserbase/stagehand/commit/87e09c618940f364ec8af00455a19a17ec63cbd3) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix: allow xpaths with prepended 'xpath=' for targeted extract + +- [#864](https://github.com/browserbase/stagehand/pull/864) [`a611115`](https://github.com/browserbase/stagehand/commit/a61111525d70b450bdfc43f112380f44899c9e97) Thanks [@miguelg719](https://github.com/miguelg719)! - Temporarily patch custom clients serialization error on api + +- [#881](https://github.com/browserbase/stagehand/pull/881) [`69913fe`](https://github.com/browserbase/stagehand/commit/69913fe1dfb8201ae2aeffa5f049fb46ab02cbc2) Thanks [@miguelg719](https://github.com/miguelg719)! - Pass sdk version number to API for debugging + +- [#913](https://github.com/browserbase/stagehand/pull/913) [`b1b83a1`](https://github.com/browserbase/stagehand/commit/b1b83a1d334fe76e5f5f9dd32dc92c16b7d40ce6) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - move iframe out of 'experimental' + +- [#891](https://github.com/browserbase/stagehand/pull/891) [`be8497c`](https://github.com/browserbase/stagehand/commit/be8497cb6b142cc893cea9692b8c47bd19514c60) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix: nested iframe xpath bug + +- [#883](https://github.com/browserbase/stagehand/pull/883) [`98704c9`](https://github.com/browserbase/stagehand/commit/98704c9ed225ca25bbde4bb3dc286936e9c54471) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - add timeout for JS click + +- [#907](https://github.com/browserbase/stagehand/pull/907) [`04978bd`](https://github.com/browserbase/stagehand/commit/04978bdd30d2edcbc69eb9fd91358a16975ea2eb) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - store mapping of CDP frame ID -> page + +## 2.4.1 + +### Patch Changes + +- [#856](https://github.com/browserbase/stagehand/pull/856) [`8a43c5a`](https://github.com/browserbase/stagehand/commit/8a43c5a86d4da40cfaedd9cf2e42186928bdf946) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - set download behaviour by default + +- [#857](https://github.com/browserbase/stagehand/pull/857) [`890ffcc`](https://github.com/browserbase/stagehand/commit/890ffccac5e0a60ade64a46eb550c981ffb3e84a) Thanks [@miguelg719](https://github.com/miguelg719)! - return "not-supported" for elements inside the shadow-dom + +- [#844](https://github.com/browserbase/stagehand/pull/844) [`64c1072`](https://github.com/browserbase/stagehand/commit/64c10727bda50470483a3eb175c02842db0923a1) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - don't automatically close tabs + +- [#860](https://github.com/browserbase/stagehand/pull/860) [`b077d3f`](https://github.com/browserbase/stagehand/commit/b077d3f48a97f47a71ccc79ae39b41e7f07f9c04) Thanks [@miguelg719](https://github.com/miguelg719)! - Set default schema on extract options with no schema + +- [#842](https://github.com/browserbase/stagehand/pull/842) [`8bcb5d7`](https://github.com/browserbase/stagehand/commit/8bcb5d77debf6bf7601fd5c090efd7fde75c5d5e) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - improved handling for OS level dropdowns + +- [#846](https://github.com/browserbase/stagehand/pull/846) [`7bf10c5`](https://github.com/browserbase/stagehand/commit/7bf10c55b267078fe847c1d7f7a60d604f9c7c94) Thanks [@miguelg719](https://github.com/miguelg719)! - Filter attaching to target worker / shared_worker + +## 2.4.0 + +### Minor Changes + +- [#819](https://github.com/browserbase/stagehand/pull/819) [`6a18c1e`](https://github.com/browserbase/stagehand/commit/6a18c1ee1e46d55c6e90c4d5572e17ed8daa140c) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - try playwright click and fall back to JS click event + +### Patch Changes + +- [#826](https://github.com/browserbase/stagehand/pull/826) [`124e0d3`](https://github.com/browserbase/stagehand/commit/124e0d3bb54ddb6738ede6d7aa99a945ef1cacd1) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix issue where we are unable to take actions on text nodes + +- [#818](https://github.com/browserbase/stagehand/pull/818) [`1660751`](https://github.com/browserbase/stagehand/commit/1660751cd14cb5b27d44f8167216afb8d1c3c45c) Thanks [@miguelg719](https://github.com/miguelg719)! - Added CUA support for Claude 4 models + +- [#821](https://github.com/browserbase/stagehand/pull/821) [`cadac9d`](https://github.com/browserbase/stagehand/commit/cadac9da09123d12e5d496a0e8b12660964c1b33) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - use playwright instead of playwright test + +- [#832](https://github.com/browserbase/stagehand/pull/832) [`759da55`](https://github.com/browserbase/stagehand/commit/759da55775eb2df81d56ae18c0f386fd9b02a9f0) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix \_refreshPageFromAPI to use parametrized apiKey + +- [#810](https://github.com/browserbase/stagehand/pull/810) [`a175a51`](https://github.com/browserbase/stagehand/commit/a175a519b8c14300db6f1ed30709e113d18e99db) Thanks [@miguelg719](https://github.com/miguelg719)! - Update logos + +- [#822](https://github.com/browserbase/stagehand/pull/822) [`8527a80`](https://github.com/browserbase/stagehand/commit/8527a80522c3eedb9516a6caa1a0e4e4be981a3d) Thanks [@miguelg719](https://github.com/miguelg719)! - Add model with date tag for OpenAI CUA + +- [#833](https://github.com/browserbase/stagehand/pull/833) [`55fca2f`](https://github.com/browserbase/stagehand/commit/55fca2f7da63cc0ef6e27b45a33f63c666cdce7e) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - adjust stagehandLogger.warn() level to be 1 instead of 0 + +## 2.3.1 + +### Patch Changes + +- [#796](https://github.com/browserbase/stagehand/pull/796) [`12a99b3`](https://github.com/browserbase/stagehand/commit/12a99b398d8a4c3eea3ca69a3cf793faaaf4aea3) Thanks [@miguelg719](https://github.com/miguelg719)! - Added a experimental flag to enable the newest and most experimental features + +- [#807](https://github.com/browserbase/stagehand/pull/807) [`2451797`](https://github.com/browserbase/stagehand/commit/2451797f64c0efa4a72fd70265110003c8d0a6cd) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - include version number in StagehandDefaultError message + +- [#803](https://github.com/browserbase/stagehand/pull/803) [`1d631a5`](https://github.com/browserbase/stagehand/commit/1d631a57a197390f672b718ae5199991ab27cfb1) Thanks [@miguelg719](https://github.com/miguelg719)! - Enable session affinity for cache optimization + +- [#804](https://github.com/browserbase/stagehand/pull/804) [`9c398bb`](https://github.com/browserbase/stagehand/commit/9c398bb9ec2d10bdb53ad5aa7e3b58cce24fdb2b) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - update operatorResponseSchema based on new openai spec + +- [#786](https://github.com/browserbase/stagehand/pull/786) [`c19ad7f`](https://github.com/browserbase/stagehand/commit/c19ad7f1e082e91fdeaa9c2ef63767a5a2b3a195) Thanks [@miguelg719](https://github.com/miguelg719)! - Handle reroute to account for rollout + +## 2.3.0 + +### Minor Changes + +- [#737](https://github.com/browserbase/stagehand/pull/737) [`6ef6073`](https://github.com/browserbase/stagehand/commit/6ef60730cab0ad9025f44b6eeb2c83751d1dcd35) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - deprecate useTextExtract and remove functionality + +### Patch Changes + +- [#741](https://github.com/browserbase/stagehand/pull/741) [`5680d25`](https://github.com/browserbase/stagehand/commit/5680d2509352c383ad502c9f4fabde01fa638833) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - use safeparse for zod validation + +- [#783](https://github.com/browserbase/stagehand/pull/783) [`4de92a8`](https://github.com/browserbase/stagehand/commit/4de92a8af461fc95063faf39feee1d49259f58ba) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix the readme logo link + +## 2.2.1 + +### Patch Changes + +- [#721](https://github.com/browserbase/stagehand/pull/721) [`be8652e`](https://github.com/browserbase/stagehand/commit/be8652e770b57fdb3299fa0b2efa4eb0e816434e) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix stagehand.close() functionality to include calling browser.close() + +- [#724](https://github.com/browserbase/stagehand/pull/724) [`6b413b7`](https://github.com/browserbase/stagehand/commit/6b413b7ad00b13ca0bd53ee2e7393023821408b6) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - rm refine step in extract + +- [#712](https://github.com/browserbase/stagehand/pull/712) [`7eafbd9`](https://github.com/browserbase/stagehand/commit/7eafbd9b1a73b37effa444929767df7c592caf02) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - deprecated `onlyVisible` param and remove its functionality + +- [#725](https://github.com/browserbase/stagehand/pull/725) [`1b50aa6`](https://github.com/browserbase/stagehand/commit/1b50aa61cf0a429dd6cb2760a08f7f698a50454b) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - dont overwrite .describe() when user defines a zod schema with z.string().url().describe() + +- [#717](https://github.com/browserbase/stagehand/pull/717) [`f2b7f1f`](https://github.com/browserbase/stagehand/commit/f2b7f1f284eef1f96753319b66c7d0b273a6f8cd) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - don't publish uncompiled ts to npm + +- [#719](https://github.com/browserbase/stagehand/pull/719) [`c8d672f`](https://github.com/browserbase/stagehand/commit/c8d672f7c410c256defbc2e87ead99239837aa28) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - fix `Invalid schema for response_format` error when extracting links + +- [#722](https://github.com/browserbase/stagehand/pull/722) [`bebf204`](https://github.com/browserbase/stagehand/commit/bebf2044502333c694743078c5b0c9deae11fb79) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - replace NBSP with regular space & remove special characters from dom+a11y tree + +- [#714](https://github.com/browserbase/stagehand/pull/714) [`37d6810`](https://github.com/browserbase/stagehand/commit/37d6810a704773d0383a86f98f5f17c7d5b21975) Thanks [@miguelg719](https://github.com/miguelg719)! - Fix the native AI SDK client implementation to optionally take in an API key + +## 2.2.0 + +### Minor Changes + +- [#655](https://github.com/browserbase/stagehand/pull/655) [`8814af9`](https://github.com/browserbase/stagehand/commit/8814af9ece99fddc3dd9fb32671d0513a3a00c67) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - extract links + +- [#675](https://github.com/browserbase/stagehand/pull/675) [`35c55eb`](https://github.com/browserbase/stagehand/commit/35c55ebf6c2867801a0a6f6988a883c8cb90cf9a) Thanks [@tkattkat](https://github.com/tkattkat)! - Added Gemini 2.5 Flash to Google supported models + +- [#668](https://github.com/browserbase/stagehand/pull/668) [`5c6d2cf`](https://github.com/browserbase/stagehand/commit/5c6d2cf89c9fbf198485506ed9ed75e07aec5cd4) Thanks [@miguelg719](https://github.com/miguelg719)! - Added a new class - Stagehand Evaluator - that wraps around a Stagehand object to determine whether a task is successful or not. Currently used for agent evals + +### Patch Changes + +- [#706](https://github.com/browserbase/stagehand/pull/706) [`18ac6fb`](https://github.com/browserbase/stagehand/commit/18ac6fba30f45b7557cecb890f4e84c75de8383c) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - remove unused fillInVariables fn + +- [#692](https://github.com/browserbase/stagehand/pull/692) [`6b95248`](https://github.com/browserbase/stagehand/commit/6b95248d6e02e5304ce4dd60499e31fc42af57eb) Thanks [@miguelg719](https://github.com/miguelg719)! - Updated the list of OpenAI models (4.1, o3...) + +- [#688](https://github.com/browserbase/stagehand/pull/688) [`7d81b3c`](https://github.com/browserbase/stagehand/commit/7d81b3c951c1f3dfc46845aefcc26ff175299bca) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - wrap page.evaluate to make sure we have injected browser side scripts before calling them + +- [#664](https://github.com/browserbase/stagehand/pull/664) [`b5ca00a`](https://github.com/browserbase/stagehand/commit/b5ca00a25ad0c33a5f4d3198e1bc59edb9956e7c) Thanks [@miguelg719](https://github.com/miguelg719)! - remove unnecessary log + +- [#683](https://github.com/browserbase/stagehand/pull/683) [`8f0f97b`](https://github.com/browserbase/stagehand/commit/8f0f97bc491e23ff0078c802aaf509fd04173c37) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - use javsacript click instead of playwright + +- [#705](https://github.com/browserbase/stagehand/pull/705) [`346ef5d`](https://github.com/browserbase/stagehand/commit/346ef5d0132dc1418dac18d26640a8df0435af57) Thanks [@miguelg719](https://github.com/miguelg719)! - Fixed removing a hanging observation map that is no longer used + +- [#698](https://github.com/browserbase/stagehand/pull/698) [`c145bc1`](https://github.com/browserbase/stagehand/commit/c145bc1d90ffd0d71c412de3af1c26c121e0b101) Thanks [@sameelarif](https://github.com/sameelarif)! - Fixing LLM client support to natively integrate with AI SDK + +- [#687](https://github.com/browserbase/stagehand/pull/687) [`edd6d3f`](https://github.com/browserbase/stagehand/commit/edd6d3feb47aac9f312a5edad78bf850ae1541db) Thanks [@miguelg719](https://github.com/miguelg719)! - Fixed the schema input for Gemini's response model + +- [#678](https://github.com/browserbase/stagehand/pull/678) [`5ec43d8`](https://github.com/browserbase/stagehand/commit/5ec43d8b9568c0f86b3e24bd83d1826c837656ed) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - allow form filling when form is not top-most element + +- [#694](https://github.com/browserbase/stagehand/pull/694) [`b8cc164`](https://github.com/browserbase/stagehand/commit/b8cc16405b712064a54c8cd591750368a47f35ea) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - add telemetry for cua agents to stagehand.metrics + +- [#699](https://github.com/browserbase/stagehand/pull/699) [`d9f4243`](https://github.com/browserbase/stagehand/commit/d9f4243f6a8c8d4f3003ad6589f7eb4da6d23d0f) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - rm deprecated primitives from stagehand object + +- [#710](https://github.com/browserbase/stagehand/pull/710) [`9f4ab76`](https://github.com/browserbase/stagehand/commit/9f4ab76a0c1f0c2171290765c48c3bcea5b50e0f) Thanks [@seanmcguire12](https://github.com/seanmcguire12)! - support targeted extract for domExtract + +- [#677](https://github.com/browserbase/stagehand/pull/677) [`bc5a731`](https://github.com/browserbase/stagehand/commit/bc5a731241f7f4c5040dd672d8e3787555766421) Thanks [@miguelg719](https://github.com/miguelg719)! - Fixes a redundant unnecessary log + ## 2.1.0 ### Minor Changes diff --git a/README.md b/README.md index 788d6c073..56dda5f6d 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,29 @@ -
-
+ +
+ +
+ + ) +} \ No newline at end of file diff --git a/packages/docs/snippets/v3-banner.mdx b/packages/docs/snippets/v3-banner.mdx new file mode 100644 index 000000000..01ca5b521 --- /dev/null +++ b/packages/docs/snippets/v3-banner.mdx @@ -0,0 +1,10 @@ +{/* + V3Banner - Currently a no-op component + + This component is imported across 50+ pages in v3 docs. + Keeping it as a no-op rather than removing allows us to easily + add a new banner message in the future without editing every file. + + To add a banner, replace the null return with your JSX content. +*/} +export const V3Banner = () => null; diff --git a/packages/docs/v2/basics/act.mdx b/packages/docs/v2/basics/act.mdx new file mode 100644 index 000000000..4ac521be0 --- /dev/null +++ b/packages/docs/v2/basics/act.mdx @@ -0,0 +1,505 @@ +--- +title: Act +description: 'Interact with a web page' +--- + +## What is `act()`? +``` typescript +page.act("click on add to cart") +``` +`act` enables Stagehand to perform **individual** actions on a web page. Use it to build self-healing and deterministic automations that adapt to website changes. + +## Why use `act()`? + + + + Write automation in plain English. No selectors or complex syntax. + + + Build automations step by step. Define exactly what happens at every moment. + + + Actions automatically adapt when websites change. + + + Cache actions to avoid LLM calls and ensure consistent execution across runs. + + + +## Using `act()` + +Use `act` to perform single actions in your automation. Here's how to click a button: + + +```typescript TypeScript +await page.goto("https://example-store.com"); +await page.act("click the add to cart button"); +``` + +```python Python +await page.goto("https://example-store.com") +await page.act("click the add to cart button") +``` + + +With `act`, breaking complex actions into small, single-step actions works best. If you need to orchestrate multi-step flows, use multiple `act` commands or `agent`. + + + +| Action | Example instruction | +|--------|---------------------| +| Click | `click the button` | +| Fill | `fill the field with ` | +| Type | `type into the search box` | +| Press | `press in the search field` | +| Scroll | `scroll to ` | +| Select from dropdown | `select from the dropdown` | + + + + +Break your task into single-step actions. + + +```typescript TypeScript +// Break it into single-step actions +await page.act("open the filters panel"); +await page.act("choose 4-star rating"); +await page.act("click the apply button"); +``` + +```python Python +# Break it into single-step actions +await page.act("open the filters panel") +await page.act("choose 4-star rating") +await page.act("click the apply button") +``` + + + + +For multi-step tasks, use [`agent()`](/v2/basics/agent) instead. + + +```typescript TypeScript +// Too complex - trying to do multiple things at once +await page.act("open the filters panel, choose 4-star rating, and click apply"); +``` + +```python Python +# Too complex - trying to do multiple things at once +await page.act("open the filters panel, choose 4-star rating, and click apply") +``` + + + + +### Advanced Configuration + +For advanced scenarios, you can configure additional options: + + +```typescript TypeScript +// Dynamic food search with advanced options +const foodItem = "organic quinoa"; + +await page.act({ + action: "Type %foodItem% in the search box and press enter", + variables: { + foodItem: foodItem + }, + modelName: "google/gemini-2.5-pro", + modelClientOptions: { + modelApiKey: process.env.GOOGLE_API_KEY, + }, + iframes: true, // Search within iframes if needed + domSettleTimeoutMs: 45000, // Wait longer for dynamic content + timeoutMs: 60000 // Extended timeout for slow-loading forms +}); +``` + +```python Python +# Dynamic food search with advanced options +food_item = "organic quinoa" + +await page.act({ + "action": "Type %foodItem% in the search box and press enter", + "variables": { + "foodItem": food_item + }, + "modelName": "google/gemini-2.5-pro", + "modelClientOptions": { + "modelApiKey": os.environ.get("GOOGLE_API_KEY") + }, + "iframes": True, # Search within iframes if needed + "domSettleTimeoutMs": 45000, # Wait longer for dynamic content + "timeoutMs": 60000 # Extended timeout for slow-loading forms +}) +``` + + + +Shadow DOM support is now available! Set `experimental: true` in your Stagehand configuration to enable it. See the [configuration guide](/v2/configuration/browser) for more details. + + + + + + +## Best practices + +### Ensure reliable actions + +Use `observe()` to discover candidate actions on the current page and plan reliably. It returns a list of suggested actions (with selector, description, method, and arguments). You can pass an observed action directly to `act` to execute it. + + +```typescript TypeScript +const [action] = await page.observe("click the login button"); + +if (action) { + await page.act(action); +} +``` + +```python Python +results = await page.observe("click the login button") + +if results: + await page.act(results[0]) +``` + + + + Plan actions with `observe()` before executing with `act`. + + +### Reduce model costs + +Cache observed actions to avoid repeated LLM calls and ensure consistent execution. + + +```typescript TypeScript +// Cost-optimized actions with caching +const actionCache = new Map(); + +const getCachedAction = async (instruction: string) => { + if (actionCache.has(instruction)) { + return actionCache.get(instruction); + } + + const [action] = await page.observe(instruction); + actionCache.set(instruction, action); + return action; +}; + +// Reuse cached actions +const loginAction = await getCachedAction("click the login button"); +await page.act(loginAction); +``` + +```python Python +# Cost-optimized actions with caching +action_cache = {} + +async def get_cached_action(instruction: str): + if instruction in action_cache: + return action_cache[instruction] + + results = await page.observe(instruction) + if results: + action = results[0] + action_cache[instruction] = action + return action + + return None + +# Reuse cached actions +login_action = await get_cached_action("click the login button") +if login_action: + await page.act(login_action) +``` + + + + Learn advanced caching techniques and patterns for optimal performance. + + +### Secure your automations + +Variables will not be shared with LLM providers. Use them for passwords, API keys, and other sensitive data. + + + +Load sensitive data from environment variables using `.env` files. Never hardcode API keys, passwords, or other secrets directly in your code. + + + +```typescript TypeScript +await page.act({ + action: "enter %username% in the email field", + variables: { + username: "user@example.com" + } +}); + +await page.act({ + action: "enter %password% in the password field", + variables: { + password: process.env.USER_PASSWORD + } +}); +``` + +```python Python +# If using Python, set `use_api: true` in your Stagehand configuration + +await page.act( + "enter %username% in the email field", + variables={ + "username": "user@example.com" + } +) + +await page.act( + "enter %password% in the password field", + variables={ + "password": os.environ.get("USER_PASSWORD") + } +) +``` + + + +When handling sensitive data, set `verbose: 0` in your Stagehand configuration to prevent secrets from appearing in logs. See the [configuration guide](/v2/configuration/browser) for more details. + + + + Complete guide to securing your browser automations with best practices and configurations. + + +## Troubleshooting + + + + + +**Problem**: `act` fails with "method not supported" error + +**Solutions**: +- Use clear and detailed instructions for what you want to accomplish +- Review our [evals](https://stagehand.dev/evals) to find the best models for your use case +- Use [`observe()`](/v2/basics/observe) and verify the resulting action is within a list of expected actions + +**Solution 1: Validate with observe** + + +```typescript TypeScript +const prompt = "click the submit button"; +const expectedMethod = "click"; + +try { + await page.act(prompt); +} catch (error) { + if (error.message.includes("method not supported")) { + // Observe the same prompt to get the planned action + const [action] = await page.observe(prompt); + + if (action && action.method === expectedMethod) { + await page.act(action); + } else { + throw new Error(`Unsupported method: expected "${expectedMethod}", got "${action?.method}"`); + } + } else { + throw error; + } +} +``` + +```python Python +prompt = "click the submit button" +expected_method = "click" + +try: + await page.act(prompt) +except Exception as error: + if "method not supported" in str(error): + # Observe the same prompt to get the planned action + results = await page.observe(prompt) + + if results and results[0].method == expected_method: + await page.act(results[0]) + else: + method = results[0].method if results else "unknown" + raise Exception(f'Unsupported method: expected "{expected_method}", got "{method}"') + else: + raise error +``` + + +**Solution 2: Retry with exponential backoff** + + +```typescript TypeScript +// Retry with exponential backoff for intermittent issues +const prompt = "click the submit button"; +const maxRetries = 3; + +for (let attempt = 0; attempt <= maxRetries; attempt++) { + try { + await page.act(prompt, { timeoutMs: 10000 + (attempt * 5000) }); + break; // Success, exit retry loop + } catch (error) { + if (error.message.includes("method not supported") && attempt < maxRetries) { + // Exponential backoff: wait 2^attempt seconds + const delay = Math.pow(2, attempt) * 1000; + console.log(`Retry ${attempt + 1}/${maxRetries} after ${delay}ms`); + await new Promise(resolve => setTimeout(resolve, delay)); + } else { + throw error; + } + } +} +``` + +```python Python +# Retry with exponential backoff for intermittent issues +import asyncio + +prompt = "click the submit button" +max_retries = 3 + +for attempt in range(max_retries + 1): + try: + timeout = 10000 + (attempt * 5000) + await page.act(prompt, {"timeoutMs": timeout}) + break # Success, exit retry loop + except Exception as error: + if "method not supported" in str(error) and attempt < max_retries: + # Exponential backoff: wait 2^attempt seconds + delay = 2 ** attempt + print(f"Retry {attempt + 1}/{max_retries} after {delay}s") + await asyncio.sleep(delay) + else: + raise error +``` + + + + + +**Problem**: `act` times out or fails to complete action (often due to element not found) + +**Solutions**: +- Ensure page has fully loaded +- Check if content is in iframes: [Learn more about working with iframes](/v2/best-practices/working-with-iframes) +- Increase action timeout +- Use `observe()` first to verify element exists + + +```typescript TypeScript +// Handle timeout and element not found issues +try { + await page.act("click the submit button", { timeout: 30000 }); +} catch (error) { + // Check if page is fully loaded + await page.waitForLoadState('domcontentloaded'); + + // Use observe to check element state + const [element] = await page.observe("find the submit button"); + + if (element) { + console.log("Element found, trying more specific instruction"); + await page.act("click the submit button at the bottom of the form"); + } else { + console.log("Element not found, trying alternative selector"); + await page.act("click the button with text 'Submit'"); + } +} +``` + +```python Python +# Handle timeout and element not found issues +try: + await page.act("click the submit button", {"timeout": 30000}) +except Exception as error: + # Check if page is fully loaded + await page.wait_for_load_state('domcontentloaded') + + # Use observe to check element state + results = await page.observe("find the submit button") + + if results: + print("Element found, trying more specific instruction") + await page.act("click the submit button at the bottom of the form") + else: + print("Element not found, trying alternative selector") + await page.act("click the button with text 'Submit'") +``` + + + + +**Problem**: `act` performs action on wrong element + +**Solutions**: +- Be more specific in instructions: include visual cues, position, or context +- Use `observe()` to preview which element will be selected +- Add contextual information: "the search button in the header" +- Use unique identifiers when available + + +```typescript TypeScript +// More precise element targeting +// Instead of: +await page.act("click the button"); + +// Use specific context: +await page.act("click the red 'Delete' button next to the user John Smith"); + +// Or preview with observe first: +const [action] = await page.observe("click the submit button in the checkout form"); +if (action.description.includes("checkout")) { + await page.act(action); +} +``` + +```python Python +# More precise element targeting +# Instead of: +await page.act("click the button") + +# Use specific context: +await page.act("click the red 'Delete' button next to the user John Smith") + +# Or preview with observe first: +results = await page.observe("click the submit button in the checkout form") +if results and "checkout" in results[0].description: + await page.act(results[0]) +``` + + + + + + + +## Next steps + + + + + Use `Agent` to autonomously execute multi-step tasks and complex workflows. + + + + Speed up repeated automations by caching actions. + + + + Use `extract` with a data schema to pull clean, typed data from any page. + + + + Learn best practices for interacting with elements inside iframes. + + \ No newline at end of file diff --git a/packages/docs/v2/basics/agent.mdx b/packages/docs/v2/basics/agent.mdx new file mode 100644 index 000000000..d55781ef0 --- /dev/null +++ b/packages/docs/v2/basics/agent.mdx @@ -0,0 +1,313 @@ +--- +title: Agent +description: 'Automate complex workflows with AI powered browser agents' +--- + +## What is `agent()?` + +``` typescript +agent.execute("apply for a job at browserbase") +``` +`agent` turns high level tasks into **fully autonomous** browser workflows. You can customize the agent by specifying the LLM provider and model, setting custom instructions for behavior, and configuring max steps. + +Agent + +## Why use `agent()`? + + + + Execute complex sequences automatically. + + + Sees and understands web interfaces like humans do using computer vision. + + + + +## Using `agent()` + +There are two ways to create agents in Stagehand: + +### Computer Use Agents + +Use computer use agents with specialized models from OpenAI or Anthropic: + + +```typescript TypeScript +const agent = stagehand.agent({ + provider: "anthropic", + model: "claude-sonnet-4-20250514", + instructions: "You are a helpful assistant that can use a web browser.", + options: { + apiKey: process.env.ANTHROPIC_API_KEY, + }, +}); +await agent.execute("apply for a job at Browserbase") +``` + +```python Python +agent = stagehand.agent( + model="claude-sonnet-4-20250514", + instructions="You are a helpful assistant that can use a web browser.", + options={ + "api_key": os.getenv("ANTHROPIC_API_KEY"), + }, +) +await agent.execute("apply for a job at Browserbase") +``` + + +View or run the example template [here](https://www.browserbase.com/templates/gemini-cua) + +### Use Stagehand Agent with Any LLM + +Use the agent without specifying a provider to utilize any model or LLM provider: + +Non CUA agents are currently only supported in TypeScript + +```typescript TypeScript +const agent = stagehand.agent(); +await agent.execute("apply for a job at Browserbase") +``` + + +## MCP Integrations + +Agents can be enhanced with external tools and services through MCP (Model Context Protocol) integrations. This allows your agent to access external APIs and data sources beyond just browser interactions. + + +```typescript TypeScript (Pass URL) +const agent = stagehand.agent({ + provider: "openai", + model: "computer-use-preview", + integrations: [ + `https://mcp.exa.ai/mcp?exaApiKey=${process.env.EXA_API_KEY}`, + ], + instructions: `You have access to web search through Exa. Use it to find current information before browsing.`, + options: { + apiKey: process.env.OPENAI_API_KEY, + }, +}); + +await agent.execute("Search for the best headphones of 2025 and go through checkout for the top recommendation"); +``` + +```typescript TypeScript (Create Connection) +import { connectToMCPServer } from "@browserbasehq/stagehand"; + +const supabaseClient = await connectToMCPServer( + `https://server.smithery.ai/@supabase-community/supabase-mcp/mcp?api_key=${process.env.SMITHERY_API_KEY}` +); + +const agent = stagehand.agent({ + provider: "openai", + model: "computer-use-preview", + integrations: [supabaseClient], + instructions: `You can interact with Supabase databases. Use these tools to store and retrieve data.`, + options: { + apiKey: process.env.OPENAI_API_KEY, + }, +}); + +await agent.execute("Search for restaurants and save the first result to the database"); +``` + + + +MCP integrations enable agents to be more powerful by combining browser automation with external APIs, databases, and services. The agent can intelligently decide when to use browser actions versus external tools. + + + +Stagehand uses a 1288x711 viewport by default (the optimal size for Computer Use Agents). Other viewport sizes may reduce performance. If you need to modify the viewport, you can edit in the [Browser Configuration](/v2/configuration/browser). + + + +## Available Models + +Use specialized computer use models (e.g., `computer-use-preview` from OpenAI or `claude-sonnet-4-20250514` from Anthropic) + + + Check out the guide on how to use different models with Stagehand. + + +## Agent Execution Configuration + +Control the maximum number of steps the agent can take to complete the task using the `maxSteps` parameter. + + +```typescript TypeScript +// Set maxSteps to control how many actions the agent can take +await agent.execute({ + instruction: "Sign me up for a library card", + maxSteps: 15 // Agent will stop after 15 steps if task isn't complete +}); +``` + +```python Python +# Set max_steps to control how many actions the agent can take +result = await agent.execute({ + "instruction": "Sign me up for a library card", + "max_steps": 15 # Agent will stop after 15 steps if task isn't complete +}) +``` + + +For complex tasks, increase the `maxSteps` limit and check task success. + + +```typescript TypeScript +// Complex multi-step task requiring more actions +const result = await agent.execute({ + instruction: "Find and apply for software engineering jobs, filtering by remote work and saving 3 applications", + maxSteps: 30, // Higher limit for complex workflows +}); + +// Check if the task completed successfully +if (result.success === true) { + console.log("Task completed successfully!"); +} else { + console.log("Task failed or was incomplete"); +} +``` + +```python Python +# Complex multi-step task requiring more actions +result = await agent.execute({ + "instruction": "Find and apply for software engineering jobs, filtering by remote work and saving 3 applications", + "max_steps": 30 # Higher limit for complex workflows +}) + +# Check if the task completed successfully +if result.success == True: + print("Task completed successfully!") +else: + print("Task failed or was incomplete") +``` + + +## Best Practices + +Following these best practices will improve your agent's success rate, reduce execution time, and minimize unexpected errors during task completion. + +### Start on the Right Page +Navigate to your target page before executing tasks: + + + + +```typescript TypeScript +await page.goto('https://github.com/browserbase/stagehand'); +await agent.execute('Get me the latest PR on the stagehand repo'); +``` + +```python Python +await page.goto("https://github.com/browserbase/stagehand") +result = await agent.execute("Get me the latest PR on the stagehand repo") +``` + + + + + +```typescript TypeScript +await agent.execute('Go to GitHub and find the latest PR on browserbase/stagehand'); +``` + +```python Python +result = await agent.execute("Go to GitHub and find the latest PR on browserbase/stagehand") +``` + + + + + +### Be Specific +Provide detailed instructions for better results: + + + + +```typescript TypeScript +await agent.execute("Find Italian restaurants in Brooklyn that are open after 10pm and have outdoor seating"); +``` + +```python Python +result = await agent.execute("Find Italian restaurants in Brooklyn that are open after 10pm and have outdoor seating") +``` + + + + + +```typescript TypeScript +await agent.execute("Find a restaurant"); +``` + +```python Python +result = await agent.execute("Find a restaurant") +``` + + + + +## Troubleshooting + + + + + +**Problem**: Agent stops before finishing the requested task + +**Solutions**: +- Check if the agent is hitting the maxSteps limit (default is 20) +- Increase maxSteps for complex tasks: `maxSteps: 30` or higher +- Break very complex tasks into smaller sequential executions + +```typescript +// Increase maxSteps for complex tasks +await agent.execute({ + instruction: "Complete the multi-page registration form with all required information", + maxSteps: 40 // Increased limit for complex task +}); + +// Or break into smaller tasks with success checking +const firstResult = await agent.execute({ + instruction: "Fill out page 1 of the registration form", + maxSteps: 15 +}); + +// Only proceed if the first task was successful +if (firstResult.success === true) { + await agent.execute({ + instruction: "Navigate to page 2 and complete remaining fields", + maxSteps: 15 + }); +} else { + console.log("First task failed, stopping execution"); +} +``` + + + +**Problem**: Agent clicks on wrong elements or fails to interact with the correct UI components + +**Solutions**: +- Ensure proper viewport size: Stagehand uses `1288x711` by default (optimal for Computer Use models) +- Avoid changing viewport dimensions as other sizes may reduce performance + + + + + + +## Next steps + + + + Execute actions efficiently using observe results + + + + Extract structured data from observed elements + + \ No newline at end of file diff --git a/packages/docs/v2/basics/extract.mdx b/packages/docs/v2/basics/extract.mdx new file mode 100644 index 000000000..33f0f9134 --- /dev/null +++ b/packages/docs/v2/basics/extract.mdx @@ -0,0 +1,433 @@ +--- +title: Extract +description: Extract structured data from a webpage +--- + +## What is `extract()`? + +```typescript +page.extract("extract the name of the repository"); +``` + +`extract` grabs structured data from a webpage. You can define your schema with [zod](https://github.com/colinhacks/zod) (TypeScript) or [pydantic](https://github.com/pydantic/pydantic) (Python). If you do not want to define a schema, you can also call `extract` with just a [natural language prompt](#prompt-only-extraction), or call `extract` [with no parameters](#extract-with-no-parameters). + +## Why use `extract()`? + + + + Turn messy webpage data into clean objects that follow a schema. + + + Build resilient extractions that don't break when the website changes + + + + +For TypeScript, the extract schemas are defined using zod schemas. + +For Python, the extract schemas are defined using pydantic models. + + +## Using `extract()` + +### Single object Extraction + +Here is how an `extract` call might look for a single object: + + +```typescript TypeScript +import { z } from 'zod/v3'; + +const item = await page.extract({ + instruction: "extract the price of the item", + schema: z.object({ + price: z.number(), + }), +}); +``` + +```python Python +from pydantic import BaseModel + +class Extraction(BaseModel): + price: float + +item = await page.extract( + "extract the price of the item", + schema=Extraction +) +``` + + +Your output schema will look like: +```Example +{ price: number } +``` + +### List of objects Extraction + +Here is how an `extract` call might look for a list of objects. + + +```typescript TypeScript +import { z } from 'zod/v3'; + +const apartments = await page.extract({ + instruction: + "Extract ALL the apartment listings and their details, including address, price, and square feet.", + schema: z.object({ + list_of_apartments: z.array( + z.object({ + address: z.string(), + price: z.string(), + square_feet: z.string(), + }), + ), + }) +}) + +console.log("the apartment list is: ", apartments); +``` + +```python Python +from pydantic import BaseModel + +class Apartment(BaseModel): + address: str + price: str + square_feet: str + +class Apartments(BaseModel): + list_of_apartments: list[Apartment] + +apartments = await page.extract( + "Extract ALL the apartment listings and their details as a list, including address, price, and square feet for each apartment", + schema=Apartments +) + +print("the apartment list is: ", apartments) +``` + + +Your output schema will look like: +```Example +list_of_apartments: [ + { + address: "street address here", + price: "$1234.00", + square_feet: "700" + }, + { + address: "another address here", + price: "1010.00", + square_feet: "500" + }, + ... +] +``` + +### Prompt-only Extraction + +You can call `extract` with just a natural language prompt: + + +```typescript TypeScript +const result = await page.extract("extract the name of the repository"); +``` + +```python Python +result = await page.extract("extract the name of the repository") +``` + + +When you call `extract` with just a prompt, your output schema will look like: +```Example +{ extraction: string } +``` + +### Extract with no parameters + +Here is how you can call `extract` with no parameters. + + +```typescript TypeScript +const pageText = await page.extract(); +``` + +```python Python +pageText = await page.extract() +``` + + +Output schema: +```Example +{ pageText: string } +``` + +Calling `extract` with no parameters will return hierarchical tree representation of the root DOM. This will not be passed through an LLM. It will look something like this: + +``` +Accessibility Tree: +[0-2] RootWebArea: What is Stagehand? - 🤘 Stagehand + [0-37] scrollable + [0-118] body + [0-241] scrollable + [0-242] div + [0-244] link: 🤘 Stagehand home page light logo + [0-245] span + [0-246] StaticText: 🤘 Stagehand + [0-247] StaticText: home page +``` + +## Best practices + + +### Extract with Context + +You can provide additional context to your schema to help the model extract the data more accurately. + + +```typescript TypeScript +import { z } from 'zod/v3'; + +const apartments = await page.extract({ + instruction: + "Extract ALL the apartment listings and their details, including address, price, and square feet.", + schema: z.object({ + list_of_apartments: z.array( + z.object({ + address: z.string().describe("the address of the apartment"), + price: z.string().describe("the price of the apartment"), + square_feet: z.string().describe("the square footage of the apartment"), + }), + ), + }) +}) +``` + +```python Python +from pydantic import BaseModel, Field + +class Apartment(BaseModel): + address: str = Field(..., description="the address of the apartment") + price: str = Field(..., description="the price of the apartment") + square_feet: str = Field(..., description="the square footage of the apartment") + +class Apartments(BaseModel): + list_of_apartments: list[Apartment] + +apartments = await page.extract( + "Extract ALL the apartment listings and their details as a list. For each apartment, include: the address of the apartment, the price of the apartment, and the square footage of the apartment", + schema=Apartments +) +``` + + +### Link Extraction + +To extract links or URLs, in the TypeScript version of Stagehand, you'll need to define the relevant field as `z.string().url()`. +In Python, you'll need to define it as `HttpUrl`. + + +Here is how an `extract` call might look for extracting a link or URL. This also works for image links. + + +```typescript TypeScript +import { z } from 'zod/v3'; + +const extraction = await page.extract({ + instruction: "extract the link to the 'contact us' page", + schema: z.object({ + link: z.string().url(), // note the usage of z.string().url() here + }), +}); + +console.log("the link to the contact us page is: ", extraction.link); +``` + +```python Python +from pydantic import BaseModel, HttpUrl + +class Extraction(BaseModel): + link: HttpUrl # note the usage of HttpUrl here + +extraction = await page.extract( + "extract the link to the 'contact us' page", + schema=Extraction +) + +print("the link to the contact us page is: ", extraction.link) +``` + + + +Inside Stagehand, extracting links works by asking the LLM to select an ID. Stagehand looks up that ID in a mapping of IDs -> URLs. When logging the LLM trace, you should expect to see IDs. The actual URLs will be included in the final `ExtractResult`. + + +## Troubleshooting + + + +**Problem**: `extract()` returns empty or incomplete data + +**Solutions**: +- **Check your instruction clarity**: Make sure your instruction is specific and describes exactly what data you want to extract +- **Verify the data exists**: Use `page.observe()` first to confirm the data is present on the page +- **Wait for dynamic content**: If the page loads content dynamically, use `page.act("wait for the content to load")` before extracting + +**Solution: Wait for content before extracting** + +```typescript TypeScript +// Wait for content before extracting +await page.act("wait for the product listings to load"); +const products = await page.extract({ + instruction: "extract all product names and prices", + schema: z.object({ + products: z.array(z.object({ + name: z.string(), + price: z.string() + })) + }) +}); +``` + +```python Python +# Wait for content before extracting +await page.act("wait for the product listings to load") +products = await page.extract( + "extract all product names and prices", + schema=ProductList +) +``` + + + + +**Problem**: Getting schema validation errors or type mismatches + +**Solutions**: +- **Use optional fields**: Make fields optional with `z.optional()` (TypeScript) or `Optional[type]` (Python) if the data might not always be present +- **Use flexible types**: Consider using `z.string()` instead of `z.number()` for prices that might include currency symbols +- **Add descriptions**: Use `.describe()` (TypeScript) or `Field(description="...")` (Python) to help the model understand field requirements + +**Solution: More flexible schema** + +```typescript TypeScript +const schema = z.object({ + price: z.string().describe("price including currency symbol, e.g., '$19.99'"), + availability: z.string().optional().describe("stock status if available"), + rating: z.number().optional() +}); +``` + +```python Python +class FlexibleProduct(BaseModel): + price: str = Field(description="price including currency symbol, e.g., '$19.99'") + availability: Optional[str] = Field(default=None, description="stock status if available") + rating: Optional[float] = None +``` + + + + +**Problem**: Extraction results vary between runs + +**Solutions**: +- **Be more specific in instructions**: Instead of "extract prices", use "extract the numerical price value for each item" +- **Use context in schema descriptions**: Add field descriptions to guide the model +- **Combine with observe**: Use `page.observe()` to understand the page structure first + +**Solution: Validate with observe first** + +```typescript TypeScript +// First observe to understand the page structure +const elements = await page.observe("find all product listings"); +console.log("Found elements:", elements.map(e => e.description)); + +// Then extract with specific targeting +const products = await page.extract({ + instruction: "extract name and price from each product listing shown on the page", + schema: z.object({ + products: z.array(z.object({ + name: z.string().describe("the product title or name"), + price: z.string().describe("the price as displayed, including currency") + })) + }) +}); +``` + +```python Python +# First observe to understand the page structure +elements = await page.observe("find all product listings") +print("Found elements:", [e.description for e in elements]) + +# Then extract with specific targeting +products = await page.extract( + "extract name and price from each product listing shown on the page", + schema=ProductSchema +) +``` + + + + +**Problem**: Extraction is slow or timing out + +**Solutions**: +- **Reduce scope**: Extract smaller chunks of data in multiple calls rather than everything at once +- **Use targeted instructions**: Be specific about which part of the page to focus on +- **Consider pagination**: For large datasets, extract one page at a time +- **Increase timeout**: Use `timeoutMs` parameter for complex extractions + +**Solution: Break down large extractions** + +```typescript TypeScript +// Instead of extracting everything at once +const allData = []; +const pageNumbers = [1, 2, 3, 4, 5]; + +for (const pageNum of pageNumbers) { + await page.act(`navigate to page ${pageNum}`); + + const pageData = await page.extract({ + instruction: "extract product data from the current page only", + schema: ProductPageSchema, + timeoutMs: 60000 // 60 second timeout + }); + + allData.push(...pageData.products); +} +``` + +```python Python +# Instead of extracting everything at once +all_data = [] +page_numbers = [1, 2, 3, 4, 5] + +for page_num in page_numbers: + await page.act(f"navigate to page {page_num}") + + page_data = await page.extract( + "extract product data from the current page only", + schema=ProductPageSchema, + timeout_ms=60000 # 60 second timeout + ) + + all_data.extend(page_data.products) +``` + + + + +## Next steps + + + + + Execute actions efficiently using observe results + + + + Analyze pages with observe() + + \ No newline at end of file diff --git a/packages/docs/v2/basics/observe.mdx b/packages/docs/v2/basics/observe.mdx new file mode 100644 index 000000000..adecd2158 --- /dev/null +++ b/packages/docs/v2/basics/observe.mdx @@ -0,0 +1,266 @@ +--- +title: Observe +sidebarTitle: Observe +description: 'Find suggested actions for your workflows' +--- + +## What is `observe()`? +``` typescript +page.observe("Find the login button") +``` + +`observe` allows you to turn any page into a checklist of reliable, executable actions. It discovers key elements, ranks likely next steps, and returns structured actions (selector, method, args) you can run instantly with `act` or use to precisely target `extract` so workflows are faster, cheaper, and more resilient. + +## Why use `observe()`? + + + + When you're unsure what's on a page or need to discover available actions + + + When building complex workflows, plan ahead all the actions you'll need to take + + + When you want to remember actions for the future and avoid LLM calls + + + Before performing critical actions to ensure elements exist + + + +## Using `observe()` + +Calling `observe` supercharges other Stagehand methods. Use it to plan workflows, speed up `act`, and precisely target `extract`. Using `observe` helps you explore what's possible on a page by giving you a list of suggested actions. + + +```typescript TypeScript +// Plan & validate +const buttons = await page.observe("Find the log in / sign up buttons"); +``` +```python Python +# Plan & validate +buttons = await page.observe("Find the log in / sign up buttons") +``` + + +This will return a list of suggestions with the following structure +```json +{ + "selector": "xpath=/html/body/header/div/button[1]", + "description": "Log in button in the top right corner", + "method": "click", + "arguments": [] +} +``` + +### Observe with Act + +You can **validate** the action (method, selector, arguments...) and then pass it to `act` to **avoid extra LLM inference**. + + +**Performance Tip**: Acting on multiple `observe` suggestions will minimize the number of LLM calls for multi-step actions and speed up your workflow 2-3x. + + + +```typescript TypeScript +await page.act(buttons[0]); // No LLM! +``` +```python Python +await page.act(buttons[0]) # No LLM! +``` + + +#### Plan ahead + +You can use multiple suggestions from `observe` to preview a batch of actions. For example, when filling a form you could ask `observe` to find all the fields and then pass them in to `act`. **Call the LLM once, act multiple times**. + + +```typescript TypeScript +const fields = await page.observe("Find all the fields in the form"); +for (const field of fields) { + await page.act(field); // No LLM! +} +``` +```python Python +fields = await page.observe("Find all the fields in the form") +for field in fields: + await page.act(field) # No LLM! +``` + + +### Observe and Extract + +Using `observe` to focus `extract` on a specific section of the page (like a table, a form, a list...) minimizes the context needed for an extraction. + +**Savings Tip**: Pass the selector to `extract` to reduce LLM token usage by 10x for verbose websites! + + + +```typescript TypeScript +// Use observe to validate elements before extraction +const [ table ] = await page.observe("Find the data table"); + +const { data } = await page.extract({ + instruction: "Extract data from the table", + schema: z.object({ + data: z.string() + }), + selector: table.selector // Reduce context scope needed for extraction +}); +``` +```python Python +# Use observe to validate elements before extraction +[ table ] = await page.observe("Find the data table") + +extraction = await page.extract( + "Extract data from the table", + schema=Data, # Pydantic schema + selector=table.selector # Reduce context scope needed for extraction +) +``` + + +## Best Practices + +### Choose the right commands + + + + +- Use `observe` when a yes/no answer will gate an action (e.g., "Find the Submit button"), then conditionally `act`. +- Use `extract` for information-only questions (e.g., "What’s the page title?", "How many results are listed?"). + + + + + +- Don’t call `extract` to locate elements you plan to click next. +- Don’t call `observe` to answer info-only questions that won’t lead to an action. + + + +- **Discover and plan with `observe`**: Use `observe("Find…")` to map actionable elements and preview next steps. +- **Scope `extract` with selectors from `observe`**: First `observe("Find the data table")`, then pass `selector` to `extract` to reduce tokens and boost accuracy. + +### Conserve LLM tokens + +Optimize performance by directly passing `ObserveResult` to `act` (e.g., `await page.act(results[0])`) to save LLM tokens. Batch operations by using `observe` once to find elements, then act on each. Cache and reuse stable `observe` results for familiar pages, using self-healing if layouts change. + + + Check out the guide on how to build your own action cache + + +### Improve Accuracy + +Be precise with instructions, e.g., "Find the primary CTA in the hero" for better results. For iframes, set `iframes: true` and wait for `networkidle`. Use `observe` selectors in `extract` to limit context. + + + Check out the guide on how to improve the accuracy of your results + + +### Action Validation + +Before performing critical actions, validate the suggestion's `method`, `selector`, and `arguments` to prevent misclicks. If a direct `act` fails, use `observe` with the same prompt to verify the method, then proceed with the suggested action. + + +```typescript TypeScript +const prompt = "click the submit button"; +const expectedMethod = "click"; + +try { + await page.act(prompt); +} catch (error) { + if (error.message.includes("method not supported")) { + // Observe the same prompt to get the planned action + const [action] = await page.observe(prompt); + + if (action && action.method === expectedMethod) { + await page.act(action); + } else { + throw new Error(`Unsupported method: expected "${expectedMethod}", got "${action?.method}"`); + } + } else { + throw error; + } +} +``` + +```python Python +prompt = "click the submit button" +expected_method = "click" + +try: + await page.act(prompt) +except Exception as error: + if "method not supported" in str(error): + # Observe the same prompt to get the planned action + results = await page.observe(prompt) + + if results and results[0].method == expected_method: + await page.act(results[0]) + else: + method = results[0].method if results else "unknown" + raise Exception(f'Unsupported method: expected "{expected_method}", got "{method}"') + else: + raise error +``` + + +## Troubleshooting + + + +**Problem**: `observe` returns empty array + +**Solutions**: +- Make sure the element exists on the page +- Use explicit instructions to find the element +- Ensure page has fully loaded +- Look at the [debugging logs](/v2/configuration/logging), if the element is there then the LLM might be hallucinating/not catching it. + + + +**Problem**: Descriptions don't match actual elements + +**Solutions**: +- Use more capable models: check [evals](https://stagehand.dev/evals) for the best models for your use case +- Provide more specific instructions +- Log inference to file (see [debugging logs](/v2/configuration/logging#llm-inference-logging)) to get an LLM trace + + + +**Problem**: The method identified is not valid + +**Solutions**: +- Check the [supported actions](/v2/basics/act) +- Provide more specific instructions +- Validate the method, if invalid override with one of the supported ones + + + + + +## Next Steps + + + +Execute actions efficiently using `observe` results + + + +Extract structured data from observed elements + + + +Monitor and debug observation performance + + + +Advanced patterns and optimization techniques + + + + + + + diff --git a/packages/docs/v2/best-practices/agent-fallbacks.mdx b/packages/docs/v2/best-practices/agent-fallbacks.mdx new file mode 100644 index 000000000..85441debe --- /dev/null +++ b/packages/docs/v2/best-practices/agent-fallbacks.mdx @@ -0,0 +1,74 @@ +--- +title: Agent Fallbacks +description: "A failsafe when unexpected page changes add extra steps" +--- + +## When to use + +Use an agent fallback as a failsafe when a one step action unexpectedly becomes a multi-step flow. + +## How it works + +1. [`act()`](/v2/basics/act) is attempted for the direct action +2. If it fails, [`agent()`](/v2/basics/agent) figures out the new path +3. Agent completes all needed steps (open menu → click button) + +### Example scenario + +**Before**: Sign in button was in the header +**After**: Sign in now requires: Click account menu → Click "Sign in" option + +A single `act("click sign in")` can't handle this change. The agent fallback can discover and execute both steps. + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +try { + await page.act("click the 'Sign In' button"); +} catch (err) { + console.log("Agent fallback triggered"); + + const agent = stagehand.agent({ + provider: "anthropic", + model: "claude-sonnet-4-20250514", + instructions: "You are a helpful assistant that can use a web browser.", + }); + + const result = await agent.execute({ + instruction: "Find and click Sign In button", + maxSteps: 10, + }); + + console.log(result.success ? "Agent fallback success" : "Agent fallback failed"); + + if (!result.success) throw err; +} +``` + +```python Python +from stagehand import Stagehand + +try: + await page.act("click the 'Sign In' button") +except Exception as err: + print("Agent fallback triggered") + + agent = stagehand.agent({ + "provider": "anthropic", + "model": "claude-sonnet-4-20250514", + "instructions": "Complete the action, handling any new steps required.", + }) + + result = await agent.execute({ + "instruction": "Find and click Sign In button", + "max_steps": 10, + }) + + print("Agent fallback success" if result.success else "Agent fallback failed") + + if not result.success: + raise err +``` + + diff --git a/packages/docs/v2/best-practices/build-agent.mdx b/packages/docs/v2/best-practices/build-agent.mdx new file mode 100644 index 000000000..160022757 --- /dev/null +++ b/packages/docs/v2/best-practices/build-agent.mdx @@ -0,0 +1,209 @@ +--- +title: 'Build a web browsing agent' +description: 'Build an AI agent that can autonomously control a browser with Stagehand' +--- +import { Excalidraw } from '/snippets/excalidraw.mdx'; + +Stagehand gives AI agents powerful tools to control a browser completely autonomously. Watch below as a Stagehand agent autonomously navigates to a URL, takes actions on the page, and extracts structured data to answer a question. +There's quite a few ways to build an agent with Stagehand. Let's look at a few of them. + +![Agent](/media/stagehand-agent.gif) + +## Stagehand MCP + +The above example is a Claude agent that uses Stagehand to control a browser. At this time of writing, [multimodal tool calling](https://sdk.vercel.ai/docs/ai-sdk-core/tools-and-tool-calling#multi-modal-tool-results) is only supported in Claude 3.5/3.7 Sonnet. +This means Claude is intelligent enough to know when to request a browser screenshot, and it can then use that screenshot to make decisions about what actions to take next. + + + +Control a browser with Browserbase MCP powered by Stagehand + + + +What's really interesting about this is that the agent is able to reason about the browser state and take actions separate from one another! +Claude is able to reason about the browser state, while Stagehand is able to take actions on the page with GPT-4o-mini or a computer use model. +Stagehand is even smart enough to know when to use GPT-4o-mini and when to use a computer use model, i.e. on iframe detection. + + + +We've found great success from having Claude as the "Trajectory" agent calling Stagehand tools when it sees fit! +While MCP is really nascent, we're excited to see where it goes. + +## Stagehand + Computer Use Models + +Stagehand lets you leverage powerful computer use APIs from OpenAI and Anthropic with just one line of code. + + +```typescript TypeScript +await page.goto("https://github.com/browserbase/stagehand"); + +// Create a Computer Use agent with just one line of code! +const agent = stagehand.agent({ + provider: "openai", + model: "computer-use-preview" +}); + +// Use the agent to execute a task +const result = await agent.execute("Extract the top contributor's username"); +console.log(result); +``` +```python Python +await page.goto("https://github.com/browserbase/stagehand-python") + +# Create a Computer Use agent with just one line of code! +agent = stagehand.agent( + model="computer-use-preview" +) + +# Use the agent to execute a task +result = await agent.execute("Extract the top contributor's username") +print(result) +``` + + + + +Check out our docs page for instructions on how to use computer use models with Stagehand. + + +Check out a live demo of a Browserbase browser controlled by OpenAI's Computer Using Agent (CUA) model. + + + +## Sequential Tool Calling (Open Operator) + +In January 2025, Browserbase released [Open Operator](https://operator.browserbase.com). +Open Operator is able to reason about the browser state and take actions accordingly to accomplish larger tasks like "order me a pizza". +It works by calling Stagehand tools in sequence: + +1. If there's no URL, go to a default URL. +1. Examine the browser state. Ask an LLM to reason about what to do next. +1. Use `page.act()` to execute the LLM-suggested action. +1. Repeat + + + +Incorporating `stagehand.agent` into your browser automation is as easy as adding a single line of code: + + +Python currently supports `stagehand.agent` with Computer Use Agent (CUA) models. The default implementation is coming soon. + + + +```typescript TypeScript +await stagehand.page.goto("https://github.com/browserbase/stagehand"); + +// Open Operator will use the default LLM from Stagehand config +const operator = stagehand.agent(); +const { message, actions } = await operator.execute( + "Extract the top contributor's username" +); + +console.log(message); +``` + + +### Replay the agent's actions + +You can replay the agent's actions exactly the same way you would with a regular Stagehand agent. You can even automatically cache the actions to avoid unnecessary LLM calls on a repeated run. + +Let's use the `replay` function below to save the actions to a Stagehand script file, which will reproduce the same actions the agent did, with cached actions built in. + + +```typescript +import { AgentAction, AgentResult } from "@browserbasehq/stagehand"; +import { exec } from "child_process"; +import fs from "fs/promises"; + +export async function replay(result: AgentResult) { + const history = result.actions; + const replay = history + .map((action: AgentAction) => { + switch (action.type) { + case "act": + if (!action.playwrightArguments) { + throw new Error("No playwright arguments provided"); + } + return `await page.act(${JSON.stringify( + action.playwrightArguments + )})`; + case "extract": + return `await page.extract("${action.parameters}")`; + case "goto": + return `await page.goto("${action.parameters}")`; + case "wait": + return `await page.waitForTimeout(${parseInt( + action.parameters as string + )})`; + case "navback": + return `await page.goBack()`; + case "refresh": + return `await page.reload()`; + case "close": + return `await stagehand.close()`; + default: + return `await stagehand.oops()`; + } + }) + .join("\n"); + + console.log("Replay:"); + const boilerplate = ` +import { Page, BrowserContext, Stagehand } from "@browserbasehq/stagehand"; + +export async function main(stagehand: Stagehand) { + const page = stagehand.page + ${replay} +} + `; + await fs.writeFile("replay.ts", boilerplate); + + // Format the replay file with prettier + await new Promise((resolve, reject) => { + exec( + "npx prettier --write replay.ts", + (error: any, stdout: any, stderr: any) => { + if (error) { + console.error(`Error formatting replay.ts: ${error}`); + reject(error); + return; + } + resolve(stdout); + } + ); + }); +} +``` + + +Here's the replay output of an instruction like `"Get me the stock price of NVDA"`: + +```typescript {14-22} replay.ts +import { Page, BrowserContext, Stagehand } from "@browserbasehq/stagehand"; + +export async function main({ + page, + context, + stagehand, +}: { + page: Page; // Playwright Page with act, extract, and observe methods + context: BrowserContext; // Playwright BrowserContext + stagehand: Stagehand; // Stagehand instance +}) { + await page.goto("https://www.google.com"); + + // Replay will default to Playwright first to avoid unnecessary LLM calls! + // If the Playwright action fails, Stagehand AI will take over and self-heal + await page.act({ + description: "The search combobox where users can type their queries.", + method: "fill", + arguments: ["NVDA stock price"], + selector: + "xpath=/html/body[1]/div[1]/div[3]/form[1]/div[1]/div[1]/div[1]/div[1]/div[2]/textarea[1]", + }); + await page.extract( + "the displayed NVDA stock price in the search suggestions", + ); + await stagehand.close(); +} +``` \ No newline at end of file diff --git a/packages/docs/v2/best-practices/caching.mdx b/packages/docs/v2/best-practices/caching.mdx new file mode 100644 index 000000000..6835192b0 --- /dev/null +++ b/packages/docs/v2/best-practices/caching.mdx @@ -0,0 +1,206 @@ +--- +title: Caching Actions +description: You can cache actions in Stagehand to avoid redundant LLM calls. +--- + +Caching actions in Stagehand is useful for actions that are expensive to run, or when the underlying DOM structure is not expected to change. + +## Using `observe` to preview an action +`observe` lets you preview an action before taking it. If you are satisfied with the action preview, you can run it in `page.act` with no further LLM calls. + + +```typescript TypeScript +const [actionPreview] = await page.observe("Click the quickstart link"); + +/** actionPreview is a JSON-ified version of a Playwright action: +{ + description: "The quickstart link", + method: "click", + selector: "/html/body/div[1]/div[1]/a", + arguments: [], +} +**/ + +// NO LLM INFERENCE when calling act on the preview +await page.act(actionPreview) +``` + +```python Python +actions = await page.observe("Click the quickstart link") +action_preview = actions[0] + +# action_preview is a dictionary version of a Playwright action: +# { +# "description": "The quickstart link", +# "method": "click", +# "selector": "/html/body/div[1]/div[1]/a", +# "arguments": [], +# } + +# NO LLM INFERENCE when calling act on the preview +await page.act(action_preview) +``` + + +## Simple caching + +Let's use a simple file-based cache for this example. We'll write a getter and a setter functions that can read and write to a JSON file: + + +```typescript TypeScript +// Get the cached value (undefined if it doesn't exist) +async function getCache(key: string): Promise { + try { + const cache = await readFile("cache.json"); + const parsed = JSON.parse(cache); + return parsed[key]; + } catch { + return undefined; + } +} + +// Set the cache value +async function setCache(key: string, value: ObserveResult): Promise { + const cache = await readFile("cache.json"); + const parsed = JSON.parse(cache); + parsed[key] = value; + await writeFile("cache.json", JSON.stringify(parsed)); +} +``` + +```python Python +# Get the cached value (None if it doesn't exist) +async def get_cache(key: str) -> Optional[Dict[str, Any]]: + try: + async with aiofiles.open("cache.json", 'r') as f: + cache_content = await f.read() + parsed = json.loads(cache_content) + return parsed.get(key) + except (FileNotFoundError, json.JSONDecodeError): + return None + +# Set the cache value +async def set_cache(key: str, value: Dict[str, Any]) -> None: + try: + async with aiofiles.open("cache.json", 'r') as f: + cache_content = await f.read() + parsed = json.loads(cache_content) + except (FileNotFoundError, json.JSONDecodeError): + parsed = {} + + parsed[key] = value + + async with aiofiles.open("cache.json", 'w') as f: + await f.write(json.dumps(parsed)) +``` + + +### Act with cache +Let's write a function that will check the cache, get the action, and run it. If the action fails, we'll attempt to "self-heal", i.e. retry it with `page.act` directly. + + +```typescript TypeScript +// Check the cache, get the action, and run it +// If selfHeal is true, we'll attempt to self-heal if the action fails +async function actWithCache(page: Page, key: string, prompt: string, selfHeal = false) { + try { + const cacheExists = await getCache(key); + + let action: ObserveResult; + if (cacheExists) { + // Get the cached action + action = await getCache(prompt); + } else { + // Get the observe result (the action) + [action] = await page.observe(prompt); + + // Cache the action + await setCache(prompt, action); + } + + // Run the action (no LLM inference) + await page.act(action); + } catch (e) { + console.error(e); + // in selfHeal mode, we'll retry the action + if (selfHeal) { + console.log("Attempting to self-heal..."); + await page.act(prompt); + } + else { + throw e; + } + } +} +``` + +```python Python +# Check the cache, get the action, and run it +# If self_heal is true, we'll attempt to self-heal if the action fails +async def act_with_cache(page, key: str, prompt: str, self_heal: bool = False): + try: + cache_exists = await get_cache(key) + + if cache_exists: + # Get the cached action + action = await get_cache(prompt) + else: + # Get the observe result (the action) + actions = await page.observe(prompt) + action = actions[0] + + # Cache the action + await set_cache(prompt, action) + + # Run the action (no LLM inference) + await page.act(action) + except Exception as e: + print(f"Error: {e}") + # in self_heal mode, we'll retry the action + if self_heal: + print("Attempting to self-heal...") + await page.act(prompt) + else: + raise e +``` + + +You can now use `actWithCache` to run an action with caching: + + +```typescript TypeScript +const prompt = "Click the quickstart link"; +const key = prompt; // Simple cache key +// Attempt cached action or self-heal +await actWithCache(page, key, prompt); +``` + +```python Python +prompt = "Click the quickstart link" +key = prompt # Simple cache key +# Attempt cached action or self-heal +await act_with_cache(page, key, prompt) +``` + + +## Advanced caching + +The above example is simple, but you may want to cache actions based on the page contents. Also, if you have duplicate prompts, you should use a more unique key. + +We want to leave caching logic up to you, but give you all the tools you need to implement your own caching strategy. + +You can directly access the DOM and accessibility tree from Playwright's page object. Here's an example of how to access the page content: + + +```typescript TypeScript +// Get the page content +const pageContent = await page.content(); +``` + +```python Python +# Get the page content +page_content = await page.content() +``` + + +You may also want to use the accessibility tree, the DOM, or any other information to create a more unique key. You can do this as you please, with very similar logic to the above example. \ No newline at end of file diff --git a/packages/docs/v2/best-practices/computer-use.mdx b/packages/docs/v2/best-practices/computer-use.mdx new file mode 100644 index 000000000..b0c9cb8eb --- /dev/null +++ b/packages/docs/v2/best-practices/computer-use.mdx @@ -0,0 +1,199 @@ +--- +title: Computer Use Agents +description: Incorporate Computer Use APIs from Anthropic and OpenAI with one line of code in Stagehand. +--- + +## What is a Computer Use Agent? + + +You might've heard of [Claude Computer Use](https://www.anthropic.com/news/3-5-models-and-computer-use) or [OpenAI's Computer Using Agent](https://openai.com/index/computer-using-agent/). + +These are powerful tools that can convert natural language into actions on the computer. However, you'd otherwise need to write your own code to convert these actions into Playwright commands. + +Stagehand not only handles the execution of Computer Use outputs, but also lets you hot-swap between OpenAI and Anthropic models with one line of code. + +## How to use a Computer Use Agent in Stagehand + +Stagehand lets you use Computer Use Agents with one line of code: + + +**IMPORTANT! Configure your browser dimensions** + +Computer Use Agents will often return XY-coordinates to click on the screen, so you'll need to configure your browser dimensions. + +If not specified, the default browser dimensions are 1024x768. You can also configure the browser dimensions in the `browserbaseSessionCreateParams` or `localBrowserLaunchOptions` options. + + + +### Configuring browser dimensions + +Browser configuration differs by environment: + + + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", + apiKey: process.env.BROWSERBASE_API_KEY /* API key for authentication */, + projectId: process.env.BROWSERBASE_PROJECT_ID /* Project identifier */, + + browserbaseSessionCreateParams: { + projectId: process.env.BROWSERBASE_PROJECT_ID!, + browserSettings: { + blockAds: true, + viewport: { + width: 1024, + height: 768, + }, + }, + }, +}); + +await stagehand.init(); +``` +```python Python +import os +from stagehand import Stagehand, StagehandConfig + +stagehand = Stagehand(StagehandConfig( + env="BROWSERBASE", + api_key=os.getenv("BROWSERBASE_API_KEY"), # API key for authentication + project_id=os.getenv("BROWSERBASE_PROJECT_ID"), # Project identifier + + browserbase_session_create_params={ + "projectId": os.getenv("BROWSERBASE_PROJECT_ID"), + "browserSettings": { + "blockAds": True, + "viewport": { + "width": 1024, + "height": 768, + }, + }, + }, +)) + +await stagehand.init() +``` + + + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "LOCAL", + localBrowserLaunchOptions: { + headless: false, + viewport: { + width: 1024, + height: 768, + }, + } +}); + +await stagehand.init(); +``` +```python Python +from stagehand import Stagehand, StagehandConfig + +stagehand = Stagehand(StagehandConfig( + env="LOCAL", + local_browser_launch_options={ + "headless": False, + "viewport": { + "width": 1024, + "height": 768, + }, + } +)) + +await stagehand.init() +``` + + + + +### Direct your Computer Use Agent + +Call `execute` on the agent to assign a task to the agent. + + +```typescript TypeScript +// Navigate to a website +await stagehand.page.goto("https://www.google.com"); + +const agent = stagehand.agent({ + // You can use either OpenAI or Anthropic + provider: "anthropic", + // The model to use (computer-use-preview for OpenAI) + model: "claude-sonnet-4-20250514", + + // Customize the system prompt + instructions: `You are a helpful assistant that can use a web browser. + Do not ask follow up questions, the user will trust your judgement.`, + + // Customize the API key + options: { + apiKey: process.env.ANTHROPIC_API_KEY, + }, +}); + +// Execute the agent +await agent.execute("Apply for a library card at the San Francisco Public Library"); +``` + +```python Python +import os + +# Navigate to a website +await stagehand.page.goto("https://www.google.com") + +agent = stagehand.agent({ + # The model to use + model="computer-use-preview", + + # Customize the system prompt + instructions="You are a helpful assistant that can use a web browser. Do not ask follow up questions, the user will trust your judgement.", + + # Customize the API key + options={ + "apiKey": os.getenv("ANTHROPIC_API_KEY"), + }, +}) + +# Execute the agent +await agent.execute("Apply for a library card at the San Francisco Public Library") +``` + + +You can also define the maximum number of steps the agent can take with: + + +```typescript TypeScript +await agent.execute({ + instructions: "Apply for a library card at the San Francisco Public Library", + maxSteps: 10, +}); +``` + +```python Python +await agent.execute( + "Apply for a library card at the San Francisco Public Library", + max_steps=10, +) +``` + + +View or run the example templates [here](https://www.browserbase.com/templates?category=Computer+Use+Agents) diff --git a/packages/docs/v2/best-practices/contributing.mdx b/packages/docs/v2/best-practices/contributing.mdx new file mode 100644 index 000000000..25c66df8e --- /dev/null +++ b/packages/docs/v2/best-practices/contributing.mdx @@ -0,0 +1,52 @@ +--- +title: 'Contribute to Stagehand' +description: 'Best practices for making a meaningful contribution to Stagehand' +--- + +# Codeowners and Subject-Matter Experts + +Any contribution must be explicitly approved by a codeowner. Officially, Stagehand codeowners are as follows: + +- [**Paul Klein**](https://github.com/pkiv) +- [**Miguel Gonzalez**](https://github.com/miguelg719) +- [**Sean McGuire**](https://github.com/seanmcguire12) +- [**Anirudh Kamath**](https://github.com/kamath) +- [**Sameel Arif**](https://github.com/sameelarif) +- [**Filip Michalsky**](https://github.com/filip-michalsky) + +Special thanks to [Jeremy Press](https://github.com/jeremypress), [Navid Pour](https://github.com/navidkpr), and [all the contributors](https://github.com/browserbase/stagehand/graphs/contributors) for your help in making Stagehand the best browser automation framework. + +***Please do not hesitate to reach out to anyone listed here in the [public Discord server](https://stagehand.dev/discord)*** + +## General Workflow + +Get listed as [one of our beloved contributors](https://github.com/browserbase/stagehand/graphs/contributors)! + +1. **Discuss your proposed contribution before starting.** Not doing this runs you the risk of entirely discarding something you put considerable time and effort into. You can DM Miguel on [Discord](https://stagehand.dev/discord) for a 1on1 call. +2. **Open a Pull Request.** Create a fork of this repository, and follow [GitHub’s instructions to create a Pull Request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork). This allows our team to review your contribution and leave comments. +3. **Wait for Review**. We'll do our best to get to your contribution as soon as possible. If it's been 2-3 days and you have yet to receive any comments, DM Miguel on [Discord](https://stagehand.dev/discord) +4. **Merge into `evals` branch.** We don’t let external contributors [run our CI via GitHub Actions](https://github.com/browserbase/stagehand/blob/main/.github/workflows/ci.yml) to prevent spam and misuse. If your contribution passes an initial screen, we’ll run our evals on it + 1. By default, all PRs run the following tests that you can also run from the repo source: + 1. Lint (`npm run lint`) - Runs `prettier` and `eslint`. If this fails, you can most likely run `npm run format` to fix some simple linting errors. + 2. Build (`npm run build`) - Lints and builds TS → JS in `dist/` via `tsup` + 3. End-to-End (`npm run e2e`) - These are deterministic end-to-end Playwright tests to ensure the integrity of basic Playwright functionality of [`stagehand.page`](http://stagehand.page) and `stagehand.context` as well as compatibility with the Browserbase API + 4. Combination (`npm run evals category combination`) - This runs AI-based end-to-end tests using combinations of `act`, `extract`, and `observe` + 2. If you’re changing anything about `act`, `extract`, or `observe` itself, we might also run specific act/extract/observe evals to ensure existing functionality doesn’t significantly drop. + + ![CI](/images/CI.png) + +5. **Cleanup and merge to main**. Once it’s in `evals`, unfortunately the original contributor can’t make any further changes. The internal Stagehand team will be responsible for cleaning up the code and bringing it into main. + +## Contribution Guidelines + +1. **Use draft PRs.** If your PR is a work in progress, please convert it to a draft (see below) while you’re working on it, and mark it for review/add reviewers when you’re ready. This helps us prevent clutter in the review queue. + + ![Draft PR](/images/pr_draft.png) + +2. **Provide a reproducible test plan.** Include an eval (preferred) or example. We can’t merge your PR if we can’t run anything that specifically highlights your contribution. + 1. Write a script in [`evals/tasks`](https://github.com/browserbase/stagehand/tree/v2/evals/tasks) as `someTask.ts` + 2. Add your script to [`evals.config.json`](https://github.com/browserbase/stagehand/blob/v2/evals/evals.config.json) with default category `combination` (*or act/extract/observe if you’re* *only* *testing* *act/extract/observe*). +3. **Add a changeset.** Run `npx changeset` in TS or `uvx changeset` in Python to add a changeset that will directly reflect in the `CHANGELOG` in the upcoming release. + 1. `patch` - no net new functionality to an end-user + 2. `minor` - some net new functionality to an end-user (new function parameter, new exposed type, etc.) + 3. `major` - you shouldn’t be committing a major change diff --git a/packages/docs/v2/best-practices/cost-optimization.mdx b/packages/docs/v2/best-practices/cost-optimization.mdx new file mode 100644 index 000000000..0602b9db2 --- /dev/null +++ b/packages/docs/v2/best-practices/cost-optimization.mdx @@ -0,0 +1,258 @@ +--- +title: Cost Optimization +sidebarTitle: Cost Optimization +description: Minimize costs while maintaining automation performance +--- + +Cost optimization in Stagehand involves balancing LLM inference costs and browser infrastructure costs. This guide provides practical strategies to reduce your automation expenses. + +## Quick Wins + +Start with these simple optimizations that can reduce costs: + +### 1. Use the Right Model for the Job + +We don't recommend using larger, more premium models for simple tasks. See our [evaluation results](https://stagehand.dev/evals) for model performance and cost comparisons across different task types. + + + + Choose the right LLM for your budget and accuracy requirements + + + See how different models perform on different tasks + + + +### 2. Implement Smart Caching + +Cache successful actions to avoid repeated LLM calls. Learn the basics in our [Caching Guide](/v2/best-practices/caching): + + +```typescript TypeScript +// Cache successful actions +const [action] = await page.observe("Click the sign in button"); +await setCache("sign_in_button", action); + +// Reuse cached action (no LLM cost) +const cachedAction = await getCache("sign_in_button"); +if (cachedAction) { + await page.act(cachedAction); +} else { + await page.act(action); +} +``` +```python Python +# Cache successful actions +actions = await page.observe("Click the sign in button") +action = actions[0] +await set_cache("sign_in_button", action) + +# Reuse cached action (no LLM cost) +cached_action = await get_cache("sign_in_button") +if cached_action: + await page.act(cached_action) +else: + await page.act(action) +``` + + + + + Reduce costs with smart action caching and observe patterns + + + +### 3. Optimize Browser Sessions + +Reuse sessions when possible and set appropriate timeouts. See [Browser Configuration](/v2/configuration/browser) for details: + + +```typescript TypeScript +const stagehand = new Stagehand({ + env: "BROWSERBASE", + browserbaseSessionCreateParams: { + timeout: 1800, // 30 minutes instead of default 1 hour + keepAlive: true, // Keep session alive between tasks + } +}); +``` +```python Python +stagehand = Stagehand( + env="BROWSERBASE", + browserbase_session_create_params={ + "timeout": 1800, # 30 minutes instead of default 1 hour + "keep_alive": True, # Keep session alive between tasks + } +) +``` + + + + + Optimize Browserbase infrastructure costs and session management + + + +## Advanced Strategies + +### Intelligent Model Switching + +Automatically fall back to cheaper models for simple tasks: + + +```typescript TypeScript +// Use models from least to most expensive based on task complexity +// See stagehand.dev/evals for performance comparisons +async function smartAct(page: Page, prompt: string) { + const models = ["cheaper-model", "premium-model"]; + + for (const model of models) { + try { + const stagehand = new Stagehand({ modelName: model }); + await stagehand.init(); + const [action] = await stagehand.page.observe(prompt); + await stagehand.page.act(action); + return; + } catch (error) { + console.log(`Falling back to ${model}...`); + } + } +} +``` +```python Python +# Use models from least to most expensive based on task complexity +# See stagehand.dev/evals for performance comparisons +async def smart_act(page, prompt: str): + models = ["cheaper-model", "premium-model"] + + for model in models: + try: + stagehand = Stagehand(model_name=model) + await stagehand.init() + actions = await stagehand.page.observe(prompt) + action = actions[0] + await stagehand.page.act(action) + return + except Exception: + print(f"Falling back to {model}...") +``` + + +### Session Pooling + +Reuse browser sessions across multiple tasks: + + +```typescript TypeScript +class SessionManager { + private sessions = new Map(); + + async getSession(taskType: string): Promise { + if (this.sessions.has(taskType)) { + return this.sessions.get(taskType)!; + } + + const stagehand = new Stagehand({ env: "BROWSERBASE" }); + await stagehand.init(); + this.sessions.set(taskType, stagehand); + return stagehand; + } +} +``` +```python Python +class SessionManager: + def __init__(self): + self.sessions = {} + + async def get_session(self, task_type: str): + if task_type in self.sessions: + return self.sessions[task_type] + + stagehand = Stagehand(env="BROWSERBASE") + await stagehand.init() + self.sessions[task_type] = stagehand + return stagehand +``` + + +## Cost Monitoring + +Track your spending to identify optimization opportunities. See our [Observability Guide](/configuration/observability) for detailed metrics: + + +```typescript TypeScript +// Monitor token usage +const metrics = stagehand.metrics; +console.log(`Total tokens: ${metrics.totalPromptTokens + metrics.totalCompletionTokens}`); +console.log(`Estimated cost: $${(metrics.totalPromptTokens + metrics.totalCompletionTokens) * 0.00001}`); +``` +```python Python +# Monitor token usage +metrics = stagehand.metrics +total_tokens = metrics['total_prompt_tokens'] + metrics['total_completion_tokens'] +print(f"Total tokens: {total_tokens}") +print(f"Estimated cost: ${total_tokens * 0.00001:.4f}") +``` + + + + + Monitor usage patterns and track costs in real-time + + + +## Budget Controls + +Set spending limits to prevent unexpected costs: + + +```typescript TypeScript +class BudgetGuard { + private dailySpend = 0; + private maxDailyBudget: number; + + constructor(maxDailyBudget: number = 25) { + this.maxDailyBudget = maxDailyBudget; + } + + checkBudget(estimatedCost: number): void { + if (this.dailySpend + estimatedCost > this.maxDailyBudget) { + throw new Error(`Daily budget exceeded: $${this.maxDailyBudget}`); + } + this.dailySpend += estimatedCost; + } +} +``` +```python Python +class BudgetGuard: + def __init__(self, max_daily_budget: float = 25.0): + self.daily_spend = 0 + self.max_daily_budget = max_daily_budget + + def check_budget(self, estimated_cost: float) -> None: + if self.daily_spend + estimated_cost > self.max_daily_budget: + raise Exception(f"Daily budget exceeded: ${self.max_daily_budget}") + self.daily_spend += estimated_cost +``` + + + +## Related Resources + + + + Choose the right LLM for your budget and accuracy requirements + + + + Reduce costs with smart action caching and observe patterns + + + + Monitor usage patterns and track costs in real-time + + + + Optimize Browserbase infrastructure costs and session management + + \ No newline at end of file diff --git a/packages/docs/v2/best-practices/deployments.mdx b/packages/docs/v2/best-practices/deployments.mdx new file mode 100644 index 000000000..0579b7ad7 --- /dev/null +++ b/packages/docs/v2/best-practices/deployments.mdx @@ -0,0 +1,238 @@ +--- +title: 'Deploying Stagehand' +description: 'Deploy your AI agents and automations to the cloud' +--- + + +**🌟 Preview: Browser Functions** - Deploy your web automation code directly on Browserbase with browser functions. Scale your `act()` automations in the cloud with zero infrastructure setup. Reach out to hello@browserbase.com to get beta access. + + +## Deploy on Vercel + +Securely run Stagehand on Browserbase inside a Vercel Function. This guide shows a minimal, production-safe HTTP endpoint you can call directly or on a schedule. + +### 1. Install Vercel CLI + +To download and install Vercel CLI, run one of the following commands: + + +```bash pnpm +pnpm i -g vercel +``` +```bash yarn +yarn global add vercel +``` +```bash npm +npm i -g vercel +``` +```bash bun +bun add -g vercel +``` + + +### 2. Project layout + +```text +your-project/ + api/ + run.ts + package.json + tsconfig.json + vercel.json +``` + +Create the structure with: + +```bash +mkdir -p api +touch api/run.ts package.json vercel.json tsconfig.json +``` + +### 3. `api/run.ts` (Node.js runtime) + +```typescript +// api/run.ts +import type { VercelRequest, VercelResponse } from "@vercel/node"; +import { Stagehand } from "@browserbasehq/stagehand"; +import { z } from "zod/v3"; + +export default async function handler(req: VercelRequest, res: VercelResponse): Promise { + try { + const stagehand = new Stagehand({ + env: "BROWSERBASE", + apiKey: process.env.BROWSERBASE_API_KEY!, + projectId: process.env.BROWSERBASE_PROJECT_ID!, + disablePino: true, + modelName: "google/gemini-2.5-flash", + modelClientOptions: { + apiKey: process.env.GOOGLE_API_KEY!, + }, + // optional session params + browserbaseSessionCreateParams: { + projectId: process.env.BROWSERBASE_PROJECT_ID!, + region: "us-west-2", + browserSettings: { + blockAds: true, + }, + }, + }); + + await stagehand.init(); + const page = stagehand.page; + + await page.goto("https://www.stagehand.dev/"); + await page.act("click the evals button"); + + const { extraction } = await page.extract("extract the fastest model"); + const data = { model: extraction ?? "" }; + + await stagehand.close(); + + res.status(200).json({ ok: true, data: data.model }); + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + res.status(500).json({ ok: false, error: msg }); + } +} +``` + +### 4. `package.json` + +```json +{ + "name": "bb-stagehand-on-vercel", + "private": true, + "type": "module", + "engines": { "node": ">=18" }, + "dependencies": { + "@browserbasehq/stagehand": "^2.4.3", + "zod": "^3.25.0" + }, + "devDependencies": { + "typescript": "^5.6.0", + "@types/node": "^20.12.12", + "@vercel/node": "^3.2.20" + } +} +``` + +### 5. `tsconfig.json` + +```json +{ + "compilerOptions": { + "target": "ES2022", + "module": "ES2022", + "moduleResolution": "node", + "outDir": ".vercel/output/functions", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "types": ["node"] + }, + "include": ["api/**/*.ts"] +} +``` + +### 6. `vercel.json` + +```json +{ + "$schema": "https://openapi.vercel.sh/vercel.json", + "functions": { + "api/run.ts": { + "maxDuration": 60 + } + } +} +``` + +See Vercel's [configuring functions](https://vercel.com/docs/functions/configuring-functions) docs for more details. + +### 7. Link your project + +Link your local folder to a Vercel project before configuring environment variables: + +```bash +# authenticate if needed +vercel login + +# link the current directory to a Vercel project (interactive) +vercel link +``` + +### 8. Environment variables + +Do not commit `.env` in production. Add variables via Vercel CLI: + +```bash +vercel env add BROWSERBASE_API_KEY +vercel env add BROWSERBASE_PROJECT_ID +# (and your model key if needed) +vercel env add GOOGLE_API_KEY +``` + +See also: [Browser Environment](/configuration/environment) for details on required variables. + +### 9. Test locally + +Replicate the Vercel environment locally to exercise your Function before deploying. Run from the project root. + +```bash +# ensure dependencies are installed +npm install + +# start the local Vercel dev server +vercel dev --listen 5005 +``` + +### 10. Deploy + +```bash +vercel +vercel --prod +``` + +### Execute the function + +#### Configure Protection Bypass for Automation + +Before invoking the production URL, create a Protection Bypass for Automation: + +1. Generate a 32-character secret (you can use `openssl rand -hex 16`) +2. Go to your project in Vercel +3. Navigate to Settings → Deployment Protection +4. Add the secret to "Protection Bypass for Automation" + +Then invoke the function with the bypass header: + +```bash +curl -X POST \ + -H "x-vercel-protection-bypass: " \ + https:///api/run +``` + +### Optional: Cron on Vercel + +Hit the same endpoint on a schedule by extending `vercel.json`: + +```json +{ + "$schema": "https://openapi.vercel.sh/vercel.json", + "functions": { + "api/run.ts": { + "maxDuration": 60 + } + } + }, + "crons": [ + { "path": "/api/run", "schedule": "0 * * * *" } + ] +} +``` + +### Features +- **No local browsers needed** with `env: "BROWSERBASE"`. [Browserbase](https://www.browserbase.com/) provides the browsers. +- **Fast functionality**: Offload browser work to Browserbase and return JSON promptly. +- **Long-running tasks**: Raise `maxDuration` and/or consider Edge runtime limits depending on plan. + diff --git a/packages/docs/v2/best-practices/mcp-integrations.mdx b/packages/docs/v2/best-practices/mcp-integrations.mdx new file mode 100644 index 000000000..b29476941 --- /dev/null +++ b/packages/docs/v2/best-practices/mcp-integrations.mdx @@ -0,0 +1,269 @@ +--- +title: "MCP Integrations" +description: "Using Model Context Protocol (MCP) integrations to enhance agent capabilities" +--- + +## What are MCP Integrations? + +MCP (Model Context Protocol) integrations allow you to connect your Stagehand agents to external tools, APIs, and services. This enables agents to perform actions beyond browser automation, such as web search, database operations, and API calls. + + +MCP integrations make your agents more powerful by combining browser automation with external capabilities. The agent can intelligently decide when to use browser actions versus external tools. + + +## Connection Options + +There are two options for connecting to MCP servers: + +1. **Pass a URL directly** - The simplest approach for quick setup +2. **Create a connection first** - Gives you more control over the connection + + +MCP client support is currently only available in TypeScript. + + +## Passing a URL + +The simplest way to add MCP integrations is by providing server URLs directly in the agent configuration: + +```typescript +const agent = stagehand.agent({ + provider: "openai", + model: "computer-use-preview", + integrations: [ + `https://mcp.exa.ai/mcp?exaApiKey=${process.env.EXA_API_KEY}`, + ], + instructions: `You have access to web search through Exa. Use it to find current information before browsing.`, + options: { + apiKey: process.env.OPENAI_API_KEY, + }, +}); + +await agent.execute("Search for the best headphones of 2025 and go through checkout for the top recommendation"); +``` + +## Creating a Connection First + +Alternatively, you can establish MCP connections first and then pass the client objects: + +```typescript +import { connectToMCPServer } from "@browserbasehq/stagehand"; + +// Connect to MCP server +const supabaseClient = await connectToMCPServer( + `https://server.smithery.ai/@supabase-community/supabase-mcp/mcp?api_key=${process.env.SMITHERY_API_KEY}` +); + +// You can also pass the config to start a local MCP server +const notionClient = await connectToMCPServer({ + command: "npx", + args: ["-y", "@notionhq/notion-mcp-server"], + env: { + NOTION_TOKEN: process.env.NOTION_TOKEN, + }, +}); + +// Use the connected client +const agent = stagehand.agent({ + provider: "openai", + model: "computer-use-preview", + integrations: [supabaseClient, notionClient], + instructions: `You can interact with Supabase databases and Notion. Use these tools to store and retrieve data.`, + options: { + apiKey: process.env.OPENAI_API_KEY, + }, +}); + +await agent.execute("Search for restaurants in New Brunswick, NJ and save the first result to the database"); +``` + + + +## Multiple Integrations + +You can combine multiple MCP integrations in a single agent: + +```typescript +const databaseClient = await connectToMCPServer(/* database config */); + +const agent = stagehand.agent({ + integrations: [ + `https://search-service.example.com/mcp?apiKey=${process.env.SEARCH_API_KEY}`, + databaseClient + ], + instructions: `You have access to external tools for search and data storage. Use these tools strategically to complete tasks efficiently.` +}); +``` + +## Best Practices + +### Choose the Right Connection Approach + + +**When to use:** +- Simple setup requirements +- Standard API configurations +- Getting started quickly + +**Benefits:** +- Minimal code required +- Automatic connection handling +- Easy to configure + + + +**When to use:** +- Custom connection options +- Connection reuse across agents +- Advanced error handling + +**Benefits:** +- Full control over connections +- Better error handling +- Connection pooling capabilities + + + +### Environment Variables + +Always use environment variables for API keys and sensitive information: + +```bash +# .env file +SEARCH_API_KEY=your_search_service_key +MCP_SERVICE_API_KEY=your_mcp_service_key +OPENAI_API_KEY=your_openai_key +DATABASE_URL=your_database_url +DATABASE_API_KEY=your_database_key +``` + +### Instructions Best Practices + +Provide clear instructions about available tools: + + + +```typescript +instructions: `You have access to: +1. Web search tools - Use to find current information +2. Database tools - Use to store/retrieve data +3. Browser automation - Use for web interactions + +Always search for current information before making decisions. +Store important data for later reference.` +``` + + + +```typescript +instructions: "You can search and save data." +``` + + + +### Error Handling + +Implement proper error handling for MCP connections: + +```typescript +try { + const client = await connectToMCPServer(serverUrl); + + const agent = stagehand.agent({ + integrations: [client], + // ... other config + }); + + const result = await agent.execute(instruction); +} catch (error) { + console.error("MCP integration failed:", error); + // Handle fallback behavior +} +``` + +## Troubleshooting + + + +**Problem:** MCP server connections timing out + +**Solutions:** +- Verify server URLs are correct and accessible +- Check network connectivity +- Ensure API keys are valid and have proper permissions +- Try connecting to servers individually to isolate issues + + + +**Problem:** Agent not using available MCP tools + +**Solutions:** +- Make instructions more specific about when to use tools +- Ensure API keys are properly configured +- Check that the MCP server supports the expected tools +- Verify tool descriptions are clear and actionable + + + +**Problem:** API key or authentication failures + +**Solutions:** +- Verify all required environment variables are set +- Check API key validity and permissions +- Ensure URLs include necessary authentication parameters +- Test MCP connections independently before using in agents + + + +## Examples + +### Web Search + Browser Automation +```typescript +const agent = stagehand.agent({ + integrations: [`https://mcp.exa.ai/mcp?exaApiKey=${process.env.EXA_API_KEY}`], + instructions: `First search for current information, then use the browser to complete tasks based on what you find.` +}); + +await agent.execute("Find the best laptop deals for 2025 and navigate to purchase the top recommendation"); +``` + +### Data Extraction + Storage +```typescript +const supabaseClient = await connectToMCPServer(/* config */); + +const agent = stagehand.agent({ + integrations: [supabaseClient], + instructions: `Extract data from websites and store it using available database tools.` +}); + +await agent.execute("Extract all restaurant information from this directory and save it to the database"); +``` + +### Multi-tool Workflow +```typescript +const agent = stagehand.agent({ + integrations: [ + `https://mcp.exa.ai/mcp?exaApiKey=${process.env.EXA_API_KEY}`, + supabaseClient + ], + instructions: `Use all available tools strategically: search for current info, browse websites, and store important data.` +}); + +await agent.execute("Research competitor pricing, compare with our site, and store the analysis"); +``` + +## Further Reading + + + + Learn the fundamentals of Stagehand agents + + + + Set up your own MCP server + + + + Create custom MCP tools + + diff --git a/packages/docs/v2/best-practices/playwright-interop.mdx b/packages/docs/v2/best-practices/playwright-interop.mdx new file mode 100644 index 000000000..a9ec4dfc5 --- /dev/null +++ b/packages/docs/v2/best-practices/playwright-interop.mdx @@ -0,0 +1,37 @@ +--- +title: 'Playwright Interoperability' +description: 'How Stagehand interacts with Playwright' +--- + +Stagehand is built on top of [Playwright](https://playwright.dev/), so you can use Playwright methods directly through the Stagehand instance. + +## `page` and `context` + +`stagehand.page` and `stagehand.context` are instances of Playwright's `Page` and `BrowserContext` respectively. Use these methods to interact with the Playwright instance that Stagehand is using. + + +```TypeScript TypeScript +const page = stagehand.page; +// Base Playwright methods work +await page.goto("https://github.com/browserbase/stagehand"); + +// Stagehand overrides Playwright objects +await page.act("click on the contributors") +``` + +```python Python +page = stagehand.page +# Base Playwright methods work +await page.goto("https://github.com/browserbase/stagehand") + +# Stagehand overrides Playwright objects +await page.act("click on the contributors") +``` + + +## Stagehand v. Playwright +Below is an example of how to extract a list of companies from the AI Grant website using both Stagehand and Playwright. + +Stagehand v. Playwright + +The above example with Stagehand can be easily reused to extract data from other websites, whereas the Playwright example would need to be rewritten for each new website. \ No newline at end of file diff --git a/packages/docs/v2/best-practices/prompting-best-practices.mdx b/packages/docs/v2/best-practices/prompting-best-practices.mdx new file mode 100644 index 000000000..9ba3683b9 --- /dev/null +++ b/packages/docs/v2/best-practices/prompting-best-practices.mdx @@ -0,0 +1,493 @@ +--- +title: Prompting Best Practices +description: "Write effective prompts for reliable Stagehand automation" +--- + +Good prompts make Stagehand reliable. Bad prompts cause failures. Here's how to write prompts that work consistently. + +## Act Method + +Use `act()` for single actions on web pages. Each action should be focused and clear. + + +```typescript TypeScript +// Good - Single, specific actions +await page.act("click the 'Add to Cart' button"); +await page.act("type 'user@example.com' into the email field"); + +// Bad - Multiple actions combined +await page.act("fill out the form and submit it"); +await page.act("login with credentials and navigate to dashboard"); +``` + +```python Python +# Good - Single, specific actions +await page.act("click the 'Add to Cart' button") +await page.act("type 'user@example.com' into the email field") + +# Bad - Multiple actions combined +await page.act("fill out the form and submit it") +await page.act("login with credentials and navigate to dashboard") +``` + + +### Use Element Types, Not Colors + +Describe elements by their type and function rather than visual attributes like color. + + +```typescript TypeScript +// Good - Element types and descriptive text +await page.act("click the 'Sign In' button"); +await page.act("type into the email input field"); + +// Bad - Color-based descriptions +await page.act("click the blue button"); +await page.act("type into the white input"); +``` + +```python Python +# Good - Element types and descriptive text +await page.act("click the 'Sign In' button") +await page.act("type into the email input field") + +# Bad - Color-based descriptions +await page.act("click the blue button") +await page.act("type into the white input") +``` + + +### Use Descriptive Language + + +```typescript TypeScript +// Good - Clear element identification +await page.act("click the 'Next' button at the bottom of the form"); +await page.act("type into the search bar at the top of the page"); + +// Bad - Vague descriptions +await page.act("click next"); +await page.act("type into search"); +``` + +```python Python +# Good - Clear element identification +await page.act("click the 'Next' button at the bottom of the form") +await page.act("type into the search bar at the top of the page") + +# Bad - Vague descriptions +await page.act("click next") +await page.act("type into search") +``` + + +### Choose the Right Action Verbs + +- **Click** for buttons, links, checkboxes +- **Type** for text inputs +- **Select** for dropdowns +- **Check/uncheck** for checkboxes +- **Upload** for file inputs + + +```typescript TypeScript +// Good +await page.act("click the submit button"); +await page.act("select 'Option 1' from dropdown"); + +// Bad +await page.act("click submit"); +await page.act("choose option 1"); +``` + +```python Python +# Good +await page.act("click the submit button") +await page.act("select 'Option 1' from dropdown") + +# Bad +await page.act("click submit") +await page.act("choose option 1") +``` + + +### Protect Sensitive Data + +Variables keep sensitive information out of prompts and logs. + + +```typescript TypeScript +// Good - Secure approach +await page.act({ + action: "enter %username% in the email field", + variables: { + username: "user@example.com" + } +}); + +await page.act({ + action: "enter %password% in the password field", + variables: { + password: process.env.USER_PASSWORD + } +}); + +// Bad - Insecure approach +await page.act("type 'mySecretPassword123' into the password field"); +``` + +```python Python +import os + +# Good - Secure approach +await page.act( + "enter %username% in the email field", + variables={ + "username": "user@example.com" + } +) + +await page.act( + "enter %password% in the password field", + variables={ + "password": os.environ.get("USER_PASSWORD") + } +) + +# Bad - Insecure approach +await page.act("type 'mySecretPassword123' into the password field") +``` + + + +Set `verbose: 0` in your Stagehand config to prevent secrets from appearing in logs. + + +## Extract Method + +Use `extract()` to pull structured data from pages. Define clear schemas and provide context. + +### Schema Best Practices + +Use descriptive field names, correct types, and detailed descriptions. Field descriptions provide context that helps the agent understand exactly what to extract. + + +```typescript TypeScript +// Good - Descriptive names, correct types, and helpful descriptions +const productData = await page.extract({ + instruction: "Extract product information", + schema: z.object({ + productTitle: z.string().describe("The main product name displayed on the page"), + priceInDollars: z.number().describe("Current selling price as a number, without currency symbol"), + isInStock: z.boolean().describe("Whether the product is available for purchase") + }) +}); + +// Bad - Generic names, wrong types, no descriptions +const data = await page.extract({ + instruction: "Get product details", + schema: z.object({ + name: z.string(), // Too generic, no context + price: z.string(), // Should be number + stock: z.string() // Should be boolean, no context + }) +}); +``` + +```python Python +from pydantic import BaseModel, Field + +# Good - Descriptive names, correct types, and helpful descriptions +class ProductData(BaseModel): + productTitle: str = Field(description="The main product name displayed on the page") + priceInDollars: float = Field(description="Current selling price as a number, without currency symbol") + isInStock: bool = Field(description="Whether the product is available for purchase") + +productData = await page.extract( + "Extract product information", + schema=ProductData +) + +# Bad - Generic names, wrong types, no descriptions +class Data(BaseModel): + name: str # Too generic, no context + price: str # Should be float, no context + stock: str # Should be bool, no context + +data = await page.extract( + "Get product details", + schema=Data +) +``` + + +### Handle Arrays Correctly + +Always wrap schemas in objects for reliable extraction. + + +```typescript TypeScript +// Good - Array wrapped in object +const listings = await page.extract({ + instruction: "Extract all apartment listings", + schema: z.object({ + apartments: z.array(z.object({ + address: z.string(), + rent: z.number() + })) + }) +}); + +// Bad - Bare array +const listings = await page.extract({ + instruction: "Extract apartment listings", + schema: z.array(z.string()) // Don't do this +}); +``` + +```python Python +from pydantic import BaseModel +from typing import List + +# Good - Array wrapped in object +class Apartment(BaseModel): + address: str + rent: float + +class Listings(BaseModel): + apartments: List[Apartment] + +listings = await page.extract( + "Extract all apartment listings", + schema=Listings +) + +# Bad - Bare array (not supported) +# Don't do this - arrays must be wrapped in objects +``` + + +### Use Proper URL Types + +Specify URL types to tell Stagehand to extract URLs. Without proper URL types, Stagehand won't extract URLs. + + +```typescript TypeScript +// Good - Tells Stagehand to extract URLs +const links = await page.extract({ + instruction: "Extract navigation links", + schema: z.object({ + links: z.array(z.object({ + text: z.string(), + url: z.string().url() // Required for URL extraction + })) + }) +}); +``` + +```python Python +from pydantic import BaseModel, HttpUrl +from typing import List + +# Good - Tells Stagehand to extract URLs +class Link(BaseModel): + text: str + url: HttpUrl # Required for URL extraction + +class Links(BaseModel): + links: List[Link] + +links = await page.extract( + "Extract navigation links", + schema=Links +) +``` + + +## Observe Method + +Use `observe()` to discover actionable elements before acting on them. + +### Check Elements First + +Verify elements exist before taking action to avoid errors. + + +```typescript TypeScript +// Check for elements first +const loginButtons = await page.observe("Find the login button"); + +if (loginButtons.length > 0) { + await page.act(loginButtons[0]); +} else { + console.log("No login button found"); +} +``` + +```python Python +# Check for elements first +login_buttons = await page.observe("Find the login button") + +if len(login_buttons) > 0: + await page.act(login_buttons[0]) +else: + print("No login button found") +``` + + +### Be Specific About Element Types + + +```typescript TypeScript +// Good - Specific element types +const submitButtons = await page.observe("Find submit button in the form"); +const dropdowns = await page.observe("Find the state dropdown menu"); + +// Bad - Too vague +const elements = await page.observe("Find submit stuff"); +const things = await page.observe("Find state selection"); +``` + +```python Python +# Good - Specific element types +submit_buttons = await page.observe("Find submit button in the form") +dropdowns = await page.observe("Find the state dropdown menu") + +# Bad - Too vague +elements = await page.observe("Find submit") +things = await page.observe("Find state selection") +``` + + +## Agent Method + +Use `agent()` for complex, multi-step workflows. Provide detailed instructions and set appropriate limits. + +### Navigate First + +Don't include navigation in agent tasks. Handle it separately. + + +```typescript TypeScript +// Good - Navigate first +await page.goto('https://amazon.com'); +await agent.execute('Search for wireless headphones under $100 and add the best rated one to cart'); + +// Bad - Navigation in task +await agent.execute('Go to Amazon, search for headphones, and add one to cart'); +``` + +```python Python +# Good - Navigate first +await page.goto('https://amazon.com') +await agent.execute('Search for wireless headphones under $100 and add the best rated one to cart') + +# Bad - Navigation in task +await agent.execute('Go to Amazon, search for headphones, and add one to cart') +``` + + +### Be Highly Specific + +Detailed instructions lead to better results. + + +```typescript TypeScript +// Good - Detailed instructions +await agent.execute({ + instruction: "Find Italian restaurants in Brooklyn that are open after 10pm, have outdoor seating, and are rated 4+ stars. Save the top 3 results.", + maxSteps: 25 +}); + +// Bad - Vague instructions +await agent.execute("Find some good restaurants"); +``` + +```python Python +# Good - Detailed instructions +await agent.execute( + instruction="Find Italian restaurants in Brooklyn that are open after 10pm, have outdoor seating, and are rated 4+ stars. Save the top 3 results.", + max_steps=25 +) + +# Bad - Vague instructions +await agent.execute("Find some good restaurants") +``` + + +### Set Appropriate Step Limits + +Match step limits to task complexity. + + +```typescript TypeScript +// Simple task - fewer steps +await agent.execute({ + instruction: "Subscribe to the newsletter with email 'user@example.com'", + maxSteps: 10 +}); + +// Complex task - more steps +await agent.execute({ + instruction: "Research and compare 5 project management tools with pricing and features", + maxSteps: 50 +}); +``` + +```python Python +# Simple task - fewer steps +await agent.execute( + instruction="Subscribe to the newsletter with email 'user@example.com'", + max_steps=10 +) + +# Complex task - more steps +await agent.execute( + instruction="Research and compare 5 project management tools with pricing and features", + max_steps=50 +) +``` + + +### Include Success Criteria + +Tell the agent how to know when it's done. + + +```typescript TypeScript +// Good - Clear success criteria +await agent.execute({ + instruction: "Add 3 smartphone cases to cart and confirm the cart shows exactly 3 items with total price", + maxSteps: 20 +}); + +// Bad - No validation +await agent.execute("Add some items to cart"); +``` + +```python Python +# Good - Clear success criteria +await agent.execute( + instruction="Add 3 smartphone cases to cart and confirm the cart shows exactly 3 items with total price", + max_steps=20 +) + +# Bad - No validation +await agent.execute("Add some items to cart") +``` + + +## Common Mistakes to Avoid + +- **Combining multiple actions** - Keep each `act()` call to one action +- **Using vague descriptions** - Be specific about which elements to interact with +- **Exposing sensitive data** - Always use variables for credentials +- **Skipping validation** - Check results before proceeding + +## Testing Your Prompts + +1. **Start simple** - Test basic functionality first +2. **Add complexity gradually** - Build up to complex workflows +3. **Monitor results** - Use logging to understand what's happening +4. **Iterate based on failures** - Refine prompts when they don't work +Remember: Good prompting is iterative. When in doubt, be more specific rather than less. \ No newline at end of file diff --git a/packages/docs/v2/best-practices/speed-optimization.mdx b/packages/docs/v2/best-practices/speed-optimization.mdx new file mode 100644 index 000000000..7bc72a674 --- /dev/null +++ b/packages/docs/v2/best-practices/speed-optimization.mdx @@ -0,0 +1,352 @@ +--- +title: Speed Optimization +sidebarTitle: Speed Optimization +description: Optimize Stagehand performance for faster automation and reduced latency +--- + +Stagehand performance depends on several factors: DOM processing speed, LLM inference time, browser operations, and network latency. This guide provides proven strategies to maximize automation speed. + +## Quick Performance Wins + +### 1. Plan Ahead with Observe + + +Use a single `observe()` call to plan multiple actions, then execute them efficiently: + + +```typescript TypeScript +// Instead of sequential operations with multiple LLM calls +await page.act("Fill name field"); // LLM call #1 +await page.act("Fill email field"); // LLM call #2 +await page.act("Select country dropdown"); // LLM call #3 + +// Use single observe to plan all form fields - one LLM call +const formFields = await page.observe("Find all form fields to fill"); + +// Execute all actions without LLM inference +for (const field of formFields) { + await page.act(field); // No LLM calls! +} +``` +```python Python +import asyncio + +# Instead of sequential operations with multiple LLM calls +await page.act("Fill name field") # LLM call #1 +await page.act("Fill email field") # LLM call #2 +await page.act("Select country dropdown") # LLM call #3 + +# Use single observe to plan all form fields - one LLM call +form_fields = await page.observe("Find all form fields to fill") + +# Execute all actions without LLM inference +for field in form_fields: + await page.act(field) # No LLM calls! + +``` + + + +**Performance Tip**: Acting on `observe` results avoids LLM inference entirely. This approach is 2-3x faster than direct `act()` calls and is the recommended pattern for multi-step workflows. + + + + Learn advanced caching patterns and cache invalidation strategies + + +### 2. Optimize DOM Processing + +Reduce DOM complexity before Stagehand processes the page: + + +```typescript TypeScript +// Remove heavy elements that slow down processing +await page.evaluate(() => { + // Remove video elements + document.querySelectorAll('video, iframe').forEach(el => el.remove()); + + // Hide complex animations + document.querySelectorAll('[style*="animation"]').forEach(el => { + (el as HTMLElement).style.animation = 'none'; + }); +}); + +// Then perform Stagehand operations +await page.act("Click the submit button"); +``` +```python Python +# Remove heavy elements that slow down processing +await page.evaluate(""" +() => { + // Remove video elements + document.querySelectorAll('video, iframe').forEach(el => el.remove()); + + // Hide complex animations + document.querySelectorAll('[style*="animation"]').forEach(el => { + el.style.animation = 'none'; + }); +} +""") + +# Then perform Stagehand operations +await page.act("Click the submit button") +``` + + +### 3. Set Appropriate Timeouts + +Use shorter timeouts for simple operations and longer ones for complex page loads: + + +```typescript TypeScript +// Simple actions - reduce action timeout +await page.act({ + instruction: "Click the login button", + actTimeout: 5000 // Default is 30000ms, reduce for simple clicks +}); + +// Complex page loads - optimize navigation +await page.goto("https://heavy-spa.com", { + waitUntil: "domcontentloaded", // Don't wait for all resources + timeout: 15000 // Shorter than default 30s +}); +``` +```python Python +# Simple actions - reduce action timeout +await page.act("Click button", act_timeout=5000) + + +# Complex page loads - optimize navigation +await page.goto("https://heavy-spa.com", + wait_until="domcontentloaded", + timeout=15000 +) +``` + + +## Advanced Performance Strategies + + +### Smart Model Selection + +Use faster models for simple tasks, premium models only when needed: + + +```typescript TypeScript +class SpeedOptimizedStagehand { + private fastModel: Stagehand; + private premiumModel: Stagehand; + + async smartAct(page: Page, prompt: string, complexity: 'simple' | 'complex') { + const model = complexity === 'simple' ? this.fastModel : this.premiumModel; + return await model.page.act(prompt); + } +} + +// Use fast model for simple clicks/forms +await stagehand.smartAct(page, "Click submit", 'simple'); + +// Use premium model for complex reasoning +await stagehand.smartAct(page, "Find the cheapest flight option", 'complex'); +``` +```python Python +class SpeedOptimizedStagehand: + def __init__(self): + self.fast_model = Stagehand(model_name="fast-model") + self.premium_model = Stagehand(model_name="premium-model") + + async def smart_act(self, page, prompt: str, complexity: str): + model = self.fast_model if complexity == 'simple' else self.premium_model + return await model.page.act(prompt) + +# Use fast model for simple clicks/forms +await stagehand.smart_act(page, "Click submit", 'simple') + +# Use premium model for complex reasoning +await stagehand.smart_act(page, "Find the cheapest flight option", 'complex') +``` + + + + Compare model performance and costs + + +### Page Load Optimization + +Skip unnecessary resources during page loads: + + +```typescript TypeScript +// Block heavy resources globally +await context.route('**/*', (route) => { + const resourceType = route.request().resourceType(); + if (['image', 'font', 'media'].includes(resourceType)) { + route.abort(); + } else { + route.continue(); + } +}); + +// Use faster navigation +await page.goto(url, { + waitUntil: 'domcontentloaded', // Don't wait for images/fonts + timeout: 10000 +}); +``` +```python Python +# Block heavy resources globally +async def handle_route(route): + resource_type = route.request.resource_type + if resource_type in ['image', 'font', 'media']: + await route.abort() + else: + await route.continue_() + +await context.route('**/*', handle_route) + +# Use faster navigation +await page.goto(url, + wait_until='domcontentloaded', # Don't wait for images/fonts + timeout=10000 +) +``` + + + Balance speed with cost considerations + + +## Performance Monitoring and Benchmarking + +Track performance metrics and measure optimization impact: + +### Performance Tracking + + +```typescript TypeScript +class PerformanceTracker { + private speedMetrics: Map = new Map(); + + async timedAct(page: Page, prompt: string): Promise { + const start = Date.now(); + const result = await page.act(prompt); + const duration = Date.now() - start; + + if (!this.speedMetrics.has(prompt)) { + this.speedMetrics.set(prompt, []); + } + this.speedMetrics.get(prompt)!.push(duration); + + console.log(`Action "${prompt}" took ${duration}ms`); + return result; + } + + getAverageTime(prompt: string): number { + const times = this.speedMetrics.get(prompt) || []; + return times.reduce((a, b) => a + b, 0) / times.length; + } +} +``` +```python Python +import time +from collections import defaultdict + +class PerformanceTracker: + def __init__(self): + self.speed_metrics = defaultdict(list) + + async def timed_act(self, page, prompt: str): + start = time.time() + result = await page.act(prompt) + duration = (time.time() - start) * 1000 # Convert to ms + + self.speed_metrics[prompt].append(duration) + print(f'Action "{prompt}" took {duration:.0f}ms') + return result + + def get_average_time(self, prompt: str) -> float: + times = self.speed_metrics[prompt] + return sum(times) / len(times) if times else 0 +``` + + +Example Output: +``` +Action "Fill form" took 1000ms +Action "Click submit" took 2000ms +Action "Confirm submission" took 5000ms +``` + +### Before vs After Benchmarking + + +```typescript TypeScript +// Before optimization +console.time("workflow"); +await page.act("Fill form"); +await page.act("Click submit"); +await page.act("Confirm submission"); +console.timeEnd("workflow"); // 8000ms + +// After optimization with observe planning +console.time("workflow-optimized"); +const workflowActions = await page.observe("Find form, submit, and confirm elements"); + +// Execute actions sequentially to avoid conflicts +for (const action of workflowActions) { + await page.act(action); +} +console.timeEnd("workflow-optimized"); // 500ms +``` +```python Python +import time + +# Before optimization +start = time.time() +await page.act("Fill form") +await page.act("Click submit") +await page.act("Confirm submission") +print(f"Workflow took {(time.time() - start) * 1000:.0f}ms") # 8000ms + +# After optimization with observe planning +start = time.time() +workflow_actions = await page.observe("Find form, submit, and confirm elements") + +# Execute actions sequentially to avoid conflicts +for action in workflow_actions: + await page.act(action) +print(f"Optimized workflow took {(time.time() - start) * 1000:.0f}ms") # 500ms +``` + + +Example Output: +``` +Workflow took 8000ms +Optimized workflow took 500ms +``` + + + + Set up comprehensive performance monitoring + + + + +## Related Resources + + + + Advanced caching patterns for maximum performance + + + + Balance speed improvements with cost considerations + + + + Optimize Browserbase settings for speed + + + + Choose the right model for speed vs accuracy + + \ No newline at end of file diff --git a/packages/docs/v2/best-practices/usecase-observe.mdx b/packages/docs/v2/best-practices/usecase-observe.mdx new file mode 100644 index 000000000..ebe8a1180 --- /dev/null +++ b/packages/docs/v2/best-practices/usecase-observe.mdx @@ -0,0 +1,104 @@ +--- +sidebarTitle: Use Cases +--- + +## Real-World Use Cases + +### E-commerce Product Discovery + +```typescript +// Discover product interaction elements +const productActions = await page.observe({ + instruction: "Find add to cart buttons, size selectors, and product images" +}); + +// Categorize actions by type +const cartButtons = productActions.filter(a => + a.description.toLowerCase().includes('cart') +); +const sizeOptions = productActions.filter(a => + a.description.toLowerCase().includes('size') +); + +// Execute purchase workflow +if (sizeOptions.length > 0) { + await page.act(sizeOptions[0]); // Select size first +} +if (cartButtons.length > 0) { + await page.act(cartButtons[0]); // Then add to cart +} +``` + +### Form Handling & Validation + +```typescript +// Analyze form structure before filling +const formElements = await page.observe({ + instruction: "Find form fields, validation messages, and submit buttons" +}); + +// Check for required fields +const requiredFields = formElements.filter(e => + e.description.includes('required') || e.description.includes('*') +); + +console.log(`Found ${requiredFields.length} required fields to complete`); + +// Fill form systematically +for (const field of requiredFields) { + await page.act(field); + // Add appropriate input based on field type +} +``` + +### Dynamic Content & SPA Navigation + +```typescript +// Wait for and discover dynamically loaded content +await page.waitForLoadState('networkidle'); + +const dynamicElements = await page.observe({ + instruction: "Find newly loaded content, infinite scroll triggers, or loading indicators", + domSettleTimeoutMs: 15000 // Wait longer for dynamic content +}); + +// Handle infinite scroll +const scrollTriggers = dynamicElements.filter(e => + e.description.toLowerCase().includes('load more') || + e.description.toLowerCase().includes('scroll') +); + +if (scrollTriggers.length > 0) { + await page.act(scrollTriggers[0]); + // Recursively observe new content + const newContent = await page.observe("Find additional items"); +} +``` + +### Multi-Step Workflow Planning + +```typescript +// Plan entire checkout flow upfront +async function planCheckoutWorkflow() { + // Step 1: Cart page analysis + await page.goto('/cart'); + const cartActions = await page.observe("Find checkout and cart modification options"); + + // Step 2: Checkout page analysis + const checkoutButton = cartActions.find(a => a.description.includes('checkout')); + if (checkoutButton) await page.act(checkoutButton); + + const checkoutActions = await page.observe("Find payment forms and shipping options"); + + // Step 3: Plan execution order + const shippingFields = checkoutActions.filter(a => a.description.includes('shipping')); + const paymentFields = checkoutActions.filter(a => a.description.includes('payment')); + const submitButton = checkoutActions.find(a => a.description.includes('complete order')); + + return { shippingFields, paymentFields, submitButton }; +} + +// Execute planned workflow +const workflow = await planCheckoutWorkflow(); +// Fill shipping → payment → submit +``` diff --git a/packages/docs/v2/best-practices/user-data.mdx b/packages/docs/v2/best-practices/user-data.mdx new file mode 100644 index 000000000..90e3ac029 --- /dev/null +++ b/packages/docs/v2/best-practices/user-data.mdx @@ -0,0 +1,56 @@ +--- +title: User Data Directory +sidebarTitle: User Data +description: Persist browser data between sessions +--- + +### User Data Directory + +Persist browser data between sessions using a custom user data directory: + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +// For Browserbase sessions +const stagehand = new Stagehand({ + env: "BROWSERBASE", + browserbaseSessionCreateParams: { + userDataDir: "/path/to/user/data/directory", + }, +}); + +// For Local sessions +const localStagehand = new Stagehand({ + env: "LOCAL", + localBrowserLaunchOptions: { + userDataDir: "./browser-data", + }, +}); + +await stagehand.init(); +console.log("Session ID:", stagehand.sessionId); +``` +```python Python +from stagehand import Stagehand + +# For Browserbase sessions +stagehand = Stagehand( + env="BROWSERBASE", + browserbase_session_create_params={ + "user_data_dir": "/path/to/user/data/directory", + }, +) + +# For Local sessions +local_stagehand = Stagehand( + env="LOCAL", + local_browser_launch_options={ + "user_data_dir": "./browser-data", + }, +) + +await stagehand.init() +print(f"Session ID: {stagehand.session_id}") +``` + \ No newline at end of file diff --git a/packages/docs/v2/best-practices/using-multiple-tabs.mdx b/packages/docs/v2/best-practices/using-multiple-tabs.mdx new file mode 100644 index 000000000..9cbda7254 --- /dev/null +++ b/packages/docs/v2/best-practices/using-multiple-tabs.mdx @@ -0,0 +1,141 @@ +--- +title: 'Using Multiple Tabs' +description: 'Act on multiple tabs with Stagehand' +--- + +Many modern web applications open new tabs when users click certain buttons or links. Without proper multitab support, automation scripts break when expected content appears in a new tab rather than the current one. Stagehand's multitab capabilities ensure your automations work seamlessly across multitab workflows. + +## The Stagehand Page + +Stagehand automatically adapts to multitab workflows. The `stagehand.page` object always points to the most recently opened or active tab, ensuring your automations continue working even when new tabs are created. + +This means you can continue using familiar patterns: + + +```typescript TypeScript +const page = stagehand.page; +await page.goto("https://example.com"); +await page.act("click the button that opens a new tab"); +// page now automatically points to the new tab +await page.extract("get data from new tab"); +``` + +```python Python +page = stagehand.page +await page.goto("https://example.com") +await page.act("click the button that opens a new tab") +# page now automatically points to the new tab +await page.extract("get data from new tab") +``` + + + +**Important**: [Stagehand Agent](/v2/basics/agent) will always operate on the `stagehand.page`. If you need an agent to work across specific tabs, you'll need to manage page switching manually. + + +## Manual Page Management + +For more control or multitab workflows, you can manage multiple tabs explicitly: + + +```typescript TypeScript +// Create a second page +await stagehand.context.newPage(); +const pages = stagehand.context.pages(); + +const githubPage = pages[0]; +const pythonPage = pages[1]; + +// Navigate each page to different repositories +await githubPage.goto("https://github.com/browserbase/stagehand"); +await pythonPage.goto("https://github.com/browserbase/stagehand-python"); + +// Extract data from both pages simultaneously +const [stagehandStars, stagehandPythonStars] = await Promise.all([ + githubPage.extract("extract the repository stars"), + pythonPage.extract("extract the repository stars") +]); + +console.log(`Stagehand stars: ${stagehandStars}`); +console.log(`Stagehand-Python stars: ${stagehandPythonStars}`); +``` + +```python Python +# Create a second page +await stagehand.context.new_page() +pages = stagehand.context.pages() + +github_page = pages[0] +python_page = pages[1] + +# Navigate each page to different repositories +await github_page.goto("https://github.com/browserbase/stagehand") +await python_page.goto("https://github.com/browserbase/stagehand-python") + +# Extract data from both pages +stagehand_stars = await github_page.extract("extract the repository stars") +stagehand_python_stars = await python_page.extract("extract the repository stars") + +print(f"Stagehand stars: {stagehand_stars}") +print(f"Stagehand-Python stars: {stagehand_python_stars}") +``` + + +## Handling Tab Events + +You can also listen for tab events to control what happens when new tabs are opened: + + +```typescript TypeScript +const page = stagehand.page; +await page.goto("https://browserbase.github.io/stagehand-eval-sites/sites/five-tab/"); + +// close the new tab after it's opened +page.on("popup", async () => { + const newPage = stagehand.context.pages()[1]; + await newPage.close(); +}); + +await page.act("click the button to open the other page"); + +const page_number = await page.extract("extract the page number"); +console.log(`You're on page ${page_number}`); +``` + +```python Python +page = stagehand.page +await page.goto("https://browserbase.github.io/stagehand-eval-sites/sites/five-tab/") + +# Close the new tab after it's opened +async def handle_popup(): + new_page = stagehand.context.pages()[1] + await new_page.close() + +page.on("popup", handle_popup) + +await page.act("click the button to open the other page") + +page_number = await page.extract("extract the page number") +print(f"You're on page {page_number}") +``` + + +## Next Steps + + + + Use `Agent` to autonomously execute multi-step tasks and complex workflows. + + + + Learn best practices for interacting with elements inside iframes. + + + + Manage browser contexts and sessions for complex automation scenarios. + + + + Handle errors gracefully and debug automation issues effectively. + + \ No newline at end of file diff --git a/packages/docs/v2/best-practices/working-with-iframes.mdx b/packages/docs/v2/best-practices/working-with-iframes.mdx new file mode 100644 index 000000000..a12d1a99b --- /dev/null +++ b/packages/docs/v2/best-practices/working-with-iframes.mdx @@ -0,0 +1,90 @@ +--- +title: Working with iframes +--- + +### What is an iframe? + +Iframes embed other pages within your current page. Sites use them for consent banners, payment widgets, chat bubbles, and third-party content. +Elements inside iframes exist in a separate context than the main page. + +### Enable iframe support + +Set `iframes: true` in your `act()`, `observe()`, and `extract()` commands. + + +```typescript TypeScript +// Act within iframes +await page.act({ action: "click the accept cookies button", iframes: true }); + +// Observe within iframes +const results = await page.observe({ + instruction: "Find the primary action button", + iframes: true, +}); + +// Extract from iframes +const data = await page.extract({ + instruction: "Extract the product price from the payment widget", + schema: z.object({ + price: z.string(), + }), + iframes: true, +}); +``` + +```python Python +# Act within iframes +await page.act( + "click the accept cookies button", + iframes=True +) + +# Observe within iframes +results = await page.observe({ + "instruction": "Find the primary action button", + "iframes": True, +}) + +# Extract from iframes +data = await page.extract({ + "instruction": "Extract the product price from the payment widget", + "schema": { + "type": "object", + "properties": { + "price": {"type": "string"} + } + }, + "iframes": True, +}) +``` + + +### Tips + +- Iframes can increase processing time. For best performance, use the iframe option only when necessary. +- When you are unsure whether an element will be in an iframe, you can verify the presence of iframes in Stagehand logs. +- If an element intermittently fails to be found, it may be inside a lazy‑loaded iframe. Add small waits between steps or re‑run your action. + + +You can enable experimental features (like Shadow DOM support) via your Stagehand configuration. See the [configuration guide](/v2/configuration/browser). + + +## Next steps + + + + Use `observe()` to plan precise, single-step actions before executing them. + + + + Use `extract()` with a data schema to pull clean, typed data from any page. + + + + Speed up repeated automations by caching actions. + + + + Learn how to perform single-step actions reliably with `act()`. + + \ No newline at end of file diff --git a/packages/docs/v2/configuration/browser.mdx b/packages/docs/v2/configuration/browser.mdx new file mode 100644 index 000000000..501a80f40 --- /dev/null +++ b/packages/docs/v2/configuration/browser.mdx @@ -0,0 +1,433 @@ +--- +title: Browser +sidebarTitle: Browser +description: Configure Stagehand on Browserbase or locally +--- + +Stagehand supports two primary environments: + +- **Browserbase** - Cloud-managed browser infrastructure optimized for production web automation at scale +- **Local** - Run browsers directly on your machine for development and debugging + +## Browserbase Environment + +Browserbase provides managed cloud browser infrastructure optimized for web automation at scale. It offers advanced features like stealth mode, proxy support, and persistent contexts. + + + Discover the power of cloud-managed browser infrastructure with Browserbase. + + +### Environment Variables + +Before getting started, set up the required environment variables: + + +```bash .env +BROWSERBASE_API_KEY=your_api_key_here +BROWSERBASE_PROJECT_ID=your_project_id_here +``` + + + +Get your API key and Project ID from the [Browserbase Dashboard](https://browserbase.com/overview) + + +### Using Stagehand with Browserbase + +#### Basic Setup + +The simplest way to get started is with default settings: + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", +}); + +await stagehand.init(); +``` +```python Python +import os +from stagehand import Stagehand + +stagehand = Stagehand( + env="BROWSERBASE", +) + +await stagehand.init() +``` + + +#### Advanced Configuration + +Configure browser settings, proxy support, and other session parameters: + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", + // Optional: API Key and Project ID will be pulled directly from your environment + apiKey: process.env.BROWSERBASE_API_KEY, + projectId: process.env.BROWSERBASE_PROJECT_ID, + browserbaseSessionCreateParams: { + proxies: true, + region: "us-west-2", + browserSettings: { + viewport: { width: 1920, height: 1080 }, + blockAds: true, + }, + }, +}); + +await stagehand.init(); +console.log("Session ID:", stagehand.sessionId); +``` +```python Python +import os +from stagehand import Stagehand + +stagehand = Stagehand( + env="BROWSERBASE", + # Optional: API Key and Project ID will be pulled directly from your environment + api_key=os.getenv("BROWSERBASE_API_KEY"), + project_id=os.getenv("BROWSERBASE_PROJECT_ID"), + browserbase_session_create_params={ + "proxies": True, + "region": "us-west-2", + "browser_settings": { + "viewport": {"width": 1920, "height": 1080}, + "block_ads": True, + }, + }, +) +``` + + + + + ```typescript TypeScript + const stagehand = new Stagehand({ + env: "BROWSERBASE", + apiKey: process.env.BROWSERBASE_API_KEY, + projectId: process.env.BROWSERBASE_PROJECT_ID, + browserbaseSessionCreateParams: { + projectId: process.env.BROWSERBASE_PROJECT_ID!, + proxies: true, + region: "us-west-2", + timeout: 3600, // 1 hour session timeout + keepAlive: true, // Available on Startup plan + browserSettings: { + advancedStealth: false, // this is a Scale Plan feature - reach out to support@browserbase.com to enable + blockAds: true, + solveCaptchas: true, + recordSession: false, + viewport: { + width: 1920, + height: 1080, + }, + fingerprint: { + browsers: ["chrome", "edge"], + devices: ["desktop"], + operatingSystems: ["windows", "macos"], + locales: ["en-US", "en-GB"], + httpVersion: 2, + }, + }, + userMetadata: { + userId: "automation-user-123", + environment: "production", + }, + }, + }); + ``` + ```python Python + stagehand = Stagehand( + env="BROWSERBASE", + api_key=os.getenv("BROWSERBASE_API_KEY"), + project_id=os.getenv("BROWSERBASE_PROJECT_ID"), + browserbase_session_create_params={ + "project_id": os.getenv("BROWSERBASE_PROJECT_ID"), + "proxies": True, + "region": "us-west-2", + "timeout": 3600, # 1 hour session timeout + "keep_alive": True, # Available on Startup plan + "browser_settings": { + "advanced_stealth": False, # this is a Scale Plan feature - reach out to support@browserbase.com to enable + "block_ads": True, + "solve_captchas": True, + "record_session": False, + "viewport": { + "width": 1920, + "height": 1080, + }, + "fingerprint": { + "browsers": ["chrome", "edge"], + "devices": ["desktop"], + "operating_systems": ["windows", "macos"], + "locales": ["en-US", "en-GB"], + "http_version": 2, + }, + }, + "user_metadata": { + "user_id": "automation-user-123", + "environment": "production", + }, + }, + ) + ``` + + + +#### Initialization Result +After calling `stagehand.init()`, the method returns configuration information about the initialized session: + + +```typescript TypeScript +const result = await stagehand.init(); +console.log(result); +``` +```python Python +result = await stagehand.init() +print(result) +``` + + +The returned object contains: +```Example +{ + debugUrl: 'https://www.browserbase.com/devtools/inspector.html?wss=connect.browserbase.com/debug/f8a21b4a-6fa1-4ab9-9007-fbfe61dc14f0/devtools/page/5474B0E0510C5B6E629BEB06E799CD70?debug=true', + sessionUrl: 'https://www.browserbase.com/sessions/f8a21b4a-6fa1-4ab9-9007-fbfe61dc14f0', + sessionId: 'f8a21b4a-6fa1-4ab9-9007-fbfe61dc14f0' +} +``` + + + +**Open the Browserbase [session live view](https://docs.browserbase.com/features/session-live-view)** to include a human-in-the-loop. + + + +**Open the [session replay](https://docs.browserbase.com/features/session-replay)** to see the full session recording. + + + +**Unique identifier** for the [Browserbase session](https://docs.browserbase.com/introduction/what-is-browserbase). This is used to identify the session in the Browserbase dashboard and to connect to the session. + + + +### Alternative: Browserbase SDK + +If you prefer to manage sessions directly, you can use the Browserbase SDK: + + +```typescript TypeScript +import { Browserbase } from "@browserbasehq/sdk"; + +const bb = new Browserbase({ + apiKey: process.env.BROWSERBASE_API_KEY! +}); + +const session = await bb.sessions.create({ + projectId: process.env.BROWSERBASE_PROJECT_ID!, + // Add configuration options here +}); +``` +```python Python +from browserbase import Browserbase + +bb = Browserbase(api_key=os.environ["BROWSERBASE_API_KEY"]) + +session = bb.sessions.create( + project_id=os.environ["BROWSERBASE_PROJECT_ID"], + # Add configuration options here +) +``` + + +#### Connecting to an Existing Session + +Connect to a previously created Browserbase session using its session ID: + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", + browserbaseSessionID: "existing-session-uuid-here", +}); + +await stagehand.init(); +console.log("Resumed Session ID:", stagehand.sessionId); +``` +```python Python +import os +from stagehand import Stagehand + +stagehand = Stagehand( + env="BROWSERBASE", + browserbase_session_id="existing-session-uuid-here", +) + +await stagehand.init() +print(f"Resumed Session ID: {stagehand.session_id}") +``` + + +## Local Environment + +The local environment runs browsers directly on your machine, providing full control over browser instances and configurations. Ideal for development, debugging, and scenarios requiring custom browser setups. + +### Environment Comparison + +| Feature | Browserbase | Local | +| --- | --- | --- | +| **Scalability** | High (cloud-managed) | Limited (local resources) | +| **Stealth Features** | Advanced fingerprinting | Basic stealth | +| **Proxy Support** | Built-in residential proxies | Manual configuration | +| **Session Persistence** | Cloud context storage | File-based user data | +| **Geographic Distribution** | Multi-region deployment | Single machine | +| **Debugging** | Session recordings & logs | Direct DevTools access | +| **Setup Complexity** | Environment variables only | Browser installation required | +| **Cost** | Usage-based pricing | Infrastructure & maintenance | +| **Best For** | Production, scale, compliance | Development, debugging | + +### Basic Local Setup + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "LOCAL" +}); + +await stagehand.init(); +console.log("Session ID:", stagehand.sessionId); +``` +```python Python +from stagehand import Stagehand + +stagehand = Stagehand( + env="LOCAL" +) + +await stagehand.init() +print(f"Session ID: {stagehand.session_id}") +``` + + +### Advanced Local Configuration + +Customize browser launch options for local development: + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "LOCAL", + localBrowserLaunchOptions: { + headless: false, // Show browser window + devtools: true, // Open developer tools + viewport: { width: 1280, height: 720 }, + executablePath: '/opt/google/chrome/chrome', // Custom Chrome path + args: [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-web-security', + '--allow-running-insecure-content', + ], + env: { + NODE_ENV: "development", + DEBUG: "true", + }, + }, +}); + +await stagehand.init(); +``` +```python Python +from stagehand import Stagehand + +stagehand = Stagehand( + env="LOCAL", + headless=False, # Show browser window + local_browser_launch_options={ + "devtools": True, # Open developer tools + "viewport": {"width": 1280, "height": 720}, + "executable_path": "/opt/google/chrome/chrome", # Custom Chrome path + "args": [ + "--no-sandbox", + "--disable-setuid-sandbox", + "--disable-web-security", + "--allow-running-insecure-content", + ], + "env": { + "NODE_ENV": "development", + "DEBUG": "true", + }, + }, +) + +await stagehand.init() +``` + + +### Connecting to your local browser + +Connect to your existing local Chrome/Chromium browser instead of launching a new one. This lets you automate your normal browser with all your existing tabs, extensions and settings. + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "LOCAL", + localBrowserLaunchOptions: { + cdpUrl: 'http://localhost:9222' + } +}); + +await stagehand.init(); +``` +```python Python +from stagehand import Stagehand + +stagehand = Stagehand( + env="LOCAL", + local_browser_launch_options={ + cdp_url="http://localhost:9222" + } +) + +await stagehand.init() +``` + + +## Troubleshooting + +### Common Issues + + + +- Verify your `BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID` are set correctly +- Check that your API key has the necessary permissions +- Ensure your Browserbase account has sufficient credits + + + +- Install Chrome or Chromium on your system +- Set the correct `executablePath` for your Chrome installation +- Check that required dependencies are installed (Linux: `libnss3-dev libatk-bridge2.0-dev libgtk-3-dev libxss1 libasound2`) + + + +- Increase session timeout in `browserbaseSessionCreateParams.timeout` +- Use `keepAlive: true` for long-running sessions +- Monitor session usage to avoid unexpected terminations + + \ No newline at end of file diff --git a/packages/docs/v2/configuration/evals.mdx b/packages/docs/v2/configuration/evals.mdx new file mode 100644 index 000000000..71199218d --- /dev/null +++ b/packages/docs/v2/configuration/evals.mdx @@ -0,0 +1,301 @@ +--- +title: Evaluations & Metrics +sidebarTitle: Evaluations +description: Monitor performance, optimize costs, and evaluate LLM effectiveness +--- + +Evaluations help you understand how well your automation performs, which models work best for your use cases, and how to optimize for cost and reliability. This guide covers both monitoring your own workflows and running comprehensive evaluations. + +## Why Evaluations Matter + +- **Performance Optimization**: Identify which models and settings work best for your specific automation tasks +- **Cost Control**: Track token usage and inference time to optimize spending +- **Reliability**: Measure success rates and identify failure patterns +- **Model Selection**: Compare different LLMs on real-world tasks to make informed decisions + + + View real-time performance comparisons across different LLMs on the [Stagehand Evals Dashboard](https://www.stagehand.dev/evals) + + +## Comprehensive Evaluations + +Evaluations help you systematically test and improve your automation workflows. Stagehand provides both built-in evaluations and tools to create your own. + +We have 2 types of evals: +1. **Deterministic Evals** - These include unit tests, integration tests, and E2E tests that can be run without any LLM inference. +2. **LLM-based Evals** - These are evals that test the underlying functionality of Stagehand's AI primitives. + + +### Evals CLI +![Evals CLI](/media/evals-cli.png) + + +To run evals, you'll need to clone the [Stagehand repo](https://github.com/browserbase/stagehand) and set up the CLI. + +We recommend using [Braintrust](https://www.braintrust.dev/docs/) to help visualize evals results and metrics. + + +The Stagehand CLI provides a powerful interface for running evaluations. You can run specific evals, categories, or external benchmarks with customizable settings. + +Evals are grouped into: +1. **Act Evals** - These are evals that test the functionality of the `act` method. +2. **Extract Evals** - These are evals that test the functionality of the `extract` method. +3. **Observe Evals** - These are evals that test the functionality of the `observe` method. +4. **Combination Evals** - These are evals that test the functionality of the `act`, `extract`, and `observe` methods together. +5. **Experimental Evals** - These are experimental custom evals that test the functionality of the stagehand primitives. +6. **Agent Evals** - These are evals that test the functionality of `agent`. +7. **(NEW) External Benchmarks** - Run external benchmarks like WebBench, GAIA, WebVoyager, OnlineMind2Web, and OSWorld. + +#### Installation + + + +```bash +# From the stagehand root directory +pnpm install +``` + + + +```bash +pnpm run build:cli +``` + + + +```bash +evals help +``` + + + +#### CLI Commands and Options + +##### Basic Commands + +```bash +# Run all evals +evals run all + +# Run specific category +evals run act +evals run extract +evals run observe +evals run agent + +# Run specific eval +evals run extract/extract_text + +# List available evals +evals list +evals list --detailed + +# Configure defaults +evals config +evals config set env browserbase +evals config set trials 5 +``` + +##### Command Options + +- **`-e, --env`**: Environment (`local` or `browserbase`) +- **`-t, --trials`**: Number of trials per eval (default: 3) +- **`-c, --concurrency`**: Max parallel sessions (default: 10) +- **`-m, --model`**: Model override +- **`-p, --provider`**: Provider override +- **`--api`**: Use Stagehand API instead of SDK + +##### Running External Benchmarks + +The CLI supports several industry-standard benchmarks: + +```bash +# WebBench with filters +evals run benchmark:webbench -l 10 -f difficulty=easy -f category=READ + +# GAIA benchmark +evals run b:gaia -s 100 -l 25 -f level=1 + +# WebVoyager +evals run b:webvoyager -l 50 + +# OnlineMind2Web +evals run b:onlineMind2Web + +# OSWorld +evals run b:osworld -f source=Mind2Web +``` + +#### Configuration Files + +You can view the specific evals in [`evals/tasks`](https://github.com/browserbase/stagehand/tree/v2/evals/tasks). Each eval is grouped into eval categories based on [`evals/evals.config.json`](https://github.com/browserbase/stagehand/blob/main/evals/evals.config.json). + + +#### Viewing eval results +![Eval results](/images/evals.png) + +Eval results are viewable on Braintrust. You can view the results of a specific eval by going to the Braintrust URL specified in the terminal when you run `npm run evals`. + +By default, each eval will run five times per model. The "Exact Match" column shows the percentage of times the eval was correct. The "Error Rate" column shows the percentage of times the eval errored out. + +You can use the Braintrust UI to filter by model/eval and aggregate results across all evals. + +### Deterministic Evals + +To run deterministic evals, you can run `npm run e2e` from within the Stagehand repo. This will test the functionality of Playwright within Stagehand to make sure it's working as expected. + +These tests are in [`evals/deterministic`](https://github.com/browserbase/stagehand/tree/v2/evals/deterministic) and test on both Browserbase browsers and local headless Chromium browsers. + +## Creating Custom Evaluations + +### Step-by-Step Guide + + + +Create a new file in `evals/tasks/your-eval.ts`: + +```typescript +import { EvalTask } from '../types'; + +export const customEvalTask: EvalTask = { + name: 'custom_task_name', + description: 'Test specific automation workflow', + + // Test setup + setup: async ({ page }) => { + await page.goto('https://example.com'); + }, + + // The actual test + task: async ({ stagehand, page }) => { + // Your automation logic + await page.act({ action: 'click the login button' }); + const result = await page.extract({ + instruction: 'Get the user name', + schema: { username: 'string' } + }); + return result; + }, + + // Validation + validate: (result, expected) => { + return result.username === expected.username; + }, + + // Test cases + testCases: [ + { + input: { /* test input */ }, + expected: { username: 'john_doe' } + } + ], + + // Evaluation criteria + scoring: { + exactMatch: true, + timeout: 30000, + retries: 2 + } +}; +``` + + + +Update `evals/evals.config.json`: + +```json +{ + "categories": { + "custom": ["custom_task_name"], + "existing_category": ["custom_task_name"] + } +} +``` + + + +```bash +# Test your custom evaluation +evals run custom_task_name + +# Run the entire custom category +evals run custom + +# Run with specific settings +evals run custom_task_name -e browserbase -t 5 -m gpt-4o +``` + + + + +## Best Practices for Custom Evals + + + +- **Atomic**: Each test should validate one specific capability +- **Deterministic**: Tests should produce consistent results +- **Realistic**: Use real-world scenarios and websites +- **Measurable**: Define clear success/failure criteria + + + +- **Parallel Execution**: Design tests to run independently +- **Resource Management**: Clean up after each test +- **Timeout Handling**: Set appropriate timeouts for operations +- **Error Recovery**: Handle failures gracefully + + + +- **Ground Truth**: Establish reliable expected outcomes +- **Edge Cases**: Test boundary conditions and error scenarios +- **Statistical Significance**: Run multiple iterations for reliability +- **Version Control**: Track changes to test cases over time + + + +### Troubleshooting Evaluations + + +**Symptoms**: Tests fail with timeout errors + +**Solutions**: +- Increase timeout in `taskConfig.ts` +- Use faster models (Gemini 2.5 Flash, GPT-4o Mini) +- Optimize test scenarios to be less complex +- Check network connectivity to LLM providers + + + +**Symptoms**: Same test passes/fails randomly + +**Solutions**: +- Set temperature to 0 for deterministic outputs +- Increase repetitions for statistical significance +- Use more capable models for complex tasks +- Check for dynamic website content affecting tests + + + +**Symptoms**: Token usage exceeding budget + +**Solutions**: +- Use cost-effective models (Gemini 2.0 Flash, GPT-4o Mini) +- Reduce repetitions for initial testing +- Focus on specific evaluation categories +- Use local browser environment to reduce Browserbase costs + + + +**Symptoms**: Results not uploading to dashboard + +**Solutions**: +- Check Braintrust API key configuration +- Verify internet connectivity +- Update Braintrust SDK to latest version +- Check project permissions in Braintrust dashboard + + \ No newline at end of file diff --git a/packages/docs/v2/configuration/logging.mdx b/packages/docs/v2/configuration/logging.mdx new file mode 100644 index 000000000..978541ae6 --- /dev/null +++ b/packages/docs/v2/configuration/logging.mdx @@ -0,0 +1,299 @@ +--- +title: Logging & Debugging +sidebarTitle: Logging +description: Set up logging, debugging, and error tracking for Stagehand workflows +--- + +Stagehand provides comprehensive logging capabilities to help you debug automation workflows, track execution, and diagnose issues. Configure logging levels, structured output, and debugging tools for both development and production environments. + +## Logging Configuration + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", // or "LOCAL" + verbose: 1, // 0 = errors only, 1 = info, 2 = debug +}); +``` + +```python Python +from stagehand import Stagehand + +stagehand = Stagehand( + env="BROWSERBASE", # or "LOCAL" + verbose=1, # 0 = errors only, 1 = info, 2 = debug +) +``` + + +### Verbose Levels + +- **Level 0**: Errors only - minimal output for production +- **Level 1**: Info - includes successful operations and important events +- **Level 2**: Debug - comprehensive logging including internal operations + +## Structured Logging + +### Log Line Format + +Each log entry contains structured information: + + +```typescript TypeScript +interface LogLine { + category: 'browser' | 'action' | 'llm' | 'error' | 'stagehand' | 'cache'; + message: string; + level: 0 | 1 | 2; // error | info | debug + timestamp: string; + auxiliary?: { + executionTime?: { value: string; unit: string }; + sessionId?: string; + url?: string; + [key: string]: any; + }; +} +``` + +```python Python +# Log line structure in Python +{ + "category": "browser" | "action" | "llm" | "error" | "stagehand" | "cache", + "message": str, + "level": 0 | 1 | 2, # error | info | debug + "timestamp": str, + "auxiliary": { + "execution_time": {"value": str, "unit": str}, + "session_id": str, + "url": str, + # ... other context data + } +} +``` + + +### Custom Logger + + +```typescript TypeScript +class AdvancedLogger { + private logFile?: string; + + constructor(logFile?: string) { + this.logFile = logFile; + } + + log = (logLine: any) => { + const timestamp = new Date().toISOString(); + const colors = { + browser: '\x1b[34m', // blue + action: '\x1b[32m', // green + llm: '\x1b[35m', // magenta + error: '\x1b[31m', // red + stagehand: '\x1b[36m', // cyan + cache: '\x1b[33m', // yellow + }; + + const color = colors[logLine.category] || '\x1b[0m'; + const reset = '\x1b[0m'; + + // Console output with colors + console.log(`${color}[${logLine.category}]${reset} ${logLine.message}`); + + // Log execution time if available + if (logLine.auxiliary?.executionTime) { + console.log(` ${logLine.auxiliary.executionTime.value}${logLine.auxiliary.executionTime.unit}`); + } + + // Log additional context + if (logLine.auxiliary && Object.keys(logLine.auxiliary).length > 0) { + console.log(' Context:', JSON.stringify(logLine.auxiliary, null, 2)); + } + + // File logging (optional) + if (this.logFile) { + const logEntry = { + timestamp, + ...logLine + }; + require('fs').appendFileSync(this.logFile, JSON.stringify(logEntry) + '\n'); + } + } +} + +// Usage +const logger = new AdvancedLogger('./automation.log'); +const stagehand = new Stagehand({ + env: "BROWSERBASE", + verbose: 2, + logger: logger.log +}); +``` + +```python Python +import json +import os +from datetime import datetime +from typing import Dict, Any, Optional + +class AdvancedLogger: + def __init__(self, log_file: Optional[str] = None): + self.log_file = log_file + + def log(self, log_line: Dict[str, Any]): + timestamp = datetime.now().isoformat() + colors = { + 'browser': '\033[34m', # blue + 'action': '\033[32m', # green + 'llm': '\033[35m', # magenta + 'error': '\033[31m', # red + 'stagehand': '\033[36m', # cyan + 'cache': '\033[33m', # yellow + } + + color = colors.get(log_line.get('category', ''), '\033[0m') + reset = '\033[0m' + + # Console output with colors + print(f"{color}[{log_line.get('category')}]{reset} {log_line.get('message')}") + + # Log execution time if available + if log_line.get('auxiliary', {}).get('execution_time'): + exec_time = log_line['auxiliary']['execution_time'] + print(f"{exec_time['value']}{exec_time['unit']}") + + # Log additional context + auxiliary = log_line.get('auxiliary', {}) + if auxiliary and len(auxiliary) > 0: + print(' Context:', json.dumps(auxiliary, indent=2)) + + # File logging (optional) + if self.log_file: + log_entry = { + 'timestamp': timestamp, + **log_line + } + with open(self.log_file, 'a') as f: + f.write(json.dumps(log_entry) + '\n') + +# Usage +logger = AdvancedLogger('./automation.log') +stagehand = Stagehand( + env="BROWSERBASE", + verbose=2, + logger=logger.log +) +``` + + +## Detailed Logging Features + +### LLM Inference Logging + +Enable detailed logging of all LLM interactions: + + +```typescript TypeScript +const stagehand = new Stagehand({ + env: "BROWSERBASE", + logInferenceToFile: true, // Creates inference_summary/ directory + verbose: 2 +}); +``` + +```python Python +stagehand = Stagehand( + env="BROWSERBASE", + log_inference_to_file=True, # Creates inference_summary/ directory + verbose=2 +) +``` + + +The `inference_summary/` directory structure: +``` +inference_summary/ +├── act_summary/ +│ ├── 20240329_080446068.json +│ ├── 20240329_080447019.json +│ └── act_summary.json +├── extract_summary/ +│ ├── 20240329_081205123.json +│ └── extract_summary.json +└── observe_summary/ + ├── 20240329_081634891.json + └── observe_summary.json +``` + +## Log Analysis & Debugging + +### Common Log Patterns + + + ```json + { + "category": "action", + "message": "act completed successfully", + "level": 1, + "auxiliary": { + "executionTime": {"value": "1250", "unit": "ms"}, + "url": "https://example.com", + "sessionId": "session-123" + } + } + ``` + + + ```json + { + "category": "llm", + "message": "inference completed", + "level": 1, + "auxiliary": { + "model": "gpt-4o", + "tokens": {"prompt": 3451, "completion": 45}, + "executionTime": {"value": "951", "unit": "ms"} + } + } + ``` + + + ```json + { + "category": "action", + "message": "action failed: element not found", + "level": 0, + "auxiliary": { + "selector": "button[data-testid='submit']", + "url": "https://example.com/form", + "sessionId": "session-123" + } + } + ``` + + + +## Best Practices + + + +- Use `verbose: 2` with visual debugging +- Enable browser DevTools for element inspection +- Use `logInferenceToFile: true` to capture LLM decisions +- Implement structured logging early + + + +- Use `verbose: 1` to balance visibility with performance +- Implement error tracking and alerting +- Use structured JSON logging +- Monitor session success rates and execution times + + + +- Never log credentials or sensitive data +- Implement log retention policies +- Secure log files and dashboards + + \ No newline at end of file diff --git a/packages/docs/v2/configuration/models.mdx b/packages/docs/v2/configuration/models.mdx new file mode 100644 index 000000000..b2874c336 --- /dev/null +++ b/packages/docs/v2/configuration/models.mdx @@ -0,0 +1,305 @@ +--- +title: Models +sidebarTitle: Models +description: Enhance Stagehand with LLMs for optimal performance, cost, and reliability +--- + +Stagehand uses Large Language Models (LLMs) to understand web pages, plan actions, and interact with complex interfaces. The choice of LLM significantly impacts your automation's accuracy, speed, and cost. + + +Find more details about how to choose the right model on our Model Evaluation page. + + +## Why LLM Choice Matters + +- **Accuracy**: Better models provide more reliable element detection and action planning +- **Speed**: Faster models reduce automation latency +- **Cost**: Different providers offer varying pricing structures +- **Reliability**: Structured output support ensures consistent automation behavior + + +Find more details about how to choose the right model on our [Model Evaluation](https://www.stagehand.dev/evals) page. + + + +Small models on **Ollama** struggle with consistent structured outputs. While technically supported, we don't recommend them for production Stagehand workflows. + + +## Environment Variables Setup + +Set up your API keys before configuring Stagehand: + + +```bash .env +# Choose one or more providers +OPENAI_API_KEY=your_openai_key_here +ANTHROPIC_API_KEY=your_anthropic_key_here +GOOGLE_API_KEY=your_google_key_here +GROQ_API_KEY=your_groq_key_here +``` + + +## Supported Providers + +Stagehand supports major LLM providers with structured output capabilities: + +### Production-Ready Providers + +| Provider | Best Models | Strengths | Use Case | +|----------|-------------|-----------|----------| +| **OpenAI** | `gpt-4.1`, `gpt-4.1-mini` | High accuracy, reliable | Production, complex sites | +| **Anthropic** | `claude-3-7-sonnet-latest` | Excellent reasoning | Complex automation tasks | +| **Google** | `gemini-2.5-flash`, `gemini-2.5-pro` | Fast, cost-effective | High-volume automation | + +### Additional Providers + + +- **Groq** - `llama-3.3-70b-versatile` (Good for speed critical applications) +- **xAI** - `grok-beta` (Good for complex reasoning) +- **Azure** - Enterprise OpenAI deployment +- **Cerebras** - High-speed inference +- **TogetherAI** - Open-source models +- **Mistral** - `mixtral-8x7b-32768` (European option) +- **DeepSeek** - Cost-effective alternative +- **Perplexity** - Real-time web data +- **Ollama** - Local deployment (limited accuracy) +- **Run any model included in AI SDK** - Find supported models in the [Vercel AI SDK](https://sdk.vercel.ai/providers/ai-sdk-providers) (Follow the guide + [here](#vercel-ai-sdk) to get started.) + + +## Basic Configuration + +### Model Name Format + +Stagehand uses the format `provider/model-name` for model specification. + +**Examples:** +- OpenAI: `openai/gpt-4.1` +- Anthropic: `anthropic/claude-3-7-sonnet-latest` +- Google: `google/gemini-2.5-flash` (Recommended) + +### Quick Start Examples + + + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + modelName: "google/gemini-2.5-flash", + modelClientOptions: { + apiKey: process.env.GOOGLE_API_KEY, + }, +}); +``` +```python Python +import os +from stagehand import Stagehand + +stagehand = Stagehand( + model_name="google/gemini-2.5-flash", + model_api_key=os.getenv("GOOGLE_API_KEY") +) +``` + + + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + modelName: "openai/gpt-4.1", + modelClientOptions: { + apiKey: process.env.OPENAI_API_KEY, + }, +}); +``` +```python Python +import os +from stagehand import Stagehand + +stagehand = Stagehand( + model_name="openai/gpt-4.1", + model_api_key=os.getenv("OPENAI_API_KEY") +) +``` + + + + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + modelName: "anthropic/claude-3-7-sonnet-latest", + modelClientOptions: { + apiKey: process.env.ANTHROPIC_API_KEY, + }, +}); +``` +```python Python +import os +from stagehand import Stagehand + +stagehand = Stagehand( + model_name="anthropic/claude-3-7-sonnet-latest", + model_api_key=os.getenv("ANTHROPIC_API_KEY") +) +``` + + + + +## Custom LLM Integration + + +Custom LLMs are currently only supported in TypeScript. + + +Integrate any LLM with Stagehand using custom clients. The only requirement is **structured output support** for consistent automation behavior. + +### Vercel AI SDK +The [Vercel AI SDK](https://sdk.vercel.ai/providers/ai-sdk-providers) is a popular library for interacting with LLMs. You can use any of the providers supported by the Vercel AI SDK to create a client for your model, **as long as they support structured outputs**. + +Vercel AI SDK supports providers for OpenAI, Anthropic, and Google, along with support for **Amazon Bedrock** and **Azure OpenAI**. + +To get started, you'll need to install the `ai` package and the provider you want to use. For example, to use Amazon Bedrock, you'll need to install the `@ai-sdk/amazon-bedrock` package. + +You'll also need to use the [Vercel AI SDK external client](https://github.com/browserbase/stagehand/blob/v2/examples/external_clients/aisdk.ts) as a template to create a client for your model. + + + + ```bash + npm install ai @ai-sdk/amazon-bedrock + ``` + + + + ```bash + pnpm install ai @ai-sdk/amazon-bedrock + ``` + + + + ```bash + yarn add ai @ai-sdk/amazon-bedrock + ``` + + + +To get started, you can use the [Vercel AI SDK external client](https://github.com/browserbase/stagehand/blob/84f810b4631291307a32a47addad7e26e9c1deb3/examples/external_clients/aisdk.ts) as a template to create a client for your model. + +```ts +// Install/import the provider you want to use. +// For example, to use OpenAI, import `openai` from @ai-sdk/openai +import { bedrock } from "@ai-sdk/amazon-bedrock"; +import { AISdkClient } from "./external_clients/aisdk"; + +const stagehand = new Stagehand({ + llmClient: new AISdkClient({ + model: bedrock("anthropic.claude-3-7-sonnet-20250219-v1:0"), + }), +}); +``` + +## Troubleshooting + +### Common Issues + + + +**Error**: `Model does not support structured outputs` + +**Solution**: +Use models that support function calling/structured outputs. The minimum requirements are: + +- Model must support JSON/structured outputs +- Model must have strong reasoning capabilities +- Model must be able to handle complex instructions + +For each provider, use their latest models that meet these requirements. Some examples: + +- **OpenAI**: GPT-4 series or newer +- **Anthropic**: Claude 3 series or newer +- **Google**: Gemini 2 series or newer +- **Other providers**: Latest models with structured output support + +**Note**: Avoid base language models without structured output capabilities or fine-tuning for instruction following. When in doubt, check our [Model Evaluation](https://www.stagehand.dev/evals) page for up-to-date recommendations. + + + +**Error**: `Invalid API key` or `Unauthorized` + +**Solution**: +- Verify your environment variables are set correctly +- Check API key permissions and quotas +- Ensure you're using the correct API key for the provider +- For Anthropic, make sure you have access to the Claude API + + + +**Symptoms**: Actions work sometimes but fail other times + +**Causes & Solutions**: +- **Weak models**: Use more capable models - check our [Model Evaluation](https://www.stagehand.dev/evals) page for current recommendations +- **High temperature**: Set temperature to 0 for deterministic outputs +- **Complex pages**: Switch to models with higher accuracy scores on our [Model Evaluation](https://www.stagehand.dev/evals) page +- **Rate limits**: Implement retry logic with exponential backoff +- **Context limits**: Reduce page complexity or use models with larger context windows +- **Prompt clarity**: Ensure your automation instructions are clear and specific + + + +**Issue**: Automation takes too long to respond + +**Solutions**: +- **Use fast models**: Choose models optimized for speed + - Any model with < 1s response time + - Models with "fast" or "flash" variants +- **Optimize settings**: + - Use `verbose: 0` to minimize token usage + - Set temperature to 0 for fastest processing + - Keep max tokens as low as possible +- **Consider local deployment**: Local models can provide lowest latency +- **Batch operations**: Group multiple actions when possible + + + +**Issue**: LLM usage costs are too high + +**Cost Optimization Strategies**: +1. **Switch to cost-effective models**: + - Check our [Model Evaluation](https://www.stagehand.dev/evals) page for current cost-performance benchmarks + - Choose models with lower cost per token that still meet accuracy requirements + - Consider models optimized for speed to reduce total runtime costs +2. **Optimize token usage**: + - Set `verbose: 0` to reduce logging overhead + - Use concise prompts and limit response length +3. **Smart model selection**: Start with cheaper models, fallback to premium ones only when needed +4. **Cache responses**: Implement LLM response caching for repeated automation patterns +5. **Monitor usage**: Set up billing alerts and track costs per automation run +6. **Batch processing**: Process multiple similar tasks together + + + +### Next Steps + + + See our Model Evaluation page + + + + Evaluate performance on your specific use cases in our Model Evaluation guide + + + + Monitor token usage and set alerts using our Observability tools + + + + Store successful patterns using our Caching Guide + + \ No newline at end of file diff --git a/packages/docs/v2/configuration/observability.mdx b/packages/docs/v2/configuration/observability.mdx new file mode 100644 index 000000000..085506d98 --- /dev/null +++ b/packages/docs/v2/configuration/observability.mdx @@ -0,0 +1,532 @@ +--- +title: Observability +sidebarTitle: Observability +description: Track Stagehand automation with session visibility and analytics +--- + +Stagehand provides powerful observability features to help you monitor, track performance, and analyze your browser automation workflows. Focus on session monitoring, resource usage, and operational insights for both Browserbase and local environments. + +## Browserbase Session Monitoring + +When running on Browserbase, you gain access to comprehensive cloud-based monitoring and session management through the Browserbase API and dashboard. + +
+ Browserbase Session Observability +
+ +### Live Session Visibility + +Browserbase provides real-time visibility into your automation sessions: + +**Session Dashboard Features** +- Real-time browser screen recording and replay +- Network request monitoring with detailed timing +- JavaScript console logs and error tracking +- CPU and memory usage metrics +- Session status and duration tracking + +**Session Management & API Access** + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; +import { Browserbase } from "@browserbasehq/sdk"; + +const browserbase = new Browserbase({ + apiKey: process.env.BROWSERBASE_API_KEY, +}); + +const stagehand = new Stagehand({ + env: "BROWSERBASE" +}); + +await stagehand.init(); + +const sessionInfo = await browserbase.sessions.retrieve(stagehand.sessionId); + +console.log("Session status:", sessionInfo.status); +console.log("Session region:", sessionInfo.region); +console.log("CPU usage:", sessionInfo.avgCpuUsage); +console.log("Memory usage:", sessionInfo.memoryUsage); +console.log("Proxy bytes:", sessionInfo.proxyBytes); +``` + +```python Python +import os +from stagehand import Stagehand +from browserbase import Browserbase + +browserbase = Browserbase( + api_key=os.getenv("BROWSERBASE_API_KEY"), +) + +stagehand = Stagehand( + env="BROWSERBASE", +) + +await stagehand.init() + +session_info = browserbase.sessions.retrieve(stagehand.session_id) + +print(f"Session status: {session_info['status']}") +print(f"Session region: {session_info['region']}") +print(f"CPU usage: {session_info['avgCpuUsage']}") +print(f"Memory usage: {session_info['memoryUsage']}") +print(f"Proxy bytes: {session_info['proxyBytes']}") +``` + + +### Session Analytics & Insights + + + + Monitor live session status, resource usage, and geographic distribution. Scale and manage concurrent sessions with real-time insights. + + + + Review complete session recordings with frame-by-frame playback. Analyze network requests and debug browser interactions visually. + + + + Programmatically access session data, automate lifecycle management, and integrate with monitoring systems through our API. + + + + Track resource consumption, session duration, and API usage. Get detailed breakdowns of costs and utilization across your automation. + + + +### Session Monitoring & Filtering + +Query and monitor sessions by status and metadata: + + +```typescript TypeScript +import { Browserbase } from "@browserbasehq/sdk"; + +const browserbase = new Browserbase({ + apiKey: process.env.BROWSERBASE_API_KEY, +}); + +// List sessions with filtering +async function getFilteredSessions() { + const sessions = await browserbase.sessions.list({ + status: 'RUNNING' + }); + + return sessions.map(session => ({ + id: session.id, + status: session.status, // RUNNING, COMPLETED, ERROR, TIMED_OUT + startedAt: session.startedAt, + endedAt: session.endedAt, + region: session.region, + avgCpuUsage: session.avgCpuUsage, + memoryUsage: session.memoryUsage, + proxyBytes: session.proxyBytes, + userMetadata: session.userMetadata + })); +} + +// Query sessions by metadata +async function querySessionsByMetadata(query: string) { + const sessions = await browserbase.sessions.list({ + q: query + }); + + return sessions; +} +``` + +```python Python +import os +from browserbase import Browserbase + +browserbase = Browserbase( + api_key=os.getenv("BROWSERBASE_API_KEY"), +) + +def get_filtered_sessions(): + sessions = browserbase.sessions.list(status="RUNNING") + + return [{ + 'id': session['id'], + 'status': session['status'], # RUNNING, COMPLETED, ERROR, TIMED_OUT + 'started_at': session['startedAt'], + 'ended_at': session['endedAt'], + 'region': session['region'], + 'avg_cpu_usage': session['avgCpuUsage'], + 'memory_usage': session['memoryUsage'], + 'proxy_bytes': session['proxyBytes'], + 'user_metadata': session['userMetadata'] + } for session in sessions] + +def query_sessions_by_metadata(query): + sessions = browserbase.sessions.list(q=query) + + return sessions +``` + + +## Local Environment Monitoring + +For local development, Stagehand provides performance monitoring and resource tracking capabilities directly on your machine. + +### Performance Tracking + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "LOCAL", + verbose: 1, // Monitor performance without debug noise +}); + +// Track local automation metrics +const startTime = Date.now(); +const initialMetrics = stagehand.metrics; + +// ... perform automation tasks + +const finalMetrics = stagehand.metrics; +const executionTime = Date.now() - startTime; + +console.log('Local Performance Summary:', { + executionTime: `${executionTime}ms`, + totalTokens: finalMetrics.totalPromptTokens + finalMetrics.totalCompletionTokens, + averageResponseTime: finalMetrics.totalInferenceTimeMs / 3, // Assuming 3 operations + tokensPerSecond: (finalMetrics.totalPromptTokens + finalMetrics.totalCompletionTokens) / (executionTime / 1000) +}); +``` + +```python Python +from stagehand import Stagehand +import time + +stagehand = Stagehand( + env="LOCAL", + verbose=1, # Monitor performance without debug noise +) + +# Track local automation metrics +start_time = time.time() +initial_metrics = stagehand.metrics + +# ... perform automation tasks + +final_metrics = stagehand.metrics +execution_time = (time.time() - start_time) * 1000 # Convert to ms + +print('Local Performance Summary:', { + 'execution_time': f"{execution_time:.0f}ms", + 'total_tokens': final_metrics['total_prompt_tokens'] + final_metrics['total_completion_tokens'], + 'average_response_time': final_metrics['total_inference_time_ms'] / 3, # Assuming 3 operations + 'tokens_per_second': (final_metrics['total_prompt_tokens'] + final_metrics['total_completion_tokens']) / (execution_time / 1000) +}) +``` + + +## Resource Usage Monitoring + +When running locally, monitor system resource usage and browser performance: + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; +import * as os from 'os'; +import { performance } from 'perf_hooks'; + +class LocalResourceMonitor { + private cpuUsage: number[] = []; + private memoryUsage: number[] = []; + + startMonitoring() { + const interval = setInterval(() => { + // Track system resources + const memUsage = process.memoryUsage(); + this.memoryUsage.push(memUsage.heapUsed / 1024 / 1024); // MB + + // Track CPU (simplified) + const loadAvg = os.loadavg()[0]; + this.cpuUsage.push(loadAvg); + }, 1000); + + return interval; + } + + getResourceSummary() { + return { + avgMemoryUsage: this.memoryUsage.reduce((a, b) => a + b, 0) / this.memoryUsage.length, + peakMemoryUsage: Math.max(...this.memoryUsage), + avgCpuLoad: this.cpuUsage.reduce((a, b) => a + b, 0) / this.cpuUsage.length, + totalDataPoints: this.cpuUsage.length + }; + } +} + +const monitor = new LocalResourceMonitor(); +const interval = monitor.startMonitoring(); + +const stagehand = new Stagehand({ env: "LOCAL" }); + +// ... run automation + +clearInterval(interval); +console.log('Resource Usage:', monitor.getResourceSummary()); +``` + +```python Python +import psutil +import time +from typing import List +from stagehand import Stagehand + +class LocalResourceMonitor: + def __init__(self): + self.cpu_usage: List[float] = [] + self.memory_usage: List[float] = [] + self.monitoring = False + + def start_monitoring(self): + self.monitoring = True + import threading + + def monitor_resources(): + while self.monitoring: + # Track CPU and memory usage + cpu_percent = psutil.cpu_percent(interval=1) + memory_info = psutil.virtual_memory() + + self.cpu_usage.append(cpu_percent) + self.memory_usage.append(memory_info.percent) + + time.sleep(1) + + thread = threading.Thread(target=monitor_resources) + thread.daemon = True + thread.start() + return thread + + def stop_monitoring(self): + self.monitoring = False + + def get_resource_summary(self): + if not self.cpu_usage or not self.memory_usage: + return {'error': 'No monitoring data collected'} + + return { + 'avg_cpu_usage': sum(self.cpu_usage) / len(self.cpu_usage), + 'peak_cpu_usage': max(self.cpu_usage), + 'avg_memory_usage': sum(self.memory_usage) / len(self.memory_usage), + 'peak_memory_usage': max(self.memory_usage), + 'total_data_points': len(self.cpu_usage) + } + +monitor = LocalResourceMonitor() +monitor.start_monitoring() + +stagehand = Stagehand(env="LOCAL") + +# ... run automation + +monitor.stop_monitoring() +print('Resource Usage:', monitor.get_resource_summary()) +``` + + + + + Monitor token usage, costs, and speed. Set up automated alerting for critical failures. Implement cost tracking across different environments. Use session analytics to optimize automation workflows. + + + +## Real-Time Metrics & Monitoring + +### Basic Usage Tracking + +Monitor your automation's resource usage in real-time with `stagehand.metrics`: + + +```typescript TypeScript +// Get current metrics +console.log(stagehand.metrics); + +// Monitor during automation +const startTime = Date.now(); +const initialMetrics = stagehand.metrics; + +// ... perform automation tasks + +const finalMetrics = stagehand.metrics; +const executionTime = Date.now() - startTime; + +console.log('Automation Summary:', { + totalTokens: finalMetrics.totalPromptTokens + finalMetrics.totalCompletionTokens, + totalCost: calculateCost(finalMetrics), + executionTime, + efficiency: (finalMetrics.totalPromptTokens + finalMetrics.totalCompletionTokens) / executionTime +}); +``` + +```python Python +# Get current metrics +print(stagehand.metrics) + +# Monitor during automation +import time +start_time = time.time() +initial_metrics = stagehand.metrics + +# ... perform automation tasks + +final_metrics = stagehand.metrics +execution_time = (time.time() - start_time) * 1000 # Convert to ms + +print('Automation Summary:', { + 'total_tokens': final_metrics['total_prompt_tokens'] + final_metrics['total_completion_tokens'], + 'total_cost': calculate_cost(final_metrics), + 'execution_time': execution_time, + 'efficiency': (final_metrics['total_prompt_tokens'] + final_metrics['total_completion_tokens']) / execution_time +}) +``` + + +### Understanding Metrics Data + +The metrics object provides detailed breakdown by Stagehand operation: + + +```typescript TypeScript +{ + actPromptTokens: 4011, + actCompletionTokens: 51, + actInferenceTimeMs: 1688, + + extractPromptTokens: 4200, + extractCompletionTokens: 243, + extractInferenceTimeMs: 4297, + + observePromptTokens: 347, + observeCompletionTokens: 43, + observeInferenceTimeMs: 903, + + totalPromptTokens: 8558, + totalCompletionTokens: 337, + totalInferenceTimeMs: 6888 +} +``` + +```python Python +{ + "act_prompt_tokens": 4011, + "act_completion_tokens": 51, + "act_inference_time_ms": 1688, + + "extract_prompt_tokens": 4200, + "extract_completion_tokens": 243, + "extract_inference_time_ms": 4297, + + "observe_prompt_tokens": 347, + "observe_completion_tokens": 43, + "observe_inference_time_ms": 903, + + "total_prompt_tokens": 8558, + "total_completion_tokens": 337, + "total_inference_time_ms": 6888 +} +``` + + +### Log Inference to File + +You can also log inference to a file by setting `logInferenceToFile` to `true`. This will create a directory called `inference_summary` in your project's root directory. + +```typescript TypeScript +const stagehand = new Stagehand({ + logInferenceToFile: true, +}); +``` + +```python Python +stagehand = Stagehand( + log_inference_to_file=True, +) +``` + +The `inference_summary` directory provides granular analysis data: +``` +inference_summary/ +├── act_summary/ +│ ├── {timestamp}.json +│ ├── {timestamp}.json +│ └── ... +│ └── act_summary.json +├── extract_summary/ +│ ├── {timestamp}.json +│ ├── {timestamp}.json +│ └── ... +│ └── extract_summary.json +├── observe_summary/ +│ ├── {timestamp}.json +│ ├── {timestamp}.json +│ └── ... +│ └── observe_summary.json +``` + +### Log File Structure + +Each operation creates detailed logs for analysis: +```typescript +{ + "act_summary": [ + { + "act_inference_type": "act", + "timestamp": "20250329_080446068", + "LLM_input_file": "20250329_080446068_act_call.txt", + "LLM_output_file": "20250329_080447019_act_response.txt", + "prompt_tokens": 3451, + "completion_tokens": 45, + "inference_time_ms": 951 + }, + ... + ], +} +``` + + +## Best Practices + + + +- Track session success rates and failure patterns +- Monitor resource usage and scaling requirements +- Set up automated alerting for critical failures +- Implement cost tracking across different environments +- Use session analytics to optimize automation workflows + + + +- Compare Browserbase vs local execution times +- Monitor token usage and inference costs across models +- Track geographic performance differences +- Identify bottlenecks in automation workflows +- Optimize for cost-effectiveness and speed + + + +- Track session distribution across regions +- Monitor concurrent session limits and scaling +- Analyze failure patterns and common error scenarios +- Use session recordings for root cause analysis +- Implement custom metadata for workflow categorization + + + +- Integrate session APIs with monitoring dashboards +- Set up automated notifications for session failures +- Track SLA compliance and performance benchmarks +- Monitor resource costs and usage patterns +- Use analytics data for capacity planning and optimization + + + +For detailed logging and debugging capabilities, see [Logging](/v2/configuration/logging). \ No newline at end of file diff --git a/packages/docs/v2/first-steps/ai-rules.mdx b/packages/docs/v2/first-steps/ai-rules.mdx new file mode 100644 index 000000000..fd2b33785 --- /dev/null +++ b/packages/docs/v2/first-steps/ai-rules.mdx @@ -0,0 +1,616 @@ +--- +title: AI Rules +description: Using AI to write Stagehand code faster, and better. +--- + +You're likely using AI to write code, and there's a **right and wrong way to do it.** This page is a collection of rules, configs, and copy‑paste snippets to allow your AI agents/assistants to write performant, Stagehand code as fast as possible. + +## Quickstart + + + + Configure Browserbase (Stagehand), Context7, DeepWiki, and Stagehand Docs in your MCP client. + + + Drop in `cursorrules` and `claude.md` so AI agents/assistants always emit Stagehand patterns. + + + +## Using MCP Servers + +MCP (Model Context Protocol) servers act as intermediaries that connect AI systems to external data sources and tools. These servers enable your coding assistant to access real-time information, execute tasks, and retrieve structured data to enhance code generation accuracy. + +The following **MCP servers** provide specialized access to Stagehand documentation and related resources: + + +Provides semantic search across documentation and codebase context. Context7 enables AI assistants to find relevant code patterns, examples, and implementation details from your project history. It maintains contextual understanding of your development workflow and can surface related solutions from previous work. + +**Installation:** +```json +{ + "mcpServers": { + "context7": { + "command": "npx", + "args": ["-y", "@upstash/context7-mcp"] + } + } +} +``` + + + +Offers deep indexing of GitHub repositories and documentation. DeepWiki allows AI agents to understand project architecture, API references, and best practices from the entire Stagehand ecosystem. It provides comprehensive knowledge about repository structure, code relationships, and development patterns. + +**Installation:** +```json +{ + "mcpServers": { + "deepwiki": { + "url": "https://mcp.deepwiki.com/mcp" + } + } +} +``` + + + +Direct access to official Stagehand documentation. This MCP server provides AI assistants with up-to-date API references, configuration options, and usage examples for accurate code generation. Mintlify auto-generates this server from the official docs, ensuring your AI assistant always has the latest information. + +**Usage:** +```json +{ + "mcpServers": { + "stagehand-docs": { + "url": "https://docs.stagehand.dev/mcp" + } + } +} +``` + + +**How MCP Servers Enhance Your Development:** +- **Real-time Documentation Access**: AI assistants can query the latest Stagehand docs, examples, and best practices +- **Context-Aware Code Generation**: Servers provide relevant code patterns and configurations based on your specific use case +- **Reduced Integration Overhead**: Standardized protocol eliminates the need for custom integrations with each documentation source +- **Enhanced Accuracy**: AI agents receive structured, up-to-date information rather than relying on potentially outdated training data + + + +**Prompting tip:** +Explicitly ask your coding agent/assistant to use these MCP servers to fetch relevant information from the docs so they have better context and know how to write proper Stagehand code. + +ie. **"Use the stagehand-docs MCP to fetch the act/observe guidelines, then generate code that follows them. Prefer cached observe results."** + + + +## Editor rule files (copy‑paste) + +Drop these in `.cursorrules`, `windsurfrules`, `claude.md`, or any agent rule framework: + + + +``````md +# Stagehand Project + +This is a project that uses [Stagehand](https://github.com/browserbase/stagehand), which amplifies Playwright with AI-powered `act`, `extract`, and `observe` methods added to the Page class. + +`Stagehand` is a class that provides configuration and browser automation capabilities with: +- `stagehand.page`: A StagehandPage object (extends Playwright Page) +- `stagehand.context`: A StagehandContext object (extends Playwright BrowserContext) +- `stagehand.agent()`: Create AI-powered agents for autonomous multi-step workflows +- `stagehand.init()`: Initialize the browser session +- `stagehand.close()`: Clean up resources + +`Page` extends Playwright's Page class with AI-powered methods: +- `act()`: Perform actions on web elements using natural language +- `extract()`: Extract structured data from pages using schemas +- `observe()`: Plan actions and get selectors before executing + +`Agent` provides autonomous Computer Use Agent capabilities: +- `execute()`: Perform complex multi-step tasks using natural language instructions + +`Context` extends Playwright's BrowserContext class for browser session management. + +Use the following rules to write code for this project. + +- To plan an instruction like "click the sign in button", use Stagehand `observe` to get the action to execute. + +```typescript +const results = await page.observe("Click the sign in button"); +``` + +You can also pass in the following params: + +```typescript +await page.observe({ + instruction: "the instruction to execute", + returnAction: true +}); +``` + +- The result of `observe` is an array of `ObserveResult` objects that can directly be used as params for `act` like this: + ```typescript + const results = await page.observe({ + instruction: "the instruction to execute", + returnAction: true, // return the action to execute + }); + + await page.act(results[0]); + ``` + +- When writing code that needs to extract data from the page, use Stagehand `extract`. Explicitly pass the following params by default: + +```typescript +const { someValue } = await page.extract({ + instruction: "the instruction to execute", + schema: z.object({ + someValue: z.string(), + }), // The schema to extract +}); +``` + +## Initialize + +```typescript +import { Stagehand, Page, BrowserContext } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE" +}); + +await stagehand.init(); + +const page = stagehand.page; // Playwright Page with act, extract, and observe methods + +const context = stagehand.context; // Playwright BrowserContext +``` +### Configuration Options +```typescript +const StagehandConfig = { + env: "BROWSERBASE" | "LOCAL", // Environment to run in + apiKey: process.env.BROWSERBASE_API_KEY, // Browserbase API key + projectId: process.env.BROWSERBASE_PROJECT_ID, // Browserbase project ID + debugDom: true, // Enable DOM debugging features + headless: false, // Run browser in headless mode + domSettleTimeoutMs: 30_000, // Timeout for DOM to settle + enableCaching: true, // Enable action caching + modelName: "gpt-4o", // AI model to use + modelClientOptions: { + apiKey: process.env.OPENAI_API_KEY, // OpenAI API key + }, +}; +``` +## Act + +You can act directly with string instructions: + +```typescript +await page.act("Click the sign in button"); +``` + +Use variables for dynamic form filling: + +```typescript +await page.act({ + action: `Enter the following information: + Name: %name% + Email: %email% + Phone: %phone%`, + variables: { + name: "John Doe", + email: "john@example.com", + phone: "+1-555-0123" + } +}); +``` + +**Best Practices:** +- Cache the results of `observe` to avoid unexpected DOM changes +- Keep actions atomic and specific (e.g., "Click the sign in button" not "Sign in to the website") +- Use variable substitution for dynamic data entry + +Act `action` should be as atomic and specific as possible, i.e. "Click the sign in button" or "Type 'hello' into the search input". +AVOID actions that are more than one step, i.e. "Order me pizza" or "Send an email to Paul asking him to call me". + +## Extract + +### Simple String Extraction + +```typescript +const signInButtonText = await page.extract("extract the sign in button text"); +``` + +### Structured Extraction with Schema (Recommended) + +Always use Zod schemas for structured data extraction: + +```typescript +import { z } from "zod/v3"; + +const data = await page.extract({ + instruction: "extract the sign in button text", + schema: z.object({ + text: z.string(), + }), +}); +``` + +### Array Extraction + +To extract multiple items, wrap the array in a single object: + +```typescript +const data = await page.extract({ + instruction: "extract the text inside all buttons", + schema: z.object({ + buttons: z.array(z.string()), + }) +}); +``` + +### Complex Object Extraction + +For more complex data structures: + +```typescript +const productData = await page.extract({ + instruction: "extract product information from this page", + schema: z.object({ + title: z.string(), + price: z.number(), + description: z.string(), + features: z.array(z.string()), + availability: z.boolean(), + }), +}); +``` + +### Schema Validation + +```typescript +import { validateZodSchema } from "./utils.js"; +import { z } from "zod/v3"; + +const schema = z.object({ name: z.string() }); +const isValid = validateZodSchema(schema, { name: "John" }); // true +``` + +## Agent System + +Stagehand provides an Agent System for autonomous web browsing using Computer Use Agents (CUA). Agents execute multi-step workflows using natural language instructions. + +### Creating Agents + +```typescript +// Basic agent (default) +const agent = stagehand.agent(); + +// OpenAI agent +const agent = stagehand.agent({ + provider: "openai", + model: "computer-use-preview", + instructions: "You are a helpful assistant that can use a web browser.", + options: { + apiKey: process.env.OPENAI_API_KEY + } +}); + +// Anthropic agent +const agent = stagehand.agent({ + provider: "anthropic", + model: "claude-sonnet-4-20250514", + instructions: "You are a helpful assistant that can use a web browser.", + options: { + apiKey: process.env.ANTHROPIC_API_KEY + } +}); +``` +### Agent Execution +```typescript +// Simple task +const result = await agent.execute("Extract the title from this webpage"); + +// Complex multi-step task +const result = await agent.execute({ + instruction: "Apply for the first engineer position with mock data", + maxSteps: 20, + autoScreenshot: true +}); +``` + +### Best Practices +- Be specific with instructions: `"Fill out the contact form with name 'John Doe' and submit it"` +- Break down complex tasks into smaller steps +- Use error handling with try/catch blocks +- Combine agents for navigation with traditional methods for precise data extraction + +```typescript +// Good: Specific instructions +await agent.execute("Navigate to products page and filter by 'Electronics'"); + +// Avoid: Vague instructions +await agent.execute("Do some stuff on this page"); +``` + +## Project Structure Best Practices + +- Store configurations in `stagehand.config.ts` +- Use environment variables for API keys (see `.env.example`) +- Implement main automation logic in functions that accept `{ page, context, stagehand }` +- Use TypeScript with proper imports from `@browserbasehq/stagehand` +`````` + + + + + +``````md +# Stagehand Python Project + +This is a project that uses [Stagehand Python](https://github.com/browserbase/stagehand-python), which provides AI-powered browser automation with `act`, `extract`, and `observe` methods. + +`Stagehand` is a class that provides configuration and browser automation capabilities with: +- `stagehand.page`: A StagehandPage object (extends Playwright Page) +- `stagehand.context`: A StagehandContext object (extends Playwright BrowserContext) +- `stagehand.agent()`: Create AI-powered agents for autonomous multi-step workflows +- `stagehand.init()`: Initialize the browser session +- `stagehand.close()`: Clean up resources + +`Page` extends Playwright's Page class with AI-powered methods: +- `act()`: Perform actions on web elements using natural language +- `extract()`: Extract structured data from pages using schemas +- `observe()`: Plan actions and get selectors before executing + +`Agent` provides autonomous Computer Use Agent capabilities: +- `execute()`: Perform complex multi-step tasks using natural language instructions + +Use the following rules to write code for this project. + +- To plan an instruction like "click the sign in button", use Stagehand `observe` to get the action to execute. + +```python +results = await page.observe("Click the sign in button") +``` + +You can also pass in the following params: + +```python +await page.observe( + instruction="the instruction to execute", + draw_overlay=True # Show visual overlay on observed elements +) +``` + +- The result of `observe` is a list of `ObserveResult` objects that can directly be used as params for `act` like this: + ```python + results = await page.observe("Click the sign in button") + await page.act(results[0]) + ``` +- When writing code that needs to extract data from the page, use Stagehand `extract`. Use Pydantic models for schemas: + +```python +from pydantic import BaseModel + +class ExtractedData(BaseModel): + some_value: str + +result = await page.extract( + instruction="the instruction to execute", + schema=ExtractedData +) +``` + +## Initialize + +```python +from stagehand import Stagehand, StagehandConfig +import asyncio +import os +from dotenv import load_dotenv + +load_dotenv() + +async def main(): + config = StagehandConfig( + env="BROWSERBASE", # or "LOCAL" + api_key=os.getenv("BROWSERBASE_API_KEY"), + project_id=os.getenv("BROWSERBASE_PROJECT_ID"), + model_name="google/gemini-2.5-flash-preview-05-20", + model_api_key=os.getenv("MODEL_API_KEY"), + ) + + # Recommended: Use as async context manager + async with Stagehand(config) as stagehand: + page = stagehand.page + # Your automation code here + + # Alternative: Manual initialization + stagehand = Stagehand(config) + await stagehand.init() + page = stagehand.page + # Your automation code here + await stagehand.close() + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### Configuration Options + +Key configuration options in `StagehandConfig`: + +```python +config = StagehandConfig( + env="BROWSERBASE", # or "LOCAL" + api_key=os.getenv("BROWSERBASE_API_KEY"), + project_id=os.getenv("BROWSERBASE_PROJECT_ID"), + model_name="google/gemini-2.5-flash-preview-05-20", + model_api_key=os.getenv("MODEL_API_KEY"), + verbose=1, # 0=minimal, 1=medium, 2=detailed + dom_settle_timeout_ms=30000, + self_heal=True, # Enable self-healing functionality +) +``` + +## Act + +You can act directly with string instructions: + +```python +await page.act("Click the sign in button") +``` + +Use variables for dynamic form filling: + +```python +await page.act( + "Enter the following information: Name: John Doe, Email: john@example.com" +) +``` + +**Best Practices:** +- Cache the results of `observe` to avoid unexpected DOM changes +- Keep actions atomic and specific (e.g., "Click the sign in button" not "Sign in to the website") +- Use specific, descriptive instructions + +Act `action` should be as atomic and specific as possible, i.e. "Click the sign in button" or "Type 'hello' into the search input". +AVOID actions that are more than one step, i.e. "Order me pizza" or "Send an email to Paul asking him to call me". + +## Extract + +### Simple String Extraction +```python +sign_in_button_text = await page.extract("extract the sign in button text") +``` + +### Structured Extraction with Schema (Recommended) +Always use Pydantic models for structured data extraction: + +```python +from pydantic import BaseModel, Field +from typing import List + +class ButtonData(BaseModel): + text: str = Field(..., description="Button text content") + +data = await page.extract( + instruction="extract the sign in button text", + schema=ButtonData +) +``` + +### Array Extraction +For arrays, use List types: + +```python +from pydantic import BaseModel, Field +from typing import List + +class ButtonsData(BaseModel): + buttons: List[str] = Field(..., description="List of button texts") + +data = await page.extract( + instruction="extract the text inside all buttons", + schema=ButtonsData +) +``` + +### Complex Object Extraction +For more complex data structures: + +```python +from pydantic import BaseModel, Field +from typing import List + +class Company(BaseModel): + name: str = Field(..., description="Company name") + description: str = Field(..., description="Brief company description") + +class Companies(BaseModel): + companies: List[Company] = Field(..., description="List of companies") + +companies_data = await page.extract( + "Extract names and descriptions of 5 companies", + schema=Companies +) +``` + +## Agent System + +Stagehand provides an Agent System for autonomous web browsing using Computer Use Agents (CUA). + +### Creating Agents + +```python +# Basic agent (uses default model) +agent = stagehand.agent() + +# OpenAI agent +agent = stagehand.agent( + model="computer-use-preview", + instructions="You are a helpful web navigation assistant.", + options={"apiKey": os.getenv("OPENAI_API_KEY")} +) + +# Anthropic agent +agent = stagehand.agent( + model="claude-sonnet-4-20250514", + instructions="You are a helpful web navigation assistant.", + options={"apiKey": os.getenv("ANTHROPIC_API_KEY")} +) +``` + +### Agent Execution + +```python +# Simple task +result = await agent.execute("Play a game of 2048") + +# Complex multi-step task with options +result = await agent.execute( + instruction="Apply for the first engineer position with mock data", + max_steps=20, + auto_screenshot=True, + wait_between_actions=1000 # milliseconds +) +``` + +**Best Practices:** +- Be specific with instructions: `"Fill out the contact form with name 'John Doe' and submit it"` +- Break down complex tasks into smaller steps +- Use error handling with try/except blocks +- Combine agents for navigation with traditional methods for precise data extraction + +```python +# Good: Specific instructions +await agent.execute("Navigate to products page and filter by 'Electronics'") + +# Avoid: Vague instructions +await agent.execute("Do some stuff on this page") +``` + +## Project Structure Best Practices + +- Store configurations in environment variables or config files +- Use async/await patterns consistently +- Implement main automation logic in async functions +- Use async context managers for resource management +- Use type hints and Pydantic models for data validation +- Handle exceptions appropriately with try/except blocks +`````` + + + +## Security notes + +- Do not embed secrets in docs or rule files; use env vars in MCP configs. +- Avoid broad actions that may trigger unintended navigation; prefer `observe` first. + +## Resources/references + +- Context7 MCP (Upstash) + - https://github.com/upstash/context7 +- DeepWiki MCP + - https://mcp.deepwiki.com/ +- Stagehand Docs MCP (Mintlify) + - https://docs.stagehand.dev/mcp diff --git a/packages/docs/v2/first-steps/installation.mdx b/packages/docs/v2/first-steps/installation.mdx new file mode 100644 index 000000000..eb5efe29b --- /dev/null +++ b/packages/docs/v2/first-steps/installation.mdx @@ -0,0 +1,198 @@ +--- +title: Installation +description: Integrate Stagehand into an existing project. +--- + +Install Stagehand in your current app with the TypeScript or Python SDK. + + +For TypeScript/Node.js: We highly recommend using the Node.js runtime environment to run Stagehand scripts, as opposed to newer alternatives like Deno or Bun. + +**Bun does not support Stagehand** since it doesn't support [Playwright](https://github.com/search?q=repo:oven-sh/bun+playwright&type=issues). + +For Python: We require Python 3.9+ and recommend using [uv](https://docs.astral.sh/uv/) to manage your virtual environment. + + + + + +### Install dependencies + + +```bash npm +npm install @browserbasehq/stagehand playwright zod +``` + +```bash pnpm +pnpm add @browserbasehq/stagehand playwright zod +``` + +```bash yarn +yarn add @browserbasehq/stagehand playwright zod +``` + + + +If you plan to run locally, install browsers once: `npx playwright install`. +For cloud browser sessions, skip this. + + +### Configure environment + +Set environment variables (or a `.env` via your framework): + + +```bash Bash +OPENAI_API_KEY=your_api_key +BROWSERBASE_API_KEY=your_api_key +BROWSERBASE_PROJECT_ID=your_project_id +``` + + +### Use in your codebase + +Add Stagehand where you need browser automation. + + +```typescript TypeScript +import "dotenv/config"; +import { Stagehand } from "@browserbasehq/stagehand"; +import { z } from "zod/v3"; + +async function main() { + const stagehand = new Stagehand({ + env: "BROWSERBASE" + }); + + await stagehand.init(); + const page = stagehand.page; + + await page.goto("https://example.com"); + + // Act on the page + await page.act("Click the sign in button"); + + // Extract structured data + const { title } = await page.extract({ + instruction: "extract the page title", + schema: z.object({ + title: z.string(), + }), + }); + + console.log(title); + await stagehand.close(); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); +``` + + + + + + +### Add dependencies + + + +```bash uv +uv add stagehand +``` + +```bash pip +pip install stagehand +``` + + + +### Configure environment + +Set environment variables (or a `.env` via your framework): + + +```bash Bash +MODEL_API_KEY=your_api_key +BROWSERBASE_API_KEY=your_api_key +BROWSERBASE_PROJECT_ID=your_project_id +``` + + +### Use in your codebase + + +```python Python +import os +import asyncio +from stagehand import Stagehand + +async def main(): + stagehand = Stagehand( + env="BROWSERBASE", + model_api_key=os.getenv("MODEL_API_KEY") + ) + await stagehand.init() + page = stagehand.page + + await page.goto("https://example.com") + + # Act on the page + await page.act("Click the sign in button") + + # Extract structured data + result = await page.extract({ + "instruction": "extract the page title", + "schema": { + "title": { + "type": "string" + } + } + }) + + print(result["title"]) + await stagehand.close() + +if __name__ == "__main__": + asyncio.run(main()) +``` + + + + + + +## Next steps + + + + Environment, Browserbase vs Local, logging, timeouts, LLM customization + + + Perform precise actions with natural language + + + Typed data extraction with Zod schemas + + + Discover elements and suggested actions + + \ No newline at end of file diff --git a/packages/docs/v2/first-steps/introduction.mdx b/packages/docs/v2/first-steps/introduction.mdx new file mode 100644 index 000000000..1615eb9a0 --- /dev/null +++ b/packages/docs/v2/first-steps/introduction.mdx @@ -0,0 +1,138 @@ +--- +title: Introducing Stagehand +sidebarTitle: Introduction +description: Developers use Stagehand to reliably automate the web. +--- + +Stagehand is a browser automation framework used to control web browsers with natural language and code. By combining the power of AI with the precision of code, Stagehand makes web automation flexible, maintainable, and actually reliable. + +## The Problem with Browser Automation + +Traditional frameworks like Playwright and Puppeteer force you to write brittle scripts that break with every UI change. Web agents promise to solve this with AI, but leave you at the mercy of unpredictable behavior. + +**You're stuck between two bad options:** +- **Too brittle**: Traditional selectors break when websites change +- **Too agentic**: AI agents are unpredictable and impossible to debug + +## Enter Stagehand + +Stagehand gives you the best of both worlds through four powerful primitives that let you choose exactly how much AI to use: + + + + Execute actions using natural language + + + Pull structured data with schemas + + + Discover available actions on any page + + + Automate entire workflows autonomously + + + + +```typescript TypeScript +// Act - Execute natural language actions +await page.act("click the login button"); + +// Extract - Pull structured data +const { price } = await page.extract({ + schema: z.object({ price: z.number() }) +}); + +// Observe - Discover available actions +const actions = await page.observe("find submit buttons"); + +// Agent - Automate entire workflows +const agent = stagehand.agent({ + provider: "anthropic", + model: "claude-sonnet-4-20250514", + options: { + apiKey: process.env.ANTHROPIC_API_KEY, + }, +}) +await agent.execute("apply for this job"); +``` +```python Python +# Act - Execute natural language actions +await page.act("click the login button") + +# Extract - Pull structured data +result = await page.extract( + schema={"price": float} +) + +# Observe - Discover available actions +actions = await page.observe("find submit buttons") + +# Agent - Automate entire workflows +await agent.execute("apply for this job") +``` + + + +## Why Developers Choose Stagehand + +- **Precise Control**: Mix AI-powered actions with deterministic code. You decide exactly how much AI to use. + +- **Actually Repeatable**: Save and replay actions exactly. No more "it worked on my machine" with browser automations. + +- **Maintainable at Scale**: One script can automate multiple websites. When sites change, your automations adapt. + +- **Composable Tools**: Choose your level of automation with Act, Extract, Observe, and Agent. + +## Built for Modern Development +Stagehand is designed for developers building production browser automations and AI agents that need reliable web access. + + + + Use any Playwright API alongside Stagehand. You're never locked into our abstractions. + + + First-class support for both ecosystems with type safety and IDE autocomplete. + + + Compatible with all Chromium-based browsers: Chrome, Edge, Arc, Brave, and more. + + + Created and maintained by the team behind enterprise browser infrastructure. + + + +## Get Started in 60 Seconds + + **Pro tip**: For best results, we recommend using Stagehand with [Browserbase](https://www.browserbase.com) for reliable cloud browser infrastructure. + + + + Build your first automation in under a minute + + + Generate Stagehand scripts with AI + + + See real-world automation examples + + + Get help from the community + + diff --git a/packages/docs/v2/first-steps/quickstart.mdx b/packages/docs/v2/first-steps/quickstart.mdx new file mode 100644 index 000000000..ff32a2ea6 --- /dev/null +++ b/packages/docs/v2/first-steps/quickstart.mdx @@ -0,0 +1,125 @@ +--- +title: Quickstart +description: 'Stagehand allows you to build web automations with natural language and code.' +--- + +If this is your **first time using Stagehand**, you should try [Director](https://director.ai) first. It's an agent that allows you to build Stagehand workflows using natural language. You can also try Stagehand using our [MCP server](/integrations/mcp/introduction) . + +Otherwise, the quickest way to start with Stagehand is with our CLI. It scaffolds a ready‑to‑run Stagehand app with sensible defaults, and an example script. + + +This quickstart is for **TypeScript**. For **Python**, see the [installation guide](/v2/first-steps/installation). + + +## 1) Create a sample project + + +```bash Bash +npx create-browser-app +``` + + +## 2) Run it + +Follow the CLI prompts to enter the project directory and add your API keys. Then run the example script. + + +```bash Bash +cd my-stagehand-app # Enter the project directory +cp .env.example .env # Add your API keys +npm start # Run the example script +``` + + +## 3) Use Stagehand (act, extract, observe) + +The scaffold includes an index.ts file that contains the example script. Here's what it looks like: + + +```typescript TypeScript +import "dotenv/config"; +import { Stagehand } from "@browserbasehq/stagehand"; + +async function main() { + const stagehand = new Stagehand({ + env: "BROWSERBASE" + }); + + await stagehand.init(); + + console.log(`Stagehand Session Started`); + console.log(`Watch live: https://browserbase.com/sessions/${stagehand.browserbaseSessionID}`); + + const page = stagehand.page; + + await page.goto("https://stagehand.dev"); + + const extractResult = await page.extract("Extract the value proposition from the page."); + console.log(`Extract result:\n`, extractResult); + + const actResult = await page.act("Click the 'Evals' button."); + console.log(`Act result:\n`, actResult); + + const observeResult = await page.observe("What can I click on this page?"); + console.log(`Observe result:\n`, observeResult); + + const agent = await stagehand.agent({ + instructions: "You're a helpful assistant that can control a web browser.", + }); + + const agentResult = await agent.execute("What is the most accurate model to use in Stagehand?"); + console.log(`Agent result:\n`, agentResult); + + await stagehand.close(); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); + +``` + + + +To use, set provider keys in `.env` (e.g., `OPENAI_API_KEY`). For cloud browsers, add `BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID`. + + +## Next steps + +Learn about the Stagehand primitives: act, extract, observe, and agent. + + + + Perform actions on web pages with natural language + + + + Get structured data with Zod schemas + + + + Discover available elements and actions + + + + Autonomous multi-step browser workflows + + + diff --git a/packages/docs/v2/integrations/crew-ai/configuration.mdx b/packages/docs/v2/integrations/crew-ai/configuration.mdx new file mode 100644 index 000000000..7b8e7cc20 --- /dev/null +++ b/packages/docs/v2/integrations/crew-ai/configuration.mdx @@ -0,0 +1,170 @@ +--- +title: "Use CrewAI to Automate Browser Tasks" +sidebarTitle: Configuration +description: "Create intelligent agents that can interact with websites and automate browser tasks using natural language instructions" +--- + +This guide walks you through setting up CrewAI with Browserbase to create agents that can perform web automation tasks using natural language instructions. + +## Step 1: Install Dependencies + +Install the required packages for CrewAI and Stagehand integration: + +```bash +pip install stagehand-py crewai crewai-tools +``` + +## Step 2: Configure Environment Variables + +You'll need API keys from three services: + +1. **Browserbase API Key and Project ID**: Get these from your [Browserbase dashboard](https://www.browserbase.com/) +2. **LLM API Key**: Get an API key from [OpenAI](https://platform.openai.com/api-keys) or [Anthropic](https://console.anthropic.com/) + +Store your API keys securely as environment variables: + +```bash +BROWSERBASE_API_KEY="your-browserbase-api-key" +BROWSERBASE_PROJECT_ID="your-browserbase-project-id" +OPENAI_API_KEY="your-openai-api-key" +ANTHROPIC_API_KEY="your-anthropic-api-key" +``` + +## Step 3: Create Your First Agent + +Create a Python script with a basic CrewAI agent: + +```python +import os +from crewai import Agent, Task, Crew +from crewai_tools import StagehandTool +from stagehand.schemas import AvailableModel + +# Get API keys from environment +browserbase_api_key = os.environ.get("BROWSERBASE_API_KEY") +browserbase_project_id = os.environ.get("BROWSERBASE_PROJECT_ID") +model_api_key = os.environ.get("OPENAI_API_KEY") # or ANTHROPIC_API_KEY + +# Initialize the StagehandTool +stagehand_tool = StagehandTool( + api_key=browserbase_api_key, + project_id=browserbase_project_id, + model_api_key=model_api_key, + model_name=AvailableModel.GPT_4O, # or AvailableModel.CLAUDE_3_7_SONNET_LATEST +) + +# Create an agent with the tool +researcher = Agent( + role="Web Researcher", + goal="Find and summarize information from websites", + backstory="I'm an expert at finding information online.", + verbose=True, + tools=[stagehand_tool], +) +``` + +## Step 4: Create and Run a Task + +Define a task for your agent and execute it: + +```python +# Create a task that uses the tool +research_task = Task( + description="Go to https://www.example.com and tell me what you see on the homepage.", + agent=researcher, +) + +# Run the crew +crew = Crew( + agents=[researcher], + tasks=[research_task], + verbose=True, +) + +try: + result = crew.kickoff() + print(result) +finally: + # Clean up resources + stagehand_tool.close() +``` + +## Step 5: Run Your Script + +Execute your Python script: + +```bash +python your_crew_script.py +``` + +## Advanced Configuration + +Customize the StagehandTool behavior with additional parameters: + +```python +stagehand_tool = StagehandTool( + api_key=browserbase_api_key, + project_id=browserbase_project_id, + model_api_key=model_api_key, + model_name=AvailableModel.CLAUDE_3_7_SONNET_LATEST, + dom_settle_timeout_ms=5000, # Wait longer for DOM to settle + headless=True, # Run browser in headless mode + self_heal=True, # Attempt to recover from errors + wait_for_captcha_solves=True, # Wait for CAPTCHA solving + verbose=1, # Control logging verbosity (0-3) +) +``` + +## Example Tasks + + + + ```python + form_task = Task( + description=""" + Submit a contact form: + 1. Go to https://example.com/contact + 2. Fill out the form with name 'John Doe', email 'john@example.com' + 3. Submit and confirm success + """, + agent=researcher, + ) + ``` + + + ```python + extraction_task = Task( + description=""" + Extract product information: + 1. Go to the products page + 2. Extract all product names, prices, and descriptions + 3. Format as structured data + """, + agent=researcher, + ) + ``` + + + ```python + navigation_task = Task( + description=""" + Navigate and analyze: + 1. Start at homepage + 2. Navigate to products section + 3. Filter by 'Electronics' category + 4. Find and extract details of highest-rated product + """, + agent=researcher, + ) + ``` + + + + + + Dive into the CrewAI documentation to learn more about its capabilities and integrations. + + + Access the Browserbase documentation for comprehensive guides and resources. + + \ No newline at end of file diff --git a/packages/docs/v2/integrations/crew-ai/introduction.mdx b/packages/docs/v2/integrations/crew-ai/introduction.mdx new file mode 100644 index 000000000..a8c5bfd8d --- /dev/null +++ b/packages/docs/v2/integrations/crew-ai/introduction.mdx @@ -0,0 +1,35 @@ +--- +title: "CrewAI Introduction" +sidebarTitle: Introduction +description: "Automate browser tasks using natural language instructions with CrewAI" +--- + +## Overview + +This guide shows you how to use CrewAI with Browserbase to create intelligent agents that can automate web interactions. By the end of this guide, you'll know how to: + +- Set up CrewAI with the StagehandTool +- Create agents that can interact with websites +- Automate browser tasks using natural language instructions +- Extract structured data from web pages + +## When You'd Use This + +The CrewAI integration is perfect for scenarios where you need intelligent web automation: + +- **Research automation**: Have agents research information across multiple websites +- **Data collection**: Extract structured data from e-commerce sites, job boards, or news sites +- **Form automation**: Automatically fill out and submit forms based on specific criteria +- **Multi-step workflows**: Execute complex browser workflows that require decision-making + +The StagehandTool wraps the Stagehand Python SDK to provide CrewAI agents with the ability to control a real web browser and interact with websites using three core primitives: + +1. **Act**: Perform actions like clicking, typing, or navigating +2. **Extract**: Extract structured data from web pages +3. **Observe**: Identify and analyze elements on the page + + + + Learn how to configure and use the StagehandTool with CrewAI agents for web automation tasks + + \ No newline at end of file diff --git a/packages/docs/v2/integrations/langchain/configuration.mdx b/packages/docs/v2/integrations/langchain/configuration.mdx new file mode 100644 index 000000000..6c3dd4b61 --- /dev/null +++ b/packages/docs/v2/integrations/langchain/configuration.mdx @@ -0,0 +1,242 @@ +--- +title: "LangChain JS Configuration" +sidebarTitle: Configuration +description: "Set up Stagehand with LangChain JS to create intelligent web automation agents" +--- + +This guide walks you through integrating Stagehand with LangChain JS to build powerful web automation workflows using natural language instructions. + +## Step 1: Install Dependencies + +Install the required packages for LangChain JS and Stagehand integration: + +```bash +npm install @langchain/langgraph @langchain/community @langchain/core @browserbasehq/stagehand +``` + +## Step 2: Configure Environment Variables + +For remote browser automation, set up your Browserbase credentials: + +```bash +BROWSERBASE_API_KEY="your-browserbase-api-key" +BROWSERBASE_PROJECT_ID="your-browserbase-project-id" +``` + +## Step 3: Create a Stagehand Instance + +Initialize Stagehand with your preferred configuration: + +```typescript +import { Stagehand } from "@browserbasehq/stagehand"; + +// For local development +const stagehand = new Stagehand({ + env: "LOCAL", + verbose: 2, + enableCaching: false, +}); + +// For production with Browserbase +const stagehand = new Stagehand({ + env: "BROWSERBASE", + verbose: 1, + enableCaching: true, +}); +``` + +## Step 4: Generate the StagehandToolkit + +Create the toolkit that provides LangChain-compatible tools: + +```typescript +import { StagehandToolkit } from '@langchain/community/agents/toolkits/stagehand'; + +const stagehandToolkit = await StagehandToolkit.fromStagehand(stagehand); +``` + +## Step 5: Use Individual Tools + +The toolkit provides four specialized tools for web automation: + +### Available Tools + +- **stagehand_navigate**: Navigate to specific URLs +- **stagehand_act**: Perform browser actions (clicking, typing, etc.) +- **stagehand_extract**: Extract structured data using schemas +- **stagehand_observe**: Analyze page elements and possible actions + +### Basic Tool Usage + +```typescript +import { z } from "zod"; + +// Navigate to a website +const navigateTool = stagehandToolkit.tools.find( + (t) => t.name === "stagehand_navigate" +); +await navigateTool.invoke("https://www.google.com"); + +// Perform an action +const actionTool = stagehandToolkit.tools.find( + (t) => t.name === "stagehand_act" +); +await actionTool.invoke('Search for "OpenAI"'); + +// Observe the page +const observeTool = stagehandToolkit.tools.find( + (t) => t.name === "stagehand_observe" +); +const result = await observeTool.invoke( + "What actions can be performed on the current page?" +); +console.log(JSON.parse(result)); + +// Extract structured data +const extractTool = stagehandToolkit.tools.find( + (t) => t.name === "stagehand_extract" +); +const extractResult = await extractTool.invoke({ + instruction: "Extract the main heading and description", + schema: z.object({ + heading: z.string(), + description: z.string(), + }), +}); +console.log(extractResult); +``` + +## Step 6: Build LangGraph Agents + +Integrate with LangGraph for complex automation workflows: + +```typescript +import { createReactAgent } from "@langchain/langgraph/prebuilt"; + +// Create an LLM +const llm = new ChatOpenAI({ + model: "gpt-4", + temperature: 0, +}); + +// Create an agent with Stagehand tools +const agent = createReactAgent({ + llm, + tools: stagehandToolkit.tools, +}); + +// Execute a complex workflow +const result = await agent.invoke({ + messages: [ + { + role: "user", + content: "Go to example.com, find the contact form, and extract all the form fields" + } + ] +}); +``` + +## Advanced Configuration + +### Custom Stagehand Configuration + +```typescript +const stagehand = new Stagehand({ + env: "BROWSERBASE", + verbose: 2, + enableCaching: true, + headless: true, + domSettleTimeoutMs: 5000, +}); +``` + +### Error Handling + +```typescript +try { + const result = await agent.invoke({ + messages: [{ role: "user", content: "Navigate to invalid-url.com" }] + }); +} catch (error) { + console.error("Automation failed:", error); +} finally { + // Clean up resources + await stagehand.close(); +} +``` + +## Example Workflows + + + + ```typescript + const extractionAgent = createReactAgent({ + llm, + tools: stagehandToolkit.tools, + }); + + const result = await extractionAgent.invoke({ + messages: [{ + role: "user", + content: ` + Go to news-website.com and extract: + 1. All article headlines + 2. Publication dates + 3. Author names + Format as structured JSON + ` + }] + }); + ``` + + + ```typescript + const formAgent = createReactAgent({ + llm, + tools: stagehandToolkit.tools, + }); + + const result = await formAgent.invoke({ + messages: [{ + role: "user", + content: ` + Navigate to contact-form.com and: + 1. Fill out the contact form with: + - Name: John Doe + - Email: john@example.com + - Message: Inquiry about services + 2. Submit the form + 3. Confirm submission success + ` + }] + }); + ``` + + + ```typescript + const researchAgent = createReactAgent({ + llm, + tools: stagehandToolkit.tools, + }); + + const result = await researchAgent.invoke({ + messages: [{ + role: "user", + content: ` + Research product pricing by: + 1. Visit competitor1.com and extract pricing info + 2. Visit competitor2.com and extract pricing info + 3. Compare features and prices + 4. Provide summary analysis + ` + }] + }); + ``` + + + + + + Official LangChain JS documentation for the Stagehand integration + + \ No newline at end of file diff --git a/packages/docs/v2/integrations/langchain/introduction.mdx b/packages/docs/v2/integrations/langchain/introduction.mdx new file mode 100644 index 000000000..b0d7b9341 --- /dev/null +++ b/packages/docs/v2/integrations/langchain/introduction.mdx @@ -0,0 +1,29 @@ +--- +title: "Langchain JS Introduction" +sidebarTitle: Introduction +description: "Integrate Stagehand with Langchain JS for intelligent web automation" +--- + +## Overview + +This guide shows you how to use Stagehand with Langchain JS to create intelligent agents that can automate web interactions. By the end of this guide, you'll know how to: + +- Set up the StagehandToolkit with Langchain JS +- Create agents that can navigate and interact with websites +- Extract structured data using natural language instructions +- Build complex automation workflows with LangGraph + +## When You'd Use This + +The Langchain JS integration is perfect for scenarios where you need intelligent web automation with advanced reasoning: + +- **AI-driven research**: Create agents that can research information across multiple websites and synthesize findings +- **Dynamic form filling**: Automatically fill out complex forms based on contextual requirements +- **Data extraction workflows**: Extract and transform data from multiple sources with intelligent navigation +- **Multi-step web processes**: Execute complex browser workflows that require decision-making and adaptation + + + + Learn how to set up and configure the StagehandToolkit with Langchain JS agents + + \ No newline at end of file diff --git a/packages/docs/v2/integrations/mcp/configuration.mdx b/packages/docs/v2/integrations/mcp/configuration.mdx new file mode 100644 index 000000000..58535d977 --- /dev/null +++ b/packages/docs/v2/integrations/mcp/configuration.mdx @@ -0,0 +1,436 @@ +--- +title: "Browserbase MCP Server Configuration" +sidebarTitle: "Configuration" +description: "Configure your browser automation with command-line flags, environment variables, and advanced options" +--- + +## Configuration Overview + +The Browserbase MCP server supports extensive configuration options through command-line flags and environment variables. Configure browser behavior, proxy settings, stealth modes, model selection, and more to customize your browser automation workflows. + + +Command-line flags are only available when running the server locally (`npx @browserbasehq/mcp-server-browserbase` with flags or local development setup). + + +## Environment Variables + +Configure the essential Browserbase credentials and optional debugging settings: + + + +Your Browserbase API key for authentication + + + +Your Browserbase project ID + + + + +## Command-Line Flags + +### Available Flags + +| Flag | Description | +|------|-------------| +| `--proxies` | Enable Browserbase proxies for the session | +| `--advancedStealth` | Enable Browserbase Advanced Stealth (Scale Plan only) | +| `--keepAlive` | Enable Browserbase Keep Alive Session | +| `--contextId ` | Specify a Browserbase Context ID to use | +| `--persist [boolean]` | Whether to persist the Browserbase context (default: true) | +| `--port ` | Port to listen on for HTTP/SHTTP transport | +| `--host ` | Host to bind server to (default: localhost, use 0.0.0.0 for all interfaces) | +| `--cookies [json]` | JSON array of cookies to inject into the browser | +| `--browserWidth ` | Browser viewport width (default: 1024) | +| `--browserHeight ` | Browser viewport height (default: 768) | +| `--modelName ` | The model to use for Stagehand (default: gemini-2.0-flash) | +| `--modelApiKey ` | API key for the custom model provider (required when using custom models) | +| `--experimental` | Enable experimental features (default: false) | + +## Configuration Examples + +### Basic Configuration + + + + + + +```json Direct SHTTP +{ + "mcpServers": { + "browserbase": { + "url": "your-smithery-url.com" + } + } +} +``` + + +When using our remote hosted server, we provide the LLM costs for Gemini, the [best performing model](https://www.stagehand.dev/evals) in [Stagehand](https://www.stagehand.dev). + + + + +```json +{ + "mcpServers": { + "browserbase": { + "command": "npx", + "args": ["@browserbasehq/mcp-server-browserbase"], + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id", + "GEMINI_API_KEY": "your_gemini_api_key" + } + } + } +} +``` + + + +```json +{ + "mcpServers": { + "browserbase": { + "command": "node", + "args": ["/path/to/mcp-server-browserbase/cli.js"], + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id", + "GEMINI_API_KEY": "your_gemini_api_key" + } + } + } +} +``` + + + +```bash +# Start server +node cli.js --port 8931 +``` + +```json +{ + "mcpServers": { + "browserbase": { + "url": "http://localhost:8931/mcp", + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id", + "GEMINI_API_KEY": "your_gemini_api_key" + } + } + } +} +``` + + + +### Advanced Features + + + +Enable Browserbase proxies for IP rotation and geo-location testing. + + +[Learn more about Browserbase Proxies](https://docs.browserbase.com/features/proxies) + + +```json +{ + "mcpServers": { + "browserbase": { + "command": "npx", + "args": ["@browserbasehq/mcp-server-browserbase", "--proxies"], + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id", + "GEMINI_API_KEY": "your_gemini_api_key" + } + } + } +} +``` + + + +Enable advanced anti-detection features for enhanced stealth browsing. + + +[Learn more about Advanced Stealth](https://docs.browserbase.com/features/stealth-mode#advanced-stealth-mode) + +**Note:** Advanced Stealth is only available for Scale Plan users. + + +```json +{ + "mcpServers": { + "browserbase": { + "command": "npx", + "args": ["@browserbasehq/mcp-server-browserbase", "--advancedStealth"], + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id", + "GEMINI_API_KEY": "your_gemini_api_key" + } + } + } +} +``` + + + +Use persistent browser contexts to maintain authentication and state across sessions. + + +[Learn more about Browserbase Contexts](https://docs.browserbase.com/features/contexts) + + +```json +{ + "mcpServers": { + "browserbase": { + "command": "npx", + "args": ["@browserbasehq/mcp-server-browserbase", "--contextId", "your_context_id"], + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id" + } + } + } +} +``` + + + +### Browser Customization + + + +Customize browser window dimensions. Default is 1024x768. Recommended aspect ratios: 16:9. + +```json +{ + "mcpServers": { + "browserbase": { + "command": "npx", + "args": [ + "@browserbasehq/mcp-server-browserbase", + "--browserWidth", "1920", + "--browserHeight", "1080" + ], + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id", + "GEMINI_API_KEY": "your_gemini_api_key" + } + } + } +} +``` + +**Common Resolutions:** +- Desktop: 1920x1080, 1280x720, 1024x768 +- Mobile: 375x667 (iPhone), 360x640 (Android) +- Tablet: 768x1024 (iPad) + + + +Inject session cookies for authentication. Useful when persistent contexts don't handle session cookies. + + +Cookies must be in [Playwright Cookie format](https://playwright.dev/docs/api/class-browsercontext#browser-context-cookies). + + +```json +{ + "mcpServers": { + "browserbase": { + "command": "npx", + "args": [ + "@browserbasehq/mcp-server-browserbase", + "--cookies", + "[{\"name\": \"session\", \"value\": \"abc123\", \"domain\": \".example.com\", \"path\": \"/\", \"httpOnly\": true, \"secure\": true}]" + ], + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id", + "GEMINI_API_KEY": "your_gemini_api_key" + } + } + } +} +``` + + + +## Model Configuration + +Configure AI models for enhanced browser automation. Stagehand defaults to Google's Gemini 2.0 Flash but supports multiple providers. + + +When using any custom model (non-default), you must provide your own API key for that model provider using the `--modelApiKey` flag. + + + + +**Google Gemini** (Default) +- `google/gemini-2.0-flash` (default) +- `google/gemini-1.5-pro` +- `google/gemini-1.5-flash` + +**OpenAI** +- `openai/gpt-4o` +- `openai/gpt-4o-mini` +- `openai/o1-mini` +- `openai/o1-preview` +- `openai/o3-mini` + +**Anthropic Claude** +- `anthropic/claude-3-5-sonnet-latest` +- `anthropic/claude-3-7-sonnet-latest` + +[View full list of supported models](https://docs.stagehand.dev/examples/custom_llms#supported-llms) + + + + +```json OpenAI GPT-4o +{ + "mcpServers": { + "browserbase": { + "command": "npx", + "args": [ + "@browserbasehq/mcp-server-browserbase", + "--modelName", "openai/gpt-4o", + "--modelApiKey", "your_openai_api_key" + ], + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id" + } + } + } +} +``` + +```json Claude Sonnet +{ + "mcpServers": { + "browserbase": { + "command": "npx", + "args": [ + "@browserbasehq/mcp-server-browserbase", + "--modelName", "anthropic/claude-3-5-sonnet-latest", + "--modelApiKey", "your_anthropic_api_key" + ], + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id" + } + } + } +} +``` + + + + +## Development Configuration + + + +Enable detailed logging for troubleshooting and development. + +```json +{ + "mcpServers": { + "browserbase": { + "command": "npx", + "args": ["@browserbasehq/mcp-server-browserbase"], + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id", + "GEMINI_API_KEY": "your_gemini_api_key", + "DEBUG": "true" + } + } + } +} +``` + + + +Configure custom host and port for SHTTP transport. + +```json +{ + "mcpServers": { + "browserbase": { + "command": "npx", + "args": [ + "@browserbasehq/mcp-server-browserbase", + "--host", "0.0.0.0", + "--port", "8080" + ], + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id", + "GEMINI_API_KEY": "your_gemini_api_key" + } + } + } +} +``` + + + +## Best Practices + + +- Use appropriate viewport sizes for your use case +- Enable proxies only when needed for geo-location +- Choose efficient models (Gemini Flash for speed, GPT-4o for accuracy) +- Reuse contexts for authentication persistence + + + +- Store API keys securely in environment variables +- Use Advanced Stealth for sensitive operations +- Implement proper session management +- Rotate cookies and contexts regularly + + + +- Enable debug mode during development +- Use context persistence for faster iteration +- Test with different viewport sizes +- Monitor session usage and quotas + + + +- Use NPM installation for reliability +- Configure appropriate timeouts +- Implement error handling and retries +- Monitor performance and resource usage + + +## Further Reading + + + +Complete platform documentation + + + +AI-powered browser automation + + + +Get help from our team + + \ No newline at end of file diff --git a/packages/docs/v2/integrations/mcp/introduction.mdx b/packages/docs/v2/integrations/mcp/introduction.mdx new file mode 100644 index 000000000..9938fb982 --- /dev/null +++ b/packages/docs/v2/integrations/mcp/introduction.mdx @@ -0,0 +1,235 @@ +--- +title: "Browserbase MCP Server" +sidebarTitle: "Introduction" +description: "AI-powered browser automation through Model Context Protocol integration with Stagehand" +--- + +## Overview + +The Browserbase MCP Server brings powerful browser automation capabilities to Claude through the Model Context Protocol (MCP). Built on top of [Stagehand](https://docs.stagehand.dev/), this integration provides AI-powered web automation using natural language commands. + + +This server enables Claude to control browsers, navigate websites, interact with web elements, and extract data—all through simple conversational commands. + + + +**Two Ways to Use MCP:** +1. **Browserbase MCP Server**: Provides browser automation tools to Claude Desktop and other MCP clients +2. **MCP Integrations in Stagehand**: Add external tools (like Exa search, Supabase) to Stagehand agents + +This documentation covers the Browserbase MCP Server. For using MCP integrations within Stagehand agents, see [MCP Integrations](/best-practices/mcp-integrations). + + +## Key Features + + + +Control browsers using plain English commands like "click the login button" or "fill out the contact form" + + + +Navigate, click, and fill forms with ease + + + +Extract structured data from any website automatically + + + + +Run multiple browser sessions simultaneously for complex workflows + + + +Capture and analyze webpage screenshots programmatically + + + +Handle authentication and session persistence across interactions + + + +## Core Benefits + + + + + +No need to learn complex selectors or automation syntax. Simply describe what you want to do in natural language. + + + +Get started in minutes with our NPM package or our remote hosted URL. + + + +Stagehand's AI understands web page context and can adapt to different layouts and designs. + + + + + + + +Navigate, click, type, scroll, and interact with any web element. + + + +Extract structured information from complex web pages automatically. + + + +Maintain authentication states and cookies across multiple interactions. + + + +Run parallel browser instances for complex workflows. + + + + + + + +Built on Browserbase's cloud browser platform for consistent performance. + + + +Handle multiple concurrent sessions and high-volume automation tasks. + + + +Stealth mode, proxy support, and advanced anti-detection capabilities. + + + +Detailed session recordings and debugging information. + + + + + +## Use Cases + + + + + +Track product prices, availability, and competitor information + + + +Gather data from multiple sources for analysis and reporting + + + +Collect articles, posts, and media from various websites + + + +Extract contact information and business data from directories + + + + + + + +Create comprehensive test suites for web applications + + + +Test functionality across different browser environments + + + +Simulate real user interactions and workflows + + + +Track page load times and user experience metrics + + + + + + + +Automatically fill and submit complex web forms + + + +Extract data and generate automated reports + + + +Schedule posts and monitor engagement across platforms + + + +Automate repetitive web-based business processes + + + + + +## Session Management + + +The Browserbase MCP Server supports both single and multi-session architectures to accommodate different automation needs. + + + + +**Traditional Approach** +- One active browser session at a time +- Simpler for basic automation tasks +- Automatic session lifecycle management +- Ideal for sequential workflows + + + +**Advanced Parallel Processing** +- Multiple independent browser sessions +- Each session maintains separate state +- Parallel execution capabilities +- Perfect for complex workflows + + + +## Getting Started + + + +Choose from NPM installation, remote hosted URL, or local development based on your needs. + + + +Set up your Browserbase API credentials in the MCP configuration. +Get your API keys from the [Browserbase Dashboard](https://www.browserbase.com/overview). + + + +Begin using natural language commands to control browsers through Claude. + + + + +Ready to get started? Check out our [Setup Guide](/integrations/mcp/setup) for detailed installation instructions. + + +## Further Reading + + + +Get started with installation and configuration + + + +Explore all available automation tools + + + +Customize your browser automation setup + + \ No newline at end of file diff --git a/packages/docs/v2/integrations/mcp/setup.mdx b/packages/docs/v2/integrations/mcp/setup.mdx new file mode 100644 index 000000000..8219e74a6 --- /dev/null +++ b/packages/docs/v2/integrations/mcp/setup.mdx @@ -0,0 +1,197 @@ +--- +title: "Browserbase MCP Server Setup" +sidebarTitle: "Setup" +description: "Add the Browserbase MCP Server to Claude" +--- + +## Quick Installation + + +One-click installation directly in Cursor with pre-configured settings + + +We support multiple transport methods for our MCP server: STDIO and SHTTP. We recommend using SHTTP with our remote hosted URL to take advantage of the server at full capacity. + +## Prerequisites + + + +Get your Browserbase API key and project ID from the [Browserbase Dashboard](https://www.browserbase.com/overview). + + +Browserbase API Key and Project ID settings + + +Then copy your API Key and Project ID directly from the input. + + + +## Installation Methods + + + + +Go to [smithery.ai](https://smithery.ai/server/@browserbasehq/mcp-browserbase) and enter your API keys and configuration to get a remote hosted URL. + +![Smithery](../../images/mcp/smithery.jpg) + + +```json Smithery +{ + "mcpServers": { + "browserbase": { + "url": "your-smithery-url.com" + } + } +} +``` + + +When using our remote hosted server, we provide the LLM costs for Gemini, the [best performing model](https://www.stagehand.dev/evals) in [Stagehand](https://www.stagehand.dev). + + + + +The easiest way to get started locally is using our NPM package. + + +If you would like to use a different model, you have to pass the model name and keys in the args. More info [here](https://docs.browserbase.com/integrations/mcp/configuration). + + + + +Go into your MCP Config JSON and add the Browserbase Server: + + +```json Claude Desktop +{ + "mcpServers": { + "browserbase": { + "command": "npx", + "args": ["@browserbasehq/mcp-server-browserbase"], + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id", + "GEMINI_API_KEY": "your_gemini_api_key" + } + } + } +} +``` + + + + + +That's it! Reload your MCP client and Claude will be able to use Browserbase. + + + + + + +For local development or customization, you can run the server locally. + + + +```bash +# Clone the Repo +git clone https://github.com/browserbase/mcp-server-browserbase.git +cd mcp-server-browserbase + +# Install the dependencies and build the project +npm install && npm run build +``` + + + +You can run locally using either STDIO or Streamable HTTP (SHTTP). + + + +Add the following to your MCP Config JSON file: + +```json +{ + "mcpServers": { + "browserbase": { + "command": "node", + "args": ["/path/to/mcp-server-browserbase/cli.js"], + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id", + "GEMINI_API_KEY": "your_gemini_api_key" + } + } + } +} +``` + + + +First, run the server: + +```bash +node cli.js --port 8931 +``` + +Then add this to your MCP Config JSON file: + +```json +{ + "mcpServers": { + "browserbase": { + "url": "http://localhost:8931/mcp", + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id", + "GEMINI_API_KEY": "your_gemini_api_key" + } + } + } +} +``` + + + + + + +Reload your MCP client and you should be good to go! + + + + + + +## Verify Installation + + + +Restart/refresh your Claude Client app and you should see the tools available by clicking the 🔨 icon. + + + +Get started using our MCP Server by asking Claude to navigate to any page and see your Browserbase Browser in action on the [dashboard](https://www.browserbase.com/sessions). + + +Try asking Claude: "Navigate to google.com and take a screenshot" + + + + +## Further Reading + + + +Learn more about the MCP protocol + + + +Explore Browserbase features and capabilities + + + +Get help from our support team + + \ No newline at end of file diff --git a/packages/docs/v2/integrations/mcp/tools.mdx b/packages/docs/v2/integrations/mcp/tools.mdx new file mode 100644 index 000000000..953518f43 --- /dev/null +++ b/packages/docs/v2/integrations/mcp/tools.mdx @@ -0,0 +1,241 @@ +--- +title: "Browserbase MCP Server Tools" +sidebarTitle: "Tools" +description: "This guide covers the specialized tools available in the Browserbase MCP server for browser automation and interaction." +--- + +## Overview + +The Browserbase MCP server provides comprehensive tools for browser automation and session management. These tools allow you to perform actions like navigating pages, capturing screenshots, manipulating cookies, and managing multiple browser sessions simultaneously. + +## Core Browser Automation Tools + +These are the primary tools for modern web automation using natural language commands. + + +Navigate to any URL in the browser + + + The URL to navigate to + + + + +Perform an action on the web page using natural language + + + The action to perform (e.g., "click the login button", "fill form field") + + + + + +Extract all text content from the current page (filters out CSS and JavaScript) + +No input parameters required + + + +Observe and find actionable elements on the web page + + + Specific instruction for observation (e.g., "find the login button", "locate search form") + + + + + +Capture a PNG screenshot of the current page + +No input parameters required + + + Base-64 encoded PNG data + + + + +Get the current URL of the browser page + +No input parameters required + + + Complete URL including protocol, domain, path, and any query parameters or fragments + + + + +Get current URLs of all active browser sessions + +No input parameters required + + + Mapping of session IDs to their current URLs in JSON format + + + +## Single Session Management + +Traditional approach with one active browser session. Simpler for basic automation tasks and automatically manages the active session. + + +Create or reuse a cloud browser session using Browserbase with fully initialized Stagehand + + + Optional session ID to use/reuse. If not provided, creates new session + + + + + +Close the current Browserbase session, disconnect the browser, and cleanup Stagehand instance + +No input parameters required + + + +## Multi-Session Management + +Advanced approach with multiple parallel browser sessions for complex automation workflows. Each session maintains independent state, cookies, and browser context. + +### Session Lifecycle Management + + +Create a new independent Stagehand browser session with full web automation capabilities + + + Human-readable name for tracking (e.g., 'login-flow', 'data-scraping') + + + + +List all currently active Stagehand browser sessions with detailed metadata + +No input parameters required + + + + +Close and clean up a specific Stagehand browser session + + + Exact session ID to close (cannot be undone) + + + +### Session-Specific Automation Tools + +All core browser automation tools are available with session-specific variants: + + +Navigate to a URL in a specific browser session + + + The session ID to use + + + + The URL to navigate to + + + + +Perform an action in a specific browser session using natural language + + + The session ID to use + + + + The action to perform + + + + + +Extract structured information from a specific browser session + + + The session ID to use + + + + What to extract from the page + + + + +Observe and find actionable elements in a specific browser session + + + The session ID to use + + + + What to observe (e.g., "find the login button") + + + + Whether to return the action to perform + + + + +Get the current URL of a specific browser session + + + The session ID to use + + + + Complete URL including protocol, domain, path, and any query parameters or fragments + + + +### Multi-Session Use Cases + + + + Run multiple scraping sessions simultaneously across different websites + + + + Compare user flows across different browser sessions with varying configurations + + + + Perform coordinated actions across multiple websites or applications + + + + Keep fallback sessions ready in case primary sessions encounter issues + + + +## Resources + + + The server provides access to screenshot resources with URI-based access. + + example: + ``` + screenshot://screenshot-name-of-the-screenshot + ``` + + + +## Further Reading + + + +Learn more about the MCP protocol + + + +Explore Stagehand's AI-powered browser automation + + + +Get help from our support team + + \ No newline at end of file diff --git a/packages/docs/v2/integrations/vercel/configuration.mdx b/packages/docs/v2/integrations/vercel/configuration.mdx new file mode 100644 index 000000000..83e2318ce --- /dev/null +++ b/packages/docs/v2/integrations/vercel/configuration.mdx @@ -0,0 +1,165 @@ +--- +title: Use Stagehand in Next.js +sidebarTitle: Configuration +description: Next.js is a popular framework for developing web-based applications in production. It powers Stagehand apps like [Director](https://director.ai), [Brainrot](https://brainrot.run) and [Open Operator](https://operator.browserbase.com). +--- + + + Clone our [GitHub repo](https://github.com/browserbase/stagehand-nextjs-quickstart) to get started with Stagehand in Next.js. + + +## Add Stagehand to an existing Next.js project +If you'd like to add Stagehand to an existing Next.js project, you can do so by installing the dependencies: + + + ```bash + npm install @browserbasehq/stagehand @browserbasehq/sdk playwright zod + ``` + + + + ```bash + pnpm add @browserbasehq/stagehand @browserbasehq/sdk playwright zod + ``` + + + + ```bash + yarn add @browserbasehq/stagehand @browserbasehq/sdk playwright zod + ``` + + + +### Write a server action +Next, let's define our `main` function as a server action in `app/stagehand/main.ts`. This file will have the following three functions: + +1. **`main`: Run the main Stagehand script** +2. **`runStagehand`: Initialize and run the `main` function** +3. **`startBBSSession`: Start a Browserbase session** + +```ts app/stagehand/main.ts +// 🤘 Welcome to Stagehand! +// This file is from the [Stagehand docs](https://docs.stagehand.dev/sections/examples/nextjs). + +"use server"; + +import { Stagehand } from "@browserbasehq/stagehand"; +import { z } from "zod/v3"; +import { Browserbase } from "@browserbasehq/sdk"; + +/** + * Run the main Stagehand script + */ +async function main(stagehand: Stagehand) { + // You can use the `page` instance to write any Playwright code + // For more info: https://playwright.dev/docs/pom + const page = stagehand.page; + + // In this example, we'll get the title of the Stagehand quickstart page + await page.goto("https://docs.stagehand.dev/"); + await page.act("click the quickstart link"); + const { title } = await page.extract({ + instruction: "extract the main heading of the page", + schema: z.object({ + title: z.string(), + }), + }); + + return title; +} + +/** + * Initialize and run the main() function + */ +export async function runStagehand(sessionId?: string) { + const stagehand = new Stagehand({ + env: "BROWSERBASE", + apiKey: process.env.BROWSERBASE_API_KEY, + projectId: process.env.BROWSERBASE_PROJECT_ID, + verbose: 1, + logger: console.log, + browserbaseSessionID: sessionId, + disablePino: true, + }); + await stagehand.init(); + await main(stagehand); + await stagehand.close(); +} + +/** + * Start a Browserbase session + */ +export async function startBBSSession() { + const browserbase = new Browserbase(); + const session = await browserbase.sessions.create({ + projectId: process.env.BROWSERBASE_PROJECT_ID!, + }); + const debugUrl = await browserbase.sessions.debug(session.id); + return { + sessionId: session.id, + debugUrl: debugUrl.debuggerFullscreenUrl, + }; +} +``` + +### Create a client component +Next, let's create a client component that will start a Browserbase session and run the `main` function with the server actions we just defined. We'll first create a Browserbase session and embed the session in an iframe before running the `main` function. + +```tsx app/components/stagehandEmbed.tsx +"use client"; + +import { useCallback, useState } from "react"; +import { runStagehand, startBBSSession } from "@/app/stagehand/main"; + +export function StagehandEmbed() { + const [sessionId, setSessionId] = useState(null); + const [debugUrl, setDebugUrl] = useState(null); + + const startSession = useCallback(async () => { + const { sessionId, debugUrl } = await startBBSSession(); + setSessionId(sessionId); + setDebugUrl(debugUrl); + await runStagehand(sessionId); + }, []); + + return ( +
+ {!sessionId && } + {sessionId && debugUrl && ( + +You might've heard of [Gemini Computer Use](https://blog.google/technology/google-deepmind/gemini-computer-use-model/), [Claude Computer Use](https://www.anthropic.com/news/3-5-models-and-computer-use), or [OpenAI's Computer Using Agent](https://openai.com/index/computer-using-agent/). + +These are powerful tools that can convert natural language into actions on the computer. However, you'd otherwise need to write your own code to convert these actions into Playwright commands. + +Stagehand not only handles the execution of Computer Use outputs, but also lets you hot-swap between Google, OpenAI, and Anthropic models with one line of code. You can find more information on the performance of different computer use models by visiting our [evals page](https://www.stagehand.dev/agent-evals). + +## How to use a Computer Use Agent in Stagehand + +Stagehand lets you use Computer Use Agents with one line of code: + + +**Deprecation Notice:** The `cua: true` option is deprecated and will be removed in a future version. Use `mode: "cua"` instead. + + + +**IMPORTANT! Configure your browser dimensions** + +Computer Use Agents will often return XY-coordinates to click on the screen, so you'll need to configure your browser dimensions. + +If not specified, the default browser dimensions are 1288 x 711. You can also configure the browser dimensions in the `browserbaseSessionCreateParams` or `localBrowserLaunchOptions` options. + + + +### Configuring browser dimensions + +Browser configuration differs by environment: + + + +```typescript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", + model: "google/gemini-2.5-flash", + + browserbaseSessionCreateParams: { + projectId: process.env.BROWSERBASE_PROJECT_ID!, + browserSettings: { + blockAds: true, + viewport: { + width: 1288, + height: 711, + }, + }, + }, +}); + +await stagehand.init(); +``` + + +```typescript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "LOCAL", + localBrowserLaunchOptions: { + headless: false, + viewport: { + width: 1288, + height: 711, + }, + } +}); + +await stagehand.init(); +``` + + + +### Direct your Computer Use Agent + +Call `execute` on the agent to assign a task to the agent. + + +```typescript Google +await page.goto("https://www.google.com/"); +const agent = stagehand.agent({ + mode: "cua", + model: { + modelName: "google/gemini-2.5-computer-use-preview-10-2025", + apiKey: process.env.GOOGLE_GENERATIVE_AI_API_KEY + }, + systemPrompt: "You are a helpful assistant...", +}); + +await agent.execute({ + instruction: "Go to Hacker News and find the most controversial post from today, then read the top 3 comments and summarize the debate.", + maxSteps: 20, + highlightCursor: true +}) +``` + +```typescript OpenAI +await page.goto("https://www.google.com/"); +const agent = stagehand.agent({ + mode: "cua", + model: { + modelName: "openai/computer-use-preview", + apiKey: process.env.OPENAI_API_KEY + }, + systemPrompt: "You are a helpful assistant...", +}); + +await agent.execute({ + instruction: "Go to Hacker News and find the most controversial post from today, then read the top 3 comments and summarize the debate.", + maxSteps: 20, + highlightCursor: true +}) +``` +```typescript Anthropic +await page.goto("https://www.google.com/"); +const agent = stagehand.agent({ + mode: "cua", + model: { + modelName: "anthropic/claude-sonnet-4-20250514", + apiKey: process.env.ANTHROPIC_API_KEY + }, + systemPrompt: "You are a helpful assistant...", +}); + +await agent.execute({ + instruction: "Go to Hacker News and find the most controversial post from today, then read the top 3 comments and summarize the debate.", + maxSteps: 20, + highlightCursor: true +}) +``` + + +You can define the maximum number of steps the agent can take with `maxSteps`: + +```typescript +await agent.execute({ + instructions: "Apply for a library card at the San Francisco Public Library", + maxSteps: 10, +}); +``` + +### Select Your Computer Use Model + +Stagehand supports computer use models from Google, Anthropic, and OpenAI. You can find all supported models on the [models page](/v3/configuration/models#agent-models-with-cua-support). + + + +```typescript +const agent = stagehand.agent({ + mode: "cua", + model: "google/gemini-2.5-computer-use-preview-10-2025", + // GOOGLE_GENERATIVE_AI_API_KEY is auto-loaded - set in your .env +}); +``` + + +```typescript +const agent = stagehand.agent({ + mode: "cua", + model: "anthropic/claude-sonnet-4-20250514", + // ANTHROPIC_API_KEY is auto-loaded - set in your .env +}); +``` + + +```typescript +const agent = stagehand.agent({ + mode: "cua", + model: "openai/computer-use-preview", + // OPENAI_API_KEY is auto-loaded - set in your .env +}); +``` + + + +View or run the example templates [here](https://www.browserbase.com/templates?category=Computer+Use+Agents) diff --git a/packages/docs/v3/best-practices/cost-optimization.mdx b/packages/docs/v3/best-practices/cost-optimization.mdx new file mode 100644 index 000000000..cc373447f --- /dev/null +++ b/packages/docs/v3/best-practices/cost-optimization.mdx @@ -0,0 +1,183 @@ +--- +title: Cost Optimization +sidebarTitle: Cost Optimization +description: Minimize costs while maintaining automation performance +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +Cost optimization in Stagehand involves balancing LLM inference costs and browser infrastructure costs. This guide provides practical strategies to reduce your automation expenses. + +## Quick Wins + +Start with these simple optimizations that can reduce costs: + +### 1. Use the Right Model for the Job + +We don't recommend using larger, more premium models for simple tasks. See our [evaluation results](https://stagehand.dev/evals) for model performance and cost comparisons across different task types. + + + + Choose the right LLM for your budget and accuracy requirements + + + See how different models perform on different tasks + + + +### 2. Implement Caching + +Enable automatic action caching to eliminate redundant LLM calls. Simply specify a `cacheDir` when initializing Stagehand: + +```typescript +const stagehand = new Stagehand({ + env: "BROWSERBASE", + cacheDir: "action-cache", // Enable automatic caching +}); + +await stagehand.init(); + +// First run: uses LLM inference and caches +// Subsequent runs: reuses cached action (no LLM cost) +await stagehand.act("Click the sign in button"); +``` + + + + Learn how to organize caches and manage cache directories + + + +### 3. Optimize Browser Sessions + +Reuse sessions when possible and set appropriate timeouts. See [Browser Configuration](/configuration/browser) for details: + +```typescript +const stagehand = new Stagehand({ + env: "BROWSERBASE", + browserbaseSessionCreateParams: { + timeout: 1800, // 30 minutes instead of default 1 hour + keepAlive: true, // Keep session alive between tasks + } +}); +``` + + + + Optimize Browserbase infrastructure costs and session management + + + +## Advanced Strategies + +### Intelligent Model Switching + +Automatically fall back to cheaper models for simple tasks: + +```typescript +// Use models from least to most expensive based on task complexity +// See stagehand.dev/evals for performance comparisons +async function smartAct(prompt: string) { + const models = ["google/gemini-2.5-flash", "openai/gpt-4o"]; + + for (const model of models) { + try { + const stagehand = new Stagehand({ + env: "LOCAL", + model: model + }); + await stagehand.init(); + const [action] = await stagehand.observe(prompt); + await stagehand.act(action); + await stagehand.close(); + return; + } catch (error) { + console.log(`Falling back to ${model}...`); + await stagehand.close(); + } + } +} +``` + +### Session Pooling + +Reuse browser sessions across multiple tasks: + +```typescript +class SessionManager { + private sessions = new Map(); + + async getSession(taskType: string): Promise { + if (this.sessions.has(taskType)) { + return this.sessions.get(taskType)!; + } + + const stagehand = new Stagehand({ env: "BROWSERBASE" }); + await stagehand.init(); + this.sessions.set(taskType, stagehand); + return stagehand; + } +} +``` + +## Cost Monitoring + +Track your spending to identify optimization opportunities. See our [Observability Guide](/configuration/observability) for detailed metrics: + +```typescript +// Monitor token usage +const metrics = await stagehand.metrics; +console.log(`Total tokens: ${metrics.totalPromptTokens + metrics.totalCompletionTokens}`); +console.log(`Estimated cost: $${(metrics.totalPromptTokens + metrics.totalCompletionTokens) * 0.00001}`); +``` + + + + Monitor usage patterns and track costs in real-time + + + +## Budget Controls + +Set spending limits to prevent unexpected costs: + +```typescript +class BudgetGuard { + private dailySpend = 0; + private maxDailyBudget: number; + + constructor(maxDailyBudget: number = 25) { + this.maxDailyBudget = maxDailyBudget; + } + + checkBudget(estimatedCost: number): void { + if (this.dailySpend + estimatedCost > this.maxDailyBudget) { + throw new Error(`Daily budget exceeded: $${this.maxDailyBudget}`); + } + this.dailySpend += estimatedCost; + } +} +``` + + +## Related Resources + + + + Choose the right LLM for your budget and accuracy requirements + + + + Reduce costs with smart action caching and observe patterns + + + + Monitor usage patterns and track costs in real-time + + + + Optimize Browserbase infrastructure costs and session management + + \ No newline at end of file diff --git a/packages/docs/v3/best-practices/deployments.mdx b/packages/docs/v3/best-practices/deployments.mdx new file mode 100644 index 000000000..9d1174a70 --- /dev/null +++ b/packages/docs/v3/best-practices/deployments.mdx @@ -0,0 +1,240 @@ +--- +title: 'Deploying Stagehand' +description: 'Deploy your AI agents and automations to the cloud' +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + + +**🌟 Preview: Browser Functions** - Deploy your web automation code directly on Browserbase with browser functions. Scale your `act()` automations in the cloud with zero infrastructure setup. Reach out to hello@browserbase.com to get beta access. + + +## Deploy on Vercel + +Securely run Stagehand on Browserbase inside a Vercel Function. This guide shows a minimal, production-safe HTTP endpoint you can call directly or on a schedule. + +### 1. Install Vercel CLI + +To download and install Vercel CLI, run one of the following commands: + + +```bash pnpm +pnpm i -g vercel +``` +```bash yarn +yarn global add vercel +``` +```bash npm +npm i -g vercel +``` +```bash bun +bun add -g vercel +``` + + +### 2. Project layout + +```text +your-project/ + api/ + run.ts + package.json + tsconfig.json + vercel.json +``` + +Create the structure with: + +```bash +mkdir -p api +touch api/run.ts package.json vercel.json tsconfig.json +``` + +### 3. `api/run.ts` (Node.js runtime) + +```typescript +// api/run.ts +import type { VercelRequest, VercelResponse } from "@vercel/node"; +import { Stagehand } from "@browserbasehq/stagehand"; +import { z } from "zod/v3"; + +export default async function handler(req: VercelRequest, res: VercelResponse): Promise { + try { + const stagehand = new Stagehand({ + env: "BROWSERBASE", + apiKey: process.env.BROWSERBASE_API_KEY!, + projectId: process.env.BROWSERBASE_PROJECT_ID!, + disablePino: true, + model: { + modelName: "google/gemini-2.5-flash", + apiKey: process.env.GOOGLE_API_KEY!, + }, + // optional session params + browserbaseSessionCreateParams: { + projectId: process.env.BROWSERBASE_PROJECT_ID!, + region: "us-west-2", + browserSettings: { + blockAds: true, + }, + }, + }); + + await stagehand.init(); + const page = stagehand.context.pages()[0]; + + await page.goto("https://www.stagehand.dev/"); + await stagehand.act("click the evals button"); + + const fastestModel = await stagehand.extract("extract the fastest model", z.string()); + + await stagehand.close(); + + res.status(200).json({ ok: true, data: fastestModel }); + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + res.status(500).json({ ok: false, error: msg }); + } +} +``` + +### 4. `package.json` + +```json +{ + "name": "bb-stagehand-on-vercel", + "private": true, + "type": "module", + "engines": { "node": ">=18" }, + "dependencies": { + "@browserbasehq/stagehand": "^3.0.0" + }, + "devDependencies": { + "@types/node": "^20.12.12", + "@vercel/node": "^3.2.20", + "typescript": "^5.2.2" + } +} +``` + +### 5. `tsconfig.json` + +```json +{ + "compilerOptions": { + "target": "ES2022", + "module": "ES2022", + "moduleResolution": "node", + "outDir": ".vercel/output/functions", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "types": ["node"] + }, + "include": ["api/**/*.ts"] +} +``` + +### 6. `vercel.json` + +```json +{ + "$schema": "https://openapi.vercel.sh/vercel.json", + "functions": { + "api/run.ts": { + "maxDuration": 60 + } + } +} +``` + +See Vercel's [configuring functions](https://vercel.com/docs/functions/configuring-functions) docs for more details. + +### 7. Link your project + +Link your local folder to a Vercel project before configuring environment variables: + +```bash +# authenticate if needed +vercel login + +# link the current directory to a Vercel project (interactive) +vercel link +``` + +### 8. Environment variables + +Do not commit `.env` in production. Add variables via Vercel CLI: + +```bash +vercel env add BROWSERBASE_API_KEY +vercel env add BROWSERBASE_PROJECT_ID +# (and your model key if needed) +vercel env add GOOGLE_API_KEY +``` + +See also: [Browser Environment](/configuration/environment) for details on required variables. + +### 9. Test locally + +Replicate the Vercel environment locally to exercise your Function before deploying. Run from the project root. + +```bash +# ensure dependencies are installed +npm install + +# start the local Vercel dev server +vercel dev --listen 5005 +``` + +### 10. Deploy + +```bash +vercel +vercel --prod +``` + +### Execute the function + +#### Configure Protection Bypass for Automation + +Before invoking the production URL, create a Protection Bypass for Automation: + +1. Generate a 32-character secret (you can use `openssl rand -hex 16`) +2. Go to your project in Vercel +3. Navigate to Settings → Deployment Protection +4. Add the secret to "Protection Bypass for Automation" + +Then invoke the function with the bypass header: + +```bash +curl -X POST \ + -H "x-vercel-protection-bypass: " \ + https:///api/run +``` + +### Optional: Cron on Vercel + +Hit the same endpoint on a schedule by extending `vercel.json`: + +```json +{ + "$schema": "https://openapi.vercel.sh/vercel.json", + "functions": { + "api/run.ts": { + "maxDuration": 60 + } + } + }, + "crons": [ + { "path": "/api/run", "schedule": "0 * * * *" } + ] +} +``` + +### Features +- **No local browsers needed** with `env: "BROWSERBASE"`. [Browserbase](https://www.browserbase.com/) provides the browsers. +- **Fast functionality**: Offload browser work to Browserbase and return JSON promptly. +- **Long-running tasks**: Raise `maxDuration` and/or consider Edge runtime limits depending on plan. + diff --git a/packages/docs/v3/best-practices/deterministic-agent.mdx b/packages/docs/v3/best-practices/deterministic-agent.mdx new file mode 100644 index 000000000..8f0a05b43 --- /dev/null +++ b/packages/docs/v3/best-practices/deterministic-agent.mdx @@ -0,0 +1,573 @@ +--- +title: Deterministic Agent Scripts +sidebarTitle: Deterministic Agent +description: Use auto-caching to convert agent workflows into fast, deterministic scripts +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +Agent workflows are powerful for exploring and automating complex tasks, but they can be slow and non-deterministic. This guide shows you how to use Stagehand's built-in auto-caching to convert agent-discovered workflows into fast, deterministic scripts that run 10-100x faster. + +## Why Use Auto-Caching with Agent? + + + + Cached agent workflows run 10-100x faster by skipping LLM inference on subsequent runs + + + Eliminate repeated LLM calls—first run uses inference, subsequent runs use cache + + + Cached actions are deterministic and more predictable than fresh agent exploration + + + Works automatically—just specify `cacheDir` and Stagehand handles everything + + + +## How Auto-Caching Works + +When you specify a `cacheDir`: + +1. **First run**: Agent explores and executes workflow using LLM inference +2. **Actions cached**: All actions are automatically saved to local cache +3. **Subsequent runs**: Same workflow reuses cached actions (no LLM calls) +4. **Performance**: 10-100x faster execution, zero LLM tokens + +The cache key is automatically generated based on: +- Agent instruction +- Start URL +- Agent execution options +- Agent configuration + +## Basic Auto-Caching with Agent + +Simply add `cacheDir` when initializing Stagehand: + +```typescript +import { Stagehand } from "@browserbasehq/stagehand"; + +// Enable auto-caching +const stagehand = new Stagehand({ + env: "BROWSERBASE", + cacheDir: "agent-cache" // Automatic caching enabled +}); + +await stagehand.init(); +const page = stagehand.context.pages()[0]; + +await page.goto("https://example.com"); + +const agent = stagehand.agent({ + mode: "cua", + model: { + modelName: "google/gemini-2.5-computer-use-preview-10-2025", + apiKey: process.env.GOOGLE_GENERATIVE_AI_API_KEY + }, + systemPrompt: "You are a helpful assistant that can use a web browser.", +}); + +// First run: Uses LLM inference (~20-30 seconds, ~50,000 tokens) +// Subsequent runs: Uses cached actions (~2-3 seconds, 0 tokens) +const result = await agent.execute({ + instruction: "Find the login form, fill in username 'demo' and password 'test123', then click submit", + maxSteps: 10 +}); + +console.log("Completed:", result.success); +console.log("Actions taken:", result.actions.length); + +await stagehand.close(); +``` + +That's it! The second time you run this script, it will reuse the cached agent actions automatically. + +## Organizing Caches by Workflow + +Use descriptive cache directories for different workflows: + +```typescript +// Login workflow +const loginStagehand = new Stagehand({ + env: "BROWSERBASE", + cacheDir: "cache/login-workflow" +}); + +// Checkout workflow +const checkoutStagehand = new Stagehand({ + env: "BROWSERBASE", + cacheDir: "cache/checkout-workflow" +}); + +// Data extraction workflow +const extractStagehand = new Stagehand({ + env: "BROWSERBASE", + cacheDir: "cache/extraction-workflow" +}); +``` + +## Complete Example: First vs Subsequent Runs + +### First Run (Exploration Mode) + +```typescript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", + cacheDir: "cache/github-search" // Enable caching +}); + +await stagehand.init(); +const page = stagehand.context.pages()[0]; + +await page.goto("https://github.com"); + +const agent = stagehand.agent({ + mode: "cua", + model: { + modelName: "google/gemini-2.5-computer-use-preview-10-2025", + apiKey: process.env.GOOGLE_GENERATIVE_AI_API_KEY + }, + systemPrompt: "You are a helpful assistant that can use a web browser.", +}); + +console.log("First run: Exploring with agent..."); +const startTime = Date.now(); + +const result = await agent.execute({ + instruction: "Search for 'stagehand' and click the first repository result", + maxSteps: 10 +}); + +const duration = Date.now() - startTime; +console.log(`First run completed in ${duration}ms`); +console.log(`Actions: ${result.actions.length}`); +console.log(`Status: ${result.success}`); + +await stagehand.close(); + +// Output (example): +// First run completed in 25000ms +// Actions: 8 +// Status: true +``` + +### Subsequent Runs (Cached Mode) + +Run the **exact same script** again: + +```typescript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", + cacheDir: "cache/github-search" // Same cache directory +}); + +await stagehand.init(); +const page = stagehand.context.pages()[0]; + +await page.goto("https://github.com"); + +const agent = stagehand.agent({ + mode: "cua", + model: { + modelName: "google/gemini-2.5-computer-use-preview-10-2025", + apiKey: process.env.GOOGLE_GENERATIVE_AI_API_KEY + }, + systemPrompt: "You are a helpful assistant that can use a web browser.", +}); + +console.log("Subsequent run: Using cached actions..."); +const startTime = Date.now(); + +const result = await agent.execute({ + instruction: "Search for 'stagehand' and click the first repository result", + maxSteps: 10 +}); + +const duration = Date.now() - startTime; +console.log(`Subsequent run completed in ${duration}ms`); +console.log(`Actions: ${result.actions.length}`); +console.log(`Status: ${result.success}`); + +await stagehand.close(); + +// Output (example): +// Subsequent run completed in 2500ms ← 10x faster! +// Actions: 8 +// Status: true +``` + +## Using History for Analysis + +While caching handles execution automatically, you can still use `stagehand.history` to analyze what happened: + +```typescript +import { Stagehand } from "@browserbasehq/stagehand"; +import fs from "fs/promises"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", + cacheDir: "cache/workflow" +}); + +await stagehand.init(); +const page = stagehand.context.pages()[0]; + +await page.goto("https://example.com"); + +const agent = stagehand.agent({ + mode: "cua", + model: { + modelName: "google/gemini-2.5-computer-use-preview-10-2025", + apiKey: process.env.GOOGLE_GENERATIVE_AI_API_KEY + }, + systemPrompt: "You are a helpful assistant that can use a web browser.", +}); + +await agent.execute({ + instruction: "Complete the login process", + maxSteps: 10 +}); + +// Analyze what the agent did +const history = await stagehand.history; + +console.log(`\nWorkflow Analysis:`); +console.log(`Total operations: ${history.length}`); + +const agentOps = history.filter(e => e.method === 'agent'); +const actOps = history.filter(e => e.method === 'act'); +const navOps = history.filter(e => e.method === 'navigate'); + +console.log(`- Agent executions: ${agentOps.length}`); +console.log(`- Act operations: ${actOps.length}`); +console.log(`- Navigate operations: ${navOps.length}`); + +// Save for documentation +await fs.writeFile( + 'workflow-analysis.json', + JSON.stringify(history, null, 2) +); + +await stagehand.close(); +``` + +## Cache Management + +### Clear Cache When Site Changes + +If the website structure changes, clear the cache to force fresh exploration: + +```typescript +import { rmSync } from 'fs'; + +// Clear specific workflow cache +rmSync('cache/login-workflow', { recursive: true, force: true }); + +// Then run with fresh exploration +const stagehand = new Stagehand({ + env: "BROWSERBASE", + cacheDir: "cache/login-workflow" // Will rebuild cache +}); +``` + +### Programmatic Cache Control + +```typescript +import { rmSync, existsSync } from 'fs'; + +function clearCacheIfNeeded(cacheDir: string, maxAge: number = 7 * 24 * 60 * 60 * 1000) { + if (!existsSync(cacheDir)) { + return; // No cache to clear + } + + const stats = statSync(cacheDir); + const age = Date.now() - stats.mtimeMs; + + if (age > maxAge) { + console.log(`Cache older than ${maxAge}ms, clearing...`); + rmSync(cacheDir, { recursive: true, force: true }); + } +} + +// Clear cache if older than 7 days +clearCacheIfNeeded('cache/workflow'); + +const stagehand = new Stagehand({ + env: "BROWSERBASE", + cacheDir: "cache/workflow" +}); +``` + +## Advanced Patterns + +### Fallback to Fresh Exploration + +Combine caching with fallback for resilience: + +```typescript +async function executeWithFallback() { + const stagehand = new Stagehand({ + env: "BROWSERBASE", + cacheDir: "cache/workflow", + selfHeal: true // Enable self-healing + }); + + await stagehand.init(); + const page = stagehand.context.pages()[0]; + + await page.goto("https://example.com"); + + const agent = stagehand.agent({ + model: "anthropic/claude-sonnet-4-20250514" + }); + + try { + // Try with cache + const result = await agent.execute({ + instruction: "Complete the checkout process", + maxSteps: 15 + }); + + console.log("Execution successful:", result.success); + } catch (error) { + console.error("Cached workflow failed:", error); + + // Clear cache and retry with fresh exploration + rmSync('cache/workflow', { recursive: true, force: true }); + + console.log("Retrying with fresh exploration..."); + const retryResult = await agent.execute({ + instruction: "Complete the checkout process", + maxSteps: 15 + }); + + console.log("Retry successful:", retryResult.success); + } + + await stagehand.close(); +} +``` + +### Version Control for Caches + +Commit cache directories to ensure consistent behavior across environments: + +```gitignore +# .gitignore + +# Commit cache directories for deterministic CI/CD +!cache/ +!cache/**/*.json +``` + +```typescript +// CI/CD pipeline will use pre-generated cache +const stagehand = new Stagehand({ + env: "BROWSERBASE", + cacheDir: "cache/production-workflow" // Committed to repo +}); +``` + +## Best Practices + + + + +Organize caches by workflow or feature: + +```typescript +// Good: descriptive cache names +cacheDir: "cache/user-registration" +cacheDir: "cache/product-search" +cacheDir: "cache/checkout-flow" + +// Avoid: generic names +cacheDir: "cache" +cacheDir: "my-cache" +``` + + + +Implement a strategy for refreshing caches: + +```typescript +// Option 1: Time-based invalidation +if (isCacheOlderThan('cache/workflow', 7)) { + clearCache('cache/workflow'); +} + +// Option 2: Version-based invalidation +const CACHE_VERSION = 'v2'; +const cacheDir = `cache/workflow-${CACHE_VERSION}`; + +// Option 3: Manual invalidation flag +if (process.env.CLEAR_CACHE === 'true') { + clearCache('cache/workflow'); +} +``` + + + +Always test cached workflows in staging before production: + +```typescript +const env = process.env.NODE_ENV === 'production' ? 'production' : 'staging'; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", + cacheDir: `cache/${env}-workflow` +}); +``` + + + +Track cache usage for optimization: + +```typescript +const cacheHit = existsSync('cache/workflow') && + statSync('cache/workflow').mtimeMs < Date.now(); + +if (cacheHit) { + console.log("Cache hit - using cached workflow"); +} else { + console.log("Cache miss - exploring with agent"); +} + +// Log metrics +metrics.recordCacheHit(cacheHit); +``` + + + + +## Performance Comparison + +**Without Caching (Every Run):** +```typescript +const stagehand = new Stagehand({ env: "BROWSERBASE" }); +// No cacheDir specified + +const result = await agent.execute({ + instruction: "Complete workflow", + maxSteps: 10 +}); + +// Every run: ~20-30 seconds, ~50,000 tokens +``` + +**With Auto-Caching (First Run):** +```typescript +const stagehand = new Stagehand({ + env: "BROWSERBASE", + cacheDir: "cache/workflow" +}); + +const result = await agent.execute({ + instruction: "Complete workflow", + maxSteps: 10 +}); + +// First run: ~20-30 seconds, ~50,000 tokens (cached for next time) +``` + +**With Auto-Caching (Subsequent Runs):** +```typescript +const stagehand = new Stagehand({ + env: "BROWSERBASE", + cacheDir: "cache/workflow" // Reuses cache +}); + +const result = await agent.execute({ + instruction: "Complete workflow", + maxSteps: 10 +}); + +// Subsequent runs: ~2-3 seconds, 0 tokens ← 10-100x faster! +``` + + +Cached agent workflows run **10-100x faster** and consume **zero LLM tokens** on subsequent runs. The first run pays the exploration cost, every run after is nearly instant. + + +## Troubleshooting + + + +**Problem**: Workflow still slow on subsequent runs + +**Solutions**: +- Verify `cacheDir` path is correct and consistent across runs +- Ensure instruction, URL, and agent config are identical +- Check file permissions on cache directory +- Look for cache hit/miss logs in verbose mode + +```typescript +const stagehand = new Stagehand({ + env: "BROWSERBASE", + cacheDir: "cache/workflow", + verbose: 2 // Enable debug logs +}); +``` + + + +**Problem**: Cached actions fail on subsequent runs + +**Solutions**: +- Website may have changed—clear cache to re-explore +- Enable self-healing to adapt to minor changes +- Implement fallback logic to retry with fresh exploration + +```typescript +const stagehand = new Stagehand({ + env: "BROWSERBASE", + cacheDir: "cache/workflow", + selfHeal: true // Adapt to changes +}); +``` + + + +**Problem**: Cache directories growing uncontrolled + +**Solutions**: +- Use version prefixes for cache directories +- Implement automatic cleanup of old caches +- Share cache directories for similar workflows + +```typescript +// Versioned caches +const CACHE_VERSION = '2024-01'; +const cacheDir = `cache/workflow-${CACHE_VERSION}`; + +// Cleanup old versions +rmSync('cache/workflow-2023-12', { recursive: true, force: true }); +``` + + + +## Next Steps + + + + Learn more about agent capabilities and configuration + + + + Complete guide to auto-caching with act() and agent() + + + + Monitor and track history and metrics + + + + Additional techniques for faster automation + + diff --git a/packages/docs/v3/best-practices/history.mdx b/packages/docs/v3/best-practices/history.mdx new file mode 100644 index 000000000..f174ae37a --- /dev/null +++ b/packages/docs/v3/best-practices/history.mdx @@ -0,0 +1,189 @@ +--- +title: History Tracking +sidebarTitle: History Tracking +description: Track and analyze Stagehand operations with the history API +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +The history API captures every Stagehand operation for debugging, auditing, and workflow analysis. + +## Basic Usage + +```typescript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ env: "BROWSERBASE" }); +await stagehand.init(); +const page = stagehand.context.pages()[0]; + +await page.goto("https://example.com"); +await stagehand.act("click login button"); + +// Get complete history +const history = await stagehand.history; + +console.log(`Total operations: ${history.length}`); +history.forEach((entry, i) => { + console.log(`${i + 1}. ${entry.method} at ${entry.timestamp}`); +}); + +await stagehand.close(); +``` + +## History Entry Structure + +```typescript +interface HistoryEntry { + method: "act" | "extract" | "observe" | "navigate" | "agent"; + parameters: unknown; // Input parameters + result: unknown; // Output/result + timestamp: string; // ISO 8601 timestamp +} +``` + +## Common Use Cases + +### Debugging Failures + +```typescript +try { + await stagehand.act("click login button"); +} catch (error) { + const history = await stagehand.history; + + history.forEach((entry, i) => { + const status = entry.result && 'error' in entry.result ? "FAILED" : "SUCCESS"; + console.log(`${i + 1}. ${status} - ${entry.method}`); + }); +} +``` + +### Analyzing Timing + +```typescript +const history = await stagehand.history; + +const timings = history.map((entry, i) => { + if (i === 0) return null; + const duration = new Date(entry.timestamp).getTime() - + new Date(history[i - 1].timestamp).getTime(); + return { operation: entry.method, duration }; +}).filter(Boolean); + +console.log("Slowest operations:", + timings.sort((a, b) => b.duration - a.duration).slice(0, 3) +); +``` + +### Operation Statistics + +```typescript +const history = await stagehand.history; + +const stats = history.reduce((acc, entry) => { + acc[entry.method] = (acc[entry.method] || 0) + 1; + return acc; +}, {} as Record); + +console.log("Operations:", stats); +// { act: 5, extract: 2, observe: 3, navigate: 1 } +``` + +### Saving History + +```typescript +import fs from "fs/promises"; + +const history = await stagehand.history; +const metrics = await stagehand.metrics; + +await fs.writeFile( + `workflow-report.json`, + JSON.stringify({ + history, + totalOps: history.length, + totalTokens: metrics.totalPromptTokens + metrics.totalCompletionTokens + }, null, 2) +); +``` + +## Filtering by Operation Type + +```typescript +const history = await stagehand.history; + +const actions = history.filter(e => e.method === 'act'); +const extractions = history.filter(e => e.method === 'extract'); +const agentOps = history.filter(e => e.method === 'agent'); + +console.log(`Actions: ${actions.length}`); +console.log(`Extractions: ${extractions.length}`); +console.log(`Agent executions: ${agentOps.length}`); +``` + +## Combining with Metrics + +```typescript +const history = await stagehand.history; +const metrics = await stagehand.metrics; + +const report = { + totalOps: history.length, + successful: history.filter(e => !e.result || !('error' in e.result)).length, + failed: history.filter(e => e.result && 'error' in e.result).length, + totalTokens: metrics.totalPromptTokens + metrics.totalCompletionTokens, + avgTimePerOp: `${(metrics.totalInferenceTimeMs / history.length).toFixed(0)}ms` +}; + +console.log(report); +``` + + + Learn more about metrics, logging, and monitoring + + +## What's Tracked? + +Only Stagehand methods are tracked in history: + +```typescript +// Tracked +await stagehand.act("click button"); // ✓ +await stagehand.extract({ instruction: "..." }); // ✓ +await stagehand.observe("find elements"); // ✓ +await page.goto("https://example.com"); // ✓ + +// Not tracked +await page.locator("button").click(); // ✗ Native Playwright +await page.click("button"); // ✗ Native Playwright +``` + +## Best Practices + +- **Save history for critical workflows** - Maintain audit trails for production +- **Inspect history when debugging** - Check the last operations to identify failures +- **Analyze timing periodically** - Find slow operations and optimize +- **Combine with metrics** - Get complete visibility into performance and cost + +## Next Steps + + + + Build fast, cached agent workflows + + + + Combine history with metrics + + + + Speed up workflows with caching + + + + Configure detailed execution traces + + diff --git a/packages/docs/v3/best-practices/mcp-integrations.mdx b/packages/docs/v3/best-practices/mcp-integrations.mdx new file mode 100644 index 000000000..ed44fbaaa --- /dev/null +++ b/packages/docs/v3/best-practices/mcp-integrations.mdx @@ -0,0 +1,273 @@ +--- +title: "MCP Integrations" +description: "Using Model Context Protocol (MCP) integrations to enhance agent capabilities" +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +## What are MCP Integrations? + +MCP (Model Context Protocol) integrations allow you to connect your Stagehand agents to external tools, APIs, and services. This enables agents to perform actions beyond browser automation, such as web search, database operations, and API calls. + + +MCP integrations make your agents more powerful by combining browser automation with external capabilities. The agent can intelligently decide when to use browser actions versus external tools. + + +## Connection Options + +There are two options for connecting to MCP servers: + +1. **Pass a URL directly** - The simplest approach for quick setup +2. **Create a connection first** - Gives you more control over the connection + + +MCP client support is currently only available in TypeScript. + + +## Passing a URL + +The simplest way to add MCP integrations is by providing server URLs directly in the agent configuration: + +```typescript +const agent = stagehand.agent({ + provider: "openai", + model: "computer-use-preview", + integrations: [ + `https://mcp.exa.ai/mcp?exaApiKey=${process.env.EXA_API_KEY}`, + ], + systemPrompt: `You have access to web search through Exa. Use it to find current information before browsing.`, + options: { + apiKey: process.env.OPENAI_API_KEY, + }, +}); + +await agent.execute("Search for the best headphones of 2025 and go through checkout for the top recommendation"); +``` + +## Creating a Connection First + +Alternatively, you can establish MCP connections first and then pass the client objects: + +```typescript +import { connectToMCPServer } from "@browserbasehq/stagehand"; + +// Connect to MCP server +const supabaseClient = await connectToMCPServer( + `https://server.smithery.ai/@supabase-community/supabase-mcp/mcp?api_key=${process.env.SMITHERY_API_KEY}` +); + +// You can also pass the config to start a local MCP server +const notionClient = await connectToMCPServer({ + command: "npx", + args: ["-y", "@notionhq/notion-mcp-server"], + env: { + NOTION_TOKEN: process.env.NOTION_TOKEN, + }, +}); + +// Use the connected client +const agent = stagehand.agent({ + provider: "openai", + model: "computer-use-preview", + integrations: [supabaseClient, notionClient], + systemPrompt: `You can interact with Supabase databases and Notion. Use these tools to store and retrieve data.`, + options: { + apiKey: process.env.OPENAI_API_KEY, + }, +}); + +await agent.execute("Search for restaurants in New Brunswick, NJ and save the first result to the database"); +``` + + + +## Multiple Integrations + +You can combine multiple MCP integrations in a single agent: + +```typescript +const databaseClient = await connectToMCPServer(/* database config */); + +const agent = stagehand.agent({ + integrations: [ + `https://search-service.example.com/mcp?apiKey=${process.env.SEARCH_API_KEY}`, + databaseClient + ], + systemPrompt: `You have access to external tools for search and data storage. Use these tools strategically to complete tasks efficiently.` +}); +``` + +## Best Practices + +### Choose the Right Connection Approach + + +**When to use:** +- Simple setup requirements +- Standard API configurations +- Getting started quickly + +**Benefits:** +- Minimal code required +- Automatic connection handling +- Easy to configure + + + +**When to use:** +- Custom connection options +- Connection reuse across agents +- Advanced error handling + +**Benefits:** +- Full control over connections +- Better error handling +- Connection pooling capabilities + + + +### Environment Variables + +Always use environment variables for API keys and sensitive information: + +```bash +# .env file +SEARCH_API_KEY=your_search_service_key +MCP_SERVICE_API_KEY=your_mcp_service_key +OPENAI_API_KEY=your_openai_key +DATABASE_URL=your_database_url +DATABASE_API_KEY=your_database_key +``` + +### Instructions Best Practices + +Provide clear instructions about available tools: + + + +```typescript +systemPrompt: `You have access to: +1. Web search tools - Use to find current information +2. Database tools - Use to store/retrieve data +3. Browser automation - Use for web interactions + +Always search for current information before making decisions. +Store important data for later reference.` +``` + + + +```typescript +systemPrompt: "You can search and save data." +``` + + + +### Error Handling + +Implement proper error handling for MCP connections: + +```typescript +try { + const client = await connectToMCPServer(serverUrl); + + const agent = stagehand.agent({ + integrations: [client], + // ... other config + }); + + const result = await agent.execute(instruction); +} catch (error) { + console.error("MCP integration failed:", error); + // Handle fallback behavior +} +``` + +## Troubleshooting + + + +**Problem:** MCP server connections timing out + +**Solutions:** +- Verify server URLs are correct and accessible +- Check network connectivity +- Ensure API keys are valid and have proper permissions +- Try connecting to servers individually to isolate issues + + + +**Problem:** Agent not using available MCP tools + +**Solutions:** +- Make instructions more specific about when to use tools +- Ensure API keys are properly configured +- Check that the MCP server supports the expected tools +- Verify tool descriptions are clear and actionable + + + +**Problem:** API key or authentication failures + +**Solutions:** +- Verify all required environment variables are set +- Check API key validity and permissions +- Ensure URLs include necessary authentication parameters +- Test MCP connections independently before using in agents + + + +## Examples + +### Web Search + Browser Automation +```typescript +const agent = stagehand.agent({ + integrations: [`https://mcp.exa.ai/mcp?exaApiKey=${process.env.EXA_API_KEY}`], + systemPrompt: `First search for current information, then use the browser to complete tasks based on what you find.` +}); + +await agent.execute("Find the best laptop deals for 2025 and navigate to purchase the top recommendation"); +``` + +### Data Extraction + Storage +```typescript +const supabaseClient = await connectToMCPServer(/* config */); + +const agent = stagehand.agent({ + integrations: [supabaseClient], + systemPrompt: `Extract data from websites and store it using available database tools.` +}); + +await agent.execute("Extract all restaurant information from this directory and save it to the database"); +``` + +### Multi-tool Workflow +```typescript +const agent = stagehand.agent({ + integrations: [ + `https://mcp.exa.ai/mcp?exaApiKey=${process.env.EXA_API_KEY}`, + supabaseClient + ], + systemPrompt: `Use all available tools strategically: search for current info, browse websites, and store important data.` +}); + +await agent.execute("Research competitor pricing, compare with our site, and store the analysis"); +``` + +## Further Reading + + + + Learn the fundamentals of Stagehand agents + + + + Set up your own MCP server + + + + Create custom MCP tools + + diff --git a/packages/docs/v3/best-practices/prompting-best-practices.mdx b/packages/docs/v3/best-practices/prompting-best-practices.mdx new file mode 100644 index 000000000..174518881 --- /dev/null +++ b/packages/docs/v3/best-practices/prompting-best-practices.mdx @@ -0,0 +1,249 @@ +--- +title: Prompting Best Practices +description: "Write effective prompts for reliable Stagehand automation" +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +Good prompts make Stagehand reliable. Bad prompts cause failures. Here's how to write prompts that work consistently. + +## Act Method + +Use `act()` for single actions on web pages. Each action should be focused and clear. + +```typescript +// Good - Single, specific actions +await stagehand.act("click the 'Add to Cart' button"); +await stagehand.act("type 'user@example.com' into the email field"); + +// Bad - Multiple actions combined +await stagehand.act("fill out the form and submit it"); +await stagehand.act("login with credentials and navigate to dashboard"); +``` + +### Use Element Types, Not Colors + +Describe elements by their type and function rather than visual attributes like color. + +```typescript +// Good - Element types and descriptive text +await stagehand.act("click the 'Sign In' button"); +await stagehand.act("type into the email input field"); + +// Bad - Color-based descriptions +await stagehand.act("click the blue button"); +await stagehand.act("type into the white input"); +``` + +### Use Descriptive Language + +```typescript +// Good - Clear element identification +await stagehand.act("click the 'Next' button at the bottom of the form"); +await stagehand.act("type into the search bar at the top of the page"); + +// Bad - Vague descriptions +await stagehand.act("click next"); +await stagehand.act("type into search"); +``` + +### Choose the Right Action Verbs + +- **Click** for buttons, links, checkboxes +- **Type** for text inputs +- **Select** for dropdowns +- **Check/uncheck** for checkboxes +- **Upload** for file inputs + +```typescript +// Good +await stagehand.act("click the submit button"); +await stagehand.act("select 'Option 1' from dropdown"); + +// Bad +await stagehand.act("click submit"); +await stagehand.act("choose option 1"); +``` + +### Protect Sensitive Data + +Variables keep sensitive information out of prompts and logs. + +```typescript +// Use variables for sensitive data +await stagehand.act("type %username% into the email field", { + variables: { username: "user@example.com" } +}); + +await stagehand.act("type %password% into the password field", { + variables: { password: process.env.USER_PASSWORD } +}); +``` + + +Set `verbose: 0` in your Stagehand config to prevent secrets from appearing in logs. + + +## Extract Method + +Use `extract()` to pull structured data from pages. Define clear schemas and provide context. + +### Schema Best Practices + +Use descriptive field names, correct types, and detailed descriptions. Field descriptions provide context that helps the model understand exactly what to extract. + +```typescript +// Good - Descriptive names, correct types, and helpful descriptions +const productData = await stagehand.extract( + "Extract product information", + z.object({ + productTitle: z.string().describe("The main product name displayed on the page"), + priceInDollars: z.number().describe("Current selling price as a number, without currency symbol"), + isInStock: z.boolean().describe("Whether the product is available for purchase") + }) +); + +// Bad - Generic names, wrong types, no descriptions +const data = await stagehand.extract( + "Get product details", + z.object({ + name: z.string(), // Too generic, no context + price: z.string(), // Should be number + stock: z.string() // Should be boolean, no context + }) +); +``` + +### Use Proper URL Types + +Specify URL types with `z.string().url()` to tell Stagehand to extract URLs. + +```typescript +// Good - Tells Stagehand to extract URLs +const links = await stagehand.extract( + "Extract navigation links", + z.array(z.object({ + text: z.string(), + url: z.string().url() // Required for URL extraction + })) +); + +// Single URL extraction +const contactUrl = await stagehand.extract( + "extract the contact page URL", + z.string().url() +); +``` + +## Observe Method + +Use `observe()` to discover actionable elements before acting on them. + +### Check Elements First + +Verify elements exist before taking action to avoid errors. + +```typescript +// Check for elements first +const loginButtons = await stagehand.observe("Find the login button"); + +if (loginButtons.length > 0) { + await stagehand.act(loginButtons[0]); +} else { + console.log("No login button found"); +} +``` + +### Be Specific About Element Types + +```typescript +// Good - Specific element types +const submitButtons = await stagehand.observe("Find submit button in the form"); +const dropdowns = await stagehand.observe("Find the state dropdown menu"); + +// Bad - Too vague +const elements = await stagehand.observe("Find submit stuff"); +const things = await stagehand.observe("Find state selection"); +``` + +## Agent Method + +Use `agent()` for complex, multi-step workflows. Provide detailed instructions and set appropriate limits. + +### Navigate First + +Don't include navigation in agent tasks. Handle it separately. + +```typescript +// Good - Navigate first +await page.goto('https://amazon.com'); +await agent.execute('Search for wireless headphones under $100 and add the best rated one to cart'); + +// Bad - Navigation in task +await agent.execute('Go to Amazon, search for headphones, and add one to cart'); +``` + +### Be Highly Specific + +Detailed instructions lead to better results. + +```typescript +// Good - Detailed instructions +await agent.execute({ + instruction: "Find Italian restaurants in Brooklyn that are open after 10pm, have outdoor seating, and are rated 4+ stars. Save the top 3 results.", + maxSteps: 25 +}); + +// Bad - Vague instructions +await agent.execute("Find some good restaurants"); +``` + +### Set Appropriate Step Limits + +Match step limits to task complexity. + +```typescript +// Simple task - fewer steps +await agent.execute({ + instruction: "Subscribe to the newsletter with email 'user@example.com'", + maxSteps: 10 +}); + +// Complex task - more steps +await agent.execute({ + instruction: "Research and compare 5 project management tools with pricing and features", + maxSteps: 50 +}); +``` + +### Include Success Criteria + +Tell the agent how to know when it's done. + +```typescript +// Good - Clear success criteria +await agent.execute({ + instruction: "Add 3 smartphone cases to cart and confirm the cart shows exactly 3 items with total price", + maxSteps: 20 +}); + +// Bad - No validation +await agent.execute("Add some items to cart"); +``` + +## Common Mistakes to Avoid + +- **Combining multiple actions** - Keep each `act()` call to one action +- **Using vague descriptions** - Be specific about which elements to interact with +- **Exposing sensitive data** - Always use variables for credentials +- **Skipping validation** - Check results before proceeding + +## Testing Your Prompts + +1. **Start simple** - Test basic functionality first +2. **Add complexity gradually** - Build up to complex workflows +3. **Monitor results** - Use logging to understand what's happening +4. **Iterate based on failures** - Refine prompts when they don't work +Remember: Good prompting is iterative. When in doubt, be more specific rather than less. \ No newline at end of file diff --git a/packages/docs/v3/best-practices/speed-optimization.mdx b/packages/docs/v3/best-practices/speed-optimization.mdx new file mode 100644 index 000000000..c77e08aa6 --- /dev/null +++ b/packages/docs/v3/best-practices/speed-optimization.mdx @@ -0,0 +1,171 @@ +--- +title: Speed Optimization +sidebarTitle: Speed Optimization +description: Optimize Stagehand performance for faster automation and reduced latency +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +Stagehand performance depends on several factors: DOM processing speed, LLM inference time, browser operations, and network latency. This guide provides proven strategies to maximize automation speed. + +## Quick Performance Wins + +### 1. Plan Ahead with Observe + + +Use a single `observe()` call to plan multiple actions, then execute them efficiently: + +```typescript +// Instead of sequential operations with multiple LLM calls +await stagehand.act("Fill name field"); // LLM call #1 +await stagehand.act("Fill email field"); // LLM call #2 +await stagehand.act("Select country dropdown"); // LLM call #3 + +// Use single observe to plan all form fields - one LLM call +const formFields = await stagehand.observe("Find all form fields to fill"); + +// Execute all actions without LLM inference +for (const field of formFields) { + await stagehand.act(field); // No LLM calls! +} +``` + + +**Performance Tip**: Acting on `observe` results avoids LLM inference entirely. This approach is 2-3x faster than direct `act()` calls and is the recommended pattern for multi-step workflows. + + + + Learn advanced caching patterns and cache invalidation strategies + + +### 2. Optimize DOM Processing + +Reduce DOM complexity before Stagehand processes the page: + +```typescript +// Remove heavy elements that slow down processing +await page.evaluate(() => { + // Remove video elements + document.querySelectorAll('video, iframe').forEach(el => el.remove()); + + // Hide complex animations + document.querySelectorAll('[style*="animation"]').forEach(el => { + (el as HTMLElement).style.animation = 'none'; + }); +}); + +// Then perform Stagehand operations +await stagehand.act("Click the submit button"); +``` + +### 3. Set Appropriate Timeouts + +Use shorter timeouts for simple operations and longer ones for complex page loads: + +```typescript +// Simple actions - reduce action timeout +await stagehand.act("Click the login button", { + timeout: 5000 // Default is 30000ms, reduce for simple clicks +}); + +// Complex page loads - optimize navigation +const page = stagehand.context.pages()[0]; +await page.goto("https://heavy-spa.com", { + waitUntil: "domcontentloaded", // Don't wait for all resources + timeout: 15000 // Shorter than default 30s +}); +``` + +## Performance Monitoring and Benchmarking + +Track performance metrics and measure optimization impact: + +### Performance Tracking + +```typescript +class PerformanceTracker { + private speedMetrics: Map = new Map(); + + async timedAct(page: Page, prompt: string): Promise { + const start = Date.now(); + const result = await stagehand.act(prompt); + const duration = Date.now() - start; + + if (!this.speedMetrics.has(prompt)) { + this.speedMetrics.set(prompt, []); + } + this.speedMetrics.get(prompt)!.push(duration); + + console.log(`Action "${prompt}" took ${duration}ms`); + return result; + } + + getAverageTime(prompt: string): number { + const times = this.speedMetrics.get(prompt) || []; + return times.reduce((a, b) => a + b, 0) / times.length; + } +} +``` + +Example Output: +``` +Action "Fill form" took 1000ms +Action "Click submit" took 2000ms +Action "Confirm submission" took 5000ms +``` + +### Before vs After Benchmarking + +```typescript +// Before optimization +console.time("workflow"); +await stagehand.act("Fill form"); +await stagehand.act("Click submit"); +await stagehand.act("Confirm submission"); +console.timeEnd("workflow"); // 8000ms + +// After optimization with observe planning +console.time("workflow-optimized"); +const workflowActions = await stagehand.observe("Find form, submit, and confirm elements"); + +// Execute actions sequentially to avoid conflicts +for (const action of workflowActions) { + await stagehand.act(action); +} +console.timeEnd("workflow-optimized"); // 500ms +``` + +Example Output: +``` +Workflow took 8000ms +Optimized workflow took 500ms +``` + + + + Set up comprehensive performance monitoring + + + + +## Related Resources + + + + Advanced caching patterns for maximum performance + + + + Balance speed improvements with cost considerations + + + + Optimize Browserbase settings for speed + + + + Choose the right model for speed vs accuracy + + \ No newline at end of file diff --git a/packages/docs/v3/best-practices/usecase-observe.mdx b/packages/docs/v3/best-practices/usecase-observe.mdx new file mode 100644 index 000000000..a1c4590a2 --- /dev/null +++ b/packages/docs/v3/best-practices/usecase-observe.mdx @@ -0,0 +1,108 @@ +--- +sidebarTitle: Use Cases +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +## Real-World Use Cases + +### E-commerce Product Discovery + +```typescript +// Discover product interaction elements +const productActions = await stagehand.observe({ + instruction: "Find add to cart buttons, size selectors, and product images" +}); + +// Categorize actions by type +const cartButtons = productActions.filter(a => + a.description.toLowerCase().includes('cart') +); +const sizeOptions = productActions.filter(a => + a.description.toLowerCase().includes('size') +); + +// Execute purchase workflow +if (sizeOptions.length > 0) { + await stagehand.act(sizeOptions[0]); // Select size first +} +if (cartButtons.length > 0) { + await stagehand.act(cartButtons[0]); // Then add to cart +} +``` + +### Form Handling & Validation + +```typescript +// Analyze form structure before filling +const formElements = await stagehand.observe({ + instruction: "Find form fields, validation messages, and submit buttons" +}); + +// Check for required fields +const requiredFields = formElements.filter(e => + e.description.includes('required') || e.description.includes('*') +); + +console.log(`Found ${requiredFields.length} required fields to complete`); + +// Fill form systematically +for (const field of requiredFields) { + await stagehand.act(field); + // Add appropriate input based on field type +} +``` + +### Dynamic Content & SPA Navigation + +```typescript +// Wait for and discover dynamically loaded content +await page.waitForLoadState('networkidle'); + +const dynamicElements = await stagehand.observe({ + instruction: "Find newly loaded content, infinite scroll triggers, or loading indicators", + domSettleTimeoutMs: 15000 // Wait longer for dynamic content +}); + +// Handle infinite scroll +const scrollTriggers = dynamicElements.filter(e => + e.description.toLowerCase().includes('load more') || + e.description.toLowerCase().includes('scroll') +); + +if (scrollTriggers.length > 0) { + await stagehand.act(scrollTriggers[0]); + // Recursively observe new content + const newContent = await stagehand.observe("Find additional items"); +} +``` + +### Multi-Step Workflow Planning + +```typescript +// Plan entire checkout flow upfront +async function planCheckoutWorkflow() { + // Step 1: Cart page analysis + await page.goto('/cart'); + const cartActions = await stagehand.observe("Find checkout and cart modification options"); + + // Step 2: Checkout page analysis + const checkoutButton = cartActions.find(a => a.description.includes('checkout')); + if (checkoutButton) await stagehand.act(checkoutButton); + + const checkoutActions = await stagehand.observe("Find payment forms and shipping options"); + + // Step 3: Plan execution order + const shippingFields = checkoutActions.filter(a => a.description.includes('shipping')); + const paymentFields = checkoutActions.filter(a => a.description.includes('payment')); + const submitButton = checkoutActions.find(a => a.description.includes('complete order')); + + return { shippingFields, paymentFields, submitButton }; +} + +// Execute planned workflow +const workflow = await planCheckoutWorkflow(); +// Fill shipping → payment → submit +``` diff --git a/packages/docs/v3/best-practices/user-data.mdx b/packages/docs/v3/best-practices/user-data.mdx new file mode 100644 index 000000000..0b6799632 --- /dev/null +++ b/packages/docs/v3/best-practices/user-data.mdx @@ -0,0 +1,35 @@ +--- +title: User Data Directory +sidebarTitle: User Data +description: Persist browser data between sessions +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + +### User Data Directory + +Persist browser data between sessions using a custom user data directory: + +```typescript +import { Stagehand } from "@browserbasehq/stagehand"; + +// For Browserbase sessions +const stagehand = new Stagehand({ + env: "BROWSERBASE", + browserbaseSessionCreateParams: { + userDataDir: "/path/to/user/data/directory", + }, +}); + +// For Local sessions +const localStagehand = new Stagehand({ + env: "LOCAL", + localBrowserLaunchOptions: { + userDataDir: "./browser-data", + }, +}); + +await stagehand.init(); +console.log("Session ID:", stagehand.sessionId); +``` \ No newline at end of file diff --git a/packages/docs/v3/best-practices/using-multiple-tabs.mdx b/packages/docs/v3/best-practices/using-multiple-tabs.mdx new file mode 100644 index 000000000..28fee3280 --- /dev/null +++ b/packages/docs/v3/best-practices/using-multiple-tabs.mdx @@ -0,0 +1,74 @@ +--- +title: 'Using Multiple Tabs' +description: 'Act on multiple tabs with Stagehand' +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +Many modern web applications open new tabs when users click certain buttons or links. Without proper multitab support, automation scripts break when expected content appears in a new tab rather than the current one. Stagehand's multitab capabilities ensure your automations work seamlessly across multitab workflows. + +## The Stagehand Page + +Stagehand automatically adapts to multitab workflows. The active page (accessed via `context.activePage()`) always points to the most recently opened or active tab, ensuring your automations continue working even when new tabs are created. + +This means you can continue using familiar patterns: + +```typescript +const page = stagehand.context.pages()[0]; +await page.goto("https://example.com"); +await stagehand.act("click the button that opens a new tab"); +// page now automatically points to the new tab +await stagehand.extract("get data from new tab"); +``` + + +**Important**: [Stagehand Agent](/v3/basics/agent) will always operate on the active page. If you need an agent to work across specific tabs, you'll need to manage page switching manually. + + +## Manual Page Management + +For more control or multitab workflows, you can manage multiple tabs explicitly: + +```typescript +// Create a second page +await stagehand.context.newPage(); +const pages = stagehand.context.pages(); + +const githubPage = pages[0]; +const pythonPage = pages[1]; + +// Navigate each page to different repositories +await githubPage.goto("https://github.com/browserbase/stagehand"); +await pythonPage.goto("https://github.com/browserbase/stagehand-python"); + +// Extract data from both pages simultaneously +const [stagehandStars, stagehandPythonStars] = await Promise.all([ + stagehand.extract("extract the repository stars", { page: githubPage }), + stagehand.extract("extract the repository stars", { page: pythonPage }) +]); + +console.log(`Stagehand stars: ${stagehandStars}`); +console.log(`Stagehand-Python stars: ${stagehandPythonStars}`); +``` + +## Next Steps + + + + Use `Agent` to autonomously execute multi-step tasks and complex workflows. + + + + Learn best practices for interacting with elements inside iframes. + + + + Manage browser contexts and sessions for complex automation scenarios. + + + + Handle errors gracefully and debug automation issues effectively. + + \ No newline at end of file diff --git a/packages/docs/v3/configuration/browser.mdx b/packages/docs/v3/configuration/browser.mdx new file mode 100644 index 000000000..f3c3916d2 --- /dev/null +++ b/packages/docs/v3/configuration/browser.mdx @@ -0,0 +1,282 @@ +--- +title: Browser +sidebarTitle: Browser +description: Configure Stagehand on Browserbase or locally +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +Stagehand supports two primary environments: + +- **Browserbase** - Cloud-managed browser infrastructure optimized for production web automation at scale +- **Local** - Run browsers directly on your machine for development and debugging + +## Browserbase Environment + +Browserbase provides managed cloud browser infrastructure optimized for web automation at scale. It offers advanced features like stealth mode, proxy support, and persistent contexts. + + + Discover the power of cloud-managed browser infrastructure with Browserbase. + + +### Environment Variables + +Before getting started, set up the required environment variables: + + +```bash .env +BROWSERBASE_API_KEY=your_api_key_here +BROWSERBASE_PROJECT_ID=your_project_id_here +``` + + + +Get your API key and Project ID from the [Browserbase Dashboard](https://browserbase.com/overview) + + +### Using Stagehand with Browserbase + +#### Basic Setup + +The simplest way to get started is with default settings: + +```typescript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", +}); + +await stagehand.init(); +``` + +#### Advanced Configuration + +Configure browser settings, proxy support, and other session parameters: +```typescript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", + // Optional: API Key and Project ID will be pulled directly from your environment + apiKey: process.env.BROWSERBASE_API_KEY, + projectId: process.env.BROWSERBASE_PROJECT_ID, + browserbaseSessionCreateParams: { + proxies: true, + region: "us-west-2", + browserSettings: { + viewport: { width: 1920, height: 1080 }, + blockAds: true, + }, + }, +}); + +await stagehand.init(); +console.log("Session ID:", stagehand.sessionId); +``` + + + ```typescript +const stagehand = new Stagehand({ + env: "BROWSERBASE", + apiKey: process.env.BROWSERBASE_API_KEY, + projectId: process.env.BROWSERBASE_PROJECT_ID, + browserbaseSessionCreateParams: { + projectId: process.env.BROWSERBASE_PROJECT_ID!, + proxies: true, + region: "us-west-2", + timeout: 3600, // 1 hour session timeout + keepAlive: true, // Available on Startup plan + browserSettings: { + advancedStealth: false, // this is a Scale Plan feature - reach out to support@browserbase.com to enable + blockAds: true, + solveCaptchas: true, + recordSession: false, + viewport: { + width: 1920, + height: 1080, + }, + }, + userMetadata: { + userId: "automation-user-123", + environment: "production", + }, + }, + }); + ``` + + +### Alternative: Browserbase SDK + +If you prefer to manage sessions directly, you can use the Browserbase SDK: + +```typescript +import { Browserbase } from "@browserbasehq/sdk"; + +const bb = new Browserbase({ + apiKey: process.env.BROWSERBASE_API_KEY! +}); + +const session = await bb.sessions.create({ + projectId: process.env.BROWSERBASE_PROJECT_ID!, + // Add configuration options here +}); +``` + +#### Connecting to an Existing Session + +Connect to a previously created Browserbase session using its session ID: + +```typescript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", + browserbaseSessionID: "existing-session-uuid-here", +}); + +await stagehand.init(); +console.log("Resumed Session ID:", stagehand.sessionId); +``` + +## Local Environment + +The local environment runs browsers directly on your machine, providing full control over browser instances and configurations. Ideal for development, debugging, and scenarios requiring custom browser setups. + +### Environment Comparison + +| Feature | Browserbase | Local | +| --- | --- | --- | +| **Scalability** | High (cloud-managed) | Limited (local resources) | +| **Stealth Features** | Advanced fingerprinting | Basic stealth | +| **Proxy Support** | Built-in residential proxies | Manual configuration | +| **Session Persistence** | Cloud context storage | File-based user data | +| **Geographic Distribution** | Multi-region deployment | Single machine | +| **Debugging** | Session recordings & logs | Direct DevTools access | +| **Setup Complexity** | Environment variables only | Browser installation required | +| **Cost** | Usage-based pricing | Infrastructure & maintenance | +| **Best For** | Production, scale, compliance | Development, debugging | + +### Basic Local Setup + +```typescript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "LOCAL" +}); + +await stagehand.init(); +console.log("Session ID:", stagehand.sessionId); +``` + +### Advanced Local Configuration + +Customize browser launch options for local development: + +```typescript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "LOCAL", + localBrowserLaunchOptions: { + headless: false, // Show browser window + devtools: true, // Open developer tools + viewport: { width: 1280, height: 720 }, + executablePath: '/opt/google/chrome/chrome', // Custom Chrome path + args: [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-web-security', + '--allow-running-insecure-content', + ], + userDataDir: './chrome-user-data', // Persist browser data + preserveUserDataDir: true, // Keep data after closing + chromiumSandbox: false, // Disable sandbox (adds --no-sandbox) + ignoreHTTPSErrors: true, // Ignore certificate errors + locale: 'en-US', // Set browser language + deviceScaleFactor: 1.0, // Display scaling + proxy: { + server: 'http://proxy.example.com:8080', + username: 'user', + password: 'pass' + }, + downloadsPath: './downloads', // Download directory + acceptDownloads: true, // Allow downloads + connectTimeoutMs: 30000, // Connection timeout + }, +}); + +await stagehand.init(); +``` + +## Advanced Configuration + +### DOM Settle Timeout + +Configure how long Stagehand waits for the DOM to stabilize before taking actions. + +```typescript +const stagehand = new Stagehand({ + env: "BROWSERBASE", + domSettleTimeout: 3000 // Wait up to 3 seconds for DOM to settle +}); +``` + +#### What is DOM Settling? + +DOM settling ensures that: +- **Animations complete** before interacting with elements +- **Lazy-loaded content** has time to appear +- **JavaScript updates** finish before actions are taken +- **Dynamic content** is fully rendered + +#### When to Adjust + +Increase `domSettleTimeout` for pages with: +- Heavy animations or transitions +- Lazy-loading or infinite scroll +- Dynamic JavaScript frameworks (React, Vue, Angular) +- Complex single-page applications + +```typescript +// For fast, static pages +const stagehand = new Stagehand({ + env: "BROWSERBASE", + domSettleTimeout: 500 // Minimal wait +}); + +// For dynamic, animated pages +const stagehand = new Stagehand({ + env: "BROWSERBASE", + domSettleTimeout: 5000 // Longer wait for stability +}); +``` + + +Setting `domSettleTimeout` too low may cause actions to fail on elements that aren't ready. Setting it too high increases execution time unnecessarily. + + +## Troubleshooting + + + +- Verify your `BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID` are set correctly +- Check that your API key has the necessary permissions +- Ensure your Browserbase account has sufficient credits + + + +- Install Chrome or Chromium on your system +- Set the correct `executablePath` for your Chrome installation +- Check that required dependencies are installed (Linux: `libnss3-dev libatk-bridge2.0-dev libgtk-3-dev libxss1 libasound2`) + + + +- Increase session timeout in `browserbaseSessionCreateParams.timeout` +- Use `keepAlive: true` for long-running sessions +- Monitor session usage to avoid unexpected terminations + + \ No newline at end of file diff --git a/packages/docs/v3/configuration/logging.mdx b/packages/docs/v3/configuration/logging.mdx new file mode 100644 index 000000000..13ac581cc --- /dev/null +++ b/packages/docs/v3/configuration/logging.mdx @@ -0,0 +1,641 @@ +--- +title: Logging +sidebarTitle: Logging +description: Set up logging, debugging, and error tracking for Stagehand workflows +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +Stagehand provides comprehensive logging capabilities to help you debug automation workflows, track execution, and diagnose issues. Configure logging levels, structured output, and debugging tools for both development and production environments. + +## Quick Start + +Choose your logging setup based on your environment: + + +```typescript Development +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "LOCAL", + verbose: 2, // Full debug output + // restOfYourConfiguration... +}); +``` + +```typescript Production +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", + verbose: 1, // Standard logging - less noise + disablePino: true, // Disable default console logging - no console spam + // logger: yourProductionLogger, // Send to observability platform like Sentry or DataDog + // restOfYourConfiguration... +}); +``` + +```typescript Testing +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "LOCAL", + verbose: 1, + // Pino automatically disabled in test environments - no worker thread issues + // logger: yourTestLogger, // Send to test logging framework like Jest + // restOfYourConfiguration... +}); +``` + + +--- + +## Operational Logging + +Real-time event logging during automation execution. + +### Verbosity Level + +Control how much detail you see in logs: + + + +**Use for:** Development, debugging specific issues + +```typescript +const stagehand = new Stagehand({ + verbose: 2, // Maximum detail + // restOfYourConfiguration... +}); +``` + + + +``` +[12:34:56] DEBUG: Capturing DOM snapshot +[12:34:57] DEBUG: DOM contains 847 elements +[12:34:58] DEBUG: LLM inference started +[12:34:59] DEBUG: LLM response: {"selector": "#btn-submit", "method": "click"} +[12:35:00] INFO: act completed successfully +``` + + + + + +**Use for:** Standard operations, staging, production + +```typescript +const stagehand = new Stagehand({ + verbose: 1, // Default level + // restOfYourConfiguration... +}); +``` + + + + +``` +[12:34:56] INFO: act started +[12:35:00] INFO: act completed successfully +[12:35:01] INFO: extract started +[12:35:03] INFO: extract completed +``` + + + + + +**Use for:** Production with external monitoring, minimal noise + +```typescript +const stagehand = new Stagehand({ + verbose: 0, // Errors only + // restOfYourConfiguration... +}); +``` + + + +``` +[12:35:05] ERROR: act failed: element not found +[12:35:10] ERROR: navigation timeout exceeded +``` + + + + + +--- + +### Log Destinations + +Logs can be sent to different destinations, including your console and external observability platforms: + + + +Fast, structured, colorized JSON logger with console output. + +**When to use:** Development, staging, or production without external observability; can manage multiple Stagehand instances + +```typescript +// Enabled by default - Pino handles console output automatically +const stagehand = new Stagehand({ + verbose: 1, + // restOfYourConfiguration... +}); +``` + + +- `process.env.NODE_ENV === "test"` +- `process.env.JEST_WORKER_ID !== undefined` (Jest tests) +- `process.env.PLAYWRIGHT_TEST_BASE_DIR !== undefined` (Playwright tests) +- `process.env.CI === "true"` (CI/CD environments) + +**Why auto-disable?** Pino uses worker threads for pretty-printing, which can cause issues in test runners. + + + + +Simple console.log/error output. + +**When to use:** Automatically activated in tests, or when `disablePino: true` without setting an external logger + +```typescript +const stagehand = new Stagehand({ + verbose: 1, + disablePino: true, // Set to true automatically when a test is detected + // restOfYourConfiguration... +}); +``` + + +- `process.env.NODE_ENV === "test"` +- `process.env.JEST_WORKER_ID !== undefined` (Jest tests) +- `process.env.PLAYWRIGHT_TEST_BASE_DIR !== undefined` (Playwright tests) +- `process.env.CI === "true"` (CI/CD environments) + +**Why auto-disable?** Pino uses worker threads for pretty-printing, which can cause issues in test runners. + + + +Your custom logging function to receive all logs. Works independently of Pino - receives logs regardless of Pino setting. + +**When to use:** Development, debugging, or when you don't need querying +capabilities. + + + + +```typescript +// Simple logger without parsing (for basic console output) +const simpleLogger = (logLine: LogLine) => { + console.log(`[${logLine.level}] ${logLine.message}`); + + // Optional: log raw auxiliary data + if (logLine.auxiliary) { + console.log(' Context:', logLine.auxiliary); + } +}; +``` + + + +Then pass the logger in your Stagehand instance: + +```typescript +const stagehand = new Stagehand({ + env: "BROWSERBASE", + verbose: 1, + logger: simpleLogger, + disablePino: true, // Avoid duplicate processing + // restOfYourConfiguration... +}) +``` + + + + + + +Your custom logging function to receive all logs. Works independently of Pino - receives logs regardless of Pino setting. + +**When to use:** Production with DataDog, Sentry, CloudWatch, or custom observability platforms for centralized monitoring and enable error alerting. Here's examples using Sentry and DataDog: + + + + + + + + + +```typescript +import * as Sentry from "@sentry/node"; + +const productionLogger = (logLine: LogLine) => { + // Send errors to Sentry + if (logLine.level === 0) { + Sentry.captureMessage(logLine.message, { + level: 'error', + extra: aux, + }); + } +} + +// Helper to parse auxiliary data to be flat, numeric, and filterable +function parseAuxiliary(aux?: LogLine['auxiliary']): Record { + if (!aux) return {}; + const parsed: Record = {}; + for (const [key, entry] of Object.entries(aux)) { + parsed[key] = entry.type === 'object' + ? JSON.parse(entry.value) + : entry.value; + } + return parsed; +} +``` + + + + +```typescript +import { datadogLogs } from "@datadog/browser-logs"; + +const productionLogger = (logLine: LogLine) => { + // Send all logs to DataDog + datadogLogs.logger.log(logLine.message, { + status: logLine.level === 0 ? 'error' : 'info', + service: 'stagehand-automation', + category: logLine.category, + ...aux, + }); +} + +// Helper to parse auxiliary data to be flat, numeric, and filterable +function parseAuxiliary(aux?: LogLine['auxiliary']): Record { + if (!aux) return {}; + const parsed: Record = {}; + for (const [key, entry] of Object.entries(aux)) { + parsed[key] = entry.type === 'object' + ? JSON.parse(entry.value) + : entry.value; + } + return parsed; +} +``` + + + + + + + +```typescript +const stagehand = new Stagehand({ + env: "BROWSERBASE", + verbose: 1, + logger: productionLogger, + disablePino: true, // Avoid duplicate processing + // restOfYourConfiguration... +}) +``` + + + + + + + +--- + +## File-Based Session Logging + +Enable detailed file-based logging for all Stagehand operations by setting a config directory. This creates comprehensive logs for `agent.execute`, `act`, `observe`, `extract`, CDP events, and LLM requests/responses. + +### Setup + +Add to your shell configuration (`~/.zshrc`, `~/.bashrc`, etc.): + +```bash +export BROWSERBASE_CONFIG_DIR=~/.config/browserbase +``` + +Then reload your shell or run `source ~/.zshrc`. + +### Usage + +Run your Stagehand script as normal: + +```bash +tsx run_some_script_that_imports_stagehand.ts +``` + +Logs are written to `~/.config/browserbase/sessions//` with a `latest` symlink pointing to the most recent session. + +### Viewing Logs + + + +Follow all logs as they happen: + +```bash +tail -f ~/.config/browserbase/sessions/latest/*.log +``` + +Or watch specific log types: + +```bash +# LLM requests and responses only +tail -f ~/.config/browserbase/sessions/latest/llm_events.log + +# CDP (Chrome DevTools Protocol) events only +tail -f ~/.config/browserbase/sessions/latest/cdp_events.log +``` + + + +View unified output sorted by timestamp: + +```bash +cat ~/.config/browserbase/sessions/latest/*.log | sort +``` + + + +Browse previous session logs: + +```bash +ls ~/.config/browserbase/sessions/ +# Output: 2025-01-06_14-30-45_abc123 2025-01-06_15-45-12_def456 latest + +cat ~/.config/browserbase/sessions/2025-01-06_14-30-45_abc123/*.log | sort +``` + + + +### Log Files + +Each session directory contains: + +| File | Contents | +|------|----------| +| `llm_events.log` | LLM requests and responses for act, extract, observe, and agent operations | +| `cdp_events.log` | Chrome DevTools Protocol calls and events | +| `stagehand.log` | General Stagehand operations and state changes | + + +This is especially useful for debugging agent workflows where you need to trace the full sequence of LLM decisions, browser actions, and CDP interactions. + + +--- + +## LLM Inference Debugging + + +**Development only** - Creates large files and contains page content. Do not use in production. + + +Save complete LLM request/response dumps to disk for offline analysis. See exactly what DOM was sent to the LLM and why it chose the wrong element. + +```typescript +const stagehand = new Stagehand({ + env: "LOCAL", + verbose: 2, + logInferenceToFile: true, // Writes files to ./inference_summary/ +}); +``` + +Creates timestamped files for each LLM call: + +``` +./inference_summary/ +├── act_summary/ +│ ├── act_summary.json # Aggregate metrics +│ ├── 20250127_123456_act_call.txt # LLM request +│ ├── 20250127_123456_act_response.txt # LLM response +│ ├── 20250127_123501_act_call.txt +│ └── 20250127_123501_act_response.txt +├── extract_summary/ +│ ├── extract_summary.json +│ ├── 20250127_123510_extract_call.txt +│ ├── 20250127_123510_extract_response.txt +│ ├── 20250127_123511_metadata_call.txt +│ └── 20250127_123511_metadata_response.txt +└── observe_summary/ + ├── observe_summary.json + └── ... +``` + +**File Types:** + + + +Contains the complete LLM request: + +```json +{ + "modelCall": "act", + "messages": [ + { + "role": "system", + "content": "You are a browser automation assistant. You have access to these actions:\n- click\n- type\n- scroll\n..." + }, + { + "role": "user", + "content": "Click the sign in button\n\nDOM:\n\n \n \n \n \n" + } + ] +} +``` + + + +Contains the LLM output: + +```json +{ + "modelResponse": "act", + "rawResponse": { + "selector": "#btn-1", + "method": "click", + "reasoning": "Found sign in button with ID btn-1" + } +} +``` + + + +Aggregates all calls with metrics: + +```json +{ + "act_summary": [ + { + "act_inference_type": "act", + "timestamp": "20250127_123456", + "LLM_input_file": "20250127_123456_act_call.txt", + "LLM_output_file": "20250127_123456_act_response.txt", + "prompt_tokens": 3451, + "completion_tokens": 45, + "inference_time_ms": 951 + }, + { + "act_inference_type": "act", + "timestamp": "20250127_123501", + "LLM_input_file": "20250127_123501_act_call.txt", + "LLM_output_file": "20250127_123501_act_response.txt", + "prompt_tokens": 2890, + "completion_tokens": 38, + "inference_time_ms": 823 + } + ] +} +``` + + + +--- + +## Reference + +### Logging Configuration + +All logging options are passed to the Stagehand constructor: + +```typescript +const stagehand = new Stagehand({ + // ... your other configurations (env, model, etc.) + + // Logging options: + verbose?: 0 | 1 | 2; // Log level (default: 1) + logger?: (line: LogLine) => void; // External logger function + disablePino?: boolean; // Disable Pino backend (default: false) + logInferenceToFile?: boolean; // Save LLM requests to disk (default: false) +}); +``` + +| Option | Default | Description | +|--------|---------|-------------| +| `verbose` | `1` | Log level: `0` = errors only, `1` = info, `2` = debug | +| `logger` | `undefined` | Custom logger function for external platforms | +| `disablePino` | `false` | Disable Pino (auto `true` in tests) | +| `logInferenceToFile` | `false` | Save LLM requests to disk (default: false) | + +### Log Structure + +Each log entry follows a structured format: + +```typescript +interface LogLine { + message: string; // "act completed successfully" + level?: 0 | 1 | 2; // error | info | debug + category?: string; // "action", "llm", "browser", "cache" + timestamp?: string; // ISO 8601 timestamp + auxiliary?: { // Additional structured metadata + [key: string]: { + value: string; // Serialized value + type: "object" | "string" | "integer" | "float" | "boolean"; + }; + }; +} +``` + + + + + +```json +{ + "category": "action", + "message": "act completed successfully", + "level": 1, + "timestamp": "2025-01-27T12:35:00.123Z", + "auxiliary": { + "selector": { + "value": "#btn-submit", + "type": "string" + }, + "executionTime": { + "value": "1250", + "type": "integer" + } + } +} +``` + + + +```json +{ + "category": "llm", + "message": "inference completed", + "level": 1, + "timestamp": "2025-01-27T12:34:58.456Z", + "auxiliary": { + "model": { + "value": "gpt-4o", + "type": "string" + }, + "promptTokens": { + "value": "3451", + "type": "integer" + }, + "completionTokens": { + "value": "45", + "type": "integer" + } + } +} +``` + + + +```json +{ + "category": "action", + "message": "action failed: element not found", + "level": 0, + "timestamp": "2025-01-27T12:35:05.789Z", + "auxiliary": { + "selector": { + "value": "#missing-btn", + "type": "string" + }, + "url": { + "value": "https://example.com/form", + "type": "string" + } + } +} +``` + + + + + +--- + +## Next Steps + +Now that you have logging configured, explore additional debugging and monitoring tools in [the Observability guide](/v3/configuration/observability): + + + +Track all LLM operations (act, extract, observe, agent) with parameters, results, and timestamps. Perfect for debugging sequences and replaying workflows. + + + +Monitor token usage and performance in real-time. Track costs per operation, identify expensive calls, and optimize resource usage. + + + +Save complete LLM request/response dumps to disk. See exactly what DOM was sent to the LLM and why it made specific decisions. + + + +Watch your automation visually with session recordings, network monitoring, and real-time browser inspection (Browserbase only). + + diff --git a/packages/docs/v3/configuration/models.mdx b/packages/docs/v3/configuration/models.mdx new file mode 100644 index 000000000..fe0f745e9 --- /dev/null +++ b/packages/docs/v3/configuration/models.mdx @@ -0,0 +1,828 @@ +--- +title: Models +sidebarTitle: Models +description: Use any LLM model with Stagehand for optimal performance +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +Understand web pages, plan actions, and interact with complex interfaces with Google, OpenAI, Anthropic, xAI, DeepSeek, Perplexity, Azure, Ollama, or any other LLM model from [the Vercel AI SDK](https://sdk.vercel.ai/providers). + +--- + +## Configuration Setup + +### Quick Start + + + Set your API key in `.env` and Stagehand handles the rest. No explicit + configuration needed! + + +Get started with Google Gemini (recommended for speed and cost): + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", + model: "google/gemini-2.5-flash" + // API key auto-loads from GOOGLE_GENERATIVE_AI_API_KEY - set in your .env +}); + +await stagehand.init(); + +``` + + + +--- + +### First Class Models + +Use any model from the following supported providers. + + + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", + model: "google/gemini-2.5-flash" + // API key auto-loads from GOOGLE_GENERATIVE_AI_API_KEY - set in your .env +}); + +await stagehand.init(); +``` + + +[View all supported Google models →](https://ai.google.dev/gemini-api/docs/models) + + + + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", + model: "anthropic/claude-haiku-4-5" + // API key auto-loads from ANTHROPIC_API_KEY - set in your .env +}); + +await stagehand.init(); + +``` + +[View all supported Anthropic models →](https://docs.anthropic.com/en/docs/models-overview) + + + + + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", + model: "openai/gpt-5" + // API key auto-loads from OPENAI_API_KEY - set in your .env +}); + +await stagehand.init(); +``` + + +[View all supported OpenAI models →](https://platform.openai.com/docs/models) + + + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", + model: "azure/gpt-5" + // API key auto-loads from AZURE_API_KEY - set in your .env +}); + +await stagehand.init(); + +``` + +[View all supported Azure models →](https://ai.azure.com/catalog) + + + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", + model: "cerebras/llama-4-scout" + // API key auto-loads from CEREBRAS_API_KEY - set in your .env +}); + +await stagehand.init(); +``` + + +[View all supported Cerebras models →](https://inference-docs.cerebras.ai/models/overview) + + + + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", + model: "deepseek/deepseek-chat" + // API key auto-loads from DEEPSEEK_API_KEY - set in your .env +}); + +await stagehand.init(); + +``` + +[View all supported DeepSeek models →](https://api-docs.deepseek.com/quick_start/pricing) + + + + + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", + model: "groq/llama-3.1-8b-instant" + // API key auto-loads from GROQ_API_KEY - set in your .env +}); + +await stagehand.init(); +``` + + +[View all supported Groq models →](https://console.groq.com/docs/models) + + + + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", + model: "mistral/codestral-2508" + // API key auto-loads from MISTRAL_API_KEY - set in your .env +}); + +await stagehand.init(); + +``` + +[View all supported Mistral models →](https://docs.mistral.ai/getting-started/models) + + + + + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", + model: "ollama/llama3.2" + // No API key required +}); + +await stagehand.init(); +``` + + +[View all supported Ollama models →](https://ollama.com/library) + + + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", + model: "perplexity/sonar-reasoning" + // API key auto-loads from PERPLEXITY_API_KEY - set in your .env +}); + +await stagehand.init(); + +``` + +[View all supported Perplexity models →](https://docs.perplexity.ai/getting-started/models) + + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", + model: "togetherai/Qwen/Qwen3-235B-A22B-Instruct-2507-tput" + // API key auto-loads from TOGETHER_AI_API_KEY - set in your .env +}); + +await stagehand.init(); +``` + + +[View all supported TogetherAI models →](https://www.together.ai/models) + + + + + +```typescript TypeScript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", + model: "xai/grok-4-fast-reasoning" + // API key auto-loads from XAI_API_KEY - set in your .env +}); + +await stagehand.init(); + +``` + + +[View all xAI models →](https://docs.x.ai/docs/models) + + + + +--- + +### Custom Models + +Amazon Bedrock, Cohere, all [first class models](/v3/configuration/models#first-class-models), and any model from [the Vercel AI SDK](https://sdk.vercel.ai/providers) is supported. + +Use this configuration for custom endpoints and custom retry or caching logic. + +We'll use Amazon Bedrock and Google as examples below. + + + + + + +Install the Vercel AI SDK for your provider. + + + +```bash +npm install @ai-sdk/amazon-bedrock +``` + + + +```bash +pnpm add @ai-sdk/amazon-bedrock +``` + + +```bash +yarn add @ai-sdk/amazon-bedrock +``` + + +```bash +bun add @ai-sdk/amazon-bedrock +``` + + + + + +```typescript +import { createAmazonBedrock } from '@ai-sdk/amazon-bedrock'; +import { AISdkClient } from '@browserbasehq/stagehand'; + +const bedrockProvider = createAmazonBedrock({ + region: 'us-east-1', + accessKeyId: 'xxxxxxxxx', + secretAccessKey: 'xxxxxxxxx', + sessionToken: 'xxxxxxxxx', +}); + +const bedrockClient = new AISdkClient({ + model: bedrockProvider("amazon/nova-pro-latest"), +}); + +``` + + + +```typescript +const stagehand = new Stagehand({ + env: "BROWSERBASE", + llmClient: bedrockClient +}); + +await stagehand.init(); +``` + + + + + + + + +Install the Vercel AI SDK for your provider. + + + +```bash +npm install @ai-sdk/google +``` + + +```bash +pnpm add @ai-sdk/google +``` + + +```bash +yarn add @ai-sdk/google +``` + + +```bash +bun add @ai-sdk/google +``` + + + + + +```typescript +import { createGoogle } from '@ai-sdk/google'; +import { AISdkClient } from '@browserbasehq/stagehand'; + +const googleProvider = createGoogle({ + apiKey: process.env.GEMINI_API_KEY, +}); + +const googleClient = new AISdkClient({ + model: googleProvider("google/gemini-2.5-flash"), +}); + +``` + + + +```typescript +const stagehand = new Stagehand({ + env: "BROWSERBASE", + llmClient: googleClient +}); + +await stagehand.init(); +``` + + + + + + +To implement a custom model, follow the steps for the provider you are using. See the Amazon Bedrock and Google examples above. All supported providers and models are in [the Vercel AI SDK](https://sdk.vercel.ai/providers). + + + +Install the Vercel AI SDK for your provider. + + +```typescript +import { createProvider } from '@ai-sdk/provider'; +import { AISdkClient } from '@browserbasehq/stagehand'; + +const provider = createProvider({ + apiKey: 'xxxxxxxxx', +}); + +const providerClient = new AISdkClient({ + model: provider("model/name"), +}); + +``` + + +```typescript +const stagehand = new Stagehand({ + env: "BROWSERBASE", + llmClient: providerClient +}); + +await stagehand.init(); +``` + + + + + + + +--- + +## Choose a Model + +Different models excel at different tasks. Consider speed, accuracy, and cost for your use case. + + + Find detailed model comparisons and recommendations on our Model Evaluation page. + + +**Quick Recommendations** + +| Use Case | Recommended Model | Why | +| ------------------------- | ------------------------------------ | ------------------------------ | +| **Production** | `google/gemini-2.5-flash` | Fast, accurate, cost-effective | +| **Intelligence** | `google/gemini-3-pro-preview` | Best accuracy on hard tasks | +| **Speed** | `google/gemini-2.5-flash` | Fastest response times | +| **Cost** | `google/gemini-2.5-flash` | Best value per token | +| **Local/offline** | `ollama/qwen3` | No API costs, full control | + + +--- + +## Advanced Options + +### Agent Models (with CUA Support) + +**Default** + +The Stagehand agent by default uses the same model passed to Stagehand. All models ([first class](/v3/configuration/models#first-class-models) and [custom](/v3/configuration/models#custom-models)) are supported. Here's an example with Gemini: + +```typescript +const stagehand = new Stagehand({ + env: "BROWSERBASE", + model: "google/gemini-2.5-flash", + // GOOGLE_GENERATIVE_AI_API_KEY is auto-loaded from .env + // ... other stagehand options +}); + +// Agent will use google/gemini-2.5-flash +const agent = stagehand.agent(); +``` + +**Override (with CUA support)** + +However, the stagehand agent also accepts a `model` parameter, which accepts any [first class](/v3/configuration/models#first-class-models) model, including [computer use agents (CUA)](/v3/configuration/models#agent-models-with-cua-support). This is useful when you'd like the agent to use a different model than the one passed to Stagehand. + + + To use a CUA model, you must pass the `mode: "cua"` parameter to the `agent()` method. If a non-CUA model is used, whether specified in Stagehand or overridden in the `agent()` method, an error will be thrown. + + + +**Deprecation Notice:** The `cua: true` option is deprecated and will be removed in a future version. Use `mode: "cua"` instead. + + + + +```typescript +const agent = stagehand.agent({ + mode: "cua", + model: "google/gemini-2.5-computer-use-preview-10-2025", + // GOOGLE_GENERATIVE_AI_API_KEY is auto-loaded from .env + // ... other agent options +}); +``` + + +```typescript +const agent = stagehand.agent({ + mode: "cua", + model: "anthropic/claude-3-7-sonnet-latest", + // ANTHROPIC_API_KEY is auto-loaded from .env + // ... other agent options +}); +``` + + +```typescript +const agent = stagehand.agent({ + mode: "cua", + model: "openai/computer-use-preview", + // OPENAI_API_KEY is auto-loaded from .env + // ... other agent options +}); +``` + + +All [first class models](/v3/configuration/models#first-class-models) are supported. Here's an example with Gemini: + +```typescript +const agent = stagehand.agent({ + model: "google/gemini-2.5-pro", + // GOOGLE_GENERATIVE_AI_API_KEY is auto-loaded from .env + // ... other agent options +}); +``` + + + + +| Provider | Model | +| -------- | ----- | +| Google | `google/gemini-2.5-computer-use-preview-10-2025` | +| Anthropic | `anthropic/claude-3-7-sonnet-latest` | +| Anthropic | `anthropic/claude-haiku-4-5-20251001` | +| Anthropic | `anthropic/claude-sonnet-4-20250514` | +| Anthropic | `anthropic/claude-sonnet-4-5-20250929` | +| OpenAI | `openai/computer-use-preview` | +| OpenAI | `openai/computer-use-preview-2025-03-11` | + + + + For overriding the agent API key, using a corporate proxy, adding provider-specific options, or other advanced use cases, the agent model can also take the form of an object. To learn more, see the [Agent Reference](/v3/references/agent). + +--- + +### Custom Endpoints + +If you need Azure OpenAI deployments or enterprise deployments. + + + + +For OpenAI, you can pass configuration directly without using `llmClient` using the `model` parameter: + +```typescript +const stagehand = new Stagehand({ + env: "BROWSERBASE", + model: { + modelName: "openai/gpt-5", + apiKey: process.env.OPENAI_API_KEY, + baseURL: "https://custom-openai-endpoint.com/v1" + } +}); +``` + + + + + +For Anthropic, you can pass configuration directly without using `llmClient` using the `model` parameter: + +```typescript +const stagehand = new Stagehand({ + env: "BROWSERBASE", + model: { + modelName: "anthropic/claude-haiku-4-5", + apiKey: process.env.ANTHROPIC_API_KEY, + baseURL: "https://custom-anthropic-endpoint.com", + }, +}); +``` + + + +For all other providers, use `llmClient`. Here's an example with Hugging Face: + +```typescript +// pnpm add @ai-sdk/huggingface + +import { createHuggingFace } from "@ai-sdk/huggingface"; +import { AISdkClient } from "@browserbasehq/stagehand"; + +const huggingFaceProvider = createHuggingFace({ + apiKey: process.env.HUGGINGFACE_API_KEY, + baseURL: "https://custom-huggingface-endpoint.com", +}); + +const huggingFaceClient = new AISdkClient({ + model: huggingFaceProvider("meta-llama/Llama-3.1-8B-Instruct"), +}); + +const stagehand = new Stagehand({ + env: "BROWSERBASE", + llmClient: huggingFaceClient, +}); +``` + + + + +--- + +### Extending the AI SDK Client + +For advanced use cases like custom retries or caching logic, you can extend the `AISdkClient`: + +```typescript +import { LLMClient } from "@browserbasehq/stagehand"; + +class CustomRetryClient extends LLMClient { + async createChatCompletion(options) { + let retries = 3; + while (retries > 0) { + try { + return await super.createChatCompletion(options); + } catch (error) { + retries--; + if (retries === 0) throw error; + await new Promise((r) => setTimeout(r, 1000 * (4 - retries))); + } + } + } +} +``` + + + Need custom caching? Consider using built-in [caching + feature](/v3/best-practices/caching). + + +--- + +### Legacy Model Format + + +**Recommendation:** Use `provider/model` format. Example: +- `model: "openai/gpt-4o"` (recommended) +- `model: "gpt-4o"` (legacy) + + + +The following models work without the `provider/` prefix in the model parameter as part of legacy support: + + + + +- `gemini-2.5-flash-preview-04-17` +- `gemini-2.5-pro-preview-03-25` +- `gemini-2.0-flash` +- `gemini-2.0-flash-lite` +- `gemini-1.5-flash` +- `gemini-1.5-flash-8b` +- `gemini-1.5-pro` + + + + +- `claude-3-7-sonnet-latest` +- `claude-3-7-sonnet-20250219` +- `claude-3-5-sonnet-latest` +- `claude-3-5-sonnet-20241022` +- `claude-3-5-sonnet-20240620` + + + +- `gpt-4o` +- `gpt-4o-mini` +- `o1` +- `o1-mini` +- `o3` +- `o3-mini` +- `gpt-4.1` +- `gpt-4.1-mini` +- `gpt-4.1-nano` +- `o4-mini` +- `gpt-4.5-preview` +- `gpt-4o-2024-08-06` +- `o1-preview` + + + + +- `cerebras-llama-3.3-70b` +- `cerebras-llama-3.1-8b` + + + + +- `groq-llama-3.3-70b-versatile` +- `groq-llama-3.3-70b-specdec` +- `moonshotai/kimi-k2-instruct` + + + + +--- + +## Troubleshooting + + + +**Error:** `API key not found` + +**Solutions:** + +- Check `.env` file has the correct variable name for the provider you are using +- Ensure environment variables are loaded (use `dotenv`) +- Restart your application after updating `.env` file + +| Provider | Environment Variable | +| ---------- | ------------------------------ | +| Google | `GOOGLE_GENERATIVE_AI_API_KEY` or `GEMINI_API_KEY` | +| Anthropic | `ANTHROPIC_API_KEY` | +| OpenAI | `OPENAI_API_KEY` | +| Azure | `AZURE_API_KEY` | +| Cerebras | `CEREBRAS_API_KEY` | +| DeepSeek | `DEEPSEEK_API_KEY` | +| Groq | `GROQ_API_KEY` | +| Mistral | `MISTRAL_API_KEY` | +| Ollama | None (local) | +| Perplexity | `PERPLEXITY_API_KEY` | +| TogetherAI | `TOGETHER_AI_API_KEY` | +| xAI | `XAI_API_KEY` | + + + + +**Error:** `Unsupported model` + +**Solutions:** + +- Use the `provider/model` format: `openai/gpt-5` +- Verify the model name exists in the provider's documentation +- Check model name is spelled correctly +- Ensure your Model API key can access the model + + + +**Error:** `Model does not support structured outputs` + +**Solutions:** + +- Check our [Model Evaluation page](https://www.stagehand.dev/evals) for recommended models + + + +**Symptoms:** Automation is expensive or slow + +**Solutions:** + +- Switch to cost-effective models (check [evals](https://www.stagehand.dev/evals) for comparisons) +- Use faster models for simple tasks, powerful ones for complex tasks +- Implement [caching](/v3/best-practices/caching) for repeated patterns + + +Python is now supported in Stagehand v3! The Python SDK uses a BYOB (Bring Your Own Browser) architecture. + +**Solutions:** + +- See the [Python SDK documentation](/v3/sdk/python) for installation and usage +- Check the [Python migration guide](/v3/migrations/python) if upgrading from v2 + + + +### Need Help? Contact Support + +Can't find a solution? Have a question? Reach out to our support team: + + + Email us at support@browserbase.com + + +--- + +## Next Steps + + + + Learn how to prompt LLMs for optimal results + + + Test which models work best for your specific use case + + + + Cache responses to reduce costs and improve speed + + + Reduce LLM spending with caching and smart model selection + + diff --git a/packages/docs/v3/configuration/observability.mdx b/packages/docs/v3/configuration/observability.mdx new file mode 100644 index 000000000..3f8c07750 --- /dev/null +++ b/packages/docs/v3/configuration/observability.mdx @@ -0,0 +1,369 @@ +--- +title: Observability +sidebarTitle: Observability +description: Track Stagehand automation with session visibility and analytics +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +Stagehand provides powerful observability features to help you monitor, track performance, and analyze your browser automation workflows. Focus on session monitoring, resource usage, and operational insights for both Browserbase and local environments. + +## Browserbase Session Monitoring + +When running on Browserbase, you gain access to comprehensive cloud-based monitoring and session management through the Browserbase API and dashboard. + +
+ Browserbase Session Observability +
+ +### Live Session Visibility + +Browserbase provides real-time visibility into your automation sessions: + +**Session Dashboard Features** +- Real-time browser screen recording and replay +- Network request monitoring with detailed timing +- JavaScript console logs and error tracking +- CPU and memory usage metrics +- Session status and duration tracking + +**Session Management & API Access** +```typescript +import { Stagehand } from "@browserbasehq/stagehand"; +import { Browserbase } from "@browserbasehq/sdk"; + +const browserbase = new Browserbase({ + apiKey: process.env.BROWSERBASE_API_KEY, +}); + +const stagehand = new Stagehand({ + env: "BROWSERBASE" +}); + +await stagehand.init(); + +const sessionInfo = await browserbase.sessions.retrieve(stagehand.sessionId); + +console.log("Session status:", sessionInfo.status); +console.log("Session region:", sessionInfo.region); +console.log("CPU usage:", sessionInfo.avgCpuUsage); +console.log("Memory usage:", sessionInfo.memoryUsage); +console.log("Proxy bytes:", sessionInfo.proxyBytes); +``` + +### Session Analytics & Insights + + + + Monitor live session status, resource usage, and geographic distribution. Scale and manage concurrent sessions with real-time insights. + + + + Review complete session recordings with frame-by-frame playback. Analyze network requests and debug browser interactions visually. + + + + Programmatically access session data, automate lifecycle management, and integrate with monitoring systems through our API. + + + + Track resource consumption, session duration, and API usage. Get detailed breakdowns of costs and utilization across your automation. + + + +### Session Monitoring & Filtering + +Query and monitor sessions by status and metadata: + +```typescript +import { Browserbase } from "@browserbasehq/sdk"; + +const browserbase = new Browserbase({ + apiKey: process.env.BROWSERBASE_API_KEY, +}); + +// List sessions with filtering +async function getFilteredSessions() { + const sessions = await browserbase.sessions.list({ + status: 'RUNNING' + }); + + return sessions.map(session => ({ + id: session.id, + status: session.status, // RUNNING, COMPLETED, ERROR, TIMED_OUT + startedAt: session.startedAt, + endedAt: session.endedAt, + region: session.region, + avgCpuUsage: session.avgCpuUsage, + memoryUsage: session.memoryUsage, + proxyBytes: session.proxyBytes, + userMetadata: session.userMetadata + })); +} + +// Query sessions by metadata +async function querySessionsByMetadata(query: string) { + const sessions = await browserbase.sessions.list({ + q: query + }); + + return sessions; +} +``` + +## Local Environment Monitoring + +For local development, Stagehand provides performance monitoring and resource tracking capabilities directly on your machine. + +### Performance Tracking + +```typescript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "LOCAL", + verbose: 1, // Monitor performance without debug noise +}); + +await stagehand.init(); + +// Track local automation metrics +const startTime = Date.now(); +const initialMetrics = await stagehand.metrics; + +// ... perform automation tasks +const page = stagehand.context.pages()[0]; +await page.goto("https://example.com"); +await stagehand.act("click button"); +await stagehand.extract({ instruction: "get data", schema: DataSchema }); + +const finalMetrics = await stagehand.metrics; +const executionTime = Date.now() - startTime; + +console.log('Local Performance Summary:', { + executionTime: `${executionTime}ms`, + totalTokens: finalMetrics.totalPromptTokens + finalMetrics.totalCompletionTokens, + totalInferenceTime: `${finalMetrics.totalInferenceTimeMs}ms`, + tokensPerSecond: ((finalMetrics.totalPromptTokens + finalMetrics.totalCompletionTokens) / (executionTime / 1000)).toFixed(2) +}); +``` + +## Resource Usage Monitoring + +When running locally, monitor system resource usage and browser performance: + +```typescript +import { Stagehand } from "@browserbasehq/stagehand"; +import * as os from 'os'; +import { performance } from 'perf_hooks'; + +class LocalResourceMonitor { + private cpuUsage: number[] = []; + private memoryUsage: number[] = []; + + startMonitoring() { + const interval = setInterval(() => { + // Track system resources + const memUsage = process.memoryUsage(); + this.memoryUsage.push(memUsage.heapUsed / 1024 / 1024); // MB + + // Track CPU (simplified) + const loadAvg = os.loadavg()[0]; + this.cpuUsage.push(loadAvg); + }, 1000); + + return interval; + } + + getResourceSummary() { + return { + avgMemoryUsage: this.memoryUsage.reduce((a, b) => a + b, 0) / this.memoryUsage.length, + peakMemoryUsage: Math.max(...this.memoryUsage), + avgCpuLoad: this.cpuUsage.reduce((a, b) => a + b, 0) / this.cpuUsage.length, + totalDataPoints: this.cpuUsage.length + }; + } +} + +const monitor = new LocalResourceMonitor(); +const interval = monitor.startMonitoring(); + +const stagehand = new Stagehand({ env: "LOCAL" }); + +// ... run automation + +clearInterval(interval); +console.log('Resource Usage:', monitor.getResourceSummary()); +``` + + + + Monitor token usage, costs, and speed. Set up automated alerting for critical failures. Implement cost tracking across different environments. Use session analytics to optimize automation workflows. + + + +## Real-Time Metrics & Monitoring + +### Basic Usage Tracking + +Monitor your automation's resource usage in real-time with `stagehand.metrics`: + +```typescript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ env: "BROWSERBASE" }); +await stagehand.init(); + +// Metrics are async in V3 +const metrics = await stagehand.metrics; +console.log(metrics); + +// Monitor during automation +const startTime = Date.now(); +const initialMetrics = await stagehand.metrics; + +// ... perform automation tasks +const page = stagehand.context.pages()[0]; +await page.goto("https://example.com"); +await stagehand.act("click the login button"); +const data = await stagehand.extract({ + instruction: "extract user info", + schema: UserSchema +}); + +const finalMetrics = await stagehand.metrics; +const executionTime = Date.now() - startTime; + +console.log('Automation Summary:', { + totalTokens: finalMetrics.totalPromptTokens + finalMetrics.totalCompletionTokens, + executionTime: `${executionTime}ms`, + avgInferenceTime: `${finalMetrics.totalInferenceTimeMs / 3}ms`, +}); +``` + +### Understanding Metrics Data + +The metrics object provides detailed breakdown by Stagehand operation: + +```typescript +interface StagehandMetrics { + // Act operation metrics + actPromptTokens: number; + actCompletionTokens: number; + actReasoningTokens: number; + actCachedInputTokens: number; + actInferenceTimeMs: number; + + // Extract operation metrics + extractPromptTokens: number; + extractCompletionTokens: number; + extractReasoningTokens: number; + extractCachedInputTokens: number; + extractInferenceTimeMs: number; + + // Observe operation metrics + observePromptTokens: number; + observeCompletionTokens: number; + observeReasoningTokens: number; + observeCachedInputTokens: number; + observeInferenceTimeMs: number; + + // Agent operation metrics + agentPromptTokens: number; + agentCompletionTokens: number; + agentReasoningTokens: number; + agentCachedInputTokens: number; + agentInferenceTimeMs: number; + + // Cumulative totals + totalPromptTokens: number; + totalCompletionTokens: number; + totalReasoningTokens: number; + totalCachedInputTokens: number; + totalInferenceTimeMs: number; +} +``` + +**Example metrics output:** + +```typescript +const metrics = await stagehand.metrics; +console.log(metrics); + +// { +// actPromptTokens: 4011, +// actCompletionTokens: 51, +// actReasoningTokens: 12, +// actCachedInputTokens: 0, +// actInferenceTimeMs: 1688, +// extractPromptTokens: 4200, +// extractCompletionTokens: 243, +// extractReasoningTokens: 18, +// extractCachedInputTokens: 0, +// extractInferenceTimeMs: 4297, +// observePromptTokens: 347, +// observeCompletionTokens: 43, +// observeReasoningTokens: 5, +// observeCachedInputTokens: 0, +// observeInferenceTimeMs: 903, +// agentPromptTokens: 0, +// agentCompletionTokens: 0, +// agentReasoningTokens: 0, +// agentCachedInputTokens: 0, +// agentInferenceTimeMs: 0, +// totalPromptTokens: 8558, +// totalCompletionTokens: 337, +// totalReasoningTokens: 35, +// totalCachedInputTokens: 0, +// totalInferenceTimeMs: 6888 +// } +``` + +## Best Practices + + + +- Track session success rates and failure patterns +- Monitor resource usage and scaling requirements +- Set up automated alerting for critical failures +- Implement cost tracking across different environments +- Use session analytics to optimize automation workflows + + + +- Compare Browserbase vs local execution times +- Monitor token usage and inference costs across models +- Track geographic performance differences +- Identify bottlenecks in automation workflows +- Optimize for cost-effectiveness and speed + + + +- Track session distribution across regions +- Monitor concurrent session limits and scaling +- Analyze failure patterns and common error scenarios +- Use session recordings for root cause analysis +- Implement custom metadata for workflow categorization + + + +- Integrate session APIs with monitoring dashboards +- Set up automated notifications for session failures +- Track SLA compliance and performance benchmarks +- Monitor resource costs and usage patterns +- Use analytics data for capacity planning and optimization + + + +## Next Steps + + + + Track all LLM operations with parameters, results, and timestamps for debugging. + + + Configure logging levels, custom loggers, and file-based session logging. + + diff --git a/packages/docs/v3/first-steps/ai-rules.mdx b/packages/docs/v3/first-steps/ai-rules.mdx new file mode 100644 index 000000000..1a9da6f5b --- /dev/null +++ b/packages/docs/v3/first-steps/ai-rules.mdx @@ -0,0 +1,466 @@ +--- +title: AI Rules +description: Using AI to write Stagehand code faster, and better. +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +You're likely using AI to write code, and there's a **right and wrong way to do it.** This page is a collection of rules, configs, and copy‑paste snippets to allow your AI agents/assistants to write performant, Stagehand code as fast as possible. + +## Quickstart + + + + Configure Browserbase (Stagehand), Context7, DeepWiki, and Stagehand Docs in your MCP client. + + + Drop in `cursorrules` and `claude.md` so AI agents/assistants always emit Stagehand patterns. + + + +## Using MCP Servers + +MCP (Model Context Protocol) servers act as intermediaries that connect AI systems to external data sources and tools. These servers enable your coding assistant to access real-time information, execute tasks, and retrieve structured data to enhance code generation accuracy. + +The following **MCP servers** provide specialized access to Stagehand documentation and related resources: + + +Provides semantic search across documentation and codebase context. Context7 enables AI assistants to find relevant code patterns, examples, and implementation details from your project history. It maintains contextual understanding of your development workflow and can surface related solutions from previous work. + +**Installation:** +```json +{ + "mcpServers": { + "context7": { + "command": "npx", + "args": ["-y", "@upstash/context7-mcp"] + } + } +} +``` + + + +Offers deep indexing of GitHub repositories and documentation. DeepWiki allows AI agents to understand project architecture, API references, and best practices from the entire Stagehand ecosystem. It provides comprehensive knowledge about repository structure, code relationships, and development patterns. + +**Installation:** +```json +{ + "mcpServers": { + "deepwiki": { + "url": "https://mcp.deepwiki.com/mcp" + } + } +} +``` + + + +Direct access to official Stagehand documentation. This MCP server provides AI assistants with up-to-date API references, configuration options, and usage examples for accurate code generation. Mintlify auto-generates this server from the official docs, ensuring your AI assistant always has the latest information. + +**Usage:** +```json +{ + "mcpServers": { + "stagehand-docs": { + "url": "https://docs.stagehand.dev/mcp" + } + } +} +``` + + +**How MCP Servers Enhance Your Development:** +- **Real-time Documentation Access**: AI assistants can query the latest Stagehand docs, examples, and best practices +- **Context-Aware Code Generation**: Servers provide relevant code patterns and configurations based on your specific use case +- **Reduced Integration Overhead**: Standardized protocol eliminates the need for custom integrations with each documentation source +- **Enhanced Accuracy**: AI agents receive structured, up-to-date information rather than relying on potentially outdated training data + + +**Prompting tip:** +Explicitly ask your coding agent/assistant to use these MCP servers to fetch relevant information from the docs so they have better context and know how to write proper Stagehand code. + +ie. **"Use the stagehand-docs MCP to fetch the act/observe guidelines, then generate code that follows them. Prefer cached observe results."** + + +## Editor rule files (copy‑paste) + +Drop these in `.cursorrules`, `windsurfrules`, `claude.md`, or any agent rule framework: + + + +``````md +# Stagehand Project + +This is a project that uses Stagehand V3, a browser automation framework with AI-powered `act`, `extract`, `observe`, and `agent` methods. + +The main class can be imported as `Stagehand` from `@browserbasehq/stagehand`. + +**Key Classes:** + +- `Stagehand`: Main orchestrator class providing `act`, `extract`, `observe`, and `agent` methods +- `context`: A `V3Context` object that manages browser contexts and pages +- `page`: Individual page objects accessed via `stagehand.context.pages()[i]` or created with `stagehand.context.newPage()` + +## Initialize + +```typescript +import { Stagehand } from "@browserbasehq/stagehand"; + +const stagehand = new Stagehand({ + env: "LOCAL", // or "BROWSERBASE" + verbose: 2, // 0, 1, or 2 + model: "openai/gpt-4.1-mini", // or any supported model +}); + +await stagehand.init(); + +// Access the browser context and pages +const page = stagehand.context.pages()[0]; +const context = stagehand.context; + +// Create new pages if needed +const page2 = await stagehand.context.newPage(); +``` + +## Act + +Actions are called on the `stagehand` instance (not the page). Use atomic, specific instructions: + +```typescript +// Act on the current active page +await stagehand.act("click the sign in button"); + +// Act on a specific page (when you need to target a page that isn't currently active) +await stagehand.act("click the sign in button", { page: page2 }); +``` + +**Important:** Act instructions should be atomic and specific: + +- ✅ Good: "Click the sign in button" or "Type 'hello' into the search input" +- ❌ Bad: "Order me pizza" or "Type in the search bar and hit enter" (multi-step) + +### Observe + Act Pattern (Recommended) + +Cache the results of `observe` to avoid unexpected DOM changes: + +```typescript +const instruction = "Click the sign in button"; + +// Get candidate actions +const actions = await stagehand.observe(instruction); + +// Execute the first action +await stagehand.act(actions[0]); +``` + +To target a specific page: + +```typescript +const actions = await stagehand.observe("select blue as the favorite color", { + page: page2, +}); +await stagehand.act(actions[0], { page: page2 }); +``` + +## Extract + +Extract data from pages using natural language instructions. The `extract` method is called on the `stagehand` instance. + +### Basic Extraction (with schema) + +```typescript +import { z } from "zod/v3"; + +// Extract with explicit schema +const data = await stagehand.extract( + "extract all apartment listings with prices and addresses", + z.object({ + listings: z.array( + z.object({ + price: z.string(), + address: z.string(), + }), + ), + }), +); + +console.log(data.listings); +``` + +### Simple Extraction (without schema) + +```typescript +// Extract returns a default object with 'extraction' field +const result = await stagehand.extract("extract the sign in button text"); + +console.log(result); +// Output: { extraction: "Sign in" } + +// Or destructure directly +const { extraction } = await stagehand.extract( + "extract the sign in button text", +); +console.log(extraction); // "Sign in" +``` + +### Targeted Extraction + +Extract data from a specific element using a selector: + +```typescript +const reason = await stagehand.extract( + "extract the reason why script injection fails", + z.string(), + { selector: "/html/body/div[2]/div[3]/iframe/html/body/p[2]" }, +); +``` + +### URL Extraction + +When extracting links or URLs, use `z.string().url()`: + +```typescript +const { links } = await stagehand.extract( + "extract all navigation links", + z.object({ + links: z.array(z.string().url()), + }), +); +``` + +### Extracting from a Specific Page + +```typescript +// Extract from a specific page (when you need to target a page that isn't currently active) +const data = await stagehand.extract( + "extract the placeholder text on the name field", + { page: page2 }, +); +``` + +## Observe + +Plan actions before executing them. Returns an array of candidate actions: + +```typescript +// Get candidate actions on the current active page +const [action] = await stagehand.observe("Click the sign in button"); + +// Execute the action +await stagehand.act(action); +``` + +Observing on a specific page: + +```typescript +// Target a specific page (when you need to target a page that isn't currently active) +const actions = await stagehand.observe("find the next page button", { + page: page2, +}); +await stagehand.act(actions[0], { page: page2 }); +``` + +## Agent + +Use the `agent` method to autonomously execute complex, multi-step tasks. + +### Basic Agent Usage + +```typescript +const page = stagehand.context.pages()[0]; +await page.goto("https://www.google.com"); + +const agent = stagehand.agent({ + model: "google/gemini-2.0-flash", + executionModel: "google/gemini-2.0-flash", +}); + +const result = await agent.execute({ + instruction: "Search for the stock price of NVDA", + maxSteps: 20, +}); + +console.log(result.message); +``` + +### Computer Use Agent (CUA) + +For more advanced scenarios using computer-use models: + +```typescript +const agent = stagehand.agent({ + mode: "cua", // Enable Computer Use Agent mode + model: "anthropic/claude-sonnet-4-20250514", + // or "google/gemini-2.5-computer-use-preview-10-2025" + systemPrompt: `You are a helpful assistant that can use a web browser. + Do not ask follow up questions, the user will trust your judgement.`, +}); + +await agent.execute({ + instruction: "Apply for a library card at the San Francisco Public Library", + maxSteps: 30, +}); +``` + +### Agent with Custom Model Configuration + +```typescript +const agent = stagehand.agent({ + mode: "cua", + model: { + modelName: "google/gemini-2.5-computer-use-preview-10-2025", + apiKey: process.env.GEMINI_API_KEY, + }, + systemPrompt: `You are a helpful assistant.`, +}); +``` + +### Agent with Integrations (MCP/External Tools) + +```typescript +const agent = stagehand.agent({ + integrations: [`https://mcp.exa.ai/mcp?exaApiKey=${process.env.EXA_API_KEY}`], + systemPrompt: `You have access to the Exa search tool.`, +}); +``` + +## Advanced Features + +### DeepLocator (XPath Targeting) + +Target specific elements across shadow DOM and iframes: + +```typescript +await page + .deepLocator("/html/body/div[2]/div[3]/iframe/html/body/p") + .highlight({ + durationMs: 5000, + contentColor: { r: 255, g: 0, b: 0 }, + }); +``` + +### Multi-Page Workflows + +```typescript +const page1 = stagehand.context.pages()[0]; +await page1.goto("https://example.com"); + +const page2 = await stagehand.context.newPage(); +await page2.goto("https://example2.com"); + +// Act/extract/observe operate on the current active page by default +// Pass { page } option to target a specific page +await stagehand.act("click button", { page: page1 }); +await stagehand.extract("get title", { page: page2 }); +``` +`````` + + + + + +``````md +# Stagehand Python Project + +This is a project that uses [Stagehand Python](https://github.com/browserbase/stagehand-python), which provides AI-powered browser automation with `act`, `extract`, and `observe` methods. + +`Stagehand` is a class that provides configuration and browser automation capabilities with: +- Pages accessed via `stagehand.context.pages()` or `stagehand.context.activePage()` +- `stagehand.context`: A StagehandContext object (extends Playwright BrowserContext) +- `stagehand.agent()`: Create AI-powered agents for autonomous multi-step workflows +- `stagehand.init()`: Initialize the browser session +- `stagehand.close()`: Clean up resources + +`Page` extends Playwright's Page class with AI-powered methods: +- `act()`: Perform actions on web elements using natural language +- `extract()`: Extract structured data from pages using schemas +- `observe()`: Plan actions and get selectors before executing + +`Agent` provides autonomous Computer Use Agent capabilities: +- `execute()`: Perform complex multi-step tasks using natural language instructions + +Use the following rules to write code for this project. + +- To plan an instruction like "click the sign in button", use Stagehand `observe` to get the action to execute. + +You can also pass in the following params: + +- The result of `observe` is a list of `ObserveResult` objects that can directly be used as params for `act` like this: + +- When writing code that needs to extract data from the page, use Stagehand `extract`. Use Pydantic models for schemas: + +## Initialize + +### Configuration Options + +Key configuration options in `StagehandConfig`: + +## Act + +You can act directly with string instructions: + +Use variables for dynamic form filling: + +**Best Practices:** +- Cache the results of `observe` to avoid unexpected DOM changes +- Keep actions atomic and specific (e.g., "Click the sign in button" not "Sign in to the website") +- Use specific, descriptive instructions + +Act `action` should be as atomic and specific as possible, i.e. "Click the sign in button" or "Type 'hello' into the search input". +AVOID actions that are more than one step, i.e. "Order me pizza" or "Send an email to Paul asking him to call me". + +## Extract + +### Simple String Extraction + +### Structured Extraction with Schema (Recommended) +Always use Pydantic models for structured data extraction: + +### Array Extraction +For arrays, use List types: + +### Complex Object Extraction +For more complex data structures: + +## Agent System + +Stagehand provides an Agent System for autonomous web browsing using Computer Use Agents (CUA). + +### Creating Agents + +### Agent Execution + +**Best Practices:** +- Be specific with instructions: `"Fill out the contact form with name 'John Doe' and submit it"` +- Break down complex tasks into smaller steps +- Use error handling with try/except blocks +- Combine agents for navigation with traditional methods for precise data extraction + +## Project Structure Best Practices + +- Store configurations in environment variables or config files +- Use async/await patterns consistently +- Implement main automation logic in async functions +- Use async context managers for resource management +- Use type hints and Pydantic models for data validation +- Handle exceptions appropriately with try/except blocks +`````` + + + +## Security notes + +- Do not embed secrets in docs or rule files; use env vars in MCP configs. +- Avoid broad actions that may trigger unintended navigation; prefer `observe` first. + +## Resources/references + +- Context7 MCP (Upstash) + - https://github.com/upstash/context7 +- DeepWiki MCP + - https://mcp.deepwiki.com/ +- Stagehand Docs MCP (Mintlify) + - https://docs.stagehand.dev/mcp diff --git a/packages/docs/v3/first-steps/installation.mdx b/packages/docs/v3/first-steps/installation.mdx new file mode 100644 index 000000000..ce8d12663 --- /dev/null +++ b/packages/docs/v3/first-steps/installation.mdx @@ -0,0 +1,134 @@ +--- +title: Installation +description: Integrate Stagehand into an existing project. +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +Install Stagehand in your current app with the TypeScript SDK. + + +We recommend using the Node.js runtime environment to run Stagehand scripts. + +**Bun is now supported** as long as you do not integrate Stagehand with Playwright. Playwright is not compatible with Bun. + + + + + +### Install dependencies + + +```bash npm +npm install @browserbasehq/stagehand +``` + +```bash pnpm +pnpm add @browserbasehq/stagehand +``` + +```bash yarn +yarn add @browserbasehq/stagehand +``` +```bash bun icon="sparkles" +bun add @browserbasehq/stagehand +``` + + + +If you plan to run locally, you need to have [Chrome](https://www.google.com/chrome/) installed on your machine. For cloud browser sessions, skip this. + + +### Configure environment + +Set environment variables (or a `.env` via your framework): + + +```bash Bash +OPENAI_API_KEY=your_api_key +BROWSERBASE_API_KEY=your_api_key +BROWSERBASE_PROJECT_ID=your_project_id +``` + + +### Use in your codebase + +Add Stagehand where you need browser automation. + +```typescript +import "dotenv/config"; +import { Stagehand } from "@browserbasehq/stagehand"; +import { z } from "zod/v3"; + +async function main() { + const stagehand = new Stagehand({ + env: "BROWSERBASE" + }); + + await stagehand.init(); + const page = stagehand.context.pages()[0]; + + await page.goto("https://example.com"); + + // Act on the page + await stagehand.act("Click the learn more button"); + + // Extract structured data + const description = await stagehand.extract("extract the description", z.string()); + + console.log(description); + await stagehand.close(); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); +``` + + + + + + +For Python and other language SDKs, use the **language selector** in the top left corner of the sidebar to view the SDK documentation for your language. + + + + + + +## Next steps + + + + Environment, Browserbase vs Local, logging, timeouts, LLM customization + + + Perform precise actions with natural language + + + Typed data extraction with Zod schemas + + + Discover elements and suggested actions + + \ No newline at end of file diff --git a/packages/docs/v3/first-steps/introduction.mdx b/packages/docs/v3/first-steps/introduction.mdx new file mode 100644 index 000000000..0d96e3fbc --- /dev/null +++ b/packages/docs/v3/first-steps/introduction.mdx @@ -0,0 +1,117 @@ +--- +title: Introducing Stagehand +sidebarTitle: Introduction +description: Developers use Stagehand to reliably automate the web. +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +Stagehand is a browser automation framework used to control web browsers with natural language and code. By combining the power of AI with the precision of code, Stagehand makes web automation flexible, maintainable, and actually reliable. + +## The Problem with Browser Automation + +Traditional frameworks like Playwright and Puppeteer force you to write brittle scripts that break with every UI change. Web agents promise to solve this with AI, but leave you at the mercy of unpredictable behavior. + +**You're stuck between two bad options:** +- **Too brittle**: Traditional selectors break when websites change +- **Too agentic**: AI agents are unpredictable and impossible to debug + +## Enter Stagehand + +Stagehand gives you the best of both worlds through four powerful primitives that let you choose exactly how much AI to use: + + + + Execute actions using natural language + + + Pull structured data with schemas + + + Discover available actions on any page + + + Automate entire workflows autonomously + + + +```typescript +// Act - Execute natural language actions +await stagehand.act("click the login button"); + +// Extract - Pull structured data +const price = await stagehand.extract( + "extract the price", + z.number() +); + +// Observe - Discover available actions +const actions = await stagehand.observe("find submit buttons"); + +// Agent - Automate entire workflows +const agent = stagehand.agent({ + mode: "cua", + model: "google/gemini-2.5-computer-use-preview-10-2025", +}); +await agent.execute("apply for this job"); +``` + + +## Why Developers Choose Stagehand + +- **Precise Control**: Mix AI-powered actions with deterministic code. You decide exactly how much AI to use. + +- **Actually Repeatable**: Save and replay actions exactly. No more "it worked on my machine" with browser automations. + +- **Maintainable at Scale**: One script can automate multiple websites. When sites change, your automations adapt. + +- **Composable Tools**: Choose your level of automation with Act, Extract, Observe, and Agent. + +## Built for Modern Development +Stagehand is designed for developers building production browser automations and AI agents that need reliable web access. + + + + Compatible with all Chromium-based browsers: Chrome, Edge, Arc, Brave, and more. + + + Created and maintained by the team behind enterprise browser infrastructure. + + + +## Get Started in 60 Seconds + + **Pro tip**: For best results, we recommend using Stagehand with [Browserbase](https://www.browserbase.com) for reliable cloud browser infrastructure. + + + + Build your first automation in under a minute + + + Generate Stagehand scripts with AI + + + See real-world automation examples + + + Get help from the community + + diff --git a/packages/docs/v3/first-steps/quickstart.mdx b/packages/docs/v3/first-steps/quickstart.mdx new file mode 100644 index 000000000..b2561d002 --- /dev/null +++ b/packages/docs/v3/first-steps/quickstart.mdx @@ -0,0 +1,130 @@ +--- +title: Quickstart +description: 'Stagehand allows you to build web automations with natural language and code.' +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +If this is your **first time using Stagehand**, you should try [Director](https://director.ai) first. It's an agent that allows you to build Stagehand workflows using natural language. You can also try Stagehand using our [MCP server](/integrations/mcp/introduction) . + +Otherwise, the quickest way to start with Stagehand is with our CLI. It scaffolds a ready‑to‑run Stagehand app with sensible defaults, and an example script. + + +This quickstart is for **TypeScript**. For other languages, change the language selector in the top left corner. + + +## 1) Create a sample project + + +```bash Bash +npx create-browser-app +``` + + +## 2) Run it + +Follow the CLI prompts to enter the project directory and add your API keys. Then run the example script. + + +```bash Bash +cd my-stagehand-app # Enter the project directory +cp .env.example .env # Add your API keys +npm start # Run the example script +``` + + +## 3) Use Stagehand (act, extract, observe) + +The scaffold includes an index.ts file that contains the example script. Here's what it looks like: + + +```typescript TypeScript +import "dotenv/config"; +import { Stagehand } from "@browserbasehq/stagehand"; + +async function main() { + const stagehand = new Stagehand({ + env: "BROWSERBASE" + }); + + await stagehand.init(); + + console.log(`Stagehand Session Started`); + console.log(`Watch live: https://browserbase.com/sessions/${stagehand.browserbaseSessionID}`); + + const page = stagehand.context.pages()[0]; + + await page.goto("https://stagehand.dev"); + + const extractResult = await stagehand.extract("Extract the value proposition from the page."); + console.log(`Extract result:\n`, extractResult); + + await stagehand.act("Click the 'Evals' button."); + + const observeResult = await stagehand.observe("What can I click on this page?"); + console.log(`Observe result:\n`, observeResult); + + const agent = stagehand.agent({ + mode: "cua", + model: "google/gemini-2.5-computer-use-preview-10-2025", + systemPrompt: "You're a helpful assistant that can control a web browser.", + }); + + const agentResult = await agent.execute("What is the most accurate model to use in Stagehand?"); + console.log(`Agent result:\n`, agentResult); + + await stagehand.close(); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); + +``` + + + +To use, set provider keys in `.env` (e.g., `OPENAI_API_KEY`). For cloud browsers, add `BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID`. + + +## Next steps + +Learn about the Stagehand primitives: act, extract, observe, and agent. + + + + Perform actions on web pages with natural language + + + + Get structured data with Zod schemas + + + + Discover available elements and actions + + + + Autonomous multi-step browser workflows + + + diff --git a/packages/docs/v3/integrations/crew-ai/configuration.mdx b/packages/docs/v3/integrations/crew-ai/configuration.mdx new file mode 100644 index 000000000..418afa95b --- /dev/null +++ b/packages/docs/v3/integrations/crew-ai/configuration.mdx @@ -0,0 +1,174 @@ +--- +title: "Use CrewAI to Automate Browser Tasks" +sidebarTitle: Configuration +description: "Create intelligent agents that can interact with websites and automate browser tasks using natural language instructions" +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +This guide walks you through setting up CrewAI with Browserbase to create agents that can perform web automation tasks using natural language instructions. + +## Step 1: Install Dependencies + +Install the required packages for CrewAI and Stagehand integration: + +```bash +pip install stagehand crewai crewai-tools +``` + +## Step 2: Configure Environment Variables + +You'll need API keys from three services: + +1. **Browserbase API Key and Project ID**: Get these from your [Browserbase dashboard](https://www.browserbase.com/) +2. **LLM API Key**: Get an API key from [OpenAI](https://platform.openai.com/api-keys) or [Anthropic](https://console.anthropic.com/) + +Store your API keys securely as environment variables: + +```bash +BROWSERBASE_API_KEY="your-browserbase-api-key" +BROWSERBASE_PROJECT_ID="your-browserbase-project-id" +OPENAI_API_KEY="your-openai-api-key" +ANTHROPIC_API_KEY="your-anthropic-api-key" +``` + +## Step 3: Create Your First Agent + +Create a Python script with a basic CrewAI agent: + +```python +import os +from crewai import Agent, Task, Crew +from crewai_tools import StagehandTool +from stagehand.schemas import AvailableModel + +# Get API keys from environment +browserbase_api_key = os.environ.get("BROWSERBASE_API_KEY") +browserbase_project_id = os.environ.get("BROWSERBASE_PROJECT_ID") +model_api_key = os.environ.get("OPENAI_API_KEY") # or ANTHROPIC_API_KEY + +# Initialize the StagehandTool +stagehand_tool = StagehandTool( + api_key=browserbase_api_key, + project_id=browserbase_project_id, + model_api_key=model_api_key, + model_name=AvailableModel.GPT_4O, # or AvailableModel.CLAUDE_3_7_SONNET_LATEST +) + +# Create an agent with the tool +researcher = Agent( + role="Web Researcher", + goal="Find and summarize information from websites", + backstory="I'm an expert at finding information online.", + verbose=True, + tools=[stagehand_tool], +) +``` + +## Step 4: Create and Run a Task + +Define a task for your agent and execute it: + +```python +# Create a task that uses the tool +research_task = Task( + description="Go to https://www.example.com and tell me what you see on the homepage.", + agent=researcher, +) + +# Run the crew +crew = Crew( + agents=[researcher], + tasks=[research_task], + verbose=True, +) + +try: + result = crew.kickoff() + print(result) +finally: + # Clean up resources + stagehand_tool.close() +``` + +## Step 5: Run Your Script + +Execute your Python script: + +```bash +python your_crew_script.py +``` + +## Advanced Configuration + +Customize the StagehandTool behavior with additional parameters: + +```python +stagehand_tool = StagehandTool( + api_key=browserbase_api_key, + project_id=browserbase_project_id, + model_api_key=model_api_key, + model_name=AvailableModel.CLAUDE_3_7_SONNET_LATEST, + dom_settle_timeout_ms=5000, # Wait longer for DOM to settle + headless=True, # Run browser in headless mode + self_heal=True, # Attempt to recover from errors + wait_for_captcha_solves=True, # Wait for CAPTCHA solving + verbose=1, # Control logging verbosity (0-3) +) +``` + +## Example Tasks + + + + ```python + form_task = Task( + description=""" + Submit a contact form: + 1. Go to https://example.com/contact + 2. Fill out the form with name 'John Doe', email 'john@example.com' + 3. Submit and confirm success + """, + agent=researcher, + ) + ``` + + + ```python + extraction_task = Task( + description=""" + Extract product information: + 1. Go to the products page + 2. Extract all product names, prices, and descriptions + 3. Format as structured data + """, + agent=researcher, + ) + ``` + + + ```python + navigation_task = Task( + description=""" + Navigate and analyze: + 1. Start at homepage + 2. Navigate to products section + 3. Filter by 'Electronics' category + 4. Find and extract details of highest-rated product + """, + agent=researcher, + ) + ``` + + + + + + Dive into the CrewAI documentation to learn more about its capabilities and integrations. + + + Access the Browserbase documentation for comprehensive guides and resources. + + \ No newline at end of file diff --git a/packages/docs/v3/integrations/crew-ai/introduction.mdx b/packages/docs/v3/integrations/crew-ai/introduction.mdx new file mode 100644 index 000000000..811823d3d --- /dev/null +++ b/packages/docs/v3/integrations/crew-ai/introduction.mdx @@ -0,0 +1,39 @@ +--- +title: "CrewAI Introduction" +sidebarTitle: Introduction +description: "Automate browser tasks using natural language instructions with CrewAI" +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +## Overview + +This guide shows you how to use CrewAI with Browserbase to create intelligent agents that can automate web interactions. By the end of this guide, you'll know how to: + +- Set up CrewAI with the StagehandTool +- Create agents that can interact with websites +- Automate browser tasks using natural language instructions +- Extract structured data from web pages + +## When You'd Use This + +The CrewAI integration is perfect for scenarios where you need intelligent web automation: + +- **Research automation**: Have agents research information across multiple websites +- **Data collection**: Extract structured data from e-commerce sites, job boards, or news sites +- **Form automation**: Automatically fill out and submit forms based on specific criteria +- **Multi-step workflows**: Execute complex browser workflows that require decision-making + +The StagehandTool wraps the Stagehand Python SDK to provide CrewAI agents with the ability to control a real web browser and interact with websites using three core primitives: + +1. **Act**: Perform actions like clicking, typing, or navigating +2. **Extract**: Extract structured data from web pages +3. **Observe**: Identify and analyze elements on the page + + + + Learn how to configure and use the StagehandTool with CrewAI agents for web automation tasks + + \ No newline at end of file diff --git a/packages/docs/v3/integrations/langchain/configuration.mdx b/packages/docs/v3/integrations/langchain/configuration.mdx new file mode 100644 index 000000000..64f57e58a --- /dev/null +++ b/packages/docs/v3/integrations/langchain/configuration.mdx @@ -0,0 +1,246 @@ +--- +title: "LangChain JS Configuration" +sidebarTitle: Configuration +description: "Set up Stagehand with LangChain JS to create intelligent web automation agents" +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +This guide walks you through integrating Stagehand with LangChain JS to build powerful web automation workflows using natural language instructions. + +## Step 1: Install Dependencies + +Install the required packages for LangChain JS and Stagehand integration: + +```bash +npm install @langchain/langgraph @langchain/community @langchain/core @browserbasehq/stagehand +``` + +## Step 2: Configure Environment Variables + +For remote browser automation, set up your Browserbase credentials: + +```bash +BROWSERBASE_API_KEY="your-browserbase-api-key" +BROWSERBASE_PROJECT_ID="your-browserbase-project-id" +``` + +## Step 3: Create a Stagehand Instance + +Initialize Stagehand with your preferred configuration: + +```typescript +import { Stagehand } from "@browserbasehq/stagehand"; + +// For local development +const stagehand = new Stagehand({ + env: "LOCAL", + verbose: 2, + enableCaching: false, +}); + +// For production with Browserbase +const stagehand = new Stagehand({ + env: "BROWSERBASE", + verbose: 1, + enableCaching: true, +}); +``` + +## Step 4: Generate the StagehandToolkit + +Create the toolkit that provides LangChain-compatible tools: + +```typescript +import { StagehandToolkit } from '@langchain/community/agents/toolkits/stagehand'; + +const stagehandToolkit = await StagehandToolkit.fromStagehand(stagehand); +``` + +## Step 5: Use Individual Tools + +The toolkit provides four specialized tools for web automation: + +### Available Tools + +- **stagehand_navigate**: Navigate to specific URLs +- **stagehand_act**: Perform browser actions (clicking, typing, etc.) +- **stagehand_extract**: Extract structured data using schemas +- **stagehand_observe**: Analyze page elements and possible actions + +### Basic Tool Usage + +```typescript +import { z } from "zod"; + +// Navigate to a website +const navigateTool = stagehandToolkit.tools.find( + (t) => t.name === "stagehand_navigate" +); +await navigateTool.invoke("https://www.google.com"); + +// Perform an action +const actionTool = stagehandToolkit.tools.find( + (t) => t.name === "stagehand_act" +); +await actionTool.invoke('Search for "OpenAI"'); + +// Observe the page +const observeTool = stagehandToolkit.tools.find( + (t) => t.name === "stagehand_observe" +); +const result = await observeTool.invoke( + "What actions can be performed on the current page?" +); +console.log(JSON.parse(result)); + +// Extract structured data +const extractTool = stagehandToolkit.tools.find( + (t) => t.name === "stagehand_extract" +); +const extractResult = await extractTool.invoke({ + instruction: "Extract the main heading and description", + schema: z.object({ + heading: z.string(), + description: z.string(), + }), +}); +console.log(extractResult); +``` + +## Step 6: Build LangGraph Agents + +Integrate with LangGraph for complex automation workflows: + +```typescript +import { createReactAgent } from "@langchain/langgraph/prebuilt"; + +// Create an LLM +const llm = new ChatOpenAI({ + model: "gpt-4", + temperature: 0, +}); + +// Create an agent with Stagehand tools +const agent = createReactAgent({ + llm, + tools: stagehandToolkit.tools, +}); + +// Execute a complex workflow +const result = await agent.invoke({ + messages: [ + { + role: "user", + content: "Go to example.com, find the contact form, and extract all the form fields" + } + ] +}); +``` + +## Advanced Configuration + +### Custom Stagehand Configuration + +```typescript +const stagehand = new Stagehand({ + env: "BROWSERBASE", + verbose: 2, + enableCaching: true, + headless: true, + domSettleTimeoutMs: 5000, +}); +``` + +### Error Handling + +```typescript +try { + const result = await agent.invoke({ + messages: [{ role: "user", content: "Navigate to invalid-url.com" }] + }); +} catch (error) { + console.error("Automation failed:", error); +} finally { + // Clean up resources + await stagehand.close(); +} +``` + +## Example Workflows + + + + ```typescript + const extractionAgent = createReactAgent({ + llm, + tools: stagehandToolkit.tools, + }); + + const result = await extractionAgent.invoke({ + messages: [{ + role: "user", + content: ` + Go to news-website.com and extract: + 1. All article headlines + 2. Publication dates + 3. Author names + Format as structured JSON + ` + }] + }); + ``` + + + ```typescript + const formAgent = createReactAgent({ + llm, + tools: stagehandToolkit.tools, + }); + + const result = await formAgent.invoke({ + messages: [{ + role: "user", + content: ` + Navigate to contact-form.com and: + 1. Fill out the contact form with: + - Name: John Doe + - Email: john@example.com + - Message: Inquiry about services + 2. Submit the form + 3. Confirm submission success + ` + }] + }); + ``` + + + ```typescript + const researchAgent = createReactAgent({ + llm, + tools: stagehandToolkit.tools, + }); + + const result = await researchAgent.invoke({ + messages: [{ + role: "user", + content: ` + Research product pricing by: + 1. Visit competitor1.com and extract pricing info + 2. Visit competitor2.com and extract pricing info + 3. Compare features and prices + 4. Provide summary analysis + ` + }] + }); + ``` + + + + + + Official LangChain JS documentation for the Stagehand integration + + \ No newline at end of file diff --git a/packages/docs/v3/integrations/langchain/introduction.mdx b/packages/docs/v3/integrations/langchain/introduction.mdx new file mode 100644 index 000000000..4c3941d4b --- /dev/null +++ b/packages/docs/v3/integrations/langchain/introduction.mdx @@ -0,0 +1,33 @@ +--- +title: "Langchain JS Introduction" +sidebarTitle: Introduction +description: "Integrate Stagehand with Langchain JS for intelligent web automation" +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +## Overview + +This guide shows you how to use Stagehand with Langchain JS to create intelligent agents that can automate web interactions. By the end of this guide, you'll know how to: + +- Set up the StagehandToolkit with Langchain JS +- Create agents that can navigate and interact with websites +- Extract structured data using natural language instructions +- Build complex automation workflows with LangGraph + +## When You'd Use This + +The Langchain JS integration is perfect for scenarios where you need intelligent web automation with advanced reasoning: + +- **AI-driven research**: Create agents that can research information across multiple websites and synthesize findings +- **Dynamic form filling**: Automatically fill out complex forms based on contextual requirements +- **Data extraction workflows**: Extract and transform data from multiple sources with intelligent navigation +- **Multi-step web processes**: Execute complex browser workflows that require decision-making and adaptation + + + + Learn how to set up and configure the StagehandToolkit with Langchain JS agents + + \ No newline at end of file diff --git a/packages/docs/v3/integrations/mcp/configuration.mdx b/packages/docs/v3/integrations/mcp/configuration.mdx new file mode 100644 index 000000000..efaa43f86 --- /dev/null +++ b/packages/docs/v3/integrations/mcp/configuration.mdx @@ -0,0 +1,391 @@ +--- +title: "Browserbase MCP Server Configuration" +sidebarTitle: "Configuration" +description: "Configure your browser automation with command-line flags, environment variables, and advanced options" +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +## Configuration Overview + +The Browserbase MCP server supports extensive configuration options through command-line flags and environment variables. Configure browser behavior, proxy settings, stealth modes, model selection, and more to customize your browser automation workflows. + + +Command-line flags are only available when running the server locally (`npx @browserbasehq/mcp-server-browserbase` with flags or local development setup). + + +## Environment Variables + +Configure the essential Browserbase credentials and optional debugging settings: + + + +Your Browserbase API key for authentication + + + +Your Browserbase project ID + + + + +## Command-Line Flags + +### Available Flags + +| Flag | Description | +|------|-------------| +| `--proxies` | Enable Browserbase proxies for the session | +| `--advancedStealth` | Enable Browserbase Advanced Stealth (Scale Plan only) | +| `--keepAlive` | Enable Browserbase Keep Alive Session | +| `--contextId ` | Specify a Browserbase Context ID to use | +| `--persist [boolean]` | Whether to persist the Browserbase context (default: true) | +| `--port ` | Port to listen on for HTTP/SHTTP transport | +| `--host ` | Host to bind server to (default: localhost, use 0.0.0.0 for all interfaces) | +| `--browserWidth ` | Browser viewport width (default: 1024) | +| `--browserHeight ` | Browser viewport height (default: 768) | +| `--modelName ` | The model to use for Stagehand (default: gemini-2.0-flash) | +| `--modelApiKey ` | API key for the custom model provider (required when using custom models) | +| `--experimental` | Enable experimental features (default: false) | + +## Configuration Examples + +### Basic Configuration + + + + + + +```json Direct SHTTP +{ + "mcpServers": { + "browserbase": { + "url": "your-smithery-url.com" + } + } +} +``` + + +When using our remote hosted server, we provide the LLM costs for Gemini, the [best performing model](https://www.stagehand.dev/evals) in [Stagehand](https://www.stagehand.dev). + + + + +```json +{ + "mcpServers": { + "browserbase": { + "command": "npx", + "args": ["@browserbasehq/mcp-server-browserbase"], + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id", + "GEMINI_API_KEY": "your_gemini_api_key" + } + } + } +} +``` + + + +```json +{ + "mcpServers": { + "browserbase": { + "command": "node", + "args": ["/path/to/mcp-server-browserbase/cli.js"], + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id", + "GEMINI_API_KEY": "your_gemini_api_key" + } + } + } +} +``` + + + +```bash +# Start server +node cli.js --port 8931 +``` + +```json +{ + "mcpServers": { + "browserbase": { + "url": "http://localhost:8931/mcp", + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id", + "GEMINI_API_KEY": "your_gemini_api_key" + } + } + } +} +``` + + + +### Advanced Features + + + +Enable Browserbase proxies for IP rotation and geo-location testing. + + +[Learn more about Browserbase Proxies](https://docs.browserbase.com/features/proxies) + + +```json +{ + "mcpServers": { + "browserbase": { + "command": "npx", + "args": ["@browserbasehq/mcp-server-browserbase", "--proxies"], + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id", + "GEMINI_API_KEY": "your_gemini_api_key" + } + } + } +} +``` + + + +Enable advanced anti-detection features for enhanced stealth browsing. + + +[Learn more about Advanced Stealth](https://docs.browserbase.com/features/stealth-mode#advanced-stealth-mode) + +**Note:** Advanced Stealth is only available for Scale Plan users. + + +```json +{ + "mcpServers": { + "browserbase": { + "command": "npx", + "args": ["@browserbasehq/mcp-server-browserbase", "--advancedStealth"], + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id", + "GEMINI_API_KEY": "your_gemini_api_key" + } + } + } +} +``` + + + +Use persistent browser contexts to maintain authentication and state across sessions. + + +[Learn more about Browserbase Contexts](https://docs.browserbase.com/features/contexts) + + +```json +{ + "mcpServers": { + "browserbase": { + "command": "npx", + "args": ["@browserbasehq/mcp-server-browserbase", "--contextId", "your_context_id"], + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id" + } + } + } +} +``` + + + +### Browser Customization + + + +Customize browser window dimensions. Default is 1288x711. Recommended aspect ratios: 16:9. + +```json +{ + "mcpServers": { + "browserbase": { + "command": "npx", + "args": [ + "@browserbasehq/mcp-server-browserbase", + "--browserWidth", "1920", + "--browserHeight", "1080" + ], + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id", + "GEMINI_API_KEY": "your_gemini_api_key" + } + } + } +} +``` + +**Common Resolutions:** +- Desktop: 1920x1080, 1280x720, 1024x768 +- Mobile: 375x667 (iPhone), 360x640 (Android) +- Tablet: 768x1024 (iPad) + + + + +## Model Configuration + +Configure AI models for enhanced browser automation. Stagehand defaults to Google's Gemini 2.0 Flash but supports multiple providers. + + +When using any custom model (non-default), you must provide your own API key for that model provider using the `--modelApiKey` flag. + + + + +**Google Gemini** (Default) +- `gemini-2.0-flash` (default) +- `gemini-2.5-pro` +- `gemini-2.5-flash` +- `gemini-2.5-flash-lite` + +**OpenAI** +- `gpt-5-2025-08-07` +- `gpt-4.1-2025-04-14` +- `gpt-4o` +- `gpt-4o-mini` + +**Anthropic Claude** +- `claude-sonnet-4-5` +- `claude-haiku-4-5` + +[View full list of supported models](https://docs.stagehand.dev/v3/configuration/models#models) + + + + +```json OpenAI GPT-4o +{ + "mcpServers": { + "browserbase": { + "command": "npx", + "args": [ + "@browserbasehq/mcp-server-browserbase", + "--modelName", "gpt-4o", + "--modelApiKey", "your_openai_api_key" + ], + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id" + } + } + } +} +``` + +```json Claude Sonnet +{ + "mcpServers": { + "browserbase": { + "command": "npx", + "args": [ + "@browserbasehq/mcp-server-browserbase", + "--modelName", "claude-3-5-sonnet-latest", + "--modelApiKey", "your_anthropic_api_key" + ], + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id" + } + } + } +} +``` + + + + +## Development Configuration + + + +Configure custom host and port for SHTTP transport. + +```json +{ + "mcpServers": { + "browserbase": { + "command": "npx", + "args": [ + "@browserbasehq/mcp-server-browserbase", + "--host", "0.0.0.0", + "--port", "8080" + ], + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id", + "GEMINI_API_KEY": "your_gemini_api_key" + } + } + } +} +``` + + + +## Best Practices + + +- Use appropriate viewport sizes for your use case +- Enable proxies only when needed for geo-location +- Choose efficient models (Gemini Flash for speed, GPT-4o for accuracy) +- Reuse contexts for authentication persistence + + + +- Store API keys securely in environment variables +- Use Advanced Stealth for sensitive operations +- Implement proper session management +- Rotate cookies and contexts regularly + + + +- Enable debug mode during development +- Use context persistence for faster iteration +- Test with different viewport sizes +- Monitor session usage and quotas + + + +- Use NPM installation for reliability +- Configure appropriate timeouts +- Implement error handling and retries +- Monitor performance and resource usage + + +## Further Reading + + + +Complete platform documentation + + + +AI-powered browser automation + + + +Get help from our team + + \ No newline at end of file diff --git a/packages/docs/v3/integrations/mcp/introduction.mdx b/packages/docs/v3/integrations/mcp/introduction.mdx new file mode 100644 index 000000000..6c969bdf9 --- /dev/null +++ b/packages/docs/v3/integrations/mcp/introduction.mdx @@ -0,0 +1,192 @@ +--- +title: "Browserbase MCP Server" +sidebarTitle: "Introduction" +description: "AI-powered browser automation through Model Context Protocol integration with Stagehand" +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +## Overview + +The Browserbase MCP Server brings powerful browser automation capabilities to Claude through the Model Context Protocol (MCP). Built on top of [Stagehand](https://docs.stagehand.dev/), this integration provides AI-powered web automation using natural language commands. + + +This server enables Claude to control browsers, navigate websites, interact with web elements, and extract data—all through simple conversational commands. + + +## Key Features + + + +Control browsers using plain English commands like "click the login button" or "fill out the contact form" + + + +Navigate, click, and fill forms with ease + + + +Extract structured data from any website automatically + + + + +Capture and analyze webpage screenshots programmatically + + + + +## Core Benefits + + + + + +No need to learn complex selectors or automation syntax. Simply describe what you want to do in natural language. + + + +Get started in minutes with our NPM package or our remote hosted URL. + + + +Stagehand's AI understands web page context and can adapt to different layouts and designs. + + + + + + + +Navigate, click, type, scroll, and interact with any web element. + + + +Extract structured information from complex web pages automatically. + + + +Maintain authentication states and context across multiple interactions. + + + + + + + +Built on Browserbase's cloud browser platform for consistent performance. + + + +Handle multiple concurrent sessions and high-volume automation tasks. + + + +Stealth mode, proxy support, and advanced anti-detection capabilities. + + + +Detailed session recordings and debugging information. + + + + + +## Use Cases + + + + + +Track product prices, availability, and competitor information + + + +Gather data from multiple sources for analysis and reporting + + + +Collect articles, posts, and media from various websites + + + +Extract contact information and business data from directories + + + + + + + +Create comprehensive test suites for web applications + + + +Simulate real user interactions and workflows + + + +Track page load times and user experience metrics + + + + + + + +Automatically fill and submit complex web forms + + + +Extract data and generate automated reports + + + +Schedule posts and monitor engagement across platforms + + + +Automate repetitive web-based business processes + + + + + +## Getting Started + + + +Choose from NPM installation, remote hosted URL, or local development based on your needs. + + + +Set up your Browserbase API credentials in the MCP configuration. +Get your API keys from the [Browserbase Dashboard](https://www.browserbase.com/overview). + + + +Begin using natural language commands to control browsers through Claude. + + + + +Ready to get started? Check out our [Setup Guide](/integrations/mcp/setup) for detailed installation instructions. + + +## Further Reading + + + +Get started with installation and configuration + + + +Explore all available automation tools + + + +Customize your browser automation setup + + \ No newline at end of file diff --git a/packages/docs/v3/integrations/mcp/setup.mdx b/packages/docs/v3/integrations/mcp/setup.mdx new file mode 100644 index 000000000..3d8c17ea0 --- /dev/null +++ b/packages/docs/v3/integrations/mcp/setup.mdx @@ -0,0 +1,201 @@ +--- +title: "Browserbase MCP Server Setup" +sidebarTitle: "Setup" +description: "Add the Browserbase MCP Server to Claude" +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +## Quick Installation + + +One-click installation directly in Cursor with pre-configured settings + + +We support multiple transport methods for our MCP server: STDIO and SHTTP. We recommend using SHTTP with our remote hosted URL to take advantage of the server at full capacity. + +## Prerequisites + + + +Get your Browserbase API key and project ID from the [Browserbase Dashboard](https://www.browserbase.com/overview). + + +Browserbase API Key and Project ID settings + + +Then copy your API Key and Project ID directly from the input. + + + +## Installation Methods + + + + +Go to [smithery.ai](https://smithery.ai/server/@browserbasehq/mcp-browserbase) and enter your API keys and configuration to get a remote hosted URL. + +![Smithery](../../../images/mcp/smithery.jpg) + + +```json Smithery +{ + "mcpServers": { + "browserbase": { + "url": "your-smithery-url.com" + } + } +} +``` + + +When using our remote hosted server, we provide the LLM costs for Gemini, the [best performing model](https://www.stagehand.dev/evals) in [Stagehand](https://www.stagehand.dev). + + + + +The easiest way to get started locally is using our NPM package. + + +If you would like to use a different model, you have to pass the model name and keys in the args. More info [here](https://docs.browserbase.com/integrations/mcp/configuration). + + + + +Go into your MCP Config JSON and add the Browserbase Server: + + +```json Claude Desktop +{ + "mcpServers": { + "browserbase": { + "command": "npx", + "args": ["@browserbasehq/mcp-server-browserbase"], + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id", + "GEMINI_API_KEY": "your_gemini_api_key" + } + } + } +} +``` + + + + + +That's it! Reload your MCP client and Claude will be able to use Browserbase. + + + + + + +For local development or customization, you can run the server locally. + + + +```bash +# Clone the Repo +git clone https://github.com/browserbase/mcp-server-browserbase.git +cd mcp-server-browserbase + +# Install the dependencies and build the project +npm install && npm run build +``` + + + +You can run locally using either STDIO or Streamable HTTP (SHTTP). + + + +Add the following to your MCP Config JSON file: + +```json +{ + "mcpServers": { + "browserbase": { + "command": "node", + "args": ["/path/to/mcp-server-browserbase/cli.js"], + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id", + "GEMINI_API_KEY": "your_gemini_api_key" + } + } + } +} +``` + + + +First, run the server: + +```bash +node cli.js --port 8931 +``` + +Then add this to your MCP Config JSON file: + +```json +{ + "mcpServers": { + "browserbase": { + "url": "http://localhost:8931/mcp", + "env": { + "BROWSERBASE_API_KEY": "your_api_key", + "BROWSERBASE_PROJECT_ID": "your_project_id", + "GEMINI_API_KEY": "your_gemini_api_key" + } + } + } +} +``` + + + + + + +Reload your MCP client and you should be good to go! + + + + + + +## Verify Installation + + + +Restart/refresh your Claude Client app and you should see the tools available by clicking the 🔨 icon. + + + +Get started using our MCP Server by asking Claude to navigate to any page and see your Browserbase Browser in action on the [dashboard](https://www.browserbase.com/sessions). + + +Try asking Claude: "Navigate to google.com and take a screenshot" + + + + +## Further Reading + + + +Learn more about the MCP protocol + + + +Explore Browserbase features and capabilities + + + +Get help from our support team + + \ No newline at end of file diff --git a/packages/docs/v3/integrations/mcp/tools.mdx b/packages/docs/v3/integrations/mcp/tools.mdx new file mode 100644 index 000000000..dcd014280 --- /dev/null +++ b/packages/docs/v3/integrations/mcp/tools.mdx @@ -0,0 +1,121 @@ +--- +title: "Browserbase MCP Server Tools" +sidebarTitle: "Tools" +description: "This guide covers the specialized tools available in the Browserbase MCP server for browser automation and interaction." +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +## Overview + +The Browserbase MCP server provides comprehensive tools for browser automation and session management. These tools allow you to perform actions like navigating pages, capturing screenshots, manipulating cookies, and managing multiple browser sessions simultaneously. + +## Core Browser Automation Tools + +These are the primary tools for modern web automation using natural language commands. + + +Navigate to any URL in the browser + + + The URL to navigate to + + + + +Perform an action on the web page using natural language + + + The action to perform (e.g., "click the login button", "fill form field") + + + + + +Extract all text content from the current page (filters out CSS and JavaScript) + +No input parameters required + + + Extracted text content from the current page + + + + +Observe and find actionable elements on the web page + + + Specific instruction for observation (e.g., "find the login button", "locate search form") + + + + + +Capture a PNG screenshot of the current page + +No input parameters required + + + Base-64 encoded PNG data + + + + +Get the current URL of the browser page + +No input parameters required + + + Complete URL including protocol, domain, path, and any query parameters or fragments + + + +## Session Management + +Manage your browser session lifecycle with create and close operations. + + +Create or reuse a cloud browser session using Browserbase with fully initialized Stagehand + + + Optional session ID to use/reuse. If not provided, creates new session + + + + + +Close the current Browserbase session, disconnect the browser, and cleanup Stagehand instance + +No input parameters required + + + +## Resources + + + The server provides access to screenshot resources with URI-based access. + + example: + ``` + screenshot://screenshot-name-of-the-screenshot + ``` + + + +## Further Reading + + + +Learn more about the MCP protocol + + + +Explore Stagehand's AI-powered browser automation + + + +Get help from our support team + + \ No newline at end of file diff --git a/packages/docs/v3/integrations/playwright.mdx b/packages/docs/v3/integrations/playwright.mdx new file mode 100644 index 000000000..7a4d2f4d1 --- /dev/null +++ b/packages/docs/v3/integrations/playwright.mdx @@ -0,0 +1,207 @@ +--- +title: Playwright +description: Use Stagehand with Playwright for browser automation +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +## Overview + +Stagehand v3 can work seamlessly with Playwright, allowing you to use Playwright's `Page` objects directly with Stagehand's AI-powered methods like `act()`, `extract()`, and `observe()`. + +## Installation + +First, install both Stagehand and Playwright: + +```bash +npm install @browserbasehq/stagehand playwright-core +``` + +## Quickstart + +### Basic Setup + +Connect Playwright to Stagehand's browser instance using Chrome DevTools Protocol (CDP): + +```typescript +import { Stagehand } from "@browserbasehq/stagehand"; +import { chromium } from "playwright-core"; + +const stagehand = new Stagehand({ + env: "BROWSERBASE", // or "LOCAL" + model: "openai/gpt-5", +}); + +await stagehand.init(); + +// Connect Playwright to Stagehand's browser +const browser = await chromium.connectOverCDP({ + wsEndpoint: stagehand.connectURL(), +}); + +const pwContext = browser.contexts()[0]; +const pwPage = pwContext.pages()[0]; +``` + +### Using Playwright Pages with Stagehand + +Once connected, you can use Playwright's `Page` objects with Stagehand's AI-powered methods: + +```typescript +// Navigate using Playwright +await pwPage.goto("https://example.com"); + +// Use Stagehand's AI methods with the Playwright page +await stagehand.act("click the login button", { page: pwPage }); + +const data = await stagehand.extract( + "extract the article title", + z.object({ title: z.string() }), + { page: pwPage } +); +``` + +## Multi-Page Example + +Stagehand works great with multiple Playwright pages: + +```typescript +import { Stagehand } from "@browserbasehq/stagehand"; +import { chromium } from "playwright-core"; +import { z } from "zod/v3"; + +// Initialize Stagehand +const stagehand = new Stagehand({ + env: "BROWSERBASE", + model: "openai/gpt-5", +}); + +await stagehand.init(); + +// Connect Playwright +const browser = await chromium.connectOverCDP({ + wsEndpoint: stagehand.connectURL(), +}); + +const pwContext = browser.contexts()[0]; +const pwPage1 = pwContext.pages()[0]; + +// Create a second page +const pwPage2 = await pwContext.newPage(); + +// Navigate both pages +await pwPage1.goto("https://docs.stagehand.dev/first-steps/introduction"); +await pwPage2.goto("https://docs.stagehand.dev/configuration/observability"); + +// Extract data from both pages concurrently +const [page1Data, page2Data] = await Promise.all([ + stagehand.extract( + "extract the names of the four stagehand primitives", + z.array(z.string()), + { page: pwPage1 } + ), + stagehand.extract( + "extract the list of session dashboard features", + z.array(z.string()), + { page: pwPage2 } + ), +]); + +console.log("Page 1 primitives:", page1Data); +console.log("Page 2 features:", page2Data); +``` + +## Complete Example + +Here's a full working example: + +```typescript +import { Stagehand } from "@browserbasehq/stagehand"; +import { chromium } from "playwright-core"; +import { z } from "zod/v3"; + +async function main() { + // Initialize Stagehand + const stagehand = new Stagehand({ + env: "BROWSERBASE", + model: "openai/gpt-5", + verbose: 1, + }); + + await stagehand.init(); + console.log("Stagehand initialized"); + + // Connect Playwright to Stagehand's browser + const browser = await chromium.connectOverCDP({ + wsEndpoint: stagehand.connectURL(), + }); + + const pwContext = browser.contexts()[0]; + const pwPage = pwContext.pages()[0]; + + // Navigate and interact + await pwPage.goto("https://example.com"); + + // Use Stagehand's AI methods + const actions = await stagehand.observe("find the main heading", { + page: pwPage, + }); + + console.log("Found actions:", actions); + + // Extract data + const heading = await stagehand.extract( + "extract the main heading text", + z.object({ heading: z.string() }), + { page: pwPage } + ); + + console.log("Heading:", heading); + + // Cleanup + await stagehand.close(); +} + +main(); +``` + +## Key Points + +- **Connect via CDP**: Use `chromium.connectOverCDP()` with `stagehand.connectURL()` as the WebSocket endpoint +- **Pass the page**: Always pass the Playwright `page` object to Stagehand methods using the `{ page }` option +- **Multi-page support**: Create multiple pages with `pwContext.newPage()` and pass them to Stagehand methods +- **Concurrent operations**: Use `Promise.all()` to run multiple Stagehand operations in parallel across different pages + +## Environment Variables + +When using Browserbase, set your credentials: + +```bash +BROWSERBASE_API_KEY=your_api_key +BROWSERBASE_PROJECT_ID=your_project_id +``` + +For OpenAI (or other providers): + +```bash +OPENAI_API_KEY=your_api_key +``` + +## Next Steps + + + + Automate entire workflows + + + Execute actions on web pages + + + Extract structured data from pages + + + Observe and find elements on pages + + diff --git a/packages/docs/v3/integrations/puppeteer.mdx b/packages/docs/v3/integrations/puppeteer.mdx new file mode 100644 index 000000000..b6c6df42b --- /dev/null +++ b/packages/docs/v3/integrations/puppeteer.mdx @@ -0,0 +1,181 @@ +--- +title: Puppeteer +description: Use Stagehand with Puppeteer for browser automation +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +## Overview + +Stagehand v3 can work seamlessly with Puppeteer, allowing you to use Puppeteer's `Page` objects directly with Stagehand's AI-powered methods like `act()`, `extract()`, and `observe()`. + +## Installation + +First, install both Stagehand and Puppeteer: + +```bash +npm install @browserbasehq/stagehand puppeteer-core +``` + +## Quickstart + +### Basic Setup + +Connect Puppeteer to Stagehand's browser instance: + +```typescript +import { Stagehand } from "@browserbasehq/stagehand"; +import puppeteer from "puppeteer-core"; + +const stagehand = new Stagehand({ + env: "LOCAL", // or "BROWSERBASE" + model: "openai/gpt-5", +}); + +await stagehand.init(); + +// Connect Puppeteer to Stagehand's browser +const browser = await puppeteer.connect({ + browserWSEndpoint: stagehand.connectURL(), + defaultViewport: null, +}); + +const pages = await browser.pages(); +const ppPage = pages[0]; +``` + +### Using Puppeteer Pages with Stagehand + +Once connected, you can use Puppeteer's `Page` objects with Stagehand's AI-powered methods: + +```typescript +// Navigate using Puppeteer +await ppPage.goto("https://example.com"); + +// Use Stagehand's AI methods with the Puppeteer page +await stagehand.act("click the sign in button", { page: ppPage }); + +const data = await stagehand.extract( + "extract the page title", + z.object({ title: z.string() }), + { page: ppPage } +); +``` + +## Advanced: Multi-Page Usage + +Create and manage multiple Puppeteer pages with Stagehand: + +```typescript +import { Stagehand } from "@browserbasehq/stagehand"; +import puppeteer from "puppeteer-core"; +import { z } from "zod/v3"; + +async function multiPageExample() { + const stagehand = new Stagehand({ + env: "BROWSERBASE", + model: "openai/gpt-5", + }); + + await stagehand.init(); + + // Connect Puppeteer + const browser = await puppeteer.connect({ + browserWSEndpoint: stagehand.connectURL(), + defaultViewport: null, + }); + + // Get the first page + const pages = await browser.pages(); + const ppPage1 = pages[0]; + + // Create a second page + const ppPage2 = await browser.newPage(); + + // Navigate both pages + await ppPage1.goto("https://example.com"); + await ppPage2.goto("https://another-site.com"); + + // Use Stagehand on different pages + await stagehand.act("click the button", { page: ppPage1 }); + + const data = await stagehand.extract( + "extract the title", + z.object({ title: z.string() }), + { page: ppPage2 } + ); + + console.log("Extracted from page 2:", data); + + await stagehand.close(); +} +``` + +## Observe + Act Pattern + +The recommended pattern for reliable automation: + +```typescript +// Step 1: Observe to find candidate actions +const actions = await stagehand.observe( + "find the submit button", + { page: ppPage } +); + +// Step 2: Execute the first action +if (actions.length > 0) { + await stagehand.act(actions[0], { page: ppPage }); +} +``` + +This pattern helps avoid DOM changes between observation and action execution. + +## Key Points + +- **Connect via WebSocket**: Use `puppeteer.connect()` with `stagehand.connectURL()` as the `browserWSEndpoint` +- **Pass the page**: Always pass the Puppeteer `page` object to Stagehand methods using the `{ page }` option +- **Disable viewport**: Set `defaultViewport: null` to use Stagehand's viewport settings +- **Multi-page support**: Create multiple pages with `browser.newPage()` and pass them to Stagehand methods + +## Environment Variables + +When using Browserbase, set your credentials: + +```bash +BROWSERBASE_API_KEY=your_api_key +BROWSERBASE_PROJECT_ID=your_project_id +``` + +For OpenAI (or other providers): + +```bash +OPENAI_API_KEY=your_api_key +``` + +## Comparison: Stagehand Native vs Puppeteer + +| Feature | Stagehand Native | With Puppeteer | +|---------|------------------|----------------| +| **Setup** | Simple - use `stagehand.context.pages()` | Requires `puppeteer.connect()` | +| **Page Access** | `stagehand.context.pages()[0]` | `await browser.pages()` | +| **AI Methods** | `stagehand.act("click")` | `stagehand.act("click", { page: ppPage })` | +| **Best For** | Pure Stagehand workflows | Existing Puppeteer codebases | + +## Next Steps + + + + Automate entire workflows + + + Execute actions on web pages + + + Extract structured data from pages + + + Observe and find elements on pages + + diff --git a/packages/docs/v3/integrations/selenium.mdx b/packages/docs/v3/integrations/selenium.mdx new file mode 100644 index 000000000..528321485 --- /dev/null +++ b/packages/docs/v3/integrations/selenium.mdx @@ -0,0 +1,132 @@ +--- +title: Selenium +description: Use Stagehand with Selenium to operate the same browser in tandem +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + +## Overview + +Stagehand v3 can work alongside Selenium WebDriver, allowing both tools to operate on the same browser session simultaneously. This enables you to combine Stagehand's AI-powered automation with Selenium's precise element interactions. + + +**Browserbase Only**: This integration requires Browserbase. It does not work with `env: "LOCAL"` because Selenium needs a remote WebDriver endpoint. + + +## Installation + +Install Stagehand, Selenium, and the Browserbase SDK: + +```bash +npm install @browserbasehq/stagehand selenium-webdriver @browserbasehq/sdk +``` + +## Quickstart + +### Create Shared Session + +Use the Browserbase SDK to create a session that both tools can connect to: + +```typescript +import http from "http"; +import { Builder, Key } from "selenium-webdriver"; +import Browserbase from "@browserbasehq/sdk"; +import { Stagehand } from "@browserbasehq/stagehand"; + +const bb = new Browserbase({ + apiKey: process.env.BROWSERBASE_API_KEY, +}); + +// Create shared session +const session = await bb.sessions.create({ + projectId: process.env.BROWSERBASE_PROJECT_ID, +}); + +console.log("Session created:", session.id); +``` + +### Connect Stagehand + +Initialize Stagehand with the session ID: + +```typescript +const stagehand = new Stagehand({ + env: "BROWSERBASE", + browserbaseSessionID: session.id, + model: "openai/gpt-5", + verbose: 2, +}); + +await stagehand.init(); +``` + +### Connect Selenium + +Use a custom HTTP agent with the session's signing key: + +```typescript +// Create custom HTTP agent with signing key +const customHttpAgent = new http.Agent({}); +(customHttpAgent as any).addRequest = (req: any, options: any) => { + req.setHeader("x-bb-signing-key", session.signingKey); + (http.Agent.prototype as any).addRequest.call(customHttpAgent, req, options); +}; + +// Connect Selenium WebDriver +const driver = new Builder() + .forBrowser("chrome") + .usingHttpAgent(customHttpAgent) + .usingServer(session.seleniumRemoteUrl) + .build(); +``` + +### Use Both Tools Together + +Now both Stagehand and Selenium operate on the same browser: + +```typescript +// Navigate with Stagehand +const page = stagehand.context.pages()[0]; +await page.goto("https://www.google.com"); + +// Extract page content with Stagehand AI +const pageContent = await stagehand.extract(); +console.log("Page content:", pageContent); + +// Use Selenium for precise element interaction +const searchBox = await driver.findElement({ name: "q" }); +await searchBox.sendKeys("Browserbase automation"); +await searchBox.sendKeys(Key.RETURN); + +// Wait for results +await driver.sleep(2000); + +console.log("Search completed!"); +``` + +## Key Points + +- **Shared Session**: Both tools connect to the same Browserbase session +- **Signing Key**: Selenium requires the session's `signingKey` in HTTP headers +- **Remote URL**: Use `session.seleniumRemoteUrl` for Selenium's server endpoint +- **Concurrent Usage**: Both tools can operate on the browser simultaneously +- **Cleanup**: Close both Stagehand (`await stagehand.close()`) and Selenium (`await driver.quit()`) + +## Next Steps + + + + Automate entire workflows + + + Execute actions on web pages + + + Extract structured data from pages + + + Observe and find elements on pages + + diff --git a/packages/docs/v3/integrations/vercel/configuration.mdx b/packages/docs/v3/integrations/vercel/configuration.mdx new file mode 100644 index 000000000..d292a9c04 --- /dev/null +++ b/packages/docs/v3/integrations/vercel/configuration.mdx @@ -0,0 +1,247 @@ +--- +title: Use Stagehand in Next.js +sidebarTitle: Configuration +description: Next.js is a popular framework for developing web-based applications in production. It powers Stagehand apps like [Director](https://director.ai), [Brainrot](https://brainrot.run) and [Open Operator](https://operator.browserbase.com). +--- +import { V3Banner } from '/snippets/v3-banner.mdx'; + + + + + + Clone our [GitHub repo](https://github.com/browserbase/stagehand-nextjs-quickstart) to get started with Stagehand (v2) in Next.js. + + +## Add Stagehand to an existing Next.js project +If you'd like to start from scratch, you can run: + + + +```bash +npm create next-app@latest stagehand-nextjs --yes +cd stagehand-nextjs +``` + + +```bash +pnpm create next-app@latest stagehand-nextjs --yes +cd stagehand-nextjs +``` + + +```bash +yarn create next-app@latest stagehand-nextjs --yes +cd stagehand-nextjs +``` + + + +If you'd like to add Stagehand to an existing Next.js project, you can do so by installing the dependencies: + + + ```bash + npm install @browserbasehq/stagehand @browserbasehq/sdk playwright zod + ``` + + + + ```bash + pnpm add @browserbasehq/stagehand @browserbasehq/sdk playwright zod + ``` + + + + ```bash + yarn add @browserbasehq/stagehand @browserbasehq/sdk playwright zod + ``` + + + +### Add environment variables +Next, let's add the environment variables to a `.env` file. +```env +BROWSERBASE_API_KEY=your-browserbase-api-key +BROWSERBASE_PROJECT_ID=your-browserbase-project-id +OPENAI_API_KEY=your-openai-api-key +``` + +### Write a server action +Next, let's define our `main` function as a server action in `app/stagehand/main.ts`. This file will have the following three functions: + +1. **`main`: Run the main Stagehand script** +2. **`runStagehand`: Initialize and run the `main` function** +3. **`startBBSSession`: Start a Browserbase session** + +```ts app/stagehand/main.ts +// 🤘 Welcome to Stagehand! +// This file is from the [Stagehand docs](https://docs.stagehand.dev/sections/examples/nextjs). + +"use server"; + +import { Stagehand } from "@browserbasehq/stagehand"; +import { z } from "zod/v3"; +import { Browserbase } from "@browserbasehq/sdk"; + +/** + * Run the main Stagehand script + */ +async function main(stagehand: Stagehand) { + // You can use the `page` instance to write any Playwright code + // For more info: https://playwright.dev/docs/pom + const page = stagehand.context.activePage(); + + // In this example, we'll get the title of the Stagehand quickstart page + await page?.goto("https://docs.stagehand.dev/"); + await stagehand.act("click the quickstart link"); + const { title } = await stagehand.extract( + "extract the main heading of the page", + z.object({ + title: z.string(), + }), + ); + + return title; +} + +/** + * Initialize and run the main() function + */ +export async function runStagehand(sessionId?: string) { + const stagehand = new Stagehand({ + env: "BROWSERBASE", + apiKey: process.env.BROWSERBASE_API_KEY, + projectId: process.env.BROWSERBASE_PROJECT_ID, + verbose: 1, + logger: console.log, + browserbaseSessionID: sessionId, + disablePino: true, + }); + await stagehand.init(); + const result = await main(stagehand); + console.log(result); + await stagehand.close(); +} + +/** + * Start a Browserbase session + */ +export async function startBBSSession() { + const browserbase = new Browserbase(); + const session = await browserbase.sessions.create({ + projectId: process.env.BROWSERBASE_PROJECT_ID!, + }); + const debugUrl = await browserbase.sessions.debug(session.id); + return { + sessionId: session.id, + debugUrl: debugUrl.debuggerFullscreenUrl, + }; +} +``` + +### Create a client component +Next, let's create a client component that will start a Browserbase session and run the `main` function with the server actions we just defined. We'll first create a Browserbase session and embed the session in an iframe before running the `main` function. + +```tsx app/components/stagehandEmbed.tsx +"use client"; + +import { useCallback, useState } from "react"; +import { runStagehand, startBBSSession } from "@/app/stagehand/main"; + +export function StagehandEmbed() { + const [sessionId, setSessionId] = useState(null); + const [debugUrl, setDebugUrl] = useState(null); + + const startSession = useCallback(async () => { + const { sessionId, debugUrl } = await startBBSSession(); + setSessionId(sessionId); + setDebugUrl(debugUrl); + await runStagehand(sessionId); + }, []); + + return ( +
+ {!sessionId && } + {sessionId && debugUrl && ( +