Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,11 @@ jobs:
- run:
name: Verify all files are compressed
command: ./bin/assert-compressed.sh
- run:
name: Test content negotiation for markdown
command: |
export PATH="$PWD/bin:$PWD/buildpack/build/.heroku-buildpack-nginx/ruby/bin:$PATH"
./bin/assert-content-negotiation.sh
- run:
name: Test content request auth tokens
command: |
Expand Down
162 changes: 162 additions & 0 deletions bin/assert-content-negotiation.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
#!/bin/bash

# Content Negotiation Test Suite
# Verifies that nginx serves markdown or HTML based on Accept header

source "$(dirname "$0")/nginx-utils.sh"
trap stop_nginx EXIT

set -euo pipefail

# Disable auth for content negotiation tests
export ENABLE_BASIC_AUTH=false
export CONTENT_REQUEST_AUTH_TOKENS=""

# Set default port if not already set
export PORT=${PORT:-3001}

# Test helper function
# Parameters:
# $1: path - URL path to test
# $2: accept_header - Accept header value (empty string for default)
# $3: expected_status - Expected HTTP status code
# $4: expected_format - "html", "markdown", or "any"
# $5: test_name - Human-readable test description
# $6: user_agent - Optional User-Agent string
run_test() {
local path="$1"
local accept_header="$2"
local expected_status="$3"
local expected_format="$4"
local test_name="$5"
local user_agent="${6:-}"

echo "🧪 $test_name"

# Build curl command with optional Accept header and User-Agent
local curl_cmd="curl --silent --header \"X-Forwarded-Proto: https\""

if [ -n "$user_agent" ]; then
curl_cmd="$curl_cmd --user-agent \"$user_agent\""
fi

if [ -n "$accept_header" ]; then
curl_cmd="$curl_cmd --header \"Accept: $accept_header\""
fi

curl_cmd="$curl_cmd --write-out \"\\n%{http_code}\\n%{content_type}\""
curl_cmd="$curl_cmd \"http://localhost:\${PORT}\${path}\""

# Execute request and capture response + metadata
local response
response=$(eval "$curl_cmd")

# Parse response components
local body=$(echo "$response" | sed '$d' | sed '$d')
local status=$(echo "$response" | tail -2 | head -1)
local content_type=$(echo "$response" | tail -1)

# Assert status code
if [ "$status" != "$expected_status" ]; then
echo " ❌ Expected status $expected_status, got $status"
exit 1
fi

# Verify content format
if [ "$expected_format" = "markdown" ]; then
# Check for markdown heading (first line should start with #)
local first_line=$(echo "$body" | head -1)
if ! grep -q "^#" <<< "$first_line"; then
echo " ❌ Expected markdown (starting with #), got: ${first_line:0:50}"
exit 1
fi

# Verify Content-Type header (warning only, not fatal)
if ! grep -q "text/markdown" <<< "$content_type"; then
echo " ⚠️ Warning: Content-Type is '$content_type', expected 'text/markdown'"
fi
elif [ "$expected_format" = "html" ]; then
# Check for HTML doctype using here-string to avoid broken pipe
if ! grep -q "<!DOCTYPE html>" <<< "$body"; then
echo " ❌ Expected HTML (with DOCTYPE), but not found"
exit 1
fi
fi
# "any" format means we don't validate content

echo " ✅ Passed (status: $status, format: $expected_format)"
}

# Main test suite
echo "================================"
echo "Content Negotiation Test Suite"
echo "================================"
echo

start_nginx

# Group 1: Basic Content Negotiation
echo "Group 1: Basic Content Negotiation"
echo "-----------------------------------"
run_test "/docs/channels" "" "200" "html" "Default serves HTML"
run_test "/docs/channels" "text/markdown" "200" "markdown" "Accept: text/markdown"
run_test "/docs/channels" "application/markdown" "200" "markdown" "Accept: application/markdown"
run_test "/docs/channels" "text/plain" "200" "markdown" "Accept: text/plain"
run_test "/docs/channels" "text/html" "200" "html" "Accept: text/html"
run_test "/docs/channels" "*/*" "200" "html" "Accept: */*"
echo

# Group 2: Browser Behavior
echo "Group 2: Browser Behavior"
echo "-------------------------"
run_test "/docs/channels" "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" "200" "html" "Browser Accept header"
run_test "/docs/channels" "text/html, text/markdown" "200" "html" "HTML prioritized when first"
echo

# Group 3: Direct Access
echo "Group 3: Direct Access"
echo "----------------------"
run_test "/docs/channels.md" "" "200" "markdown" "Direct .md access"
run_test "/docs/channels/index.html" "" "200" "html" "Direct index.html access"
echo

# Group 4: Path Variations
echo "Group 4: Path Variations"
echo "------------------------"
run_test "/docs/chat/connect" "text/markdown" "200" "markdown" "Non-index path"
run_test "/docs/api/realtime-sdk" "text/markdown" "200" "markdown" "Nested index path"
run_test "/docs/basics" "text/markdown" "200" "markdown" "Simple path"
echo

# Group 5: Edge Cases
echo "Group 5: Edge Cases"
echo "-------------------"
run_test "/docs/nonexistent" "" "404" "any" "404 when path missing"
run_test "/docs/nonexistent" "text/markdown" "404" "any" "404 with markdown Accept"
run_test "/llms.txt" "" "200" "any" "Non-docs paths unaffected"
echo

# Group 6: Bot Detection (User-Agent)
echo "Group 6: Bot Detection (User-Agent)"
echo "------------------------------------"
run_test "/docs/channels" "" "200" "markdown" "Claude-User bot gets markdown" "Claude-User/1.0"
run_test "/docs/channels" "" "200" "markdown" "ClaudeBot gets markdown" "Mozilla/5.0 (compatible; ClaudeBot/1.0)"
run_test "/docs/channels" "" "200" "markdown" "ChatGPT-User bot gets markdown" "ChatGPT-User"
run_test "/docs/channels" "" "200" "markdown" "GPTBot gets markdown" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0)"
run_test "/docs/channels" "" "200" "markdown" "PerplexityBot gets markdown" "PerplexityBot"
run_test "/docs/channels" "" "200" "html" "Regular browser gets HTML" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"
echo

# Group 7: Combined Bot + Accept Header
echo "Group 7: Combined Bot + Accept Header"
echo "--------------------------------------"
run_test "/docs/channels" "text/html" "200" "markdown" "Bot overrides Accept: text/html" "Claude-User/1.0"
run_test "/docs/channels" "text/markdown" "200" "markdown" "Bot + markdown Accept both work" "GPTBot/1.0"
echo

echo "================================"
echo "✅ All 23 tests passed!"
echo "================================"

# Exit explicitly with success
exit 0
1 change: 1 addition & 0 deletions config/mime.types
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ types {

text/mathml mml;
text/plain txt;
text/markdown md markdown;
text/vnd.sun.j2me.app-descriptor jad;
text/vnd.wap.wml wml;
text/x-component htc;
Expand Down
91 changes: 85 additions & 6 deletions config/nginx.conf.erb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ http {
gzip on;
gzip_comp_level 6;
gzip_min_length 512;
gzip_types text/plain text/css application/json application/javascript text/xml application/xml application/xml+rss font/woff font/woff2 image/svg+xml;
gzip_types text/plain text/markdown text/css application/json application/javascript text/xml application/xml application/xml+rss font/woff font/woff2 image/svg+xml;
gzip_vary on;
gzip_proxied any; # Heroku router sends Via header

Expand Down Expand Up @@ -62,6 +62,85 @@ http {
<% end %>
}

##
# CONTENT NEGOTIATION FOR MARKDOWN
# Serves markdown to LLM bots and clients that request it via Accept header

# Detect LLM bots by User-Agent
map $http_user_agent $is_llm_bot {
default 0;

# Anthropic / Claude
"~*Claude-User" 1;
"~*ClaudeBot" 1;
"~*anthropic-ai" 1;

# OpenAI / ChatGPT
"~*ChatGPT-User" 1;
"~*GPTBot" 1;

# Perplexity
"~*PerplexityBot" 1;
"~*Perplexity-User" 1;

# Google AI
"~*Google-Extended" 1;
"~*GoogleOther" 1;
"~*Gemini" 1;

# Mistral AI
"~*MistralAI-User" 1;

# Meta / Facebook
"~*Meta-ExternalAgent" 1;

# Amazon
"~*Amazonbot" 1;

# ByteDance / TikTok
"~*Bytespider" 1;
}

# Detect markdown request via Accept header
map $http_accept $wants_markdown_via_accept {
default 0;

# Exact markdown MIME types
"text/markdown" 1;
"application/markdown" 1;
"text/plain" 1;

# Browsers explicitly want HTML (check first before wildcard patterns)
"~*^text/html" 0;

# Accept header contains markdown types
"~*text/markdown" 1;
"~*application/markdown" 1;

# Wildcard gets HTML
"*/*" 0;
}

# Serve markdown if bot detected OR markdown requested via Accept header
# Combines: ${is_llm_bot}${wants_markdown_via_accept} → "00", "01", "10", or "11"
map "${is_llm_bot}${wants_markdown_via_accept}" $docs_file_extension {
default ".html";

# If either variable is 1, serve markdown
"10" ".md"; # Bot detected, no markdown Accept
"01" ".md"; # No bot, markdown Accept
"11" ".md"; # Both
"00" ".html"; # Neither
}

# Translate extension to file path
map $docs_file_extension $docs_try_file {
".html" "$request_uri/index.html";
".md" "$request_uri.md";
}

# / CONTENT NEGOTIATION FOR MARKDOWN

##
# CORS CONFIGURATION

Expand Down Expand Up @@ -231,10 +310,10 @@ http {
<% if content_request_protected %>
# Serve the file if it exists, otherwise try to authenticate
# (.html requests won't match here, they'll go to the @html_auth location)
try_files $request_uri @html_auth;
try_files $request_uri $docs_try_file @html_auth;
<% else %>
# Serve the file if it exists, try index.html for paths without a trailing slash, otherwise 404
try_files $request_uri $request_uri/index.html $request_uri/ =404;
# Serve the file if it exists, try content-negotiated file, then index.html, otherwise 404
try_files $request_uri $docs_try_file $request_uri/index.html $request_uri/ =404;
<% end %>
}

Expand All @@ -252,8 +331,8 @@ http {
<% end %>
}

# If the request is authenticated, break out of the location block and serve the file
try_files $request_uri.html $request_uri/index.html $request_uri/ =404;
# If the request is authenticated, try content-negotiated file first, then fallback to HTML
try_files $docs_try_file $request_uri.html $request_uri/index.html $request_uri/ =404;
}

# Don't serve files with the .html extension here, send them to the canonical location
Expand Down
70 changes: 70 additions & 0 deletions data/onPostBuild/__fixtures__/input.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
---
title: Test Fixture
meta_description: "This is a test description"
redirect_from:
- /old-path
languages:
- javascript
some_other_field: "should be removed"
---

import Something from '../component'
import {
MultiLine,
Import
} from 'module'

export const foo = 'bar';
export default SomeComponent;

{/* This is a JSX comment */}
{/*
Multi-line JSX comment
with multiple lines
*/}

## Basic heading

## Heading with anchor <a id="test-anchor"/>

### Nested heading <a name="nested"/>

<script type="application/ld+json">
{"@context": "https://schema.org"}
</script>

Regular content here.

## Links and images

- [Internal link](/docs/channels)
- [External link](https://example.com)
- [Hash link](#test-anchor)
- ![Relative image](../../../images/content/diagrams/test.png)
- ![Absolute image](/images/content/test.png)
- ![Direct image](images/content/test.png)

## Template variables

Use {{API_KEY}} and {{RANDOM_CHANNEL_NAME}} in your code.

## Code blocks

<Code>
```javascript
const channel = realtime.channels.get('{{RANDOM_CHANNEL_NAME}}');
```
</Code>

Here's a code block with anchors and scripts that should be preserved:
```html
<a id="preserve-this"/>
<script>console.log('preserve this too')</script>
{/* preserve JSX comments in code */}
```

## JSX Components

<Aside data-type='note'>
This component should be preserved as-is.
</Aside>
Loading