From 59f4abca59de11bed24a2ad03348417ef508f55c Mon Sep 17 00:00:00 2001 From: Kenneth Kalmer Date: Wed, 3 Dec 2025 12:15:48 +0000 Subject: [PATCH 1/5] Add MDX to Markdown transpilation during build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements automatic transpilation of MDX documentation files to Markdown format during the Gatsby build process. The transpiled Markdown files are generated in the public/docs/ directory alongside HTML output, making documentation accessible to LLMs and other text-based tools. Key features: - Transpiles all MDX files under src/pages/docs/ to .md format - Removes frontmatter except title (converted to # heading) - Removes import/export statements and script tags - Replaces template variables ({{API_KEY}}, {{RANDOM_CHANNEL_NAME}}) - Preserves JSX components and code blocks as-is - Smart path mapping: index.mdx โ†’ parent.md, file.mdx โ†’ file.md ๐Ÿค– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- data/onPostBuild/index.ts | 2 + data/onPostBuild/transpileMdxToMarkdown.ts | 441 +++++++++++++++++++++ 2 files changed, 443 insertions(+) create mode 100644 data/onPostBuild/transpileMdxToMarkdown.ts diff --git a/data/onPostBuild/index.ts b/data/onPostBuild/index.ts index b5738663c9..e8bc0eaad1 100644 --- a/data/onPostBuild/index.ts +++ b/data/onPostBuild/index.ts @@ -1,5 +1,6 @@ import { GatsbyNode, Reporter } from 'gatsby'; import { onPostBuild as llmstxt } from './llmstxt'; +import { onPostBuild as transpileMdxToMarkdown } from './transpileMdxToMarkdown'; import { onPostBuild as compressAssets } from './compressAssets'; import { validateRedirectFile, REDIRECT_FILE_PATH } from '../utils/validateRedirectFile'; @@ -33,5 +34,6 @@ export const onPostBuild: GatsbyNode['onPostBuild'] = async (args) => { // Run all onPostBuild functions in sequence await llmstxt(args); + await transpileMdxToMarkdown(args); await compressAssets(args); }; diff --git a/data/onPostBuild/transpileMdxToMarkdown.ts b/data/onPostBuild/transpileMdxToMarkdown.ts new file mode 100644 index 0000000000..7924513073 --- /dev/null +++ b/data/onPostBuild/transpileMdxToMarkdown.ts @@ -0,0 +1,441 @@ +import { GatsbyNode } from 'gatsby'; +import * as path from 'path'; +import * as fs from 'fs-extra'; +import frontMatter from 'front-matter'; + +const REPORTER_PREFIX = 'onPostBuild:transpileMdxToMarkdown'; + +interface MdxNode { + parent: { + relativeDirectory: string; + name: string; + absolutePath: string; + }; + internal: { + contentFilePath: string; + }; +} + +interface MdxQueryResult { + site: { + siteMetadata: { + siteUrl: string; + }; + }; + allMdx: { + nodes: MdxNode[]; + }; +} + +interface FrontMatterAttributes { + title?: string; + [key: string]: any; +} + +/** + * Remove import and export statements from content + * Handles both single-line and multi-line statements + */ +function removeImportExportStatements(content: string): string { + return content + // Remove import statements (single and multi-line) + .replace(/^import\s+[\s\S]*?from\s+['"][^'"]+['"];?\s*$/gm, '') + .replace(/^import\s+['"][^'"]+['"];?\s*$/gm, '') + // Remove export statements (single and multi-line) + .replace(/^export\s+\{[\s\S]*?\}\s*;?\s*$/gm, '') + .replace(/^export\s+\{[\s\S]*?\}\s+from\s+['"][^'"]+['"];?\s*$/gm, '') + .replace(/^export\s+(default|const|let|var|function|class)\s+.*$/gm, '') + // Clean up extra blank lines left behind + .replace(/\n\n\n+/g, '\n\n'); +} + +/** + * Remove script tags that are not inside code blocks + */ +function removeScriptTags(content: string): string { + // Split content into code block and non-code-block sections + const parts: Array<{ content: string; isCodeBlock: boolean }> = []; + const fenceRegex = /```[\s\S]*?```/g; + + let lastIndex = 0; + const matches = Array.from(content.matchAll(fenceRegex)); + + for (const match of matches) { + // Add content before code block + if (match.index !== undefined && match.index > lastIndex) { + parts.push({ + content: content.slice(lastIndex, match.index), + isCodeBlock: false, + }); + } + // Add code block itself + parts.push({ + content: match[0], + isCodeBlock: true, + }); + lastIndex = (match.index || 0) + match[0].length; + } + + // Add remaining content after last code block + if (lastIndex < content.length) { + parts.push({ + content: content.slice(lastIndex), + isCodeBlock: false, + }); + } + + // Remove script tags only from non-code-block parts + return parts + .map((part) => { + if (part.isCodeBlock) { + return part.content; // Preserve code blocks exactly + } + // Remove script tags with any attributes and their content + return part.content.replace(/]*>[\s\S]*?<\/script>/gi, ''); + }) + .join(''); +} + +/** + * Remove anchor tags that are used for HTML navigation + * Removes patterns like , , etc. + * Preserves actual links with href attributes + */ +function removeAnchorTags(content: string): string { + // Split content into code block and non-code-block sections + const parts: Array<{ content: string; isCodeBlock: boolean }> = []; + const fenceRegex = /```[\s\S]*?```/g; + + let lastIndex = 0; + const matches = Array.from(content.matchAll(fenceRegex)); + + for (const match of matches) { + if (match.index !== undefined && match.index > lastIndex) { + parts.push({ + content: content.slice(lastIndex, match.index), + isCodeBlock: false, + }); + } + parts.push({ + content: match[0], + isCodeBlock: true, + }); + lastIndex = (match.index || 0) + match[0].length; + } + + if (lastIndex < content.length) { + parts.push({ + content: content.slice(lastIndex), + isCodeBlock: false, + }); + } + + // Remove anchor tags only from non-code-block parts + return parts + .map((part) => { + if (part.isCodeBlock) { + return part.content; // Preserve code blocks exactly + } + + // Remove anchor tags from regular content + return part.content + .replace(//gi, '') + .replace(//gi, '') + .replace(/<\/a>/gi, '') + .replace(/<\/a>/gi, ''); + }) + .join(''); +} + +/** + * Remove JSX comments from content + * Removes patterns like: {slash-star comment star-slash} + * Preserves JSX comments in code blocks + */ +function removeJsxComments(content: string): string { + // Split content into code block and non-code-block sections + const parts: Array<{ content: string; isCodeBlock: boolean }> = []; + const fenceRegex = /```[\s\S]*?```/g; + + let lastIndex = 0; + const matches = Array.from(content.matchAll(fenceRegex)); + + for (const match of matches) { + if (match.index !== undefined && match.index > lastIndex) { + parts.push({ + content: content.slice(lastIndex, match.index), + isCodeBlock: false, + }); + } + parts.push({ + content: match[0], + isCodeBlock: true, + }); + lastIndex = (match.index || 0) + match[0].length; + } + + if (lastIndex < content.length) { + parts.push({ + content: content.slice(lastIndex), + isCodeBlock: false, + }); + } + + // Remove JSX comments only from non-code-block parts + return parts + .map((part) => { + if (part.isCodeBlock) { + return part.content; // Preserve code blocks exactly + } + // Remove JSX comments from regular content + return part.content.replace(/\{\/\*[\s\S]*?\*\/\}/g, ''); + }) + .join(''); +} + +/** + * Convert image paths to GitHub raw URLs + * Handles relative (../), absolute (/images/), and direct (images/) paths + */ +function convertImagePathsToGitHub(content: string): string { + const githubBaseUrl = 'https://raw.githubusercontent.com/ably/docs/main/src'; + + return content + // Handle relative paths: ../../../images/... + .replace( + /!\[([^\]]*)\]\(((?:\.\.\/)+)(images\/[^)]+)\)/g, + (match, altText, relativePath, imagePath) => { + return `![${altText}](${githubBaseUrl}/${imagePath})`; + } + ) + // Handle absolute paths: /images/... + .replace( + /!\[([^\]]*)\]\(\/(images\/[^)]+)\)/g, + (match, altText, imagePath) => { + return `![${altText}](${githubBaseUrl}/${imagePath})`; + } + ) + // Handle direct paths: images/... (no prefix) + .replace( + /!\[([^\]]*)\]\((images\/[^)]+)\)/g, + (match, altText, imagePath) => { + return `![${altText}](${githubBaseUrl}/${imagePath})`; + } + ); +} + +/** + * Convert relative URLs to absolute URLs using the main website domain + * Converts: [text](/docs/channels) โ†’ [text](https://ably.com/docs/channels) + * Preserves: External URLs (http://, https://), hash-only links (#anchor) + */ +function convertRelativeUrls(content: string, siteUrl: string): string { + const baseUrl = siteUrl.replace(/\/$/, ''); // Remove trailing slash + + // Match markdown links: [text](url) + // Only convert URLs that start with / (relative) and are not external URLs or hash-only + return content.replace( + /\[([^\]]+)\]\(([^)]+)\)/g, + (match, linkText, url) => { + // Don't convert external URLs + if (url.startsWith('http://') || url.startsWith('https://')) { + return match; + } + + // Don't convert hash-only anchors + if (url.startsWith('#')) { + return match; + } + + // Convert relative URLs (starting with /) + if (url.startsWith('/')) { + return `[${linkText}](${baseUrl}${url})`; + } + + // Keep other URLs as-is (relative paths without leading /) + return match; + } + ); +} + +/** + * Replace template variables with readable placeholders + */ +function replaceTemplateVariables(content: string): string { + return content + .replace(/{{API_KEY}}/g, 'your-api-key') + .replace(/{{RANDOM_CHANNEL_NAME}}/g, 'your-channel-name'); +} + +/** + * Calculate the output path for a markdown file based on its source location + */ +function calculateOutputPath(relativeDirectory: string, fileName: string): string { + // Remove 'docs' or 'docs/' prefix: "docs/channels" โ†’ "channels", "docs" โ†’ "" + const pathWithoutDocs = relativeDirectory.replace(/^docs\/?/, ''); + const pathParts = pathWithoutDocs.split('/').filter((p) => p); + + if (fileName === 'index') { + // Special case: top-level docs/index.mdx โ†’ public/docs.md + if (pathParts.length === 0) { + return path.join(process.cwd(), 'public', 'docs.md'); + } + + // index.mdx: use parent directory name + // docs/channels/index.mdx โ†’ public/docs/channels.md + const dirName = pathParts.pop(); // Remove and get last element + return path.join(process.cwd(), 'public', 'docs', ...pathParts, `${dirName}.md`); + } else { + // Regular file: use filename + // docs/chat/connect.mdx โ†’ public/docs/chat/connect.md + return path.join(process.cwd(), 'public', 'docs', ...pathParts, `${fileName}.md`); + } +} + +/** + * Transform MDX content to clean Markdown + */ +function transformMdxToMarkdown(sourceContent: string, siteUrl: string): { content: string; title: string } { + // Stage 1: Parse frontmatter + const parsed = frontMatter(sourceContent); + + if (!parsed.attributes.title) { + throw new Error('Missing title in frontmatter'); + } + + const title = parsed.attributes.title; + let content = parsed.body; + + // Stage 2: Remove import/export statements + content = removeImportExportStatements(content); + + // Stage 3: Remove script tags (not in code blocks) + content = removeScriptTags(content); + + // Stage 4: Remove anchor tags + content = removeAnchorTags(content); + + // Stage 5: Remove JSX comments + content = removeJsxComments(content); + + // Stage 6: Convert image paths to GitHub URLs + content = convertImagePathsToGitHub(content); + + // Stage 7: Convert relative URLs to absolute URLs + content = convertRelativeUrls(content, siteUrl); + + // Stage 8: Replace template variables + content = replaceTemplateVariables(content); + + // Stage 9: Prepend title as markdown heading + const finalContent = `# ${title}\n\n${content}`; + + return { content: finalContent, title }; +} + +/** + * Process a single MDX file + */ +async function processFile(node: MdxNode, siteUrl: string, reporter: any): Promise { + const sourcePath = node.internal.contentFilePath; + const relativeDirectory = node.parent.relativeDirectory; + const fileName = node.parent.name; + + // Read source MDX file + const sourceContent = await fs.readFile(sourcePath, 'utf-8'); + + // Transform MDX to Markdown + const { content } = transformMdxToMarkdown(sourceContent, siteUrl); + + // Calculate output path + const outputPath = calculateOutputPath(relativeDirectory, fileName); + + // Ensure output directory exists + await fs.ensureDir(path.dirname(outputPath)); + + // Write markdown file + await fs.writeFile(outputPath, content, 'utf-8'); + + reporter.verbose(`${REPORTER_PREFIX} Transpiled: ${sourcePath} -> ${outputPath}`); +} + +/** + * Main onPostBuild function + */ +export const onPostBuild: GatsbyNode['onPostBuild'] = async ({ graphql, reporter }) => { + const query = ` + query { + site { + siteMetadata { + siteUrl + } + } + allMdx { + nodes { + parent { + ... on File { + relativeDirectory + name + absolutePath + } + } + internal { + contentFilePath + } + } + } + } + `; + + const { data, errors } = await graphql(query); + + if (errors) { + reporter.panicOnBuild( + `${REPORTER_PREFIX} Error running GraphQL query: ${JSON.stringify(errors)}` + ); + return; + } + + if (!data) { + reporter.warn(`${REPORTER_PREFIX} No MDX nodes found`); + return; + } + + // Get siteUrl from GraphQL + const siteUrl = data.site.siteMetadata.siteUrl; + + // Filter to only docs directory + const mdxNodes = data.allMdx.nodes.filter((node) => { + return node.parent.relativeDirectory.startsWith('docs'); + }); + + reporter.info(`${REPORTER_PREFIX} Found ${mdxNodes.length} MDX files to transpile`); + + let successCount = 0; + let failureCount = 0; + + // Process each file + for (const node of mdxNodes) { + try { + await processFile(node, siteUrl, reporter); + successCount++; + } catch (error) { + const errorMessage = error instanceof Error ? error.message : String(error); + reporter.warn( + `${REPORTER_PREFIX} Failed to transpile ${node.internal.contentFilePath}: ${errorMessage}` + ); + failureCount++; + } + } + + // Report summary + if (failureCount > 0) { + reporter.warn( + `${REPORTER_PREFIX} Transpiled ${successCount} files, ${failureCount} failed` + ); + } else { + reporter.info( + `${REPORTER_PREFIX} Successfully transpiled ${successCount} MDX files to Markdown` + ); + } +}; From 6b8e86b3d2e825ad5838063d588cbadf3f7b6fc3 Mon Sep 17 00:00:00 2001 From: Kenneth Kalmer Date: Fri, 5 Dec 2025 17:25:57 +0000 Subject: [PATCH 2/5] Add tests for MDX to Markdown transpilation --- data/onPostBuild/__fixtures__/input.mdx | 70 +++++ .../transpileMdxToMarkdown.test.ts.snap | 55 ++++ .../transpileMdxToMarkdown.test.ts | 284 ++++++++++++++++++ data/onPostBuild/transpileMdxToMarkdown.ts | 64 +++- 4 files changed, 458 insertions(+), 15 deletions(-) create mode 100644 data/onPostBuild/__fixtures__/input.mdx create mode 100644 data/onPostBuild/__snapshots__/transpileMdxToMarkdown.test.ts.snap create mode 100644 data/onPostBuild/transpileMdxToMarkdown.test.ts diff --git a/data/onPostBuild/__fixtures__/input.mdx b/data/onPostBuild/__fixtures__/input.mdx new file mode 100644 index 0000000000..46fda6209b --- /dev/null +++ b/data/onPostBuild/__fixtures__/input.mdx @@ -0,0 +1,70 @@ +--- +title: Test Fixture +meta_description: "This is a test description" +redirect_from: + - /old-path +languages: + - javascript +some_other_field: "should be removed" +--- + +import Something from '../component' +import { + MultiLine, + Import +} from 'module' + +export const foo = 'bar'; +export default SomeComponent; + +{/* This is a JSX comment */} +{/* + Multi-line JSX comment + with multiple lines +*/} + +## Basic heading + +## Heading with anchor + +### Nested heading + + + +Regular content here. + +## Links and images + +- [Internal link](/docs/channels) +- [External link](https://example.com) +- [Hash link](#test-anchor) +- ![Relative image](../../../images/content/diagrams/test.png) +- ![Absolute image](/images/content/test.png) +- ![Direct image](images/content/test.png) + +## Template variables + +Use {{API_KEY}} and {{RANDOM_CHANNEL_NAME}} in your code. + +## Code blocks + + +```javascript +const channel = realtime.channels.get('{{RANDOM_CHANNEL_NAME}}'); +``` + + +Here's a code block with anchors and scripts that should be preserved: +```html + + +{/* preserve JSX comments in code */} +``` + +## JSX Components + + diff --git a/data/onPostBuild/__snapshots__/transpileMdxToMarkdown.test.ts.snap b/data/onPostBuild/__snapshots__/transpileMdxToMarkdown.test.ts.snap new file mode 100644 index 0000000000..43dd0ac5bc --- /dev/null +++ b/data/onPostBuild/__snapshots__/transpileMdxToMarkdown.test.ts.snap @@ -0,0 +1,55 @@ +// Jest Snapshot v1, https://goo.gl/fbAQLP + +exports[`MDX to Markdown Transpilation Full transformation with fixture should transform comprehensive fixture correctly 1`] = ` +"# Test Fixture + + + + + + +## Basic heading + +## Heading with anchor + +### Nested heading + + + +Regular content here. + +## Links and images + +- [Internal link](http://localhost:3000/docs/channels) +- [External link](https://example.com) +- [Hash link](#test-anchor) +- ![Relative image](https://raw.githubusercontent.com/ably/docs/main/src/images/content/diagrams/test.png) +- ![Absolute image](https://raw.githubusercontent.com/ably/docs/main/src/images/content/test.png) +- ![Direct image](https://raw.githubusercontent.com/ably/docs/main/src/images/content/test.png) + +## Template variables + +Use your-api-key and your-channel-name in your code. + +## Code blocks + + +\`\`\`javascript +const channel = realtime.channels.get('your-channel-name'); +\`\`\` + + +Here's a code block with anchors and scripts that should be preserved: +\`\`\`html + + +{/* preserve JSX comments in code */} +\`\`\` + +## JSX Components + + +" +`; diff --git a/data/onPostBuild/transpileMdxToMarkdown.test.ts b/data/onPostBuild/transpileMdxToMarkdown.test.ts new file mode 100644 index 0000000000..3648d593be --- /dev/null +++ b/data/onPostBuild/transpileMdxToMarkdown.test.ts @@ -0,0 +1,284 @@ +import { + transformMdxToMarkdown, + removeImportExportStatements, + removeScriptTags, + removeAnchorTags, + removeJsxComments, + convertImagePathsToGitHub, + convertRelativeUrls, + replaceTemplateVariables, + calculateOutputPath, +} from './transpileMdxToMarkdown'; +import * as fs from 'fs'; +import * as path from 'path'; + +describe('MDX to Markdown Transpilation', () => { + const siteUrl = 'http://localhost:3000'; + + describe('Full transformation with fixture', () => { + it('should transform comprehensive fixture correctly', () => { + const inputPath = path.join(__dirname, '__fixtures__', 'input.mdx'); + const input = fs.readFileSync(inputPath, 'utf-8'); + + const { content, title } = transformMdxToMarkdown(input, siteUrl); + + expect(title).toBe('Test Fixture'); + expect(content).toMatchSnapshot(); + }); + + it('should throw error when title is missing', () => { + const input = `--- +meta_description: "Test" +--- + +Content without title`; + + expect(() => { + transformMdxToMarkdown(input, siteUrl); + }).toThrow('Missing title in frontmatter'); + }); + }); + + describe('removeImportExportStatements', () => { + it('should remove single-line imports', () => { + const input = `import Foo from 'bar'\n\nContent here`; + const output = removeImportExportStatements(input); + expect(output).not.toContain('import'); + expect(output).toContain('Content here'); + }); + + it('should remove multi-line imports', () => { + const input = `import {\n Foo,\n Bar\n} from 'module';\n\nContent here`; + const output = removeImportExportStatements(input); + expect(output).not.toContain('import'); + expect(output).not.toContain('from'); + expect(output).toContain('Content here'); + }); + + it('should remove export default statements', () => { + const input = `export default SomeComponent;\n\nContent here`; + const output = removeImportExportStatements(input); + expect(output).not.toContain('export'); + expect(output).toContain('Content here'); + }); + + it('should remove export const statements', () => { + const input = `export const foo = 'bar';\n\nContent here`; + const output = removeImportExportStatements(input); + expect(output).not.toContain('export'); + expect(output).toContain('Content here'); + }); + + it('should remove multi-line export functions', () => { + const input = `export function foo() {\n return 'bar';\n}\n\nContent here`; + const output = removeImportExportStatements(input); + expect(output).not.toContain('export'); + expect(output).not.toContain('function foo'); + expect(output).toContain('Content here'); + }); + + it('should remove multi-line export classes', () => { + const input = `export class Foo {\n bar() {}\n}\n\nContent here`; + const output = removeImportExportStatements(input); + expect(output).not.toContain('export'); + expect(output).not.toContain('class Foo'); + expect(output).toContain('Content here'); + }); + }); + + describe('removeScriptTags', () => { + it('should remove script tags outside code blocks', () => { + const input = `Text before\n\nText after`; + const output = removeScriptTags(input); + expect(output).not.toContain('\nContent`; + const output = removeScriptTags(input); + expect(output).not.toContain('\n```'; + const output = removeScriptTags(input); + expect(output).toContain('\n```\nAfter'; + const output = removeScriptTags(input); + expect(output).toContain(''); + }); + }); + + describe('removeAnchorTags', () => { + it('should remove self-closing anchor tags', () => { + const input = '## Heading '; + const output = removeAnchorTags(input); + expect(output).toBe('## Heading '); + }); + + it('should remove anchor tags with name attribute', () => { + const input = '## Heading '; + const output = removeAnchorTags(input); + expect(output).toBe('## Heading '); + }); + + it('should remove empty anchor tags', () => { + const input = '## Heading '; + const output = removeAnchorTags(input); + expect(output).toBe('## Heading '); + }); + + it('should preserve anchor tags in code blocks', () => { + const input = '```html\n\n```'; + const output = removeAnchorTags(input); + expect(output).toContain(''); + }); + + it('should preserve link anchors with href', () => { + const input = '[Link text](http://example.com)'; + const output = removeAnchorTags(input); + expect(output).toBe('[Link text](http://example.com)'); + }); + }); + + describe('removeJsxComments', () => { + it('should remove single-line JSX comments', () => { + const input = 'Text {/* comment */} more text'; + const output = removeJsxComments(input); + expect(output).not.toContain('{/*'); + expect(output).not.toContain('*/}'); + expect(output).toContain('Text'); + expect(output).toContain('more text'); + }); + + it('should remove multi-line JSX comments', () => { + const input = 'Text {/*\n multi\n line\n*/} more'; + const output = removeJsxComments(input); + expect(output).not.toContain('{/*'); + expect(output).toContain('Text'); + expect(output).toContain('more'); + }); + + it('should preserve JSX comments in code blocks', () => { + const input = '```jsx\n{/* code comment */}\n```'; + const output = removeJsxComments(input); + expect(output).toContain('{/* code comment */}'); + }); + }); + + describe('convertImagePathsToGitHub', () => { + const githubBase = 'https://raw.githubusercontent.com/ably/docs/main/src'; + + it('should convert relative image paths', () => { + const input = '![Alt text](../../../images/content/diagrams/test.png)'; + const output = convertImagePathsToGitHub(input); + expect(output).toBe(`![Alt text](${githubBase}/images/content/diagrams/test.png)`); + }); + + it('should convert absolute image paths', () => { + const input = '![Alt text](/images/content/test.png)'; + const output = convertImagePathsToGitHub(input); + expect(output).toBe(`![Alt text](${githubBase}/images/content/test.png)`); + }); + + it('should convert direct image paths', () => { + const input = '![Alt text](images/content/test.png)'; + const output = convertImagePathsToGitHub(input); + expect(output).toBe(`![Alt text](${githubBase}/images/content/test.png)`); + }); + + it('should handle multiple images', () => { + const input = `![One](../images/a.png)\n![Two](/images/b.png)`; + const output = convertImagePathsToGitHub(input); + expect(output).toContain(`${githubBase}/images/a.png`); + expect(output).toContain(`${githubBase}/images/b.png`); + }); + }); + + describe('convertRelativeUrls', () => { + it('should convert relative URLs to absolute', () => { + const input = '[Link text](/docs/channels)'; + const output = convertRelativeUrls(input, siteUrl); + expect(output).toBe('[Link text](http://localhost:3000/docs/channels)'); + }); + + it('should preserve external URLs', () => { + const input = '[Link](https://example.com/page)'; + const output = convertRelativeUrls(input, siteUrl); + expect(output).toBe('[Link](https://example.com/page)'); + }); + + it('should preserve hash-only links', () => { + const input = '[Anchor](#section)'; + const output = convertRelativeUrls(input, siteUrl); + expect(output).toBe('[Anchor](#section)'); + }); + + it('should handle multiple links', () => { + const input = '[Internal](/docs/a) and [External](https://b.com) and [Hash](#c)'; + const output = convertRelativeUrls(input, siteUrl); + expect(output).toContain('[Internal](http://localhost:3000/docs/a)'); + expect(output).toContain('[External](https://b.com)'); + expect(output).toContain('[Hash](#c)'); + }); + }); + + describe('replaceTemplateVariables', () => { + it('should replace API_KEY', () => { + const input = 'Use {{API_KEY}} in your code'; + const output = replaceTemplateVariables(input); + expect(output).toBe('Use your-api-key in your code'); + }); + + it('should replace RANDOM_CHANNEL_NAME', () => { + const input = 'Channel: {{RANDOM_CHANNEL_NAME}}'; + const output = replaceTemplateVariables(input); + expect(output).toBe('Channel: your-channel-name'); + }); + + it('should replace multiple occurrences', () => { + const input = '{{API_KEY}} and {{RANDOM_CHANNEL_NAME}} and {{API_KEY}}'; + const output = replaceTemplateVariables(input); + expect(output).toBe('your-api-key and your-channel-name and your-api-key'); + }); + }); + + describe('calculateOutputPath', () => { + it('should handle index files', () => { + const output = calculateOutputPath('docs/channels', 'index'); + expect(output).toContain('public/docs/channels.md'); + expect(output).toMatch(/public\/docs\/channels\.md$/); + }); + + it('should handle non-index files', () => { + const output = calculateOutputPath('docs/chat', 'connect'); + expect(output).toContain('public/docs/chat/connect.md'); + expect(output).toMatch(/public\/docs\/chat\/connect\.md$/); + }); + + it('should handle top-level docs index', () => { + const output = calculateOutputPath('docs', 'index'); + expect(output).toContain('public/docs.md'); + expect(output).toMatch(/public\/docs\.md$/); + }); + + it('should handle nested index paths', () => { + const output = calculateOutputPath('docs/api/realtime-sdk', 'index'); + expect(output).toContain('public/docs/api/realtime-sdk.md'); + expect(output).toMatch(/public\/docs\/api\/realtime-sdk\.md$/); + }); + + it('should handle deeply nested files', () => { + const output = calculateOutputPath('docs/chat/moderation/direct', 'bodyguard'); + expect(output).toContain('public/docs/chat/moderation/direct/bodyguard.md'); + expect(output).toMatch(/public\/docs\/chat\/moderation\/direct\/bodyguard\.md$/); + }); + }); +}); diff --git a/data/onPostBuild/transpileMdxToMarkdown.ts b/data/onPostBuild/transpileMdxToMarkdown.ts index 7924513073..1f69a5d0ea 100644 --- a/data/onPostBuild/transpileMdxToMarkdown.ts +++ b/data/onPostBuild/transpileMdxToMarkdown.ts @@ -37,16 +37,28 @@ interface FrontMatterAttributes { * Handles both single-line and multi-line statements */ function removeImportExportStatements(content: string): string { - return content - // Remove import statements (single and multi-line) + let result = content; + + // Remove import statements (single and multi-line) + result = result .replace(/^import\s+[\s\S]*?from\s+['"][^'"]+['"];?\s*$/gm, '') - .replace(/^import\s+['"][^'"]+['"];?\s*$/gm, '') - // Remove export statements (single and multi-line) + .replace(/^import\s+['"][^'"]+['"];?\s*$/gm, ''); + + // Remove export statements + // Handle: export { foo, bar }; (single and multi-line) + result = result .replace(/^export\s+\{[\s\S]*?\}\s*;?\s*$/gm, '') - .replace(/^export\s+\{[\s\S]*?\}\s+from\s+['"][^'"]+['"];?\s*$/gm, '') - .replace(/^export\s+(default|const|let|var|function|class)\s+.*$/gm, '') - // Clean up extra blank lines left behind - .replace(/\n\n\n+/g, '\n\n'); + .replace(/^export\s+\{[\s\S]*?\}\s+from\s+['"][^'"]+['"];?\s*$/gm, ''); + + // Handle: export default Component; or export const foo = 'bar'; + result = result.replace(/^export\s+(default|const|let|var)\s+.*$/gm, ''); + + // Handle: export function/class declarations (multi-line) + // Match from 'export function/class' until the closing brace + result = result.replace(/^export\s+(function|class)\s+\w+[\s\S]*?\n\}/gm, ''); + + // Clean up extra blank lines left behind + return result.replace(/\n\n\n+/g, '\n\n'); } /** @@ -196,28 +208,30 @@ function removeJsxComments(content: string): string { /** * Convert image paths to GitHub raw URLs * Handles relative (../), absolute (/images/), and direct (images/) paths + * Only converts paths with valid image extensions */ function convertImagePathsToGitHub(content: string): string { const githubBaseUrl = 'https://raw.githubusercontent.com/ably/docs/main/src'; + const imageExtensions = '(?:png|jpg|jpeg|gif|svg|webp|bmp|ico)'; return content - // Handle relative paths: ../../../images/... + // Handle relative paths: ../../../images/...{ext} .replace( - /!\[([^\]]*)\]\(((?:\.\.\/)+)(images\/[^)]+)\)/g, + new RegExp(`!\\[([^\\]]*)\\]\\(((?:\\.\\.\\/)+)(images\\/[^)]+\\.${imageExtensions})\\)`, 'gi'), (match, altText, relativePath, imagePath) => { return `![${altText}](${githubBaseUrl}/${imagePath})`; } ) - // Handle absolute paths: /images/... + // Handle absolute paths: /images/...{ext} .replace( - /!\[([^\]]*)\]\(\/(images\/[^)]+)\)/g, + new RegExp(`!\\[([^\\]]*)\\]\\(\\/(images\\/[^)]+\\.${imageExtensions})\\)`, 'gi'), (match, altText, imagePath) => { return `![${altText}](${githubBaseUrl}/${imagePath})`; } ) - // Handle direct paths: images/... (no prefix) + // Handle direct paths: images/...{ext} (no prefix) .replace( - /!\[([^\]]*)\]\((images\/[^)]+)\)/g, + new RegExp(`!\\[([^\\]]*)\\]\\((images\\/[^)]+\\.${imageExtensions})\\)`, 'gi'), (match, altText, imagePath) => { return `![${altText}](${githubBaseUrl}/${imagePath})`; } @@ -402,7 +416,14 @@ export const onPostBuild: GatsbyNode['onPostBuild'] = async ({ graphql, reporter } // Get siteUrl from GraphQL - const siteUrl = data.site.siteMetadata.siteUrl; + const siteUrl = data.site?.siteMetadata?.siteUrl; + + if (!siteUrl) { + reporter.panicOnBuild( + `${REPORTER_PREFIX} siteUrl is not configured in siteMetadata. Please check gatsby-config.ts` + ); + return; + } // Filter to only docs directory const mdxNodes = data.allMdx.nodes.filter((node) => { @@ -439,3 +460,16 @@ export const onPostBuild: GatsbyNode['onPostBuild'] = async ({ graphql, reporter ); } }; + +// Export functions for testing +export { + removeImportExportStatements, + removeScriptTags, + removeAnchorTags, + removeJsxComments, + convertImagePathsToGitHub, + convertRelativeUrls, + replaceTemplateVariables, + calculateOutputPath, + transformMdxToMarkdown, +}; From 3558d18d3520977d1e8ed8be161aa26af5b76ef2 Mon Sep 17 00:00:00 2001 From: Kenneth Kalmer Date: Wed, 3 Dec 2025 13:27:13 +0000 Subject: [PATCH 3/5] Add content negotiation for markdown documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements HTTP content negotiation in nginx to serve markdown versions of documentation pages based on the Accept header. This allows clients (like LLMs and text-based tools) to request markdown format while browsers continue to receive HTML by default. Key features: - Serves markdown for Accept: text/markdown, application/markdown, or text/plain - Maintains backward compatibility (HTML is default) - Works with existing authentication system - Supports both index and non-index file paths - No performance impact (uses nginx map blocks) Content negotiation behavior: - /docs/channels with Accept: text/markdown โ†’ serves docs/channels.md - /docs/channels with Accept: text/html โ†’ serves docs/channels/index.html - /docs/channels (browser default) โ†’ serves docs/channels/index.html - /docs/channels.md (direct access) โ†’ serves docs/channels.md Implementation: - Added text/markdown MIME type to config/mime.types - Added text/markdown to gzip_types for compression - Created map blocks to detect Accept header preferences - Updated location blocks to use content-negotiated file paths - Fallback to HTML when markdown doesn't exist ๐Ÿค– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- config/mime.types | 1 + config/nginx.conf.erb | 42 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/config/mime.types b/config/mime.types index 2961256950..7baa544b0f 100644 --- a/config/mime.types +++ b/config/mime.types @@ -11,6 +11,7 @@ types { text/mathml mml; text/plain txt; + text/markdown md markdown; text/vnd.sun.j2me.app-descriptor jad; text/vnd.wap.wml wml; text/x-component htc; diff --git a/config/nginx.conf.erb b/config/nginx.conf.erb index a1948850bb..8e0cf892e6 100644 --- a/config/nginx.conf.erb +++ b/config/nginx.conf.erb @@ -18,7 +18,7 @@ http { gzip on; gzip_comp_level 6; gzip_min_length 512; - gzip_types text/plain text/css application/json application/javascript text/xml application/xml application/xml+rss font/woff font/woff2 image/svg+xml; + gzip_types text/plain text/markdown text/css application/json application/javascript text/xml application/xml application/xml+rss font/woff font/woff2 image/svg+xml; gzip_vary on; gzip_proxied any; # Heroku router sends Via header @@ -62,6 +62,36 @@ http { <% end %> } + ## + # CONTENT NEGOTIATION FOR MARKDOWN + # Maps Accept header to file extension preference + + map $http_accept $docs_file_extension { + default ".html"; + + # Exact markdown MIME types + "text/markdown" ".md"; + "application/markdown" ".md"; + "text/plain" ".md"; + + # Handle multiple Accept values - prefer markdown if explicitly requested + "~*text/markdown" ".md"; + "~*application/markdown" ".md"; + "~*^text/plain" ".md"; + + # Explicit HTML request gets HTML (handles browser defaults) + "~*^text/html" ".html"; + "*/*" ".html"; + } + + # Translate extension to file path + map $docs_file_extension $docs_try_file { + ".html" "$request_uri/index.html"; + ".md" "$request_uri.md"; + } + + # / CONTENT NEGOTIATION FOR MARKDOWN + ## # CORS CONFIGURATION @@ -231,10 +261,10 @@ http { <% if content_request_protected %> # Serve the file if it exists, otherwise try to authenticate # (.html requests won't match here, they'll go to the @html_auth location) - try_files $request_uri @html_auth; + try_files $request_uri $docs_try_file @html_auth; <% else %> - # Serve the file if it exists, try index.html for paths without a trailing slash, otherwise 404 - try_files $request_uri $request_uri/index.html $request_uri/ =404; + # Serve the file if it exists, try content-negotiated file, then index.html, otherwise 404 + try_files $request_uri $docs_try_file $request_uri/index.html $request_uri/ =404; <% end %> } @@ -252,8 +282,8 @@ http { <% end %> } - # If the request is authenticated, break out of the location block and serve the file - try_files $request_uri.html $request_uri/index.html $request_uri/ =404; + # If the request is authenticated, try content-negotiated file, then fallback to HTML + try_files $request_uri.html $docs_try_file $request_uri/index.html $request_uri/ =404; } # Don't serve files with the .html extension here, send them to the canonical location From 5a81081b311d9269b552326286fea3799710f001 Mon Sep 17 00:00:00 2001 From: Kenneth Kalmer Date: Wed, 3 Dec 2025 14:33:52 +0000 Subject: [PATCH 4/5] Add content negotiation test suite and fix map priority MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds comprehensive CI test suite to verify content negotiation works correctly for all Accept header scenarios. The test suite validates that nginx serves markdown or HTML based on the Accept header, with proper fallback behavior. Test coverage: - Basic content negotiation: text/markdown, application/markdown, text/plain, text/html, */* - Browser behavior: Complex Accept headers, HTML priority when listed first - Direct access: .md and .html file access - Path variations: Index paths, non-index paths, nested paths - Edge cases: 404 handling, fallback behavior, non-docs paths Also fixes nginx map priority order to ensure anchored patterns (^text/html, ^text/plain) are evaluated before wildcard patterns. This ensures "text/html, text/markdown" correctly serves HTML instead of markdown. Changes: - Created bin/assert-content-negotiation.sh with run_test() helper function - Integrated test into CircleCI test-nginx job - Reordered nginx map patterns for correct priority matching ๐Ÿค– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .circleci/config.yml | 5 ++ bin/assert-content-negotiation.sh | 138 ++++++++++++++++++++++++++++++ config/nginx.conf.erb | 11 +-- 3 files changed, 149 insertions(+), 5 deletions(-) create mode 100755 bin/assert-content-negotiation.sh diff --git a/.circleci/config.yml b/.circleci/config.yml index 2ad9210b01..fd47405d04 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -129,6 +129,11 @@ jobs: - run: name: Verify all files are compressed command: ./bin/assert-compressed.sh + - run: + name: Test content negotiation for markdown + command: | + export PATH="$PWD/bin:$PWD/buildpack/build/.heroku-buildpack-nginx/ruby/bin:$PATH" + ./bin/assert-content-negotiation.sh - run: name: Test content request auth tokens command: | diff --git a/bin/assert-content-negotiation.sh b/bin/assert-content-negotiation.sh new file mode 100755 index 0000000000..69011b99ec --- /dev/null +++ b/bin/assert-content-negotiation.sh @@ -0,0 +1,138 @@ +#!/bin/bash + +# Content Negotiation Test Suite +# Verifies that nginx serves markdown or HTML based on Accept header + +source "$(dirname "$0")/nginx-utils.sh" +trap stop_nginx EXIT + +set -euo pipefail + +# Disable auth for content negotiation tests +export ENABLE_BASIC_AUTH=false +export CONTENT_REQUEST_AUTH_TOKENS="" + +# Set default port if not already set +export PORT=${PORT:-3001} + +# Test helper function +# Parameters: +# $1: path - URL path to test +# $2: accept_header - Accept header value (empty string for default) +# $3: expected_status - Expected HTTP status code +# $4: expected_format - "html", "markdown", or "any" +# $5: test_name - Human-readable test description +run_test() { + local path="$1" + local accept_header="$2" + local expected_status="$3" + local expected_format="$4" + local test_name="$5" + + echo "๐Ÿงช $test_name" + + # Build curl command with optional Accept header + local curl_cmd="curl --silent --header \"X-Forwarded-Proto: https\"" + + if [ -n "$accept_header" ]; then + curl_cmd="$curl_cmd --header \"Accept: $accept_header\"" + fi + + curl_cmd="$curl_cmd --write-out \"\\n%{http_code}\\n%{content_type}\"" + curl_cmd="$curl_cmd \"http://localhost:\${PORT}\${path}\"" + + # Execute request and capture response + metadata + local response + response=$(eval "$curl_cmd") + + # Parse response components + local body=$(echo "$response" | sed '$d' | sed '$d') + local status=$(echo "$response" | tail -2 | head -1) + local content_type=$(echo "$response" | tail -1) + + # Assert status code + if [ "$status" != "$expected_status" ]; then + echo " โŒ Expected status $expected_status, got $status" + exit 1 + fi + + # Verify content format + if [ "$expected_format" = "markdown" ]; then + # Check for markdown heading (first line should start with #) + local first_line=$(echo "$body" | head -1) + if ! grep -q "^#" <<< "$first_line"; then + echo " โŒ Expected markdown (starting with #), got: ${first_line:0:50}" + exit 1 + fi + + # Verify Content-Type header (warning only, not fatal) + if ! grep -q "text/markdown" <<< "$content_type"; then + echo " โš ๏ธ Warning: Content-Type is '$content_type', expected 'text/markdown'" + fi + elif [ "$expected_format" = "html" ]; then + # Check for HTML doctype using here-string to avoid broken pipe + if ! grep -q "" <<< "$body"; then + echo " โŒ Expected HTML (with DOCTYPE), but not found" + exit 1 + fi + fi + # "any" format means we don't validate content + + echo " โœ… Passed (status: $status, format: $expected_format)" +} + +# Main test suite +echo "================================" +echo "Content Negotiation Test Suite" +echo "================================" +echo + +start_nginx + +# Group 1: Basic Content Negotiation +echo "Group 1: Basic Content Negotiation" +echo "-----------------------------------" +run_test "/docs/channels" "" "200" "html" "Default serves HTML" +run_test "/docs/channels" "text/markdown" "200" "markdown" "Accept: text/markdown" +run_test "/docs/channels" "application/markdown" "200" "markdown" "Accept: application/markdown" +run_test "/docs/channels" "text/plain" "200" "markdown" "Accept: text/plain" +run_test "/docs/channels" "text/html" "200" "html" "Accept: text/html" +run_test "/docs/channels" "*/*" "200" "html" "Accept: */*" +echo + +# Group 2: Browser Behavior +echo "Group 2: Browser Behavior" +echo "-------------------------" +run_test "/docs/channels" "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" "200" "html" "Browser Accept header" +run_test "/docs/channels" "text/html, text/markdown" "200" "html" "HTML prioritized when first" +echo + +# Group 3: Direct Access +echo "Group 3: Direct Access" +echo "----------------------" +run_test "/docs/channels.md" "" "200" "markdown" "Direct .md access" +run_test "/docs/channels/index.html" "" "200" "html" "Direct index.html access" +echo + +# Group 4: Path Variations +echo "Group 4: Path Variations" +echo "------------------------" +run_test "/docs/chat/connect" "text/markdown" "200" "markdown" "Non-index path" +run_test "/docs/api/realtime-sdk" "text/markdown" "200" "markdown" "Nested index path" +run_test "/docs/basics" "text/markdown" "200" "markdown" "Simple path" +echo + +# Group 5: Edge Cases +echo "Group 5: Edge Cases" +echo "-------------------" +run_test "/docs/nonexistent" "" "404" "any" "404 when path missing" +run_test "/docs/nonexistent" "text/markdown" "404" "any" "404 with markdown Accept" +run_test "/llms.txt" "" "200" "any" "Non-docs paths unaffected" +echo + +echo "================================" +echo "โœ… All 16 tests passed!" +echo "================================" + +# Exit explicitly with success +exit 0 diff --git a/config/nginx.conf.erb b/config/nginx.conf.erb index 8e0cf892e6..71c84261b4 100644 --- a/config/nginx.conf.erb +++ b/config/nginx.conf.erb @@ -74,13 +74,14 @@ http { "application/markdown" ".md"; "text/plain" ".md"; + # IMPORTANT: Check start-of-string patterns FIRST (before wildcard patterns) + # Explicit HTML request gets HTML (handles browser defaults like "text/html, text/markdown") + "~*^text/html" ".html"; + # Handle multiple Accept values - prefer markdown if explicitly requested "~*text/markdown" ".md"; "~*application/markdown" ".md"; - "~*^text/plain" ".md"; - # Explicit HTML request gets HTML (handles browser defaults) - "~*^text/html" ".html"; "*/*" ".html"; } @@ -282,8 +283,8 @@ http { <% end %> } - # If the request is authenticated, try content-negotiated file, then fallback to HTML - try_files $request_uri.html $docs_try_file $request_uri/index.html $request_uri/ =404; + # If the request is authenticated, try content-negotiated file first, then fallback to HTML + try_files $docs_try_file $request_uri.html $request_uri/index.html $request_uri/ =404; } # Don't serve files with the .html extension here, send them to the canonical location From 40b46d7d8fd75a873e1b8cb1093407a96c77e265 Mon Sep 17 00:00:00 2001 From: Kenneth Kalmer Date: Mon, 8 Dec 2025 16:09:15 +0000 Subject: [PATCH 5/5] Add User-Agent based bot detection for markdown serving MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enhances content negotiation to serve markdown to LLM bots based on User-Agent strings, in addition to Accept header detection. This ensures bots like Claude, ChatGPT, Perplexity, and Google AI get markdown even if they don't send proper Accept headers. Bot detection: - Detects several LLM bot User-Agents (Claude, ChatGPT, Perplexity, Google AI) - Conservative list - no generic HTTP libraries to avoid false positives - Combines with existing Accept header logic using nginx map variables - Serves markdown if EITHER bot detected OR Accept header requests markdown Implementation: - Added $is_llm_bot map for User-Agent pattern matching - Updated $docs_file_extension map to combine bot + Accept header detection - Uses map variable concatenation: "${is_llm_bot}${wants_markdown_via_accept}" - Works seamlessly with existing try_files logic Testing: - Added new tests for bot User-Agent detection - Tests bot override behavior (bot gets markdown even with Accept: text/html) - Verified browsers still get HTML by default ๐Ÿค– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- bin/assert-content-negotiation.sh | 28 +++++++++++- config/nginx.conf.erb | 74 +++++++++++++++++++++++++------ 2 files changed, 87 insertions(+), 15 deletions(-) diff --git a/bin/assert-content-negotiation.sh b/bin/assert-content-negotiation.sh index 69011b99ec..b5b6549eb6 100755 --- a/bin/assert-content-negotiation.sh +++ b/bin/assert-content-negotiation.sh @@ -22,18 +22,24 @@ export PORT=${PORT:-3001} # $3: expected_status - Expected HTTP status code # $4: expected_format - "html", "markdown", or "any" # $5: test_name - Human-readable test description +# $6: user_agent - Optional User-Agent string run_test() { local path="$1" local accept_header="$2" local expected_status="$3" local expected_format="$4" local test_name="$5" + local user_agent="${6:-}" echo "๐Ÿงช $test_name" - # Build curl command with optional Accept header + # Build curl command with optional Accept header and User-Agent local curl_cmd="curl --silent --header \"X-Forwarded-Proto: https\"" + if [ -n "$user_agent" ]; then + curl_cmd="$curl_cmd --user-agent \"$user_agent\"" + fi + if [ -n "$accept_header" ]; then curl_cmd="$curl_cmd --header \"Accept: $accept_header\"" fi @@ -130,8 +136,26 @@ run_test "/docs/nonexistent" "text/markdown" "404" "any" "404 with markdown Acce run_test "/llms.txt" "" "200" "any" "Non-docs paths unaffected" echo +# Group 6: Bot Detection (User-Agent) +echo "Group 6: Bot Detection (User-Agent)" +echo "------------------------------------" +run_test "/docs/channels" "" "200" "markdown" "Claude-User bot gets markdown" "Claude-User/1.0" +run_test "/docs/channels" "" "200" "markdown" "ClaudeBot gets markdown" "Mozilla/5.0 (compatible; ClaudeBot/1.0)" +run_test "/docs/channels" "" "200" "markdown" "ChatGPT-User bot gets markdown" "ChatGPT-User" +run_test "/docs/channels" "" "200" "markdown" "GPTBot gets markdown" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0)" +run_test "/docs/channels" "" "200" "markdown" "PerplexityBot gets markdown" "PerplexityBot" +run_test "/docs/channels" "" "200" "html" "Regular browser gets HTML" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)" +echo + +# Group 7: Combined Bot + Accept Header +echo "Group 7: Combined Bot + Accept Header" +echo "--------------------------------------" +run_test "/docs/channels" "text/html" "200" "markdown" "Bot overrides Accept: text/html" "Claude-User/1.0" +run_test "/docs/channels" "text/markdown" "200" "markdown" "Bot + markdown Accept both work" "GPTBot/1.0" +echo + echo "================================" -echo "โœ… All 16 tests passed!" +echo "โœ… All 23 tests passed!" echo "================================" # Exit explicitly with success diff --git a/config/nginx.conf.erb b/config/nginx.conf.erb index 71c84261b4..659d448c80 100644 --- a/config/nginx.conf.erb +++ b/config/nginx.conf.erb @@ -64,25 +64,73 @@ http { ## # CONTENT NEGOTIATION FOR MARKDOWN - # Maps Accept header to file extension preference + # Serves markdown to LLM bots and clients that request it via Accept header - map $http_accept $docs_file_extension { - default ".html"; + # Detect LLM bots by User-Agent + map $http_user_agent $is_llm_bot { + default 0; + + # Anthropic / Claude + "~*Claude-User" 1; + "~*ClaudeBot" 1; + "~*anthropic-ai" 1; + + # OpenAI / ChatGPT + "~*ChatGPT-User" 1; + "~*GPTBot" 1; + + # Perplexity + "~*PerplexityBot" 1; + "~*Perplexity-User" 1; + + # Google AI + "~*Google-Extended" 1; + "~*GoogleOther" 1; + "~*Gemini" 1; + + # Mistral AI + "~*MistralAI-User" 1; + + # Meta / Facebook + "~*Meta-ExternalAgent" 1; + + # Amazon + "~*Amazonbot" 1; + + # ByteDance / TikTok + "~*Bytespider" 1; + } + + # Detect markdown request via Accept header + map $http_accept $wants_markdown_via_accept { + default 0; # Exact markdown MIME types - "text/markdown" ".md"; - "application/markdown" ".md"; - "text/plain" ".md"; + "text/markdown" 1; + "application/markdown" 1; + "text/plain" 1; + + # Browsers explicitly want HTML (check first before wildcard patterns) + "~*^text/html" 0; - # IMPORTANT: Check start-of-string patterns FIRST (before wildcard patterns) - # Explicit HTML request gets HTML (handles browser defaults like "text/html, text/markdown") - "~*^text/html" ".html"; + # Accept header contains markdown types + "~*text/markdown" 1; + "~*application/markdown" 1; - # Handle multiple Accept values - prefer markdown if explicitly requested - "~*text/markdown" ".md"; - "~*application/markdown" ".md"; + # Wildcard gets HTML + "*/*" 0; + } + + # Serve markdown if bot detected OR markdown requested via Accept header + # Combines: ${is_llm_bot}${wants_markdown_via_accept} โ†’ "00", "01", "10", or "11" + map "${is_llm_bot}${wants_markdown_via_accept}" $docs_file_extension { + default ".html"; - "*/*" ".html"; + # If either variable is 1, serve markdown + "10" ".md"; # Bot detected, no markdown Accept + "01" ".md"; # No bot, markdown Accept + "11" ".md"; # Both + "00" ".html"; # Neither } # Translate extension to file path