diff --git a/.circleci/config.yml b/.circleci/config.yml index 2ad9210b01..fd47405d04 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -129,6 +129,11 @@ jobs: - run: name: Verify all files are compressed command: ./bin/assert-compressed.sh + - run: + name: Test content negotiation for markdown + command: | + export PATH="$PWD/bin:$PWD/buildpack/build/.heroku-buildpack-nginx/ruby/bin:$PATH" + ./bin/assert-content-negotiation.sh - run: name: Test content request auth tokens command: | diff --git a/bin/assert-content-negotiation.sh b/bin/assert-content-negotiation.sh new file mode 100755 index 0000000000..b5b6549eb6 --- /dev/null +++ b/bin/assert-content-negotiation.sh @@ -0,0 +1,162 @@ +#!/bin/bash + +# Content Negotiation Test Suite +# Verifies that nginx serves markdown or HTML based on Accept header + +source "$(dirname "$0")/nginx-utils.sh" +trap stop_nginx EXIT + +set -euo pipefail + +# Disable auth for content negotiation tests +export ENABLE_BASIC_AUTH=false +export CONTENT_REQUEST_AUTH_TOKENS="" + +# Set default port if not already set +export PORT=${PORT:-3001} + +# Test helper function +# Parameters: +# $1: path - URL path to test +# $2: accept_header - Accept header value (empty string for default) +# $3: expected_status - Expected HTTP status code +# $4: expected_format - "html", "markdown", or "any" +# $5: test_name - Human-readable test description +# $6: user_agent - Optional User-Agent string +run_test() { + local path="$1" + local accept_header="$2" + local expected_status="$3" + local expected_format="$4" + local test_name="$5" + local user_agent="${6:-}" + + echo "๐Ÿงช $test_name" + + # Build curl command with optional Accept header and User-Agent + local curl_cmd="curl --silent --header \"X-Forwarded-Proto: https\"" + + if [ -n "$user_agent" ]; then + curl_cmd="$curl_cmd --user-agent \"$user_agent\"" + fi + + if [ -n "$accept_header" ]; then + curl_cmd="$curl_cmd --header \"Accept: $accept_header\"" + fi + + curl_cmd="$curl_cmd --write-out \"\\n%{http_code}\\n%{content_type}\"" + curl_cmd="$curl_cmd \"http://localhost:\${PORT}\${path}\"" + + # Execute request and capture response + metadata + local response + response=$(eval "$curl_cmd") + + # Parse response components + local body=$(echo "$response" | sed '$d' | sed '$d') + local status=$(echo "$response" | tail -2 | head -1) + local content_type=$(echo "$response" | tail -1) + + # Assert status code + if [ "$status" != "$expected_status" ]; then + echo " โŒ Expected status $expected_status, got $status" + exit 1 + fi + + # Verify content format + if [ "$expected_format" = "markdown" ]; then + # Check for markdown heading (first line should start with #) + local first_line=$(echo "$body" | head -1) + if ! grep -q "^#" <<< "$first_line"; then + echo " โŒ Expected markdown (starting with #), got: ${first_line:0:50}" + exit 1 + fi + + # Verify Content-Type header (warning only, not fatal) + if ! grep -q "text/markdown" <<< "$content_type"; then + echo " โš ๏ธ Warning: Content-Type is '$content_type', expected 'text/markdown'" + fi + elif [ "$expected_format" = "html" ]; then + # Check for HTML doctype using here-string to avoid broken pipe + if ! grep -q "" <<< "$body"; then + echo " โŒ Expected HTML (with DOCTYPE), but not found" + exit 1 + fi + fi + # "any" format means we don't validate content + + echo " โœ… Passed (status: $status, format: $expected_format)" +} + +# Main test suite +echo "================================" +echo "Content Negotiation Test Suite" +echo "================================" +echo + +start_nginx + +# Group 1: Basic Content Negotiation +echo "Group 1: Basic Content Negotiation" +echo "-----------------------------------" +run_test "/docs/channels" "" "200" "html" "Default serves HTML" +run_test "/docs/channels" "text/markdown" "200" "markdown" "Accept: text/markdown" +run_test "/docs/channels" "application/markdown" "200" "markdown" "Accept: application/markdown" +run_test "/docs/channels" "text/plain" "200" "markdown" "Accept: text/plain" +run_test "/docs/channels" "text/html" "200" "html" "Accept: text/html" +run_test "/docs/channels" "*/*" "200" "html" "Accept: */*" +echo + +# Group 2: Browser Behavior +echo "Group 2: Browser Behavior" +echo "-------------------------" +run_test "/docs/channels" "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" "200" "html" "Browser Accept header" +run_test "/docs/channels" "text/html, text/markdown" "200" "html" "HTML prioritized when first" +echo + +# Group 3: Direct Access +echo "Group 3: Direct Access" +echo "----------------------" +run_test "/docs/channels.md" "" "200" "markdown" "Direct .md access" +run_test "/docs/channels/index.html" "" "200" "html" "Direct index.html access" +echo + +# Group 4: Path Variations +echo "Group 4: Path Variations" +echo "------------------------" +run_test "/docs/chat/connect" "text/markdown" "200" "markdown" "Non-index path" +run_test "/docs/api/realtime-sdk" "text/markdown" "200" "markdown" "Nested index path" +run_test "/docs/basics" "text/markdown" "200" "markdown" "Simple path" +echo + +# Group 5: Edge Cases +echo "Group 5: Edge Cases" +echo "-------------------" +run_test "/docs/nonexistent" "" "404" "any" "404 when path missing" +run_test "/docs/nonexistent" "text/markdown" "404" "any" "404 with markdown Accept" +run_test "/llms.txt" "" "200" "any" "Non-docs paths unaffected" +echo + +# Group 6: Bot Detection (User-Agent) +echo "Group 6: Bot Detection (User-Agent)" +echo "------------------------------------" +run_test "/docs/channels" "" "200" "markdown" "Claude-User bot gets markdown" "Claude-User/1.0" +run_test "/docs/channels" "" "200" "markdown" "ClaudeBot gets markdown" "Mozilla/5.0 (compatible; ClaudeBot/1.0)" +run_test "/docs/channels" "" "200" "markdown" "ChatGPT-User bot gets markdown" "ChatGPT-User" +run_test "/docs/channels" "" "200" "markdown" "GPTBot gets markdown" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0)" +run_test "/docs/channels" "" "200" "markdown" "PerplexityBot gets markdown" "PerplexityBot" +run_test "/docs/channels" "" "200" "html" "Regular browser gets HTML" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)" +echo + +# Group 7: Combined Bot + Accept Header +echo "Group 7: Combined Bot + Accept Header" +echo "--------------------------------------" +run_test "/docs/channels" "text/html" "200" "markdown" "Bot overrides Accept: text/html" "Claude-User/1.0" +run_test "/docs/channels" "text/markdown" "200" "markdown" "Bot + markdown Accept both work" "GPTBot/1.0" +echo + +echo "================================" +echo "โœ… All 23 tests passed!" +echo "================================" + +# Exit explicitly with success +exit 0 diff --git a/config/mime.types b/config/mime.types index 2961256950..7baa544b0f 100644 --- a/config/mime.types +++ b/config/mime.types @@ -11,6 +11,7 @@ types { text/mathml mml; text/plain txt; + text/markdown md markdown; text/vnd.sun.j2me.app-descriptor jad; text/vnd.wap.wml wml; text/x-component htc; diff --git a/config/nginx.conf.erb b/config/nginx.conf.erb index a1948850bb..659d448c80 100644 --- a/config/nginx.conf.erb +++ b/config/nginx.conf.erb @@ -18,7 +18,7 @@ http { gzip on; gzip_comp_level 6; gzip_min_length 512; - gzip_types text/plain text/css application/json application/javascript text/xml application/xml application/xml+rss font/woff font/woff2 image/svg+xml; + gzip_types text/plain text/markdown text/css application/json application/javascript text/xml application/xml application/xml+rss font/woff font/woff2 image/svg+xml; gzip_vary on; gzip_proxied any; # Heroku router sends Via header @@ -62,6 +62,85 @@ http { <% end %> } + ## + # CONTENT NEGOTIATION FOR MARKDOWN + # Serves markdown to LLM bots and clients that request it via Accept header + + # Detect LLM bots by User-Agent + map $http_user_agent $is_llm_bot { + default 0; + + # Anthropic / Claude + "~*Claude-User" 1; + "~*ClaudeBot" 1; + "~*anthropic-ai" 1; + + # OpenAI / ChatGPT + "~*ChatGPT-User" 1; + "~*GPTBot" 1; + + # Perplexity + "~*PerplexityBot" 1; + "~*Perplexity-User" 1; + + # Google AI + "~*Google-Extended" 1; + "~*GoogleOther" 1; + "~*Gemini" 1; + + # Mistral AI + "~*MistralAI-User" 1; + + # Meta / Facebook + "~*Meta-ExternalAgent" 1; + + # Amazon + "~*Amazonbot" 1; + + # ByteDance / TikTok + "~*Bytespider" 1; + } + + # Detect markdown request via Accept header + map $http_accept $wants_markdown_via_accept { + default 0; + + # Exact markdown MIME types + "text/markdown" 1; + "application/markdown" 1; + "text/plain" 1; + + # Browsers explicitly want HTML (check first before wildcard patterns) + "~*^text/html" 0; + + # Accept header contains markdown types + "~*text/markdown" 1; + "~*application/markdown" 1; + + # Wildcard gets HTML + "*/*" 0; + } + + # Serve markdown if bot detected OR markdown requested via Accept header + # Combines: ${is_llm_bot}${wants_markdown_via_accept} โ†’ "00", "01", "10", or "11" + map "${is_llm_bot}${wants_markdown_via_accept}" $docs_file_extension { + default ".html"; + + # If either variable is 1, serve markdown + "10" ".md"; # Bot detected, no markdown Accept + "01" ".md"; # No bot, markdown Accept + "11" ".md"; # Both + "00" ".html"; # Neither + } + + # Translate extension to file path + map $docs_file_extension $docs_try_file { + ".html" "$request_uri/index.html"; + ".md" "$request_uri.md"; + } + + # / CONTENT NEGOTIATION FOR MARKDOWN + ## # CORS CONFIGURATION @@ -231,10 +310,10 @@ http { <% if content_request_protected %> # Serve the file if it exists, otherwise try to authenticate # (.html requests won't match here, they'll go to the @html_auth location) - try_files $request_uri @html_auth; + try_files $request_uri $docs_try_file @html_auth; <% else %> - # Serve the file if it exists, try index.html for paths without a trailing slash, otherwise 404 - try_files $request_uri $request_uri/index.html $request_uri/ =404; + # Serve the file if it exists, try content-negotiated file, then index.html, otherwise 404 + try_files $request_uri $docs_try_file $request_uri/index.html $request_uri/ =404; <% end %> } @@ -252,8 +331,8 @@ http { <% end %> } - # If the request is authenticated, break out of the location block and serve the file - try_files $request_uri.html $request_uri/index.html $request_uri/ =404; + # If the request is authenticated, try content-negotiated file first, then fallback to HTML + try_files $docs_try_file $request_uri.html $request_uri/index.html $request_uri/ =404; } # Don't serve files with the .html extension here, send them to the canonical location diff --git a/data/onPostBuild/__fixtures__/input.mdx b/data/onPostBuild/__fixtures__/input.mdx new file mode 100644 index 0000000000..46fda6209b --- /dev/null +++ b/data/onPostBuild/__fixtures__/input.mdx @@ -0,0 +1,70 @@ +--- +title: Test Fixture +meta_description: "This is a test description" +redirect_from: + - /old-path +languages: + - javascript +some_other_field: "should be removed" +--- + +import Something from '../component' +import { + MultiLine, + Import +} from 'module' + +export const foo = 'bar'; +export default SomeComponent; + +{/* This is a JSX comment */} +{/* + Multi-line JSX comment + with multiple lines +*/} + +## Basic heading + +## Heading with anchor + +### Nested heading + + + +Regular content here. + +## Links and images + +- [Internal link](/docs/channels) +- [External link](https://example.com) +- [Hash link](#test-anchor) +- ![Relative image](../../../images/content/diagrams/test.png) +- ![Absolute image](/images/content/test.png) +- ![Direct image](images/content/test.png) + +## Template variables + +Use {{API_KEY}} and {{RANDOM_CHANNEL_NAME}} in your code. + +## Code blocks + + +```javascript +const channel = realtime.channels.get('{{RANDOM_CHANNEL_NAME}}'); +``` + + +Here's a code block with anchors and scripts that should be preserved: +```html + + +{/* preserve JSX comments in code */} +``` + +## JSX Components + + diff --git a/data/onPostBuild/__snapshots__/transpileMdxToMarkdown.test.ts.snap b/data/onPostBuild/__snapshots__/transpileMdxToMarkdown.test.ts.snap new file mode 100644 index 0000000000..43dd0ac5bc --- /dev/null +++ b/data/onPostBuild/__snapshots__/transpileMdxToMarkdown.test.ts.snap @@ -0,0 +1,55 @@ +// Jest Snapshot v1, https://goo.gl/fbAQLP + +exports[`MDX to Markdown Transpilation Full transformation with fixture should transform comprehensive fixture correctly 1`] = ` +"# Test Fixture + + + + + + +## Basic heading + +## Heading with anchor + +### Nested heading + + + +Regular content here. + +## Links and images + +- [Internal link](http://localhost:3000/docs/channels) +- [External link](https://example.com) +- [Hash link](#test-anchor) +- ![Relative image](https://raw.githubusercontent.com/ably/docs/main/src/images/content/diagrams/test.png) +- ![Absolute image](https://raw.githubusercontent.com/ably/docs/main/src/images/content/test.png) +- ![Direct image](https://raw.githubusercontent.com/ably/docs/main/src/images/content/test.png) + +## Template variables + +Use your-api-key and your-channel-name in your code. + +## Code blocks + + +\`\`\`javascript +const channel = realtime.channels.get('your-channel-name'); +\`\`\` + + +Here's a code block with anchors and scripts that should be preserved: +\`\`\`html + + +{/* preserve JSX comments in code */} +\`\`\` + +## JSX Components + + +" +`; diff --git a/data/onPostBuild/index.ts b/data/onPostBuild/index.ts index b5738663c9..e8bc0eaad1 100644 --- a/data/onPostBuild/index.ts +++ b/data/onPostBuild/index.ts @@ -1,5 +1,6 @@ import { GatsbyNode, Reporter } from 'gatsby'; import { onPostBuild as llmstxt } from './llmstxt'; +import { onPostBuild as transpileMdxToMarkdown } from './transpileMdxToMarkdown'; import { onPostBuild as compressAssets } from './compressAssets'; import { validateRedirectFile, REDIRECT_FILE_PATH } from '../utils/validateRedirectFile'; @@ -33,5 +34,6 @@ export const onPostBuild: GatsbyNode['onPostBuild'] = async (args) => { // Run all onPostBuild functions in sequence await llmstxt(args); + await transpileMdxToMarkdown(args); await compressAssets(args); }; diff --git a/data/onPostBuild/transpileMdxToMarkdown.test.ts b/data/onPostBuild/transpileMdxToMarkdown.test.ts new file mode 100644 index 0000000000..3648d593be --- /dev/null +++ b/data/onPostBuild/transpileMdxToMarkdown.test.ts @@ -0,0 +1,284 @@ +import { + transformMdxToMarkdown, + removeImportExportStatements, + removeScriptTags, + removeAnchorTags, + removeJsxComments, + convertImagePathsToGitHub, + convertRelativeUrls, + replaceTemplateVariables, + calculateOutputPath, +} from './transpileMdxToMarkdown'; +import * as fs from 'fs'; +import * as path from 'path'; + +describe('MDX to Markdown Transpilation', () => { + const siteUrl = 'http://localhost:3000'; + + describe('Full transformation with fixture', () => { + it('should transform comprehensive fixture correctly', () => { + const inputPath = path.join(__dirname, '__fixtures__', 'input.mdx'); + const input = fs.readFileSync(inputPath, 'utf-8'); + + const { content, title } = transformMdxToMarkdown(input, siteUrl); + + expect(title).toBe('Test Fixture'); + expect(content).toMatchSnapshot(); + }); + + it('should throw error when title is missing', () => { + const input = `--- +meta_description: "Test" +--- + +Content without title`; + + expect(() => { + transformMdxToMarkdown(input, siteUrl); + }).toThrow('Missing title in frontmatter'); + }); + }); + + describe('removeImportExportStatements', () => { + it('should remove single-line imports', () => { + const input = `import Foo from 'bar'\n\nContent here`; + const output = removeImportExportStatements(input); + expect(output).not.toContain('import'); + expect(output).toContain('Content here'); + }); + + it('should remove multi-line imports', () => { + const input = `import {\n Foo,\n Bar\n} from 'module';\n\nContent here`; + const output = removeImportExportStatements(input); + expect(output).not.toContain('import'); + expect(output).not.toContain('from'); + expect(output).toContain('Content here'); + }); + + it('should remove export default statements', () => { + const input = `export default SomeComponent;\n\nContent here`; + const output = removeImportExportStatements(input); + expect(output).not.toContain('export'); + expect(output).toContain('Content here'); + }); + + it('should remove export const statements', () => { + const input = `export const foo = 'bar';\n\nContent here`; + const output = removeImportExportStatements(input); + expect(output).not.toContain('export'); + expect(output).toContain('Content here'); + }); + + it('should remove multi-line export functions', () => { + const input = `export function foo() {\n return 'bar';\n}\n\nContent here`; + const output = removeImportExportStatements(input); + expect(output).not.toContain('export'); + expect(output).not.toContain('function foo'); + expect(output).toContain('Content here'); + }); + + it('should remove multi-line export classes', () => { + const input = `export class Foo {\n bar() {}\n}\n\nContent here`; + const output = removeImportExportStatements(input); + expect(output).not.toContain('export'); + expect(output).not.toContain('class Foo'); + expect(output).toContain('Content here'); + }); + }); + + describe('removeScriptTags', () => { + it('should remove script tags outside code blocks', () => { + const input = `Text before\n\nText after`; + const output = removeScriptTags(input); + expect(output).not.toContain('\nContent`; + const output = removeScriptTags(input); + expect(output).not.toContain('\n```'; + const output = removeScriptTags(input); + expect(output).toContain('\n```\nAfter'; + const output = removeScriptTags(input); + expect(output).toContain(''); + }); + }); + + describe('removeAnchorTags', () => { + it('should remove self-closing anchor tags', () => { + const input = '## Heading '; + const output = removeAnchorTags(input); + expect(output).toBe('## Heading '); + }); + + it('should remove anchor tags with name attribute', () => { + const input = '## Heading '; + const output = removeAnchorTags(input); + expect(output).toBe('## Heading '); + }); + + it('should remove empty anchor tags', () => { + const input = '## Heading '; + const output = removeAnchorTags(input); + expect(output).toBe('## Heading '); + }); + + it('should preserve anchor tags in code blocks', () => { + const input = '```html\n\n```'; + const output = removeAnchorTags(input); + expect(output).toContain(''); + }); + + it('should preserve link anchors with href', () => { + const input = '[Link text](http://example.com)'; + const output = removeAnchorTags(input); + expect(output).toBe('[Link text](http://example.com)'); + }); + }); + + describe('removeJsxComments', () => { + it('should remove single-line JSX comments', () => { + const input = 'Text {/* comment */} more text'; + const output = removeJsxComments(input); + expect(output).not.toContain('{/*'); + expect(output).not.toContain('*/}'); + expect(output).toContain('Text'); + expect(output).toContain('more text'); + }); + + it('should remove multi-line JSX comments', () => { + const input = 'Text {/*\n multi\n line\n*/} more'; + const output = removeJsxComments(input); + expect(output).not.toContain('{/*'); + expect(output).toContain('Text'); + expect(output).toContain('more'); + }); + + it('should preserve JSX comments in code blocks', () => { + const input = '```jsx\n{/* code comment */}\n```'; + const output = removeJsxComments(input); + expect(output).toContain('{/* code comment */}'); + }); + }); + + describe('convertImagePathsToGitHub', () => { + const githubBase = 'https://raw.githubusercontent.com/ably/docs/main/src'; + + it('should convert relative image paths', () => { + const input = '![Alt text](../../../images/content/diagrams/test.png)'; + const output = convertImagePathsToGitHub(input); + expect(output).toBe(`![Alt text](${githubBase}/images/content/diagrams/test.png)`); + }); + + it('should convert absolute image paths', () => { + const input = '![Alt text](/images/content/test.png)'; + const output = convertImagePathsToGitHub(input); + expect(output).toBe(`![Alt text](${githubBase}/images/content/test.png)`); + }); + + it('should convert direct image paths', () => { + const input = '![Alt text](images/content/test.png)'; + const output = convertImagePathsToGitHub(input); + expect(output).toBe(`![Alt text](${githubBase}/images/content/test.png)`); + }); + + it('should handle multiple images', () => { + const input = `![One](../images/a.png)\n![Two](/images/b.png)`; + const output = convertImagePathsToGitHub(input); + expect(output).toContain(`${githubBase}/images/a.png`); + expect(output).toContain(`${githubBase}/images/b.png`); + }); + }); + + describe('convertRelativeUrls', () => { + it('should convert relative URLs to absolute', () => { + const input = '[Link text](/docs/channels)'; + const output = convertRelativeUrls(input, siteUrl); + expect(output).toBe('[Link text](http://localhost:3000/docs/channels)'); + }); + + it('should preserve external URLs', () => { + const input = '[Link](https://example.com/page)'; + const output = convertRelativeUrls(input, siteUrl); + expect(output).toBe('[Link](https://example.com/page)'); + }); + + it('should preserve hash-only links', () => { + const input = '[Anchor](#section)'; + const output = convertRelativeUrls(input, siteUrl); + expect(output).toBe('[Anchor](#section)'); + }); + + it('should handle multiple links', () => { + const input = '[Internal](/docs/a) and [External](https://b.com) and [Hash](#c)'; + const output = convertRelativeUrls(input, siteUrl); + expect(output).toContain('[Internal](http://localhost:3000/docs/a)'); + expect(output).toContain('[External](https://b.com)'); + expect(output).toContain('[Hash](#c)'); + }); + }); + + describe('replaceTemplateVariables', () => { + it('should replace API_KEY', () => { + const input = 'Use {{API_KEY}} in your code'; + const output = replaceTemplateVariables(input); + expect(output).toBe('Use your-api-key in your code'); + }); + + it('should replace RANDOM_CHANNEL_NAME', () => { + const input = 'Channel: {{RANDOM_CHANNEL_NAME}}'; + const output = replaceTemplateVariables(input); + expect(output).toBe('Channel: your-channel-name'); + }); + + it('should replace multiple occurrences', () => { + const input = '{{API_KEY}} and {{RANDOM_CHANNEL_NAME}} and {{API_KEY}}'; + const output = replaceTemplateVariables(input); + expect(output).toBe('your-api-key and your-channel-name and your-api-key'); + }); + }); + + describe('calculateOutputPath', () => { + it('should handle index files', () => { + const output = calculateOutputPath('docs/channels', 'index'); + expect(output).toContain('public/docs/channels.md'); + expect(output).toMatch(/public\/docs\/channels\.md$/); + }); + + it('should handle non-index files', () => { + const output = calculateOutputPath('docs/chat', 'connect'); + expect(output).toContain('public/docs/chat/connect.md'); + expect(output).toMatch(/public\/docs\/chat\/connect\.md$/); + }); + + it('should handle top-level docs index', () => { + const output = calculateOutputPath('docs', 'index'); + expect(output).toContain('public/docs.md'); + expect(output).toMatch(/public\/docs\.md$/); + }); + + it('should handle nested index paths', () => { + const output = calculateOutputPath('docs/api/realtime-sdk', 'index'); + expect(output).toContain('public/docs/api/realtime-sdk.md'); + expect(output).toMatch(/public\/docs\/api\/realtime-sdk\.md$/); + }); + + it('should handle deeply nested files', () => { + const output = calculateOutputPath('docs/chat/moderation/direct', 'bodyguard'); + expect(output).toContain('public/docs/chat/moderation/direct/bodyguard.md'); + expect(output).toMatch(/public\/docs\/chat\/moderation\/direct\/bodyguard\.md$/); + }); + }); +}); diff --git a/data/onPostBuild/transpileMdxToMarkdown.ts b/data/onPostBuild/transpileMdxToMarkdown.ts new file mode 100644 index 0000000000..1f69a5d0ea --- /dev/null +++ b/data/onPostBuild/transpileMdxToMarkdown.ts @@ -0,0 +1,475 @@ +import { GatsbyNode } from 'gatsby'; +import * as path from 'path'; +import * as fs from 'fs-extra'; +import frontMatter from 'front-matter'; + +const REPORTER_PREFIX = 'onPostBuild:transpileMdxToMarkdown'; + +interface MdxNode { + parent: { + relativeDirectory: string; + name: string; + absolutePath: string; + }; + internal: { + contentFilePath: string; + }; +} + +interface MdxQueryResult { + site: { + siteMetadata: { + siteUrl: string; + }; + }; + allMdx: { + nodes: MdxNode[]; + }; +} + +interface FrontMatterAttributes { + title?: string; + [key: string]: any; +} + +/** + * Remove import and export statements from content + * Handles both single-line and multi-line statements + */ +function removeImportExportStatements(content: string): string { + let result = content; + + // Remove import statements (single and multi-line) + result = result + .replace(/^import\s+[\s\S]*?from\s+['"][^'"]+['"];?\s*$/gm, '') + .replace(/^import\s+['"][^'"]+['"];?\s*$/gm, ''); + + // Remove export statements + // Handle: export { foo, bar }; (single and multi-line) + result = result + .replace(/^export\s+\{[\s\S]*?\}\s*;?\s*$/gm, '') + .replace(/^export\s+\{[\s\S]*?\}\s+from\s+['"][^'"]+['"];?\s*$/gm, ''); + + // Handle: export default Component; or export const foo = 'bar'; + result = result.replace(/^export\s+(default|const|let|var)\s+.*$/gm, ''); + + // Handle: export function/class declarations (multi-line) + // Match from 'export function/class' until the closing brace + result = result.replace(/^export\s+(function|class)\s+\w+[\s\S]*?\n\}/gm, ''); + + // Clean up extra blank lines left behind + return result.replace(/\n\n\n+/g, '\n\n'); +} + +/** + * Remove script tags that are not inside code blocks + */ +function removeScriptTags(content: string): string { + // Split content into code block and non-code-block sections + const parts: Array<{ content: string; isCodeBlock: boolean }> = []; + const fenceRegex = /```[\s\S]*?```/g; + + let lastIndex = 0; + const matches = Array.from(content.matchAll(fenceRegex)); + + for (const match of matches) { + // Add content before code block + if (match.index !== undefined && match.index > lastIndex) { + parts.push({ + content: content.slice(lastIndex, match.index), + isCodeBlock: false, + }); + } + // Add code block itself + parts.push({ + content: match[0], + isCodeBlock: true, + }); + lastIndex = (match.index || 0) + match[0].length; + } + + // Add remaining content after last code block + if (lastIndex < content.length) { + parts.push({ + content: content.slice(lastIndex), + isCodeBlock: false, + }); + } + + // Remove script tags only from non-code-block parts + return parts + .map((part) => { + if (part.isCodeBlock) { + return part.content; // Preserve code blocks exactly + } + // Remove script tags with any attributes and their content + return part.content.replace(/]*>[\s\S]*?<\/script>/gi, ''); + }) + .join(''); +} + +/** + * Remove anchor tags that are used for HTML navigation + * Removes patterns like , , etc. + * Preserves actual links with href attributes + */ +function removeAnchorTags(content: string): string { + // Split content into code block and non-code-block sections + const parts: Array<{ content: string; isCodeBlock: boolean }> = []; + const fenceRegex = /```[\s\S]*?```/g; + + let lastIndex = 0; + const matches = Array.from(content.matchAll(fenceRegex)); + + for (const match of matches) { + if (match.index !== undefined && match.index > lastIndex) { + parts.push({ + content: content.slice(lastIndex, match.index), + isCodeBlock: false, + }); + } + parts.push({ + content: match[0], + isCodeBlock: true, + }); + lastIndex = (match.index || 0) + match[0].length; + } + + if (lastIndex < content.length) { + parts.push({ + content: content.slice(lastIndex), + isCodeBlock: false, + }); + } + + // Remove anchor tags only from non-code-block parts + return parts + .map((part) => { + if (part.isCodeBlock) { + return part.content; // Preserve code blocks exactly + } + + // Remove anchor tags from regular content + return part.content + .replace(//gi, '') + .replace(//gi, '') + .replace(/<\/a>/gi, '') + .replace(/<\/a>/gi, ''); + }) + .join(''); +} + +/** + * Remove JSX comments from content + * Removes patterns like: {slash-star comment star-slash} + * Preserves JSX comments in code blocks + */ +function removeJsxComments(content: string): string { + // Split content into code block and non-code-block sections + const parts: Array<{ content: string; isCodeBlock: boolean }> = []; + const fenceRegex = /```[\s\S]*?```/g; + + let lastIndex = 0; + const matches = Array.from(content.matchAll(fenceRegex)); + + for (const match of matches) { + if (match.index !== undefined && match.index > lastIndex) { + parts.push({ + content: content.slice(lastIndex, match.index), + isCodeBlock: false, + }); + } + parts.push({ + content: match[0], + isCodeBlock: true, + }); + lastIndex = (match.index || 0) + match[0].length; + } + + if (lastIndex < content.length) { + parts.push({ + content: content.slice(lastIndex), + isCodeBlock: false, + }); + } + + // Remove JSX comments only from non-code-block parts + return parts + .map((part) => { + if (part.isCodeBlock) { + return part.content; // Preserve code blocks exactly + } + // Remove JSX comments from regular content + return part.content.replace(/\{\/\*[\s\S]*?\*\/\}/g, ''); + }) + .join(''); +} + +/** + * Convert image paths to GitHub raw URLs + * Handles relative (../), absolute (/images/), and direct (images/) paths + * Only converts paths with valid image extensions + */ +function convertImagePathsToGitHub(content: string): string { + const githubBaseUrl = 'https://raw.githubusercontent.com/ably/docs/main/src'; + const imageExtensions = '(?:png|jpg|jpeg|gif|svg|webp|bmp|ico)'; + + return content + // Handle relative paths: ../../../images/...{ext} + .replace( + new RegExp(`!\\[([^\\]]*)\\]\\(((?:\\.\\.\\/)+)(images\\/[^)]+\\.${imageExtensions})\\)`, 'gi'), + (match, altText, relativePath, imagePath) => { + return `![${altText}](${githubBaseUrl}/${imagePath})`; + } + ) + // Handle absolute paths: /images/...{ext} + .replace( + new RegExp(`!\\[([^\\]]*)\\]\\(\\/(images\\/[^)]+\\.${imageExtensions})\\)`, 'gi'), + (match, altText, imagePath) => { + return `![${altText}](${githubBaseUrl}/${imagePath})`; + } + ) + // Handle direct paths: images/...{ext} (no prefix) + .replace( + new RegExp(`!\\[([^\\]]*)\\]\\((images\\/[^)]+\\.${imageExtensions})\\)`, 'gi'), + (match, altText, imagePath) => { + return `![${altText}](${githubBaseUrl}/${imagePath})`; + } + ); +} + +/** + * Convert relative URLs to absolute URLs using the main website domain + * Converts: [text](/docs/channels) โ†’ [text](https://ably.com/docs/channels) + * Preserves: External URLs (http://, https://), hash-only links (#anchor) + */ +function convertRelativeUrls(content: string, siteUrl: string): string { + const baseUrl = siteUrl.replace(/\/$/, ''); // Remove trailing slash + + // Match markdown links: [text](url) + // Only convert URLs that start with / (relative) and are not external URLs or hash-only + return content.replace( + /\[([^\]]+)\]\(([^)]+)\)/g, + (match, linkText, url) => { + // Don't convert external URLs + if (url.startsWith('http://') || url.startsWith('https://')) { + return match; + } + + // Don't convert hash-only anchors + if (url.startsWith('#')) { + return match; + } + + // Convert relative URLs (starting with /) + if (url.startsWith('/')) { + return `[${linkText}](${baseUrl}${url})`; + } + + // Keep other URLs as-is (relative paths without leading /) + return match; + } + ); +} + +/** + * Replace template variables with readable placeholders + */ +function replaceTemplateVariables(content: string): string { + return content + .replace(/{{API_KEY}}/g, 'your-api-key') + .replace(/{{RANDOM_CHANNEL_NAME}}/g, 'your-channel-name'); +} + +/** + * Calculate the output path for a markdown file based on its source location + */ +function calculateOutputPath(relativeDirectory: string, fileName: string): string { + // Remove 'docs' or 'docs/' prefix: "docs/channels" โ†’ "channels", "docs" โ†’ "" + const pathWithoutDocs = relativeDirectory.replace(/^docs\/?/, ''); + const pathParts = pathWithoutDocs.split('/').filter((p) => p); + + if (fileName === 'index') { + // Special case: top-level docs/index.mdx โ†’ public/docs.md + if (pathParts.length === 0) { + return path.join(process.cwd(), 'public', 'docs.md'); + } + + // index.mdx: use parent directory name + // docs/channels/index.mdx โ†’ public/docs/channels.md + const dirName = pathParts.pop(); // Remove and get last element + return path.join(process.cwd(), 'public', 'docs', ...pathParts, `${dirName}.md`); + } else { + // Regular file: use filename + // docs/chat/connect.mdx โ†’ public/docs/chat/connect.md + return path.join(process.cwd(), 'public', 'docs', ...pathParts, `${fileName}.md`); + } +} + +/** + * Transform MDX content to clean Markdown + */ +function transformMdxToMarkdown(sourceContent: string, siteUrl: string): { content: string; title: string } { + // Stage 1: Parse frontmatter + const parsed = frontMatter(sourceContent); + + if (!parsed.attributes.title) { + throw new Error('Missing title in frontmatter'); + } + + const title = parsed.attributes.title; + let content = parsed.body; + + // Stage 2: Remove import/export statements + content = removeImportExportStatements(content); + + // Stage 3: Remove script tags (not in code blocks) + content = removeScriptTags(content); + + // Stage 4: Remove anchor tags + content = removeAnchorTags(content); + + // Stage 5: Remove JSX comments + content = removeJsxComments(content); + + // Stage 6: Convert image paths to GitHub URLs + content = convertImagePathsToGitHub(content); + + // Stage 7: Convert relative URLs to absolute URLs + content = convertRelativeUrls(content, siteUrl); + + // Stage 8: Replace template variables + content = replaceTemplateVariables(content); + + // Stage 9: Prepend title as markdown heading + const finalContent = `# ${title}\n\n${content}`; + + return { content: finalContent, title }; +} + +/** + * Process a single MDX file + */ +async function processFile(node: MdxNode, siteUrl: string, reporter: any): Promise { + const sourcePath = node.internal.contentFilePath; + const relativeDirectory = node.parent.relativeDirectory; + const fileName = node.parent.name; + + // Read source MDX file + const sourceContent = await fs.readFile(sourcePath, 'utf-8'); + + // Transform MDX to Markdown + const { content } = transformMdxToMarkdown(sourceContent, siteUrl); + + // Calculate output path + const outputPath = calculateOutputPath(relativeDirectory, fileName); + + // Ensure output directory exists + await fs.ensureDir(path.dirname(outputPath)); + + // Write markdown file + await fs.writeFile(outputPath, content, 'utf-8'); + + reporter.verbose(`${REPORTER_PREFIX} Transpiled: ${sourcePath} -> ${outputPath}`); +} + +/** + * Main onPostBuild function + */ +export const onPostBuild: GatsbyNode['onPostBuild'] = async ({ graphql, reporter }) => { + const query = ` + query { + site { + siteMetadata { + siteUrl + } + } + allMdx { + nodes { + parent { + ... on File { + relativeDirectory + name + absolutePath + } + } + internal { + contentFilePath + } + } + } + } + `; + + const { data, errors } = await graphql(query); + + if (errors) { + reporter.panicOnBuild( + `${REPORTER_PREFIX} Error running GraphQL query: ${JSON.stringify(errors)}` + ); + return; + } + + if (!data) { + reporter.warn(`${REPORTER_PREFIX} No MDX nodes found`); + return; + } + + // Get siteUrl from GraphQL + const siteUrl = data.site?.siteMetadata?.siteUrl; + + if (!siteUrl) { + reporter.panicOnBuild( + `${REPORTER_PREFIX} siteUrl is not configured in siteMetadata. Please check gatsby-config.ts` + ); + return; + } + + // Filter to only docs directory + const mdxNodes = data.allMdx.nodes.filter((node) => { + return node.parent.relativeDirectory.startsWith('docs'); + }); + + reporter.info(`${REPORTER_PREFIX} Found ${mdxNodes.length} MDX files to transpile`); + + let successCount = 0; + let failureCount = 0; + + // Process each file + for (const node of mdxNodes) { + try { + await processFile(node, siteUrl, reporter); + successCount++; + } catch (error) { + const errorMessage = error instanceof Error ? error.message : String(error); + reporter.warn( + `${REPORTER_PREFIX} Failed to transpile ${node.internal.contentFilePath}: ${errorMessage}` + ); + failureCount++; + } + } + + // Report summary + if (failureCount > 0) { + reporter.warn( + `${REPORTER_PREFIX} Transpiled ${successCount} files, ${failureCount} failed` + ); + } else { + reporter.info( + `${REPORTER_PREFIX} Successfully transpiled ${successCount} MDX files to Markdown` + ); + } +}; + +// Export functions for testing +export { + removeImportExportStatements, + removeScriptTags, + removeAnchorTags, + removeJsxComments, + convertImagePathsToGitHub, + convertRelativeUrls, + replaceTemplateVariables, + calculateOutputPath, + transformMdxToMarkdown, +};