From 93178345c68f18f8e9c3f9b84bc279c6c4742eef Mon Sep 17 00:00:00 2001 From: Kenneth Kalmer Date: Mon, 9 Jun 2025 16:47:15 +0100 Subject: [PATCH 1/7] poc: generate markdown copies of each HTML file for agents --- data/onPostBuild/index.ts | 2 + data/onPostBuild/markdownOutput.ts | 173 ++++++++++++++++++ package.json | 4 +- yarn.lock | 273 +++++++++++++++++++++++++++-- 4 files changed, 436 insertions(+), 16 deletions(-) create mode 100644 data/onPostBuild/markdownOutput.ts diff --git a/data/onPostBuild/index.ts b/data/onPostBuild/index.ts index 844392b4d6..84c131f0f4 100644 --- a/data/onPostBuild/index.ts +++ b/data/onPostBuild/index.ts @@ -1,9 +1,11 @@ import { GatsbyNode } from 'gatsby'; import { onPostBuild as llmstxt } from './llmstxt'; import { onPostBuild as compressAssets } from './compressAssets'; +import { onPostBuild as markdownOutput } from './markdownOutput'; export const onPostBuild: GatsbyNode['onPostBuild'] = async (args) => { // Run all onPostBuild functions in sequence await llmstxt(args); + await markdownOutput(args); await compressAssets(args); }; diff --git a/data/onPostBuild/markdownOutput.ts b/data/onPostBuild/markdownOutput.ts new file mode 100644 index 0000000000..18fd3b3eff --- /dev/null +++ b/data/onPostBuild/markdownOutput.ts @@ -0,0 +1,173 @@ +import { GatsbyNode, Reporter } from 'gatsby'; +import fs from 'fs/promises'; +import path from 'path'; +import { glob } from 'glob'; +import { JSDOM, VirtualConsole } from 'jsdom'; +import TurndownService from 'turndown'; + +const CONFIG = { + htmlDir: './public', + markdownDir: './public', + excludePatterns: ['404.html', 'api/**/*', 'page-data/**/*', 'static/**/*', 'docs/404.html'], + includeMetadata: true, +}; + +// Selectors for elements to remove from the HTML before converting to markdown +const UNWANTED_ELEMENTS_SELECTOR = + 'script, style, nav[role="navigation"], .header, #header, header, .footer, #footer, footer, [aria-label="breadcrumb"], aside'; + +// Prioritised selectors for the main content of the page, first match wins +const CONTENT_SELECTORS = ['main', '[role="main"]', '.content', '#content', 'article']; + +const withoutTrailingSlash = (path: string) => (path === `/` ? path : path.replace(/\/$/, ``)); + +const cleanAttribute = (attribute: string | null) => { + return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : ''; +}; + +async function exportToMarkdown({ reporter, siteUrl }: { reporter: Reporter; siteUrl: string }) { + const turndownService = new TurndownService({ + headingStyle: 'atx', + codeBlockStyle: 'fenced', + emDelimiter: '*', + }); + + // Remove the anchor tags from the headers + turndownService.addRule('header', { + filter: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'], + replacement: (_, node) => { + const level = parseInt(node.nodeName.charAt(1), 10); + return `${'#'.repeat(level)} ${node.textContent}`; + }, + }); + + // Update local links to use the siteUrl + turndownService.addRule('localLink', { + filter: (node) => (node.nodeName === 'A' && node.getAttribute('href')?.startsWith('/')) || false, + replacement: (content, node) => { + // most of this replacement is taken from the turndown library directly + let href = withoutTrailingSlash(siteUrl) + (node as HTMLElement).getAttribute('href'); + if (href) { + href = href.replace(/([()])/g, '\\$1'); + } + let title = cleanAttribute((node as HTMLElement).getAttribute('title')); + if (title) { + title = ' "' + title.replace(/"/g, '\\"') + '"'; + } + return '[' + content + '](' + href + title + ')'; + }, + }); + + // Find all HTML files + const htmlFiles = await glob('**/*.html', { + cwd: CONFIG.htmlDir, + ignore: CONFIG.excludePatterns, + }); + + reporter.info(`Found ${htmlFiles.length} HTML files to process`); + + for (const htmlFile of htmlFiles) { + try { + const fullPath = path.join(CONFIG.htmlDir, htmlFile); + const htmlContent = await fs.readFile(fullPath, 'utf-8'); + + // Parse and clean HTML + const virtualConsole = new VirtualConsole(); // Stop CSS parsing errors from polluting the console + const dom = new JSDOM(htmlContent, { url: siteUrl, virtualConsole }); + const document = dom.window.document; + + // Remove unwanted elements + const unwanted = document.querySelectorAll(UNWANTED_ELEMENTS_SELECTOR); + unwanted.forEach((el) => el.remove()); + + // Get main content + let mainContent = null; + + for (const selector of CONTENT_SELECTORS) { + mainContent = document.querySelector(selector); + if (mainContent) { + break; + } + } + + if (!mainContent) { + mainContent = document.body; + } + + // Convert to markdown + const markdown = turndownService.turndown(mainContent.innerHTML); + + // Prepare final content + let finalContent = ''; + + if (CONFIG.includeMetadata) { + const title = document.querySelector('title')?.textContent?.trim() || 'Untitled'; + const description = document.querySelector('meta[name="description"]')?.getAttribute('content')?.trim() || ''; + const canonicalUrl = document.querySelector('link[rel="canonical"]')?.getAttribute('href') || ''; + + finalContent = `--- +title: "${title}" +url: ${canonicalUrl || `/${htmlFile.replace('.html', '').replace('/index', '')}`} +generated_at: ${new Date().toISOString()} +description: "${description}" +--- + +${markdown}`; + } else { + finalContent = markdown; + } + + // Append .md to the filename, remove /index.html + const outputName = `${htmlFile.replace('/index.html', '')}.md`; + const outputPath = path.join(CONFIG.markdownDir, outputName); + + // Write markdown file + await fs.writeFile(outputPath, finalContent); + } catch (error) { + reporter.error(`✗ Error processing ${htmlFile}:`, error as Error); + } + } + + reporter.info(`Markdown export complete! ${htmlFiles.length} files processed.`); +} + +interface QueryResult { + site: { + siteMetadata: { + siteUrl: string; + }; + }; +} + +// Run the export +export const onPostBuild: GatsbyNode['onPostBuild'] = async ({ graphql, reporter }) => { + const query = ` + query { + site { + siteMetadata { + siteUrl + } + } + } + `; + const { data, errors } = await graphql(query); + + if (errors) { + reporter.panicOnBuild(`Error while running GraphQL query.`); + throw errors; + } + + if (!data) { + reporter.panicOnBuild(`No documents found.`); + throw new Error('No documents found.'); + } + + const siteUrl = data.site.siteMetadata.siteUrl; + + if (!siteUrl) { + reporter.panicOnBuild(`Site URL not found.`); + throw new Error('Site URL not found.'); + } + + await exportToMarkdown({ reporter, siteUrl }); +}; diff --git a/package.json b/package.json index d49fe77cec..96430037b9 100644 --- a/package.json +++ b/package.json @@ -92,7 +92,7 @@ "react-select": "^5.7.0", "remark-gfm": "^1.0.0", "textile-js": "^2.1.1", - "turndown": "^7.1.1", + "turndown": "^7.2.0", "typescript": "^4.6.3", "use-keyboard-shortcut": "^1.1.6", "util": "^0.12.4", @@ -133,10 +133,12 @@ "eslint-plugin-react-hooks": "^4.6.0", "fast-check": "^3.4.0", "gatsby-plugin-postcss": "^6.3.0", + "glob": "^11.0.2", "identity-obj-proxy": "^3.0.0", "jest": "^29.3.1", "jest-axe": "^7.0.0", "jest-environment-jsdom": "^29.3.1", + "jsdom": "^26.1.0", "lint-staged": "^13.1.0", "msw": "^2.0.1", "postcss": "^8.4.31", diff --git a/yarn.lock b/yarn.lock index d08139af4f..e89bb2d8e7 100644 --- a/yarn.lock +++ b/yarn.lock @@ -78,6 +78,17 @@ signedsource "^1.0.0" yargs "^15.3.1" +"@asamuzakjp/css-color@^3.2.0": + version "3.2.0" + resolved "https://registry.yarnpkg.com/@asamuzakjp/css-color/-/css-color-3.2.0.tgz#cc42f5b85c593f79f1fa4f25d2b9b321e61d1794" + integrity sha512-K1A6z8tS3XsmCMM86xoWdn7Fkdn9m6RSVtocUrJYIwZnFVkng/PvkEoWtOWmP+Scc6saYWHWZYbndEEXxl24jw== + dependencies: + "@csstools/css-calc" "^2.1.3" + "@csstools/css-color-parser" "^3.0.9" + "@csstools/css-parser-algorithms" "^3.0.4" + "@csstools/css-tokenizer" "^3.0.3" + lru-cache "^10.4.3" + "@babel/code-frame@7.12.11": version "7.12.11" resolved "https://registry.yarnpkg.com/@babel/code-frame/-/code-frame-7.12.11.tgz#f4ad435aa263db935b8f10f2c552d23fb716a63f" @@ -1486,6 +1497,41 @@ resolved "https://registry.yarnpkg.com/@codesandbox/sandpack-themes/-/sandpack-themes-2.0.21.tgz#f1970f03537434fff008e9c9c3f6581b0e5940c2" integrity sha512-CMH/MO/dh6foPYb/3eSn2Cu/J3+1+/81Fsaj7VggICkCrmRk0qG5dmgjGAearPTnRkOGORIPHuRqwNXgw0E6YQ== +"@cspotcode/source-map-support@^0.8.0": + version "0.8.1" + resolved "https://registry.yarnpkg.com/@cspotcode/source-map-support/-/source-map-support-0.8.1.tgz#00629c35a688e05a88b1cda684fb9d5e73f000a1" + integrity sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw== + dependencies: + "@jridgewell/trace-mapping" "0.3.9" + +"@csstools/color-helpers@^5.0.2": + version "5.0.2" + resolved "https://registry.yarnpkg.com/@csstools/color-helpers/-/color-helpers-5.0.2.tgz#82592c9a7c2b83c293d9161894e2a6471feb97b8" + integrity sha512-JqWH1vsgdGcw2RR6VliXXdA0/59LttzlU8UlRT/iUUsEeWfYq8I+K0yhihEUTTHLRm1EXvpsCx3083EU15ecsA== + +"@csstools/css-calc@^2.1.3", "@csstools/css-calc@^2.1.4": + version "2.1.4" + resolved "https://registry.yarnpkg.com/@csstools/css-calc/-/css-calc-2.1.4.tgz#8473f63e2fcd6e459838dd412401d5948f224c65" + integrity sha512-3N8oaj+0juUw/1H3YwmDDJXCgTB1gKU6Hc/bB502u9zR0q2vd786XJH9QfrKIEgFlZmhZiq6epXl4rHqhzsIgQ== + +"@csstools/css-color-parser@^3.0.9": + version "3.0.10" + resolved "https://registry.yarnpkg.com/@csstools/css-color-parser/-/css-color-parser-3.0.10.tgz#79fc68864dd43c3b6782d2b3828bc0fa9d085c10" + integrity sha512-TiJ5Ajr6WRd1r8HSiwJvZBiJOqtH86aHpUjq5aEKWHiII2Qfjqd/HCWKPOW8EP4vcspXbHnXrwIDlu5savQipg== + dependencies: + "@csstools/color-helpers" "^5.0.2" + "@csstools/css-calc" "^2.1.4" + +"@csstools/css-parser-algorithms@^3.0.4": + version "3.0.5" + resolved "https://registry.yarnpkg.com/@csstools/css-parser-algorithms/-/css-parser-algorithms-3.0.5.tgz#5755370a9a29abaec5515b43c8b3f2cf9c2e3076" + integrity sha512-DaDeUkXZKjdGhgYaHNJTV9pV7Y9B3b644jCLs9Upc3VeNGg6LWARAT6O+Q+/COo+2gg/bM5rhpMAtf70WqfBdQ== + +"@csstools/css-tokenizer@^3.0.3": + version "3.0.4" + resolved "https://registry.yarnpkg.com/@csstools/css-tokenizer/-/css-tokenizer-3.0.4.tgz#333fedabc3fd1a8e5d0100013731cf19e6a8c5d3" + integrity sha512-Vd/9EVDiu6PPJt9yAh6roZP6El1xHrdvIVGjyBsHR0RYwNHgL7FJPyIIW4fANJNG6FtyZfvlRPpFI4ZM/lubvw== + "@emotion/babel-plugin@^11.11.0": version "11.11.0" resolved "https://registry.npmjs.org/@emotion/babel-plugin/-/babel-plugin-11.11.0.tgz" @@ -2420,6 +2466,11 @@ "@lezer/lr" "^1.0.0" json5 "^2.2.1" +"@mixmark-io/domino@^2.2.0": + version "2.2.0" + resolved "https://registry.yarnpkg.com/@mixmark-io/domino/-/domino-2.2.0.tgz#4e8ec69bf1afeb7a14f0628b7e2c0f35bdb336c3" + integrity sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw== + "@msgpackr-extract/msgpackr-extract-darwin-arm64@3.0.3": version "3.0.3" resolved "https://registry.yarnpkg.com/@msgpackr-extract/msgpackr-extract-darwin-arm64/-/msgpackr-extract-darwin-arm64-3.0.3.tgz#9edec61b22c3082018a79f6d1c30289ddf3d9d11" @@ -4801,6 +4852,26 @@ agent-base@6: dependencies: debug "4" +agent-base@^7.1.0, agent-base@^7.1.2: + version "7.1.3" + resolved "https://registry.yarnpkg.com/agent-base/-/agent-base-7.1.3.tgz#29435eb821bc4194633a5b89e5bc4703bafc25a1" + integrity sha512-jRR5wdylq8CkOe6hei19GGZnxM6rBGwFl3Bg0YItGDimvjGtAvdZk4Pu6Cl4u4Igsws4a1fd1Vq3ezrhn4KmFw== + +agentkeepalive@^4.2.1: + version "4.5.0" + resolved "https://registry.npmjs.org/agentkeepalive/-/agentkeepalive-4.5.0.tgz" + integrity sha512-5GG/5IbQQpC9FpkRGsSvZI5QYeSCzlJHdpBQntCsuTOxhKD8lqKhrleg2Yi7yvMIf82Ycmmqln9U8V9qwEiJew== + dependencies: + humanize-ms "^1.2.1" + +aggregate-error@^3.0.0: + version "3.1.0" + resolved "https://registry.npmjs.org/aggregate-error/-/aggregate-error-3.1.0.tgz" + integrity sha512-4I7Td01quW/RpocfNayFdFVk1qSuoh0E7JrbRJ16nH01HhKFQ88INq9Sd+nd72zqRySlr9BmDA8xlEJ6vJMrYA== + dependencies: + clean-stack "^2.0.0" + indent-string "^4.0.0" + ajv-formats@^2.1.1: version "2.1.1" resolved "https://registry.yarnpkg.com/ajv-formats/-/ajv-formats-2.1.1.tgz#6e669400659eb74973bbf2e33327180a0996b520" @@ -6634,6 +6705,14 @@ cssstyle@^2.3.0: dependencies: cssom "~0.3.6" +cssstyle@^4.2.1: + version "4.4.0" + resolved "https://registry.yarnpkg.com/cssstyle/-/cssstyle-4.4.0.tgz#a185e81564a6047693586d904d278cbe8565ba07" + integrity sha512-W0Y2HOXlPkb2yaKrCVRjinYKciu/qSLEmK0K9mcfDei3zwlnHFEHAs/Du3cIRwPqY+J4JsiBzUjoHyc8RsJ03A== + dependencies: + "@asamuzakjp/css-color" "^3.2.0" + rrweb-cssom "^0.8.0" + csstype@^3.0.2: version "3.1.3" resolved "https://registry.yarnpkg.com/csstype/-/csstype-3.1.3.tgz#d80ff294d114fb0e6ac500fbf85b60137d7eff81" @@ -6661,6 +6740,14 @@ data-urls@^3.0.2: whatwg-mimetype "^3.0.0" whatwg-url "^11.0.0" +data-urls@^5.0.0: + version "5.0.0" + resolved "https://registry.yarnpkg.com/data-urls/-/data-urls-5.0.0.tgz#2f76906bce1824429ffecb6920f45a0b30f00dde" + integrity sha512-ZYP5VBHshaDAiVZxjbRVcFJpc+4xGgT0bK3vzy1HLN8jTO975HEbuYzZJcHoQEY5K1a0z8YayJkyVETa08eNTg== + dependencies: + whatwg-mimetype "^4.0.0" + whatwg-url "^14.0.0" + data-view-buffer@^1.0.2: version "1.0.2" resolved "https://registry.yarnpkg.com/data-view-buffer/-/data-view-buffer-1.0.2.tgz#211a03ba95ecaf7798a8c7198d79536211f88570" @@ -6735,7 +6822,7 @@ decamelize@^1.2.0: resolved "https://registry.yarnpkg.com/decamelize/-/decamelize-1.2.0.tgz#f6534d15148269b20352e7bee26f501f9a191290" integrity sha512-z2S+W9X73hAUUki+N+9Za2lBlun89zigOyGrsax+KUQ6wKW4ZoWpEYBkGhQjwAjjDCkWxhY0VKEhk8wzY7F5cA== -decimal.js@^10.4.2: +decimal.js@^10.4.2, decimal.js@^10.5.0: version "10.5.0" resolved "https://registry.npmjs.org/decimal.js/-/decimal.js-10.5.0.tgz" integrity sha512-8vDa8Qxvr/+d94hSh5P3IJwI5t8/c0KsMp+g8bNw9cY2icONa5aPfvKeieW1WlG0WQYwwhJ7mjui2xtiePQSXw== @@ -7002,11 +7089,6 @@ domhandler@^5.0.2, domhandler@^5.0.3: dependencies: domelementtype "^2.3.0" -domino@^2.1.6: - version "2.1.6" - resolved "https://registry.npmjs.org/domino/-/domino-2.1.6.tgz" - integrity sha512-3VdM/SXBZX2omc9JF9nOPCtDaYQ67BGp5CoLpIQlO2KCAPETs8TcDHacF26jXadGbvUteZzRTeos2fhID5+ucQ== - dompurify@^3.2.4: version "3.2.4" resolved "https://registry.npmjs.org/dompurify/-/dompurify-3.2.4.tgz" @@ -9255,7 +9337,19 @@ glob@^10.3.10, glob@^10.4.2: package-json-from-dist "^1.0.0" path-scurry "^1.11.1" -glob@^7.1.1, glob@^7.1.3, glob@^7.1.4, glob@^7.1.6, glob@^7.2.3: +glob@^11.0.2: + version "11.0.2" + resolved "https://registry.yarnpkg.com/glob/-/glob-11.0.2.tgz#3261e3897bbc603030b041fd77ba636022d51ce0" + integrity sha512-YT7U7Vye+t5fZ/QMkBFrTJ7ZQxInIUjwyAjVj84CYXqgBdv30MFUPGnBR6sQaVq6Is15wYJUsnzTuWaGRBhBAQ== + dependencies: + foreground-child "^3.1.0" + jackspeak "^4.0.1" + minimatch "^10.0.0" + minipass "^7.1.2" + package-json-from-dist "^1.0.0" + path-scurry "^2.0.0" + +glob@^7.1.1, glob@^7.1.2, glob@^7.1.3, glob@^7.1.4, glob@^7.1.6, glob@^7.2.3: version "7.2.3" resolved "https://registry.npmjs.org/glob/-/glob-7.2.3.tgz" integrity sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q== @@ -9744,6 +9838,13 @@ html-encoding-sniffer@^3.0.0: dependencies: whatwg-encoding "^2.0.0" +html-encoding-sniffer@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/html-encoding-sniffer/-/html-encoding-sniffer-4.0.0.tgz#696df529a7cfd82446369dc5193e590a3735b448" + integrity sha512-Y22oTqIU4uuPgEemfz7NDJz6OeKf12Lsu+QC+s3BVpda64lTiMYCyGwg5ki4vFxkMwQdeZDl2adZoqUgdFuTgQ== + dependencies: + whatwg-encoding "^3.1.1" + html-entities@^2.1.0: version "2.5.2" resolved "https://registry.npmjs.org/html-entities/-/html-entities-2.5.2.tgz" @@ -9817,6 +9918,14 @@ http-proxy-agent@^5.0.0: agent-base "6" debug "4" +http-proxy-agent@^7.0.2: + version "7.0.2" + resolved "https://registry.yarnpkg.com/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz#9a8b1f246866c028509486585f62b8f2c18c270e" + integrity sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig== + dependencies: + agent-base "^7.1.0" + debug "^4.3.4" + http2-wrapper@^1.0.0-beta.5.2: version "1.0.3" resolved "https://registry.yarnpkg.com/http2-wrapper/-/http2-wrapper-1.0.3.tgz#b8f55e0c1f25d4ebd08b3b0c2c079f9590800b3d" @@ -9841,6 +9950,14 @@ https-proxy-agent@^5.0.0, https-proxy-agent@^5.0.1: agent-base "6" debug "4" +https-proxy-agent@^7.0.6: + version "7.0.6" + resolved "https://registry.yarnpkg.com/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz#da8dfeac7da130b05c2ba4b59c9b6cd66611a6b9" + integrity sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw== + dependencies: + agent-base "^7.1.2" + debug "4" + human-signals@^2.1.0: version "2.1.0" resolved "https://registry.yarnpkg.com/human-signals/-/human-signals-2.1.0.tgz#dc91fcba42e4d06e4abaed33b3e7a3c02f514ea0" @@ -10602,6 +10719,13 @@ jackspeak@^3.1.2: optionalDependencies: "@pkgjs/parseargs" "^0.11.0" +jackspeak@^4.0.1: + version "4.1.1" + resolved "https://registry.yarnpkg.com/jackspeak/-/jackspeak-4.1.1.tgz#96876030f450502047fc7e8c7fcf8ce8124e43ae" + integrity sha512-zptv57P3GpL+O0I7VdMJNBZCu+BPHVQUk55Ft8/QCJjTVxrnJHuVuX/0Bl2A6/+2oyR/ZMEuFKwmzqqZ/U5nPQ== + dependencies: + "@isaacs/cliui" "^8.0.2" + javascript-stringify@^2.0.1: version "2.1.0" resolved "https://registry.yarnpkg.com/javascript-stringify/-/javascript-stringify-2.1.0.tgz#27c76539be14d8bd128219a2d731b09337904e79" @@ -11095,6 +11219,32 @@ jsdom@^20.0.0: ws "^8.11.0" xml-name-validator "^4.0.0" +jsdom@^26.1.0: + version "26.1.0" + resolved "https://registry.yarnpkg.com/jsdom/-/jsdom-26.1.0.tgz#ab5f1c1cafc04bd878725490974ea5e8bf0c72b3" + integrity sha512-Cvc9WUhxSMEo4McES3P7oK3QaXldCfNWp7pl2NNeiIFlCoLr3kfq9kb1fxftiwk1FLV7CvpvDfonxtzUDeSOPg== + dependencies: + cssstyle "^4.2.1" + data-urls "^5.0.0" + decimal.js "^10.5.0" + html-encoding-sniffer "^4.0.0" + http-proxy-agent "^7.0.2" + https-proxy-agent "^7.0.6" + is-potential-custom-element-name "^1.0.1" + nwsapi "^2.2.16" + parse5 "^7.2.1" + rrweb-cssom "^0.8.0" + saxes "^6.0.0" + symbol-tree "^3.2.4" + tough-cookie "^5.1.1" + w3c-xmlserializer "^5.0.0" + webidl-conversions "^7.0.0" + whatwg-encoding "^3.1.1" + whatwg-mimetype "^4.0.0" + whatwg-url "^14.1.1" + ws "^8.18.0" + xml-name-validator "^5.0.0" + jsesc@^2.5.1: version "2.5.2" resolved "https://registry.yarnpkg.com/jsesc/-/jsesc-2.5.2.tgz#80564d2e483dacf6e8ef209650a67df3f0c283a4" @@ -11497,11 +11647,16 @@ lru-cache@4.0.0: pseudomap "^1.0.1" yallist "^2.0.0" -lru-cache@^10.2.0: +lru-cache@^10.2.0, lru-cache@^10.4.3: version "10.4.3" resolved "https://registry.yarnpkg.com/lru-cache/-/lru-cache-10.4.3.tgz#410fc8a17b70e598013df257c2446b7f3383f119" integrity sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ== +lru-cache@^11.0.0: + version "11.1.0" + resolved "https://registry.yarnpkg.com/lru-cache/-/lru-cache-11.1.0.tgz#afafb060607108132dbc1cf8ae661afb69486117" + integrity sha512-QIXZUBJUx+2zHUdQujWejBkcD9+cs94tLn0+YL8UrCh+D5sCXZ4c7LaEH48pNwRY3MLDgqUFyhlCyjJPf1WP0A== + lru-cache@^5.1.1: version "5.1.1" resolved "https://registry.yarnpkg.com/lru-cache/-/lru-cache-5.1.1.tgz#1da27e6710271947695daf6848e847f01d84b920" @@ -12351,6 +12506,13 @@ minimatch@9.0.3: dependencies: brace-expansion "^2.0.1" +minimatch@^10.0.0: + version "10.0.1" + resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-10.0.1.tgz#ce0521856b453c86e25f2c4c0d03e6ff7ddc440b" + integrity sha512-ethXTt3SGGR+95gudmqJ1eNhRO7eGEGIgYA9vnPatK4/etz2MEVDno5GMCibdMTuBMyElzIlgxMna3K94XDIDQ== + dependencies: + brace-expansion "^2.0.1" + minimatch@^3.0.4, minimatch@^3.0.5, minimatch@^3.1.1, minimatch@^3.1.2: version "3.1.2" resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.1.2.tgz#19cd194bfd3e428f049a70817c038d89ab4be35b" @@ -12716,7 +12878,7 @@ nullthrows@^1.1.1: resolved "https://registry.yarnpkg.com/nullthrows/-/nullthrows-1.1.1.tgz#7818258843856ae971eae4208ad7d7eb19a431b1" integrity sha512-2vPPEi+Z7WqML2jZYddDIfy5Dqb0r2fze2zTxNNknZaFpVHU3mFB3R+DWeJWGVx0ecvttSGlJTI+WG+8Z4cDWw== -nwsapi@^2.2.2: +nwsapi@^2.2.16, nwsapi@^2.2.2: version "2.2.20" resolved "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.20.tgz" integrity sha512-/ieB+mDe4MrrKMT8z+mQL8klXydZWGR5Dowt4RAGKbJ3kIGEx3X4ljUo+6V73IXtUPWgfOlU5B9MlGxFO5T+cA== @@ -13093,7 +13255,7 @@ parse5@^6.0.0: resolved "https://registry.yarnpkg.com/parse5/-/parse5-6.0.1.tgz#e1a1c085c569b3dc08321184f19a39cc27f7c30b" integrity sha512-Ofn/CTFzRGTTxwpNEs9PP93gXShHcTq255nzRYSKe8AkVpZY7e1fpmTfOyoIvjP5HG7Z2ZM7VS9PPhQGW2pOpw== -parse5@^7.0.0, parse5@^7.1.1: +parse5@^7.0.0, parse5@^7.1.1, parse5@^7.2.1: version "7.3.0" resolved "https://registry.yarnpkg.com/parse5/-/parse5-7.3.0.tgz#d7e224fa72399c7a175099f45fc2ad024b05ec05" integrity sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw== @@ -13171,6 +13333,19 @@ path-scurry@^1.11.1, path-scurry@^1.6.1: lru-cache "^10.2.0" minipass "^5.0.0 || ^6.0.2 || ^7.0.0" +path-scurry@^2.0.0: + version "2.0.0" + resolved "https://registry.yarnpkg.com/path-scurry/-/path-scurry-2.0.0.tgz#9f052289f23ad8bf9397a2a0425e7b8615c58580" + integrity sha512-ypGJsmGtdXUOeM5u93TyeIEfEhM6s+ljAhrk5vAvSx8uyY/02OvrZnA0YNGUrPXfpJMgI1ODd3nwz8Npx4O4cg== + dependencies: + lru-cache "^11.0.0" + minipass "^7.1.2" + +path-to-regexp@0.1.10: + version "0.1.10" + resolved "https://registry.yarnpkg.com/path-to-regexp/-/path-to-regexp-0.1.10.tgz#67e9108c5c0551b9e5326064387de4763c4d5f8b" + integrity sha512-7lf7qcQidTku0Gu3YDPc8DJ1q7OOucfa/BSsIwjuh56VU7katFvuM8hULfkwB3Fns/rsVF7PwPKVw1sl5KQS9w== + path-to-regexp@0.1.12: version "0.1.12" resolved "https://registry.yarnpkg.com/path-to-regexp/-/path-to-regexp-0.1.12.tgz#d5e1a12e478a976d432ef3c58d534b9923164bb7" @@ -14511,6 +14686,11 @@ rrdom@^2.0.0-alpha.13: dependencies: rrweb-snapshot "^2.0.0-alpha.18" +rrweb-cssom@^0.8.0: + version "0.8.0" + resolved "https://registry.yarnpkg.com/rrweb-cssom/-/rrweb-cssom-0.8.0.tgz#3021d1b4352fbf3b614aaeed0bc0d5739abe0bc2" + integrity sha512-guoltQEx+9aMf2gDZ0s62EcV8lsXR+0w8915TC3ITdn2YueuNjdAYh/levpU9nFaoChh9RUS5ZdQMrKfVEN9tw== + rrweb-snapshot@^2.0.0-alpha.13, rrweb-snapshot@^2.0.0-alpha.18: version "2.0.0-alpha.18" resolved "https://registry.npmjs.org/rrweb-snapshot/-/rrweb-snapshot-2.0.0-alpha.18.tgz" @@ -15717,6 +15897,18 @@ title-case@^3.0.3: dependencies: tslib "^2.0.3" +tldts-core@^6.1.86: + version "6.1.86" + resolved "https://registry.yarnpkg.com/tldts-core/-/tldts-core-6.1.86.tgz#a93e6ed9d505cb54c542ce43feb14c73913265d8" + integrity sha512-Je6p7pkk+KMzMv2XXKmAE3McmolOQFdxkKw0R8EYNr7sELW46JqnNeTX8ybPiQgvg1ymCoF8LXs5fzFaZvJPTA== + +tldts@^6.1.32: + version "6.1.86" + resolved "https://registry.yarnpkg.com/tldts/-/tldts-6.1.86.tgz#087e0555b31b9725ee48ca7e77edc56115cd82f7" + integrity sha512-WMi/OQ2axVTf/ykqCQgXiIct+mSQDFdH2fkwhPwgEwvJ1kSzZRiinb0zF2Xb8u4+OqPChmyI6MEu4EezNJz+FQ== + dependencies: + tldts-core "^6.1.86" + tmp@^0.0.33: version "0.0.33" resolved "https://registry.yarnpkg.com/tmp/-/tmp-0.0.33.tgz#6d34335889768d21b2bcda0aa277ced3b1bfadf9" @@ -15769,6 +15961,13 @@ tough-cookie@^4.1.2: universalify "^0.2.0" url-parse "^1.5.3" +tough-cookie@^5.1.1: + version "5.1.2" + resolved "https://registry.yarnpkg.com/tough-cookie/-/tough-cookie-5.1.2.tgz#66d774b4a1d9e12dc75089725af3ac75ec31bed7" + integrity sha512-FVDYdxtnj0G6Qm/DhNPSb8Ju59ULcup3tuJxkFb5K8Bv2pUXILbf0xZWU8PX8Ov19OXljbUyveOFwRMwkXzO+A== + dependencies: + tldts "^6.1.32" + tr46@^3.0.0: version "3.0.0" resolved "https://registry.npmjs.org/tr46/-/tr46-3.0.0.tgz" @@ -15776,6 +15975,13 @@ tr46@^3.0.0: dependencies: punycode "^2.1.1" +tr46@^5.1.0: + version "5.1.1" + resolved "https://registry.yarnpkg.com/tr46/-/tr46-5.1.1.tgz#96ae867cddb8fdb64a49cc3059a8d428bcf238ca" + integrity sha512-hdF5ZgjTqgAntKkklYw0R03MG2x/bSzTtkxmIRw/sTNV8YXsCJ1tfLAX23lhxhHJlEf3CRCOCGGWw3vI3GaSPw== + dependencies: + punycode "^2.3.1" + tr46@~0.0.3: version "0.0.3" resolved "https://registry.yarnpkg.com/tr46/-/tr46-0.0.3.tgz#8184fd347dac9cdc185992f3a6622e14b9d9ab6a" @@ -15867,12 +16073,12 @@ tunnel-agent@^0.6.0: dependencies: safe-buffer "^5.0.1" -turndown@^7.1.1: - version "7.1.2" - resolved "https://registry.npmjs.org/turndown/-/turndown-7.1.2.tgz" - integrity sha512-ntI9R7fcUKjqBP6QU8rBK2Ehyt8LAzt3UBT9JR9tgo6GtuKvyUzpayWmeMKJw1DPdXzktvtIT8m2mVXz+bL/Qg== +turndown@^7.2.0: + version "7.2.0" + resolved "https://registry.yarnpkg.com/turndown/-/turndown-7.2.0.tgz#67d614fe8371fb511079a93345abfd156c0ffcf4" + integrity sha512-eCZGBN4nNNqM9Owkv9HAtWRYfLA4h909E/WGAWWBpmB275ehNhZyk87/Tpvjbp0jjNl9XwCsbe6bm6CqFsgD+A== dependencies: - domino "^2.1.6" + "@mixmark-io/domino" "^2.2.0" type-check@^0.4.0, type-check@~0.4.0: version "0.4.0" @@ -16485,6 +16691,13 @@ w3c-xmlserializer@^4.0.0: dependencies: xml-name-validator "^4.0.0" +w3c-xmlserializer@^5.0.0: + version "5.0.0" + resolved "https://registry.yarnpkg.com/w3c-xmlserializer/-/w3c-xmlserializer-5.0.0.tgz#f925ba26855158594d907313cedd1476c5967f6c" + integrity sha512-o8qghlI8NZHU1lLPrpi2+Uq7abh4GGPpYANlalzWxyWteJOCsr/P+oPBA49TOLu5FTZO4d3F9MnWJfiMo4BkmA== + dependencies: + xml-name-validator "^5.0.0" + walker@^1.0.8: version "1.0.8" resolved "https://registry.npmjs.org/walker/-/walker-1.0.8.tgz" @@ -16614,11 +16827,23 @@ whatwg-encoding@^2.0.0: dependencies: iconv-lite "0.6.3" +whatwg-encoding@^3.1.1: + version "3.1.1" + resolved "https://registry.yarnpkg.com/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz#d0f4ef769905d426e1688f3e34381a99b60b76e5" + integrity sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ== + dependencies: + iconv-lite "0.6.3" + whatwg-mimetype@^3.0.0: version "3.0.0" resolved "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-3.0.0.tgz" integrity sha512-nt+N2dzIutVRxARx1nghPKGv1xHikU7HKdfafKkLNLindmPU/ch3U31NOCGGA/dmPcmb1VlofO0vnKAcsm0o/Q== +whatwg-mimetype@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz#bc1bf94a985dc50388d54a9258ac405c3ca2fc0a" + integrity sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg== + whatwg-url@^11.0.0: version "11.0.0" resolved "https://registry.npmjs.org/whatwg-url/-/whatwg-url-11.0.0.tgz" @@ -16627,6 +16852,14 @@ whatwg-url@^11.0.0: tr46 "^3.0.0" webidl-conversions "^7.0.0" +whatwg-url@^14.0.0, whatwg-url@^14.1.1: + version "14.2.0" + resolved "https://registry.yarnpkg.com/whatwg-url/-/whatwg-url-14.2.0.tgz#4ee02d5d725155dae004f6ae95c73e7ef5d95663" + integrity sha512-De72GdQZzNTUBBChsXueQUnPKDkg/5A5zp7pFDuQAj5UFoENpiACU0wlCvzpAGnTkj++ihpKwKyYewn/XNUbKw== + dependencies: + tr46 "^5.1.0" + webidl-conversions "^7.0.0" + whatwg-url@^5.0.0: version "5.0.0" resolved "https://registry.yarnpkg.com/whatwg-url/-/whatwg-url-5.0.0.tgz#966454e8765462e37644d3626f6742ce8b70965d" @@ -16799,6 +17032,11 @@ ws@^8.11.0: resolved "https://registry.yarnpkg.com/ws/-/ws-8.16.0.tgz#d1cd774f36fbc07165066a60e40323eab6446fd4" integrity sha512-HS0c//TP7Ina87TfiPUz1rQzMhHrl/SG2guqRcTOIUYD2q8uhUdNHZYJUaQ8aTGPzCh+c6oawMKW35nFl1dxyQ== +ws@^8.18.0: + version "8.18.2" + resolved "https://registry.yarnpkg.com/ws/-/ws-8.18.2.tgz#42738b2be57ced85f46154320aabb51ab003705a" + integrity sha512-DMricUmwGZUVr++AEAe2uiVM7UoO9MAVZMDu05UQOaUII0lp+zOzLLU4Xqh/JvTqklB1T4uELaaPBKyjE1r4fQ== + ws@~8.17.1: version "8.17.1" resolved "https://registry.yarnpkg.com/ws/-/ws-8.17.1.tgz#9293da530bb548febc95371d90f9c878727d919b" @@ -16814,6 +17052,11 @@ xml-name-validator@^4.0.0: resolved "https://registry.npmjs.org/xml-name-validator/-/xml-name-validator-4.0.0.tgz" integrity sha512-ICP2e+jsHvAj2E2lIHxa5tjXRlKDJo4IdvPvCXbXQGdzSfmSpNVyIKMvoZHjDY9DP0zV17iI85o90vRFXNccRw== +xml-name-validator@^5.0.0: + version "5.0.0" + resolved "https://registry.yarnpkg.com/xml-name-validator/-/xml-name-validator-5.0.0.tgz#82be9b957f7afdacf961e5980f1bf227c0bf7673" + integrity sha512-EvGK8EJ3DhaHfbRlETOWAS5pO9MZITeauHKJyb8wyajUfQUenkIg2MvLDTZ4T/TgIcm3HU0TFBgWWboAZ30UHg== + xml2js@^0.5.0: version "0.5.0" resolved "https://registry.yarnpkg.com/xml2js/-/xml2js-0.5.0.tgz#d9440631fbb2ed800203fad106f2724f62c493b7" From 6eee0253cbcb473217a9bc91188bc4c6b10880a9 Mon Sep 17 00:00:00 2001 From: Kenneth Kalmer Date: Mon, 9 Jun 2025 17:00:42 +0100 Subject: [PATCH 2/7] chore: add text/markdown mime type --- config/mime.types | 1 + 1 file changed, 1 insertion(+) diff --git a/config/mime.types b/config/mime.types index 2961256950..86b7539dd6 100644 --- a/config/mime.types +++ b/config/mime.types @@ -8,6 +8,7 @@ types { application/javascript js; application/atom+xml atom; application/rss+xml rss; + text/markdown md; text/mathml mml; text/plain txt; From 2fc28dfd5efe808500e64b773aff8c459675c217 Mon Sep 17 00:00:00 2001 From: Kenneth Kalmer Date: Fri, 28 Nov 2025 17:28:50 +0000 Subject: [PATCH 3/7] feat: generate language-specific page variants for MDX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create variant pages at /{page}/{language} for each detected language. Extract languages from code blocks, set in pageContext for pre-rendering. 🤖 Generated with Claude Code Co-Authored-By: Claude --- data/onCreatePage.ts | 65 +++++++++++++++++++++++++++++++-- src/contexts/layout-context.tsx | 19 +++++++++- 2 files changed, 78 insertions(+), 6 deletions(-) diff --git a/data/onCreatePage.ts b/data/onCreatePage.ts index 09906c9a36..61eef6ad10 100644 --- a/data/onCreatePage.ts +++ b/data/onCreatePage.ts @@ -1,6 +1,8 @@ import { GatsbyNode } from 'gatsby'; import path from 'path'; import fs from 'fs'; +import { stripSdkType } from '@ably/ui/core/CodeSnippet/languages'; +import { IGNORED_LANGUAGES } from './createPages/constants'; export type LayoutOptions = { leftSidebar: boolean; @@ -55,6 +57,45 @@ const extractCodeLanguages = async (filePath: string): Promise> => { } }; +// Get unique base languages for variant page creation (without modifying original array) +const getBaseLanguagesForVariants = (detectedLanguages: Set): string[] => { + const baseLanguages = new Set(); + + detectedLanguages.forEach((lang) => { + // Skip ignored languages + const baseLang = stripSdkType(lang); + if (!IGNORED_LANGUAGES.includes(lang) && !IGNORED_LANGUAGES.includes(baseLang)) { + baseLanguages.add(baseLang); + } + }); + + return Array.from(baseLanguages).sort(); +}; + +// Create language variant pages +const createLanguageVariants = ( + page: any, + originalLanguages: string[], + baseLanguages: string[], + layout: LayoutOptions, + createPage: Function, + mdxWrapper: string +): void => { + baseLanguages.forEach((baseLang) => { + createPage({ + ...page, + path: `${page.path}/${baseLang}`, + context: { + ...page.context, + layout, // Include layout config for sidebars! + language: baseLang, + languages: originalLanguages, // Keep original array untouched! + }, + component: `${mdxWrapper}?__contentFilePath=${page.component}`, + }); + }); +}; + export const onCreatePage: GatsbyNode['onCreatePage'] = async ({ page, actions }) => { const { createPage } = actions; const pathOptions = Object.entries(pageLayoutOptions).find(([path]) => page.path === path); @@ -62,16 +103,32 @@ export const onCreatePage: GatsbyNode['onCreatePage'] = async ({ page, actions } const detectedLanguages = isMDX ? await extractCodeLanguages(page.component) : new Set(); if (pathOptions || isMDX) { + // IMPORTANT: Keep original languages array for default page (don't modify!) + const originalLanguages = Array.from(detectedLanguages); + + // Determine layout configuration + const layout = pathOptions + ? pathOptions[1] + : { leftSidebar: true, rightSidebar: true, searchBar: true, template: 'base', mdx: isMDX }; + + // Create/update the default page with ORIGINAL languages array createPage({ ...page, context: { ...page.context, - layout: pathOptions - ? pathOptions[1] - : { leftSidebar: true, rightSidebar: true, searchBar: true, template: 'base', mdx: isMDX }, - ...(isMDX ? { languages: Array.from(detectedLanguages) } : {}), + layout, + ...(isMDX ? { languages: originalLanguages } : {}), }, component: isMDX ? `${mdxWrapper}?__contentFilePath=${page.component}` : page.component, }); + + // Create language variant pages for MDX files + if (isMDX && detectedLanguages.size > 0) { + const baseLanguages = getBaseLanguagesForVariants(detectedLanguages); + if (baseLanguages.length > 0) { + console.log(`Creating ${baseLanguages.length} language variants for ${page.path}: ${baseLanguages.join(', ')}`); + createLanguageVariants(page, originalLanguages, baseLanguages, layout, createPage, mdxWrapper); + } + } } }; diff --git a/src/contexts/layout-context.tsx b/src/contexts/layout-context.tsx index 2be52ce4b1..c980e19e07 100644 --- a/src/contexts/layout-context.tsx +++ b/src/contexts/layout-context.tsx @@ -33,13 +33,23 @@ const determineActiveLanguage = ( activeLanguages: LanguageKey[], location: string, product: ProductKey | null, + pageContextLanguage?: string, ): LanguageKey => { + // Priority 1: Pre-set language from pageContext (for language variant pages) + if (pageContextLanguage && Object.keys(languageInfo).includes(pageContextLanguage)) { + return pageContextLanguage as LanguageKey; + } + + // Priority 2: Query parameter const params = new URLSearchParams(location); const langParam = params.get('lang') as LanguageKey; if (langParam && Object.keys(languageInfo).includes(langParam) && activeLanguages.includes(langParam)) { return langParam; - } else if (activeLanguages.length > 0 && product) { + } + + // Priority 3: First relevant language for product + if (activeLanguages.length > 0 && product) { const relevantLanguages = activeLanguages.filter((lang) => Object.keys(languageData[product]).includes(lang)); return relevantLanguages[0]; } @@ -94,7 +104,12 @@ export const LayoutProvider: React.FC Date: Fri, 28 Nov 2025 17:29:08 +0000 Subject: [PATCH 4/7] feat: clean URLs and canonical tags for language variants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Strip language paths from browser URLs and canonical tags. Add noindex to variant pages. Public URLs use query params. 🤖 Generated with Claude Code Co-Authored-By: Claude --- src/components/Head.tsx | 3 +++ src/components/Layout/MDXWrapper.tsx | 27 +++++++++++++++++++++++++-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/src/components/Head.tsx b/src/components/Head.tsx index 6789294f81..5716786e14 100644 --- a/src/components/Head.tsx +++ b/src/components/Head.tsx @@ -7,12 +7,14 @@ export const Head = ({ description, metaTitle, keywords, + robots, }: { title: string; canonical: string; description: string; metaTitle?: string; keywords?: string; + robots?: string; }) => ( {metaTitle || title} @@ -23,6 +25,7 @@ export const Head = ({ {keywords && } + {robots && } diff --git a/src/components/Layout/MDXWrapper.tsx b/src/components/Layout/MDXWrapper.tsx index 5e74554bad..8ac207002c 100644 --- a/src/components/Layout/MDXWrapper.tsx +++ b/src/components/Layout/MDXWrapper.tsx @@ -1,6 +1,7 @@ import React, { PropsWithChildren, useState, + useEffect, createContext, isValidElement, cloneElement, @@ -182,7 +183,29 @@ const MDXWrapper: React.FC = ({ children, pageContext, location const metaTitle = getMetaTitle(title, (activePage.product as ProductName) || META_PRODUCT_FALLBACK) as string; const { canonicalUrl } = useSiteMetadata(); - const canonical = canonicalUrl(location.pathname); + + // Check if current path is a language variant (ends with a language identifier) + // Pattern: /docs/auth/token/java → language variant + const languageSegmentPattern = /\/(javascript|typescript|python|java|ruby|php|go|swift|kotlin|csharp|objc|nodejs|react|flutter|laravel|shell)$/i; + const isLanguageVariant = languageSegmentPattern.test(location.pathname); + + // Strip language segment from pathname for canonical URL and browser display + const cleanPath = isLanguageVariant + ? location.pathname.replace(languageSegmentPattern, '') + : location.pathname; + const canonical = canonicalUrl(cleanPath) + location.search; + + // Prevent indexing of language variant URLs (nginx serves these via query params) + const robots = isLanguageVariant ? 'noindex, follow' : undefined; + + // Clean up browser URL if it's a language variant (strip language segment) + // Use Gatsby's navigate to keep router state in sync + useEffect(() => { + if (isLanguageVariant) { + const cleanUrl = cleanPath + location.search; + navigate(cleanUrl, { replace: true }); + } + }, [isLanguageVariant, cleanPath, location.search]); // Use the copyable headers hook useCopyableHeaders(); @@ -206,7 +229,7 @@ const MDXWrapper: React.FC = ({ children, pageContext, location return ( - +
Date: Fri, 28 Nov 2025 17:29:23 +0000 Subject: [PATCH 5/7] feat: nginx routing for language query parameters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Map ?lang=java to /java path using nginx map directive. Serves pre-rendered language-specific HTML files. 🤖 Generated with Claude Code Co-Authored-By: Claude --- bin/start-nginx | 2 ++ config/nginx.conf.erb | 20 +++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/bin/start-nginx b/bin/start-nginx index 5c6b4d4f1f..b551b50691 100755 --- a/bin/start-nginx +++ b/bin/start-nginx @@ -13,6 +13,8 @@ PORT=${PORT:-3001} \ NGINX_ERROR_LOG_PATH=${NGINX_ERROR_LOG_PATH:-"/dev/stderr"} \ NGINX_ROOT=$(pwd)/public \ NGINX_PID_FILE=${NGINX_PID_FILE:-"/tmp/nginx.pid"} \ + DEBUG_LOGGING=${DEBUG_LOGGING:-false} \ + PORT_IN_REDIRECT=${PORT_IN_REDIRECT:-true} \ SKIP_HTTPS=${SKIP_HTTPS:-true} \ ENABLE_BASIC_AUTH=${ENABLE_BASIC_AUTH:-false} \ erb config/nginx.conf.erb > config/nginx.conf diff --git a/config/nginx.conf.erb b/config/nginx.conf.erb index a1948850bb..c567bde159 100644 --- a/config/nginx.conf.erb +++ b/config/nginx.conf.erb @@ -40,7 +40,10 @@ http { 'content_type="$sent_http_content_type" accept="$http_accept"'; log_format l2met 'measure#nginx.service=$request_time request_id=$http_x_request_id'; access_log <%= ENV['NGINX_ACCESS_LOG_PATH'] || 'logs/nginx/access.log' %> logfmtably; - error_log <%= ENV['NGINX_ERROR_LOG_PATH'] || 'logs/nginx/error.log' %> notice; + error_log <%= ENV['NGINX_ERROR_LOG_PATH'] || 'logs/nginx/error.log' %> <%= ENV['DEBUG_LOGGING'] == 'true' ? 'debug' : 'notice' %>; + <% if ENV['DEBUG_LOGGING'] == 'true' %> + rewrite_log on; + <% end %> include mime.types; default_type application/octet-stream; @@ -62,6 +65,12 @@ http { <% end %> } + # This is used map ?lang= to // for serving up files + map $arg_lang $lang_path { + default ""; + ~^(.+)$ "/$1"; + } + ## # CORS CONFIGURATION @@ -130,7 +139,7 @@ http { server { listen <%= ENV["PORT"] %>; charset UTF-8; - port_in_redirect off; + port_in_redirect <%= ENV['PORT_IN_REDIRECT'] ? 'on' : 'off' %>; keepalive_timeout 5; root <%= ENV['NGINX_ROOT'] || '/app/public' %>; @@ -231,10 +240,11 @@ http { <% if content_request_protected %> # Serve the file if it exists, otherwise try to authenticate # (.html requests won't match here, they'll go to the @html_auth location) - try_files $request_uri @html_auth; + try_files $uri @html_auth; <% else %> # Serve the file if it exists, try index.html for paths without a trailing slash, otherwise 404 - try_files $request_uri $request_uri/index.html $request_uri/ =404; + # try_files $uri $uri/index.html =404; + try_files $uri$lang_path/index.html $uri/index.html =404; <% end %> } @@ -253,7 +263,7 @@ http { } # If the request is authenticated, break out of the location block and serve the file - try_files $request_uri.html $request_uri/index.html $request_uri/ =404; + try_files $uri.html $uri/index.html =404; } # Don't serve files with the .html extension here, send them to the canonical location From 9fd77990a7a555ba25e41d38f3280ea2f89cf5ba Mon Sep 17 00:00:00 2001 From: Kenneth Kalmer Date: Fri, 28 Nov 2025 17:29:39 +0000 Subject: [PATCH 6/7] docs: markdown generation documentation and tooling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit âš ī¸ NOTE: This commit will be dropped before final PR merge. Experimental markdown generation with React hydration. Currently disabled via MARKDOWN_SIMPLE_MODE=true. 🤖 Generated with Claude Code Co-Authored-By: Claude --- LANGUAGE_MARKDOWN_GENERATION.md | 369 ++++++++++ data/onPostBuild/markdownOutput.ts | 63 +- .../markdownOutputWithLanguages.ts | 631 ++++++++++++++++++ package.json | 5 +- scripts/generate-language-markdown.ts | 223 +++++++ 5 files changed, 1283 insertions(+), 8 deletions(-) create mode 100644 LANGUAGE_MARKDOWN_GENERATION.md create mode 100644 data/onPostBuild/markdownOutputWithLanguages.ts create mode 100644 scripts/generate-language-markdown.ts diff --git a/LANGUAGE_MARKDOWN_GENERATION.md b/LANGUAGE_MARKDOWN_GENERATION.md new file mode 100644 index 0000000000..abf2e0e94d --- /dev/null +++ b/LANGUAGE_MARKDOWN_GENERATION.md @@ -0,0 +1,369 @@ +# Language-Specific Markdown Generation + +This implementation generates language-specific markdown files from HTML pages with React-based language selectors. + +## Overview + +The system can operate in two modes: + +1. **Simple Mode** (legacy): Converts static HTML to markdown without language awareness +2. **Advanced Mode** (new): Hydrates React, switches languages, and generates separate markdown files per language + +## How It Works + +### Advanced Mode (Default) + +1. **Load HTML**: Reads built HTML files from `./public` +2. **Setup JSDOM**: Creates a browser-like environment with React support +3. **Asset Rewriting**: Rewrites `ASSET_PREFIX` URLs to local paths (since assets aren't deployed yet) +4. **React Hydration**: Loads and executes Gatsby bundles (webpack-runtime, framework, app, page bundles) +5. **Language Detection**: Identifies available languages from: + - Language selector DOM elements + - Page metadata + - Product-based language data (`src/data/languages/languageData.ts`) +6. **Language Switching**: For each language: + - Updates URL search params (`?lang=javascript`) + - Triggers React re-render + - Waits for content to update +7. **Content Extraction**: Extracts main content and converts to markdown +8. **File Generation**: Saves as `page.{language}.md` (e.g., `docs/realtime/channels.javascript.md`) + +### File Naming Convention + +- **With languages**: `/docs/foo/index.html` → `/docs/foo.javascript.md`, `/docs/foo.python.md`, etc. +- **Without languages**: `/docs/foo/index.html` → `/docs/foo.md` (current behavior) + +## Usage + +### During Build (Automatic) + +Advanced mode runs automatically after each build: + +```bash +yarn build +``` + +To force simple mode: + +```bash +MARKDOWN_SIMPLE_MODE=true yarn build +``` + +### Standalone Script + +Generate markdown without rebuilding the site: + +```bash +# Default (advanced mode, all pages, all languages) +yarn generate-markdown + +# Simple mode (static HTML conversion) +yarn generate-markdown:simple + +# Verbose logging +yarn generate-markdown:verbose + +# Custom options +node scripts/generate-language-markdown.ts --mode=advanced --verbose +``` + +#### CLI Options + +``` +--mode= Export mode: "simple" or "advanced" (default: advanced) +--env= Environment to load (.env.) +--pages= Glob pattern to filter pages (e.g., "docs/realtime/*") +--languages= Comma-separated languages (e.g., "javascript,python") +--site-url= Site URL for absolute links +--verbose, -v Enable verbose logging +--help, -h Show help message +``` + +### Examples + +```bash +# Generate for specific pages +yarn generate-markdown --pages="docs/realtime/*" + +# Generate specific languages only +yarn generate-markdown --languages="javascript,python" --verbose + +# Use different environment +yarn generate-markdown --env=staging +``` + +## Environment Variables + +- `ASSET_PREFIX`: Asset CDN URL (automatically rewritten to local paths) +- `MARKDOWN_SIMPLE_MODE`: Set to `'true'` to force simple mode +- `VERBOSE`: Set to `'true'` for detailed logging +- `GATSBY_ABLY_MAIN_WEBSITE`: Site URL for absolute links + +## Implementation Details + +### File Structure + +``` +data/onPostBuild/ +├── markdownOutput.ts # Mode switcher and simple implementation +├── markdownOutputWithLanguages.ts # Advanced mode with React hydration +└── index.ts # Post-build hook orchestration + +scripts/ +└── generate-language-markdown.ts # Standalone CLI script +``` + +### Key Components + +#### 1. JSDOM Setup (`markdownOutputWithLanguages.ts`) + +```typescript +class LocalAssetResourceLoader extends ResourceLoader { + // Rewrites ASSET_PREFIX URLs to local ./public paths + async fetch(url: string, options: any) { + if (this.assetPrefix && url.includes(this.assetPrefix)) { + const localPath = url.replace(this.assetPrefix, ''); + return fs.readFile(path.join('./public', localPath)); + } + return super.fetch(url, options); + } +} +``` + +#### 2. Language Detection + +```typescript +function detectAvailableLanguages(document: Document, htmlFile: string): string[] { + // 1. Try DOM selectors + const options = document.querySelectorAll('[data-language-selector] option'); + if (options.length > 0) { + return Array.from(options).map(opt => opt.value); + } + + // 2. Fallback to product-based data + const product = extractProductFromPath(htmlFile); // e.g., 'realtime' → 'pubsub' + return Object.keys(languageData[product]); +} +``` + +#### 3. Language Switching + +```typescript +async function switchLanguage(dom: JSDOM, language: string): Promise { + // Update URL search params + window.location.search = `?lang=${language}`; + + // Trigger events + window.dispatchEvent(new Event('popstate')); + window.dispatchEvent(new Event('hashchange')); + + // Manipulate selector + const selector = document.querySelector('[data-language-selector] select'); + selector.value = language; + selector.dispatchEvent(new Event('change')); + + // Wait for content to update + await waitFor(() => contentChanged(), 5000); +} +``` + +### Frontmatter Schema + +```yaml +--- +title: "Channel Lifecycle" +url: "/docs/realtime/channels" +generated_at: "2025-11-18T10:30:00Z" +description: "Learn about channel lifecycle and state management" +language: "javascript" +language_version: "2.11" +--- +``` + +## Supported Languages + +Languages are defined per product in `src/data/languages/languageData.ts`: + +- **Pub/Sub**: javascript, nodejs, typescript, react, csharp, flutter, java, kotlin, objc, php, python, ruby, swift, go, laravel +- **Chat**: javascript, react, swift, kotlin +- **Spaces**: javascript, react +- **Asset Tracking**: javascript, swift, kotlin + +## Troubleshooting + +### React Hydration Fails + +**Symptom**: Falls back to simple mode + +**Causes**: +- Missing Gatsby bundles +- JavaScript errors during hydration +- Timeout (default: 30s) + +**Solution**: Check browser console logs, increase timeout in `CONFIG.hydrationTimeout` + +### Language Switching Doesn't Work + +**Symptom**: All language files have identical content + +**Causes**: +- Language selector not found +- React state not updating +- Content not conditional on language + +**Solution**: +- Verify language selector exists: `document.querySelector('[data-language-selector]')` +- Check if content actually changes by language in browser +- Increase `CONFIG.languageSwitchTimeout` + +### Asset Loading Errors + +**Symptom**: Scripts fail to load, 404 errors + +**Causes**: +- `ASSET_PREFIX` not properly rewritten +- Assets not built yet +- Incorrect path resolution + +**Solution**: +- Ensure `./public` directory exists with all assets +- Check `ASSET_PREFIX` value matches expected URL +- Verify `rewriteAssetUrls()` is working correctly + +### Memory Issues + +**Symptom**: Process crashes with OOM + +**Causes**: +- Too many JSDOM instances +- Large pages +- Memory leaks + +**Solution**: +- Process files sequentially (current implementation) +- Reduce `CONFIG.hydrationTimeout` +- Use `--max-old-space-size=4096` Node flag + +## Performance Considerations + +### Simple Mode +- **Speed**: ~50-100ms per page +- **Memory**: ~50MB for 100 pages +- **Use Case**: No language selectors, static content + +### Advanced Mode +- **Speed**: ~2-5 seconds per page (per language) +- **Memory**: ~200-500MB for 100 pages +- **Use Case**: Language selectors, conditional content + +### Optimization Strategies + +1. **Parallel Processing** (future): Use worker threads for multiple pages +2. **Caching**: Reuse JSDOM environment for same template types +3. **Selective Generation**: Only regenerate changed pages +4. **Hybrid Mode**: Use simple mode for pages without language selectors + +## Future Enhancements + +### 1. Smart Detection +- Detect which pages actually need language processing +- Skip pages where content doesn't change by language + +### 2. Incremental Generation +```typescript +interface IncrementalOptions { + changedFiles?: string[]; // Only regenerate these + compareHash?: boolean; // Skip if content hash unchanged +} +``` + +### 3. Parallel Processing +```typescript +import { Worker } from 'worker_threads'; + +async function processInParallel(files: string[], workers: number) { + // Distribute files across worker threads +} +``` + +### 4. Page Filtering +Already designed in CLI but not implemented: + +```bash +yarn generate-markdown --pages="docs/realtime/*" +yarn generate-markdown --languages="javascript,python" +``` + +## Testing + +### Manual Testing + +```bash +# 1. Build the site +yarn build + +# 2. Check generated files +ls public/docs/realtime/*.md + +# 3. Verify content differs by language +diff public/docs/realtime/channels.javascript.md public/docs/realtime/channels.python.md + +# 4. Test CLI +yarn generate-markdown:verbose +``` + +### Test Cases + +1. **Pages with language selector**: Should generate multiple `.{lang}.md` files +2. **Pages without language selector**: Should generate single `.md` file +3. **Invalid HTML**: Should fall back to simple mode +4. **Missing assets**: Should handle gracefully +5. **ASSET_PREFIX**: Should rewrite URLs correctly + +### Debugging + +Enable verbose logging: + +```bash +VERBOSE=true yarn generate-markdown +``` + +Or use Node debugger: + +```bash +node --inspect-brk scripts/generate-language-markdown.ts +``` + +## Known Limitations + +1. **Server-Side Only**: Cannot run in browser +2. **Sequential Processing**: One page at a time (slow for large sites) +3. **React Dependency**: Requires React to be fully functional +4. **Limited Language Detection**: Relies on DOM or product mapping +5. **No Incremental Updates**: Regenerates all files every time +6. **Memory Intensive**: JSDOM + React uses significant RAM + +## Contributing + +When modifying the language generation: + +1. Test both simple and advanced modes +2. Verify ASSET_PREFIX handling for staging/production +3. Check memory usage for large page sets +4. Update this documentation +5. Add tests for new features + +## Related Files + +- `src/components/Layout/LanguageSelector.tsx` - Language selector component +- `src/data/languages/languageData.ts` - Language versions per product +- `gatsby-config.ts` - Asset prefix configuration +- `data/onPostBuild/index.ts` - Post-build hook orchestration + +## Questions? + +For issues or questions: +1. Check the troubleshooting section above +2. Review JSDOM and Gatsby documentation +3. Examine browser console for client-side behavior +4. Contact the documentation team diff --git a/data/onPostBuild/markdownOutput.ts b/data/onPostBuild/markdownOutput.ts index 18fd3b3eff..afaaa7ee1e 100644 --- a/data/onPostBuild/markdownOutput.ts +++ b/data/onPostBuild/markdownOutput.ts @@ -1,9 +1,10 @@ import { GatsbyNode, Reporter } from 'gatsby'; -import fs from 'fs/promises'; -import path from 'path'; +import * as fs from 'fs/promises'; +import * as path from 'path'; import { glob } from 'glob'; import { JSDOM, VirtualConsole } from 'jsdom'; -import TurndownService from 'turndown'; +import * as TurndownService from 'turndown'; +import { exportToMarkdownWithLanguages } from './markdownOutputWithLanguages'; const CONFIG = { htmlDir: './public', @@ -25,8 +26,12 @@ const cleanAttribute = (attribute: string | null) => { return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : ''; }; -async function exportToMarkdown({ reporter, siteUrl }: { reporter: Reporter; siteUrl: string }) { - const turndownService = new TurndownService({ +/** + * Simple markdown export (original implementation) + * Converts static HTML to markdown without language support + */ +async function exportToMarkdownSimple({ reporter, siteUrl }: { reporter: Reporter; siteUrl: string }) { + const turndownService = new (TurndownService as any)({ headingStyle: 'atx', codeBlockStyle: 'fenced', emDelimiter: '*', @@ -139,7 +144,50 @@ interface QueryResult { }; } -// Run the export +export interface MarkdownOutputOptions { + /** + * Use advanced mode with React hydration and language support (default: true) + * Set to false for simple static HTML to markdown conversion + */ + advancedMode?: boolean; +} + +/** + * Main markdown export function with mode switching + */ +export async function exportToMarkdown( + { reporter, siteUrl }: { reporter: Reporter; siteUrl: string }, + options: MarkdownOutputOptions = {} +) { + const { advancedMode = true } = options; + + // Check if advanced mode is disabled via environment variable + const forceSimpleMode = process.env.MARKDOWN_SIMPLE_MODE === 'true'; + + if (forceSimpleMode || !advancedMode) { + reporter.info('Using simple markdown export (static HTML conversion)'); + return exportToMarkdownSimple({ reporter, siteUrl }); + } + + // Use advanced mode with language support + reporter.info('Using advanced markdown export (React hydration + language support)'); + + const assetPrefix = process.env.ASSET_PREFIX; + + try { + await exportToMarkdownWithLanguages({ + reporter, + siteUrl, + assetPrefix, + }); + } catch (error) { + reporter.error('Advanced markdown export failed, falling back to simple mode:', error as Error); + // Fallback to simple mode if advanced mode fails + await exportToMarkdownSimple({ reporter, siteUrl }); + } +} + +// Run the export (Gatsby post-build hook) export const onPostBuild: GatsbyNode['onPostBuild'] = async ({ graphql, reporter }) => { const query = ` query { @@ -169,5 +217,6 @@ export const onPostBuild: GatsbyNode['onPostBuild'] = async ({ graphql, reporter throw new Error('Site URL not found.'); } - await exportToMarkdown({ reporter, siteUrl }); + // Default to advanced mode + await exportToMarkdown({ reporter, siteUrl }, { advancedMode: true }); }; diff --git a/data/onPostBuild/markdownOutputWithLanguages.ts b/data/onPostBuild/markdownOutputWithLanguages.ts new file mode 100644 index 0000000000..4de9602dfa --- /dev/null +++ b/data/onPostBuild/markdownOutputWithLanguages.ts @@ -0,0 +1,631 @@ +import { GatsbyNode, Reporter } from 'gatsby'; +import * as fs from 'fs/promises'; +import * as path from 'path'; +import { glob } from 'glob'; +import { JSDOM, VirtualConsole, ResourceLoader, FetchOptions, AbortablePromise } from 'jsdom'; +import * as TurndownService from 'turndown'; +import languageData from '../../src/data/languages/languageData'; + +const CONFIG = { + htmlDir: './public', + markdownDir: './public', + excludePatterns: ['404.html', 'api/**/*', 'page-data/**/*', 'static/**/*', 'docs/404.html'], + includeMetadata: true, + hydrationTimeout: 30000, // 30 seconds + languageSwitchTimeout: 5000, // 5 seconds per language +}; + +// Selectors for elements to remove from the HTML before converting to markdown +const UNWANTED_ELEMENTS_SELECTOR = + 'script, style, nav[role="navigation"], .header, #header, header, .footer, #footer, footer, [aria-label="breadcrumb"], aside'; + +// Prioritised selectors for the main content of the page, first match wins +const CONTENT_SELECTORS = ['main', '[role="main"]', '.content', '#content', 'article']; + +const withoutTrailingSlash = (urlPath: string) => (urlPath === `/` ? urlPath : urlPath.replace(/\/$/, ``)); + +const cleanAttribute = (attribute: string | null) => { + return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : ''; +}; + +interface LanguageMarkdownOptions { + reporter: Reporter; + siteUrl: string; + assetPrefix?: string; +} + +/** + * Custom ResourceLoader that rewrites ASSET_PREFIX URLs to local file paths + */ +class LocalAssetResourceLoader extends ResourceLoader { + private assetPrefix?: string; + private publicDir: string; + + constructor(assetPrefix?: string) { + super(); + this.assetPrefix = assetPrefix; + this.publicDir = path.resolve('./public'); + } + + fetch(url: string, options: FetchOptions): AbortablePromise { + // If URL contains ASSET_PREFIX, rewrite to local path + if (this.assetPrefix && url.includes(this.assetPrefix)) { + const localPath = url.replace(this.assetPrefix, ''); + const fullPath = path.join(this.publicDir, localPath); + + const promise = (async () => { + try { + const content = await fs.readFile(fullPath, 'utf-8'); + return Buffer.from(content); + } catch (error) { + // If file doesn't exist locally, return null + return null as any; + } + })() as AbortablePromise; + + // Add abort method to make it an AbortablePromise + promise.abort = () => {}; + + return promise; + } + + // For other URLs, use default behavior (but we'll configure JSDOM to not load external resources) + return super.fetch(url, options); + } +} + +/** + * Rewrite asset URLs in HTML content from ASSET_PREFIX to relative paths + */ +function rewriteAssetUrls(html: string, assetPrefix?: string): string { + if (!assetPrefix) return html; + + // Rewrite src and href attributes + return html + .replace(new RegExp(`src="${assetPrefix}`, 'g'), 'src="') + .replace(new RegExp(`href="${assetPrefix}`, 'g'), 'href="') + .replace(new RegExp(`src='${assetPrefix}`, 'g'), "src='") + .replace(new RegExp(`href='${assetPrefix}`, 'g'), "href='"); +} + +/** + * Extract product key from page path + * Examples: /docs/realtime/channels -> realtime (maps to pubsub) + */ +function extractProductFromPath(htmlFile: string): string | null { + const match = htmlFile.match(/docs\/(realtime|chat|spaces|asset-tracking)/i); + if (!match) return null; + + const segment = match[1].toLowerCase(); + + // Map path segments to product keys + const productMap: Record = { + 'realtime': 'pubsub', + 'chat': 'chat', + 'spaces': 'spaces', + 'asset-tracking': 'assetTracking', + }; + + return productMap[segment] || null; +} + +/** + * Detect available languages for a page + */ +function detectAvailableLanguages(document: Document, htmlFile: string): string[] { + // Try to find language selector in the DOM + const languageOptions = document.querySelectorAll('[data-language-selector] option, .language-selector option'); + if (languageOptions.length > 0) { + return Array.from(languageOptions) + .map(option => option.getAttribute('value')) + .filter((val): val is string => !!val); + } + + // Fallback: use product-based language data + const product = extractProductFromPath(htmlFile); + if (product && languageData[product as keyof typeof languageData]) { + return Object.keys(languageData[product as keyof typeof languageData]); + } + + return []; +} + +/** + * Wait for a condition with timeout + */ +function waitFor( + condition: () => boolean, + timeout: number, + checkInterval: number = 100 +): Promise { + return new Promise((resolve) => { + const startTime = Date.now(); + + const check = () => { + if (condition()) { + resolve(true); + return; + } + + if (Date.now() - startTime > timeout) { + resolve(false); + return; + } + + setTimeout(check, checkInterval); + }; + + check(); + }); +} + +/** + * Load and execute Gatsby bundles in JSDOM + */ +async function loadGatsbyBundles( + dom: JSDOM, + htmlFile: string, + reporter: Reporter +): Promise { + const { window } = dom; + const document = window.document; + + try { + // Gatsby bundles are already in the HTML as script tags + // We need to execute them in order + const scripts = Array.from(document.querySelectorAll('script[src]')); + + // Find the Gatsby scripts + const gatsbyScripts = scripts.filter(script => { + const src = script.getAttribute('src') || ''; + return src.includes('webpack-runtime') || + src.includes('framework') || + src.includes('app') || + src.match(/^\/component---/); + }); + + // Scripts are already loaded by JSDOM, but we need to ensure they executed + // Wait for Gatsby to be ready + const gatsbyReady = await waitFor( + () => { + return !!(window as any).___gatsby && !!(window as any).React; + }, + CONFIG.hydrationTimeout + ); + + if (!gatsbyReady) { + reporter.warn(`Gatsby failed to hydrate for ${htmlFile}`); + return false; + } + + // Wait for the page to be fully rendered + await waitFor( + () => { + const mainContent = document.querySelector(CONTENT_SELECTORS.join(',')); + return !!mainContent && mainContent.children.length > 0; + }, + CONFIG.hydrationTimeout + ); + + return true; + } catch (error) { + reporter.error(`Error loading Gatsby bundles for ${htmlFile}:`, error as Error); + return false; + } +} + +/** + * Switch to a specific language and wait for content to update + */ +async function switchLanguage( + dom: JSDOM, + language: string, + reporter: Reporter +): Promise { + const { window } = dom; + const document = window.document; + + try { + // Get current content hash to detect changes + const getContentHash = () => { + const mainContent = document.querySelector(CONTENT_SELECTORS.join(',')); + return mainContent ? mainContent.innerHTML.substring(0, 1000) : ''; + }; + + const beforeHash = getContentHash(); + + // Method 1: Try to manipulate the URL search params + const currentUrl = new URL(window.location.href); + currentUrl.searchParams.set('lang', language); + + // Update window.location + Object.defineProperty(window.location, 'search', { + writable: true, + value: currentUrl.search + }); + Object.defineProperty(window.location, 'href', { + writable: true, + value: currentUrl.href + }); + + // Dispatch events that might trigger language change + window.dispatchEvent(new window.Event('popstate')); + window.dispatchEvent(new window.Event('hashchange')); + + // Method 2: Try to find and manipulate the language selector directly + const languageSelector = document.querySelector('[data-language-selector] select, .language-selector select') as HTMLSelectElement; + if (languageSelector) { + languageSelector.value = language; + + // Trigger change event + const changeEvent = new window.Event('change', { bubbles: true }); + languageSelector.dispatchEvent(changeEvent); + } + + // Method 3: Try to manipulate React state directly if available + if ((window as any).___LANGUAGE_CONTEXT___) { + (window as any).___LANGUAGE_CONTEXT___.setLanguage(language); + } + + // Wait for content to change (or timeout) + const contentChanged = await waitFor( + () => { + const afterHash = getContentHash(); + return afterHash !== beforeHash && afterHash.length > 0; + }, + CONFIG.languageSwitchTimeout + ); + + if (!contentChanged) { + reporter.verbose(`Language switch to ${language} did not change content (might already be in that language)`); + } + + // Additional wait to ensure all React updates are complete + await new Promise(resolve => setTimeout(resolve, 500)); + + return true; + } catch (error) { + reporter.error(`Error switching to language ${language}:`, error as Error); + return false; + } +} + +/** + * Extract and convert content to markdown for a specific language + */ +function extractMarkdownForLanguage( + document: Document, + turndownService: TurndownService, + language: string | null, + siteUrl: string, + htmlFile: string +): { markdown: string; metadata: Record } { + // Remove unwanted elements (create a clone to avoid modifying the original) + const docClone = document.cloneNode(true) as Document; + const unwanted = docClone.querySelectorAll(UNWANTED_ELEMENTS_SELECTOR); + unwanted.forEach((el) => el.remove()); + + // Get main content + let mainContent = null; + for (const selector of CONTENT_SELECTORS) { + mainContent = docClone.querySelector(selector); + if (mainContent) { + break; + } + } + + if (!mainContent) { + mainContent = docClone.body; + } + + // Convert to markdown + const markdown = turndownService.turndown(mainContent.innerHTML); + + // Extract metadata + const title = document.querySelector('title')?.textContent?.trim() || 'Untitled'; + const description = document.querySelector('meta[name="description"]')?.getAttribute('content')?.trim() || ''; + const canonicalUrl = document.querySelector('link[rel="canonical"]')?.getAttribute('href') || ''; + + // Get language version if available + const product = extractProductFromPath(htmlFile); + let languageVersion: string | undefined; + if (language && product) { + const productData = languageData[product as keyof typeof languageData]; + if (productData && typeof productData === 'object') { + languageVersion = (productData as any)[language]; + } + } + + return { + markdown, + metadata: { + title, + url: canonicalUrl || `/${htmlFile.replace('.html', '').replace('/index', '')}`, + description, + language, + languageVersion, + generatedAt: new Date().toISOString(), + }, + }; +} + +/** + * Process a single HTML file with language support + */ +async function processHtmlFileWithLanguages( + htmlFile: string, + options: LanguageMarkdownOptions +): Promise { + const { reporter, siteUrl, assetPrefix } = options; + + try { + const fullPath = path.join(CONFIG.htmlDir, htmlFile); + let htmlContent = await fs.readFile(fullPath, 'utf-8'); + + // Rewrite asset URLs if ASSET_PREFIX is set + if (assetPrefix) { + htmlContent = rewriteAssetUrls(htmlContent, assetPrefix); + } + + // Create TurndownService + const turndownService = new (TurndownService as any)({ + headingStyle: 'atx', + codeBlockStyle: 'fenced', + emDelimiter: '*', + }); + + // Add custom rules (same as original) + turndownService.addRule('header', { + filter: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'], + replacement: (_, node) => { + const level = parseInt(node.nodeName.charAt(1), 10); + return `${'#'.repeat(level)} ${node.textContent}`; + }, + }); + + turndownService.addRule('localLink', { + filter: (node) => (node.nodeName === 'A' && node.getAttribute('href')?.startsWith('/')) || false, + replacement: (content, node) => { + let href = withoutTrailingSlash(siteUrl) + (node as HTMLElement).getAttribute('href'); + if (href) { + href = href.replace(/([()])/g, '\\$1'); + } + let title = cleanAttribute((node as HTMLElement).getAttribute('title')); + if (title) { + title = ' "' + title.replace(/"/g, '\\"') + '"'; + } + return '[' + content + '](' + href + title + ')'; + }, + }); + + // Setup JSDOM with custom resource loader + const virtualConsole = new VirtualConsole(); + virtualConsole.on('error', () => {}); // Suppress errors + + const resourceLoader = new LocalAssetResourceLoader(assetPrefix); + + const dom = new JSDOM(htmlContent, { + url: siteUrl + '/' + htmlFile, + runScripts: 'dangerously', + resources: resourceLoader, + virtualConsole, + beforeParse(window) { + // Mock necessary browser APIs + (window as any).requestAnimationFrame = (cb: any) => setTimeout(cb, 0); + (window as any).cancelAnimationFrame = (id: any) => clearTimeout(id); + + // Mock IntersectionObserver + (window as any).IntersectionObserver = class IntersectionObserver { + observe() {} + unobserve() {} + disconnect() {} + }; + + // Mock localStorage + if (!(window as any).localStorage) { + (window as any).localStorage = { + getItem: () => null, + setItem: () => {}, + removeItem: () => {}, + clear: () => {}, + }; + } + }, + }); + + const { window } = dom; + const document = window.document; + + // Detect available languages + const availableLanguages = detectAvailableLanguages(document, htmlFile); + + if (availableLanguages.length === 0) { + // No language selector found - generate single markdown file (current behavior) + reporter.verbose(`No languages found for ${htmlFile}, generating single markdown`); + + const { markdown, metadata } = extractMarkdownForLanguage( + document, + turndownService, + null, + siteUrl, + htmlFile + ); + + const finalContent = CONFIG.includeMetadata + ? `--- +title: "${metadata.title}" +url: ${metadata.url} +generated_at: ${metadata.generatedAt} +description: "${metadata.description}" +--- + +${markdown}` + : markdown; + + const outputName = `${htmlFile.replace('/index.html', '')}.md`; + const outputPath = path.join(CONFIG.markdownDir, outputName); + await fs.writeFile(outputPath, finalContent); + + return; + } + + reporter.info(`Found ${availableLanguages.length} languages for ${htmlFile}: ${availableLanguages.join(', ')}`); + + // Try to hydrate React + const hydrated = await loadGatsbyBundles(dom, htmlFile, reporter); + + if (!hydrated) { + reporter.warn(`Failed to hydrate React for ${htmlFile}, falling back to static extraction`); + + // Fall back to generating a single file without language switching + const { markdown, metadata } = extractMarkdownForLanguage( + document, + turndownService, + null, + siteUrl, + htmlFile + ); + + const finalContent = CONFIG.includeMetadata + ? `--- +title: "${metadata.title}" +url: ${metadata.url} +generated_at: ${metadata.generatedAt} +description: "${metadata.description}" +--- + +${markdown}` + : markdown; + + const outputName = `${htmlFile.replace('/index.html', '')}.md`; + const outputPath = path.join(CONFIG.markdownDir, outputName); + await fs.writeFile(outputPath, finalContent); + + return; + } + + // Generate markdown for each language + for (const language of availableLanguages) { + reporter.verbose(`Processing language: ${language} for ${htmlFile}`); + + // Switch to this language + const switched = await switchLanguage(dom, language, reporter); + + if (!switched) { + reporter.warn(`Failed to switch to language ${language} for ${htmlFile}`); + continue; + } + + // Extract content for this language + const { markdown, metadata } = extractMarkdownForLanguage( + document, + turndownService, + language, + siteUrl, + htmlFile + ); + + // Create final content with metadata + const finalContent = CONFIG.includeMetadata + ? `--- +title: "${metadata.title}" +url: ${metadata.url} +generated_at: ${metadata.generatedAt} +description: "${metadata.description}" +language: "${metadata.language}"${metadata.languageVersion ? `\nlanguage_version: "${metadata.languageVersion}"` : ''} +--- + +${markdown}` + : markdown; + + // Save with language suffix: page.javascript.md + const baseName = htmlFile.replace('/index.html', ''); + const outputName = `${baseName}.${language}.md`; + const outputPath = path.join(CONFIG.markdownDir, outputName); + + await fs.writeFile(outputPath, finalContent); + reporter.verbose(`✓ Generated ${outputName}`); + } + } catch (error) { + reporter.error(`Error processing ${htmlFile}:`, error as Error); + throw error; + } +} + +/** + * Export all HTML files to language-specific markdown + */ +export async function exportToMarkdownWithLanguages(options: LanguageMarkdownOptions): Promise { + const { reporter } = options; + + // Find all HTML files + const htmlFiles = await glob('**/*.html', { + cwd: CONFIG.htmlDir, + ignore: CONFIG.excludePatterns, + }); + + reporter.info(`Found ${htmlFiles.length} HTML files to process with language support`); + + // Process files sequentially to avoid overwhelming the system + // (JSDOM with React is resource-intensive) + for (const htmlFile of htmlFiles) { + try { + await processHtmlFileWithLanguages(htmlFile, options); + } catch (error) { + reporter.error(`Failed to process ${htmlFile}:`, error as Error); + // Continue with next file + } + } + + reporter.info(`✓ Language-aware markdown export complete! ${htmlFiles.length} files processed.`); +} + +interface QueryResult { + site: { + siteMetadata: { + siteUrl: string; + }; + }; +} + +/** + * Gatsby post-build hook + */ +export const onPostBuild: GatsbyNode['onPostBuild'] = async ({ graphql, reporter }) => { + const query = ` + query { + site { + siteMetadata { + siteUrl + } + } + } + `; + + const { data, errors } = await graphql(query); + + if (errors) { + reporter.panicOnBuild(`Error while running GraphQL query.`); + throw errors; + } + + if (!data) { + reporter.panicOnBuild(`No documents found.`); + throw new Error('No documents found.'); + } + + const siteUrl = data.site.siteMetadata.siteUrl; + + if (!siteUrl) { + reporter.panicOnBuild(`Site URL not found.`); + throw new Error('Site URL not found.'); + } + + const assetPrefix = process.env.ASSET_PREFIX; + + await exportToMarkdownWithLanguages({ + reporter, + siteUrl, + assetPrefix, + }); +}; diff --git a/package.json b/package.json index 96430037b9..1654947fb5 100644 --- a/package.json +++ b/package.json @@ -38,7 +38,10 @@ "lint-staged": "lint-staged", "repo-githooks": "git config core.hooksPath .githooks", "no-githooks": "git config --unset core.hooksPath", - "validate-llms-txt": "node bin/validate-llms.txt.ts" + "validate-llms-txt": "node bin/validate-llms.txt.ts", + "generate-markdown": "node scripts/generate-language-markdown.ts", + "generate-markdown:simple": "node scripts/generate-language-markdown.ts --mode=simple", + "generate-markdown:verbose": "node scripts/generate-language-markdown.ts --verbose" }, "dependencies": { "@ably/ui": "17.9.3", diff --git a/scripts/generate-language-markdown.ts b/scripts/generate-language-markdown.ts new file mode 100644 index 0000000000..29644f85f5 --- /dev/null +++ b/scripts/generate-language-markdown.ts @@ -0,0 +1,223 @@ +#!/usr/bin/env node + +/** + * Standalone CLI script for generating language-specific markdown files + * + * Usage: + * yarn generate-markdown + * yarn generate-markdown --mode=simple + * yarn generate-markdown --pages "docs/realtime/*" + * yarn generate-markdown --languages "javascript,python" + * yarn generate-markdown --help + */ + +import dotenv from 'dotenv'; +import { exportToMarkdown } from '../data/onPostBuild/markdownOutput'; +import { exportToMarkdownWithLanguages } from '../data/onPostBuild/markdownOutputWithLanguages'; + +// Mock reporter for standalone execution +class ConsoleReporter { + info(message: string) { + console.log(`â„šī¸ ${message}`); + } + + warn(message: string) { + console.warn(`âš ī¸ ${message}`); + } + + error(message: string, error?: Error) { + console.error(`❌ ${message}`); + if (error) { + console.error(error); + } + } + + verbose(message: string) { + if (process.env.VERBOSE === 'true') { + console.log(`🔍 ${message}`); + } + } + + panicOnBuild(message: string) { + console.error(`đŸ’Ĩ PANIC: ${message}`); + process.exit(1); + } +} + +interface CliOptions { + mode: 'simple' | 'advanced'; + env?: string; + pages?: string; + languages?: string; + siteUrl?: string; + verbose: boolean; + help: boolean; +} + +function parseArgs(): CliOptions { + const args = process.argv.slice(2); + const options: CliOptions = { + mode: 'advanced', + env: process.env.NODE_ENV || 'production', + pages: undefined, + languages: undefined, + siteUrl: process.env.GATSBY_ABLY_MAIN_WEBSITE || 'https://ably.com', + verbose: false, + help: false, + }; + + for (const arg of args) { + if (arg === '--help' || arg === '-h') { + options.help = true; + } else if (arg.startsWith('--mode=')) { + const mode = arg.split('=')[1]; + if (mode === 'simple' || mode === 'advanced') { + options.mode = mode; + } else { + console.error(`Invalid mode: ${mode}. Must be "simple" or "advanced"`); + process.exit(1); + } + } else if (arg.startsWith('--env=')) { + options.env = arg.split('=')[1]; + } else if (arg.startsWith('--pages=')) { + options.pages = arg.split('=')[1]; + } else if (arg.startsWith('--languages=')) { + options.languages = arg.split('=')[1]; + } else if (arg.startsWith('--site-url=')) { + options.siteUrl = arg.split('=')[1]; + } else if (arg === '--verbose' || arg === '-v') { + options.verbose = true; + } + } + + return options; +} + +function printHelp() { + console.log(` +Generate language-specific markdown files from built HTML + +Usage: + yarn generate-markdown [options] + +Options: + --mode= Export mode: "simple" or "advanced" (default: advanced) + --env= Environment to load (.env.) + --pages= Glob pattern to filter pages (e.g., "docs/realtime/*") + --languages= Comma-separated language list (e.g., "javascript,python") + --site-url= Site URL for absolute links + --verbose, -v Enable verbose logging + --help, -h Show this help message + +Examples: + yarn generate-markdown + yarn generate-markdown --mode=simple + yarn generate-markdown --pages="docs/realtime/*" + yarn generate-markdown --languages="javascript,python" --verbose + +Environment Variables: + MARKDOWN_SIMPLE_MODE Force simple mode (set to 'true') + ASSET_PREFIX Asset prefix for rewriting URLs + VERBOSE Enable verbose logging + `); +} + +async function loadEnvironment(env?: string) { + // Load environment variables + const envFile = env ? `.env.${env}` : `.env.${process.env.NODE_ENV || 'production'}`; + + dotenv.config({ path: envFile }); + + console.log(`đŸ“Ļ Loaded environment from ${envFile}`); +} + +async function main() { + const options = parseArgs(); + + if (options.help) { + printHelp(); + process.exit(0); + } + + // Set verbose mode + if (options.verbose) { + process.env.VERBOSE = 'true'; + } + + // Load environment + await loadEnvironment(options.env); + + const reporter = new ConsoleReporter() as any; + const siteUrl = options.siteUrl || process.env.GATSBY_ABLY_MAIN_WEBSITE || 'https://ably.com'; + + console.log(''); + console.log('🚀 Starting markdown generation...'); + console.log(''); + console.log(` Mode: ${options.mode}`); + console.log(` Site URL: ${siteUrl}`); + console.log(` Environment: ${options.env}`); + + if (options.pages) { + console.log(` Pages filter: ${options.pages}`); + } + + if (options.languages) { + console.log(` Languages: ${options.languages}`); + } + + if (process.env.ASSET_PREFIX) { + console.log(` Asset Prefix: ${process.env.ASSET_PREFIX}`); + } + + console.log(''); + + const startTime = Date.now(); + + try { + if (options.mode === 'simple') { + // Simple mode + await exportToMarkdown( + { reporter, siteUrl }, + { advancedMode: false } + ); + } else { + // Advanced mode with language support + const assetPrefix = process.env.ASSET_PREFIX; + + if (options.pages || options.languages) { + reporter.warn('Page and language filtering not yet implemented, generating all pages/languages'); + // TODO: Implement filtering by passing options to exportToMarkdownWithLanguages + } + + await exportToMarkdownWithLanguages({ + reporter, + siteUrl, + assetPrefix, + }); + } + + const duration = ((Date.now() - startTime) / 1000).toFixed(2); + + console.log(''); + console.log(`✅ Markdown generation complete in ${duration}s`); + console.log(''); + + process.exit(0); + } catch (error) { + console.error(''); + console.error('❌ Markdown generation failed:'); + console.error(error); + console.error(''); + + process.exit(1); + } +} + +// Handle unhandled rejections +process.on('unhandledRejection', (error) => { + console.error('Unhandled rejection:', error); + process.exit(1); +}); + +// Run main function +main(); From 8571faea47f4efdaae293f1994465787681d236d Mon Sep 17 00:00:00 2001 From: Kenneth Kalmer Date: Fri, 28 Nov 2025 17:34:41 +0000 Subject: [PATCH 7/7] chore: `yarn.lock` update --- yarn.lock | 29 +---------------------------- 1 file changed, 1 insertion(+), 28 deletions(-) diff --git a/yarn.lock b/yarn.lock index e89bb2d8e7..4b98509161 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1497,13 +1497,6 @@ resolved "https://registry.yarnpkg.com/@codesandbox/sandpack-themes/-/sandpack-themes-2.0.21.tgz#f1970f03537434fff008e9c9c3f6581b0e5940c2" integrity sha512-CMH/MO/dh6foPYb/3eSn2Cu/J3+1+/81Fsaj7VggICkCrmRk0qG5dmgjGAearPTnRkOGORIPHuRqwNXgw0E6YQ== -"@cspotcode/source-map-support@^0.8.0": - version "0.8.1" - resolved "https://registry.yarnpkg.com/@cspotcode/source-map-support/-/source-map-support-0.8.1.tgz#00629c35a688e05a88b1cda684fb9d5e73f000a1" - integrity sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw== - dependencies: - "@jridgewell/trace-mapping" "0.3.9" - "@csstools/color-helpers@^5.0.2": version "5.0.2" resolved "https://registry.yarnpkg.com/@csstools/color-helpers/-/color-helpers-5.0.2.tgz#82592c9a7c2b83c293d9161894e2a6471feb97b8" @@ -4857,21 +4850,6 @@ agent-base@^7.1.0, agent-base@^7.1.2: resolved "https://registry.yarnpkg.com/agent-base/-/agent-base-7.1.3.tgz#29435eb821bc4194633a5b89e5bc4703bafc25a1" integrity sha512-jRR5wdylq8CkOe6hei19GGZnxM6rBGwFl3Bg0YItGDimvjGtAvdZk4Pu6Cl4u4Igsws4a1fd1Vq3ezrhn4KmFw== -agentkeepalive@^4.2.1: - version "4.5.0" - resolved "https://registry.npmjs.org/agentkeepalive/-/agentkeepalive-4.5.0.tgz" - integrity sha512-5GG/5IbQQpC9FpkRGsSvZI5QYeSCzlJHdpBQntCsuTOxhKD8lqKhrleg2Yi7yvMIf82Ycmmqln9U8V9qwEiJew== - dependencies: - humanize-ms "^1.2.1" - -aggregate-error@^3.0.0: - version "3.1.0" - resolved "https://registry.npmjs.org/aggregate-error/-/aggregate-error-3.1.0.tgz" - integrity sha512-4I7Td01quW/RpocfNayFdFVk1qSuoh0E7JrbRJ16nH01HhKFQ88INq9Sd+nd72zqRySlr9BmDA8xlEJ6vJMrYA== - dependencies: - clean-stack "^2.0.0" - indent-string "^4.0.0" - ajv-formats@^2.1.1: version "2.1.1" resolved "https://registry.yarnpkg.com/ajv-formats/-/ajv-formats-2.1.1.tgz#6e669400659eb74973bbf2e33327180a0996b520" @@ -9349,7 +9327,7 @@ glob@^11.0.2: package-json-from-dist "^1.0.0" path-scurry "^2.0.0" -glob@^7.1.1, glob@^7.1.2, glob@^7.1.3, glob@^7.1.4, glob@^7.1.6, glob@^7.2.3: +glob@^7.1.1, glob@^7.1.3, glob@^7.1.4, glob@^7.1.6, glob@^7.2.3: version "7.2.3" resolved "https://registry.npmjs.org/glob/-/glob-7.2.3.tgz" integrity sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q== @@ -13341,11 +13319,6 @@ path-scurry@^2.0.0: lru-cache "^11.0.0" minipass "^7.1.2" -path-to-regexp@0.1.10: - version "0.1.10" - resolved "https://registry.yarnpkg.com/path-to-regexp/-/path-to-regexp-0.1.10.tgz#67e9108c5c0551b9e5326064387de4763c4d5f8b" - integrity sha512-7lf7qcQidTku0Gu3YDPc8DJ1q7OOucfa/BSsIwjuh56VU7katFvuM8hULfkwB3Fns/rsVF7PwPKVw1sl5KQS9w== - path-to-regexp@0.1.12: version "0.1.12" resolved "https://registry.yarnpkg.com/path-to-regexp/-/path-to-regexp-0.1.12.tgz#d5e1a12e478a976d432ef3c58d534b9923164bb7"