diff --git a/LANGUAGE_MARKDOWN_GENERATION.md b/LANGUAGE_MARKDOWN_GENERATION.md new file mode 100644 index 0000000000..abf2e0e94d --- /dev/null +++ b/LANGUAGE_MARKDOWN_GENERATION.md @@ -0,0 +1,369 @@ +# Language-Specific Markdown Generation + +This implementation generates language-specific markdown files from HTML pages with React-based language selectors. + +## Overview + +The system can operate in two modes: + +1. **Simple Mode** (legacy): Converts static HTML to markdown without language awareness +2. **Advanced Mode** (new): Hydrates React, switches languages, and generates separate markdown files per language + +## How It Works + +### Advanced Mode (Default) + +1. **Load HTML**: Reads built HTML files from `./public` +2. **Setup JSDOM**: Creates a browser-like environment with React support +3. **Asset Rewriting**: Rewrites `ASSET_PREFIX` URLs to local paths (since assets aren't deployed yet) +4. **React Hydration**: Loads and executes Gatsby bundles (webpack-runtime, framework, app, page bundles) +5. **Language Detection**: Identifies available languages from: + - Language selector DOM elements + - Page metadata + - Product-based language data (`src/data/languages/languageData.ts`) +6. **Language Switching**: For each language: + - Updates URL search params (`?lang=javascript`) + - Triggers React re-render + - Waits for content to update +7. **Content Extraction**: Extracts main content and converts to markdown +8. **File Generation**: Saves as `page.{language}.md` (e.g., `docs/realtime/channels.javascript.md`) + +### File Naming Convention + +- **With languages**: `/docs/foo/index.html` → `/docs/foo.javascript.md`, `/docs/foo.python.md`, etc. +- **Without languages**: `/docs/foo/index.html` → `/docs/foo.md` (current behavior) + +## Usage + +### During Build (Automatic) + +Advanced mode runs automatically after each build: + +```bash +yarn build +``` + +To force simple mode: + +```bash +MARKDOWN_SIMPLE_MODE=true yarn build +``` + +### Standalone Script + +Generate markdown without rebuilding the site: + +```bash +# Default (advanced mode, all pages, all languages) +yarn generate-markdown + +# Simple mode (static HTML conversion) +yarn generate-markdown:simple + +# Verbose logging +yarn generate-markdown:verbose + +# Custom options +node scripts/generate-language-markdown.ts --mode=advanced --verbose +``` + +#### CLI Options + +``` +--mode= Export mode: "simple" or "advanced" (default: advanced) +--env= Environment to load (.env.) +--pages= Glob pattern to filter pages (e.g., "docs/realtime/*") +--languages= Comma-separated languages (e.g., "javascript,python") +--site-url= Site URL for absolute links +--verbose, -v Enable verbose logging +--help, -h Show help message +``` + +### Examples + +```bash +# Generate for specific pages +yarn generate-markdown --pages="docs/realtime/*" + +# Generate specific languages only +yarn generate-markdown --languages="javascript,python" --verbose + +# Use different environment +yarn generate-markdown --env=staging +``` + +## Environment Variables + +- `ASSET_PREFIX`: Asset CDN URL (automatically rewritten to local paths) +- `MARKDOWN_SIMPLE_MODE`: Set to `'true'` to force simple mode +- `VERBOSE`: Set to `'true'` for detailed logging +- `GATSBY_ABLY_MAIN_WEBSITE`: Site URL for absolute links + +## Implementation Details + +### File Structure + +``` +data/onPostBuild/ +├── markdownOutput.ts # Mode switcher and simple implementation +├── markdownOutputWithLanguages.ts # Advanced mode with React hydration +└── index.ts # Post-build hook orchestration + +scripts/ +└── generate-language-markdown.ts # Standalone CLI script +``` + +### Key Components + +#### 1. JSDOM Setup (`markdownOutputWithLanguages.ts`) + +```typescript +class LocalAssetResourceLoader extends ResourceLoader { + // Rewrites ASSET_PREFIX URLs to local ./public paths + async fetch(url: string, options: any) { + if (this.assetPrefix && url.includes(this.assetPrefix)) { + const localPath = url.replace(this.assetPrefix, ''); + return fs.readFile(path.join('./public', localPath)); + } + return super.fetch(url, options); + } +} +``` + +#### 2. Language Detection + +```typescript +function detectAvailableLanguages(document: Document, htmlFile: string): string[] { + // 1. Try DOM selectors + const options = document.querySelectorAll('[data-language-selector] option'); + if (options.length > 0) { + return Array.from(options).map(opt => opt.value); + } + + // 2. Fallback to product-based data + const product = extractProductFromPath(htmlFile); // e.g., 'realtime' → 'pubsub' + return Object.keys(languageData[product]); +} +``` + +#### 3. Language Switching + +```typescript +async function switchLanguage(dom: JSDOM, language: string): Promise { + // Update URL search params + window.location.search = `?lang=${language}`; + + // Trigger events + window.dispatchEvent(new Event('popstate')); + window.dispatchEvent(new Event('hashchange')); + + // Manipulate selector + const selector = document.querySelector('[data-language-selector] select'); + selector.value = language; + selector.dispatchEvent(new Event('change')); + + // Wait for content to update + await waitFor(() => contentChanged(), 5000); +} +``` + +### Frontmatter Schema + +```yaml +--- +title: "Channel Lifecycle" +url: "/docs/realtime/channels" +generated_at: "2025-11-18T10:30:00Z" +description: "Learn about channel lifecycle and state management" +language: "javascript" +language_version: "2.11" +--- +``` + +## Supported Languages + +Languages are defined per product in `src/data/languages/languageData.ts`: + +- **Pub/Sub**: javascript, nodejs, typescript, react, csharp, flutter, java, kotlin, objc, php, python, ruby, swift, go, laravel +- **Chat**: javascript, react, swift, kotlin +- **Spaces**: javascript, react +- **Asset Tracking**: javascript, swift, kotlin + +## Troubleshooting + +### React Hydration Fails + +**Symptom**: Falls back to simple mode + +**Causes**: +- Missing Gatsby bundles +- JavaScript errors during hydration +- Timeout (default: 30s) + +**Solution**: Check browser console logs, increase timeout in `CONFIG.hydrationTimeout` + +### Language Switching Doesn't Work + +**Symptom**: All language files have identical content + +**Causes**: +- Language selector not found +- React state not updating +- Content not conditional on language + +**Solution**: +- Verify language selector exists: `document.querySelector('[data-language-selector]')` +- Check if content actually changes by language in browser +- Increase `CONFIG.languageSwitchTimeout` + +### Asset Loading Errors + +**Symptom**: Scripts fail to load, 404 errors + +**Causes**: +- `ASSET_PREFIX` not properly rewritten +- Assets not built yet +- Incorrect path resolution + +**Solution**: +- Ensure `./public` directory exists with all assets +- Check `ASSET_PREFIX` value matches expected URL +- Verify `rewriteAssetUrls()` is working correctly + +### Memory Issues + +**Symptom**: Process crashes with OOM + +**Causes**: +- Too many JSDOM instances +- Large pages +- Memory leaks + +**Solution**: +- Process files sequentially (current implementation) +- Reduce `CONFIG.hydrationTimeout` +- Use `--max-old-space-size=4096` Node flag + +## Performance Considerations + +### Simple Mode +- **Speed**: ~50-100ms per page +- **Memory**: ~50MB for 100 pages +- **Use Case**: No language selectors, static content + +### Advanced Mode +- **Speed**: ~2-5 seconds per page (per language) +- **Memory**: ~200-500MB for 100 pages +- **Use Case**: Language selectors, conditional content + +### Optimization Strategies + +1. **Parallel Processing** (future): Use worker threads for multiple pages +2. **Caching**: Reuse JSDOM environment for same template types +3. **Selective Generation**: Only regenerate changed pages +4. **Hybrid Mode**: Use simple mode for pages without language selectors + +## Future Enhancements + +### 1. Smart Detection +- Detect which pages actually need language processing +- Skip pages where content doesn't change by language + +### 2. Incremental Generation +```typescript +interface IncrementalOptions { + changedFiles?: string[]; // Only regenerate these + compareHash?: boolean; // Skip if content hash unchanged +} +``` + +### 3. Parallel Processing +```typescript +import { Worker } from 'worker_threads'; + +async function processInParallel(files: string[], workers: number) { + // Distribute files across worker threads +} +``` + +### 4. Page Filtering +Already designed in CLI but not implemented: + +```bash +yarn generate-markdown --pages="docs/realtime/*" +yarn generate-markdown --languages="javascript,python" +``` + +## Testing + +### Manual Testing + +```bash +# 1. Build the site +yarn build + +# 2. Check generated files +ls public/docs/realtime/*.md + +# 3. Verify content differs by language +diff public/docs/realtime/channels.javascript.md public/docs/realtime/channels.python.md + +# 4. Test CLI +yarn generate-markdown:verbose +``` + +### Test Cases + +1. **Pages with language selector**: Should generate multiple `.{lang}.md` files +2. **Pages without language selector**: Should generate single `.md` file +3. **Invalid HTML**: Should fall back to simple mode +4. **Missing assets**: Should handle gracefully +5. **ASSET_PREFIX**: Should rewrite URLs correctly + +### Debugging + +Enable verbose logging: + +```bash +VERBOSE=true yarn generate-markdown +``` + +Or use Node debugger: + +```bash +node --inspect-brk scripts/generate-language-markdown.ts +``` + +## Known Limitations + +1. **Server-Side Only**: Cannot run in browser +2. **Sequential Processing**: One page at a time (slow for large sites) +3. **React Dependency**: Requires React to be fully functional +4. **Limited Language Detection**: Relies on DOM or product mapping +5. **No Incremental Updates**: Regenerates all files every time +6. **Memory Intensive**: JSDOM + React uses significant RAM + +## Contributing + +When modifying the language generation: + +1. Test both simple and advanced modes +2. Verify ASSET_PREFIX handling for staging/production +3. Check memory usage for large page sets +4. Update this documentation +5. Add tests for new features + +## Related Files + +- `src/components/Layout/LanguageSelector.tsx` - Language selector component +- `src/data/languages/languageData.ts` - Language versions per product +- `gatsby-config.ts` - Asset prefix configuration +- `data/onPostBuild/index.ts` - Post-build hook orchestration + +## Questions? + +For issues or questions: +1. Check the troubleshooting section above +2. Review JSDOM and Gatsby documentation +3. Examine browser console for client-side behavior +4. Contact the documentation team diff --git a/bin/start-nginx b/bin/start-nginx index 5c6b4d4f1f..b551b50691 100755 --- a/bin/start-nginx +++ b/bin/start-nginx @@ -13,6 +13,8 @@ PORT=${PORT:-3001} \ NGINX_ERROR_LOG_PATH=${NGINX_ERROR_LOG_PATH:-"/dev/stderr"} \ NGINX_ROOT=$(pwd)/public \ NGINX_PID_FILE=${NGINX_PID_FILE:-"/tmp/nginx.pid"} \ + DEBUG_LOGGING=${DEBUG_LOGGING:-false} \ + PORT_IN_REDIRECT=${PORT_IN_REDIRECT:-true} \ SKIP_HTTPS=${SKIP_HTTPS:-true} \ ENABLE_BASIC_AUTH=${ENABLE_BASIC_AUTH:-false} \ erb config/nginx.conf.erb > config/nginx.conf diff --git a/config/mime.types b/config/mime.types index 2961256950..86b7539dd6 100644 --- a/config/mime.types +++ b/config/mime.types @@ -8,6 +8,7 @@ types { application/javascript js; application/atom+xml atom; application/rss+xml rss; + text/markdown md; text/mathml mml; text/plain txt; diff --git a/config/nginx.conf.erb b/config/nginx.conf.erb index a1948850bb..c567bde159 100644 --- a/config/nginx.conf.erb +++ b/config/nginx.conf.erb @@ -40,7 +40,10 @@ http { 'content_type="$sent_http_content_type" accept="$http_accept"'; log_format l2met 'measure#nginx.service=$request_time request_id=$http_x_request_id'; access_log <%= ENV['NGINX_ACCESS_LOG_PATH'] || 'logs/nginx/access.log' %> logfmtably; - error_log <%= ENV['NGINX_ERROR_LOG_PATH'] || 'logs/nginx/error.log' %> notice; + error_log <%= ENV['NGINX_ERROR_LOG_PATH'] || 'logs/nginx/error.log' %> <%= ENV['DEBUG_LOGGING'] == 'true' ? 'debug' : 'notice' %>; + <% if ENV['DEBUG_LOGGING'] == 'true' %> + rewrite_log on; + <% end %> include mime.types; default_type application/octet-stream; @@ -62,6 +65,12 @@ http { <% end %> } + # This is used map ?lang= to // for serving up files + map $arg_lang $lang_path { + default ""; + ~^(.+)$ "/$1"; + } + ## # CORS CONFIGURATION @@ -130,7 +139,7 @@ http { server { listen <%= ENV["PORT"] %>; charset UTF-8; - port_in_redirect off; + port_in_redirect <%= ENV['PORT_IN_REDIRECT'] ? 'on' : 'off' %>; keepalive_timeout 5; root <%= ENV['NGINX_ROOT'] || '/app/public' %>; @@ -231,10 +240,11 @@ http { <% if content_request_protected %> # Serve the file if it exists, otherwise try to authenticate # (.html requests won't match here, they'll go to the @html_auth location) - try_files $request_uri @html_auth; + try_files $uri @html_auth; <% else %> # Serve the file if it exists, try index.html for paths without a trailing slash, otherwise 404 - try_files $request_uri $request_uri/index.html $request_uri/ =404; + # try_files $uri $uri/index.html =404; + try_files $uri$lang_path/index.html $uri/index.html =404; <% end %> } @@ -253,7 +263,7 @@ http { } # If the request is authenticated, break out of the location block and serve the file - try_files $request_uri.html $request_uri/index.html $request_uri/ =404; + try_files $uri.html $uri/index.html =404; } # Don't serve files with the .html extension here, send them to the canonical location diff --git a/data/onCreatePage.ts b/data/onCreatePage.ts index 09906c9a36..61eef6ad10 100644 --- a/data/onCreatePage.ts +++ b/data/onCreatePage.ts @@ -1,6 +1,8 @@ import { GatsbyNode } from 'gatsby'; import path from 'path'; import fs from 'fs'; +import { stripSdkType } from '@ably/ui/core/CodeSnippet/languages'; +import { IGNORED_LANGUAGES } from './createPages/constants'; export type LayoutOptions = { leftSidebar: boolean; @@ -55,6 +57,45 @@ const extractCodeLanguages = async (filePath: string): Promise> => { } }; +// Get unique base languages for variant page creation (without modifying original array) +const getBaseLanguagesForVariants = (detectedLanguages: Set): string[] => { + const baseLanguages = new Set(); + + detectedLanguages.forEach((lang) => { + // Skip ignored languages + const baseLang = stripSdkType(lang); + if (!IGNORED_LANGUAGES.includes(lang) && !IGNORED_LANGUAGES.includes(baseLang)) { + baseLanguages.add(baseLang); + } + }); + + return Array.from(baseLanguages).sort(); +}; + +// Create language variant pages +const createLanguageVariants = ( + page: any, + originalLanguages: string[], + baseLanguages: string[], + layout: LayoutOptions, + createPage: Function, + mdxWrapper: string +): void => { + baseLanguages.forEach((baseLang) => { + createPage({ + ...page, + path: `${page.path}/${baseLang}`, + context: { + ...page.context, + layout, // Include layout config for sidebars! + language: baseLang, + languages: originalLanguages, // Keep original array untouched! + }, + component: `${mdxWrapper}?__contentFilePath=${page.component}`, + }); + }); +}; + export const onCreatePage: GatsbyNode['onCreatePage'] = async ({ page, actions }) => { const { createPage } = actions; const pathOptions = Object.entries(pageLayoutOptions).find(([path]) => page.path === path); @@ -62,16 +103,32 @@ export const onCreatePage: GatsbyNode['onCreatePage'] = async ({ page, actions } const detectedLanguages = isMDX ? await extractCodeLanguages(page.component) : new Set(); if (pathOptions || isMDX) { + // IMPORTANT: Keep original languages array for default page (don't modify!) + const originalLanguages = Array.from(detectedLanguages); + + // Determine layout configuration + const layout = pathOptions + ? pathOptions[1] + : { leftSidebar: true, rightSidebar: true, searchBar: true, template: 'base', mdx: isMDX }; + + // Create/update the default page with ORIGINAL languages array createPage({ ...page, context: { ...page.context, - layout: pathOptions - ? pathOptions[1] - : { leftSidebar: true, rightSidebar: true, searchBar: true, template: 'base', mdx: isMDX }, - ...(isMDX ? { languages: Array.from(detectedLanguages) } : {}), + layout, + ...(isMDX ? { languages: originalLanguages } : {}), }, component: isMDX ? `${mdxWrapper}?__contentFilePath=${page.component}` : page.component, }); + + // Create language variant pages for MDX files + if (isMDX && detectedLanguages.size > 0) { + const baseLanguages = getBaseLanguagesForVariants(detectedLanguages); + if (baseLanguages.length > 0) { + console.log(`Creating ${baseLanguages.length} language variants for ${page.path}: ${baseLanguages.join(', ')}`); + createLanguageVariants(page, originalLanguages, baseLanguages, layout, createPage, mdxWrapper); + } + } } }; diff --git a/data/onPostBuild/index.ts b/data/onPostBuild/index.ts index 844392b4d6..84c131f0f4 100644 --- a/data/onPostBuild/index.ts +++ b/data/onPostBuild/index.ts @@ -1,9 +1,11 @@ import { GatsbyNode } from 'gatsby'; import { onPostBuild as llmstxt } from './llmstxt'; import { onPostBuild as compressAssets } from './compressAssets'; +import { onPostBuild as markdownOutput } from './markdownOutput'; export const onPostBuild: GatsbyNode['onPostBuild'] = async (args) => { // Run all onPostBuild functions in sequence await llmstxt(args); + await markdownOutput(args); await compressAssets(args); }; diff --git a/data/onPostBuild/markdownOutput.ts b/data/onPostBuild/markdownOutput.ts new file mode 100644 index 0000000000..afaaa7ee1e --- /dev/null +++ b/data/onPostBuild/markdownOutput.ts @@ -0,0 +1,222 @@ +import { GatsbyNode, Reporter } from 'gatsby'; +import * as fs from 'fs/promises'; +import * as path from 'path'; +import { glob } from 'glob'; +import { JSDOM, VirtualConsole } from 'jsdom'; +import * as TurndownService from 'turndown'; +import { exportToMarkdownWithLanguages } from './markdownOutputWithLanguages'; + +const CONFIG = { + htmlDir: './public', + markdownDir: './public', + excludePatterns: ['404.html', 'api/**/*', 'page-data/**/*', 'static/**/*', 'docs/404.html'], + includeMetadata: true, +}; + +// Selectors for elements to remove from the HTML before converting to markdown +const UNWANTED_ELEMENTS_SELECTOR = + 'script, style, nav[role="navigation"], .header, #header, header, .footer, #footer, footer, [aria-label="breadcrumb"], aside'; + +// Prioritised selectors for the main content of the page, first match wins +const CONTENT_SELECTORS = ['main', '[role="main"]', '.content', '#content', 'article']; + +const withoutTrailingSlash = (path: string) => (path === `/` ? path : path.replace(/\/$/, ``)); + +const cleanAttribute = (attribute: string | null) => { + return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : ''; +}; + +/** + * Simple markdown export (original implementation) + * Converts static HTML to markdown without language support + */ +async function exportToMarkdownSimple({ reporter, siteUrl }: { reporter: Reporter; siteUrl: string }) { + const turndownService = new (TurndownService as any)({ + headingStyle: 'atx', + codeBlockStyle: 'fenced', + emDelimiter: '*', + }); + + // Remove the anchor tags from the headers + turndownService.addRule('header', { + filter: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'], + replacement: (_, node) => { + const level = parseInt(node.nodeName.charAt(1), 10); + return `${'#'.repeat(level)} ${node.textContent}`; + }, + }); + + // Update local links to use the siteUrl + turndownService.addRule('localLink', { + filter: (node) => (node.nodeName === 'A' && node.getAttribute('href')?.startsWith('/')) || false, + replacement: (content, node) => { + // most of this replacement is taken from the turndown library directly + let href = withoutTrailingSlash(siteUrl) + (node as HTMLElement).getAttribute('href'); + if (href) { + href = href.replace(/([()])/g, '\\$1'); + } + let title = cleanAttribute((node as HTMLElement).getAttribute('title')); + if (title) { + title = ' "' + title.replace(/"/g, '\\"') + '"'; + } + return '[' + content + '](' + href + title + ')'; + }, + }); + + // Find all HTML files + const htmlFiles = await glob('**/*.html', { + cwd: CONFIG.htmlDir, + ignore: CONFIG.excludePatterns, + }); + + reporter.info(`Found ${htmlFiles.length} HTML files to process`); + + for (const htmlFile of htmlFiles) { + try { + const fullPath = path.join(CONFIG.htmlDir, htmlFile); + const htmlContent = await fs.readFile(fullPath, 'utf-8'); + + // Parse and clean HTML + const virtualConsole = new VirtualConsole(); // Stop CSS parsing errors from polluting the console + const dom = new JSDOM(htmlContent, { url: siteUrl, virtualConsole }); + const document = dom.window.document; + + // Remove unwanted elements + const unwanted = document.querySelectorAll(UNWANTED_ELEMENTS_SELECTOR); + unwanted.forEach((el) => el.remove()); + + // Get main content + let mainContent = null; + + for (const selector of CONTENT_SELECTORS) { + mainContent = document.querySelector(selector); + if (mainContent) { + break; + } + } + + if (!mainContent) { + mainContent = document.body; + } + + // Convert to markdown + const markdown = turndownService.turndown(mainContent.innerHTML); + + // Prepare final content + let finalContent = ''; + + if (CONFIG.includeMetadata) { + const title = document.querySelector('title')?.textContent?.trim() || 'Untitled'; + const description = document.querySelector('meta[name="description"]')?.getAttribute('content')?.trim() || ''; + const canonicalUrl = document.querySelector('link[rel="canonical"]')?.getAttribute('href') || ''; + + finalContent = `--- +title: "${title}" +url: ${canonicalUrl || `/${htmlFile.replace('.html', '').replace('/index', '')}`} +generated_at: ${new Date().toISOString()} +description: "${description}" +--- + +${markdown}`; + } else { + finalContent = markdown; + } + + // Append .md to the filename, remove /index.html + const outputName = `${htmlFile.replace('/index.html', '')}.md`; + const outputPath = path.join(CONFIG.markdownDir, outputName); + + // Write markdown file + await fs.writeFile(outputPath, finalContent); + } catch (error) { + reporter.error(`✗ Error processing ${htmlFile}:`, error as Error); + } + } + + reporter.info(`Markdown export complete! ${htmlFiles.length} files processed.`); +} + +interface QueryResult { + site: { + siteMetadata: { + siteUrl: string; + }; + }; +} + +export interface MarkdownOutputOptions { + /** + * Use advanced mode with React hydration and language support (default: true) + * Set to false for simple static HTML to markdown conversion + */ + advancedMode?: boolean; +} + +/** + * Main markdown export function with mode switching + */ +export async function exportToMarkdown( + { reporter, siteUrl }: { reporter: Reporter; siteUrl: string }, + options: MarkdownOutputOptions = {} +) { + const { advancedMode = true } = options; + + // Check if advanced mode is disabled via environment variable + const forceSimpleMode = process.env.MARKDOWN_SIMPLE_MODE === 'true'; + + if (forceSimpleMode || !advancedMode) { + reporter.info('Using simple markdown export (static HTML conversion)'); + return exportToMarkdownSimple({ reporter, siteUrl }); + } + + // Use advanced mode with language support + reporter.info('Using advanced markdown export (React hydration + language support)'); + + const assetPrefix = process.env.ASSET_PREFIX; + + try { + await exportToMarkdownWithLanguages({ + reporter, + siteUrl, + assetPrefix, + }); + } catch (error) { + reporter.error('Advanced markdown export failed, falling back to simple mode:', error as Error); + // Fallback to simple mode if advanced mode fails + await exportToMarkdownSimple({ reporter, siteUrl }); + } +} + +// Run the export (Gatsby post-build hook) +export const onPostBuild: GatsbyNode['onPostBuild'] = async ({ graphql, reporter }) => { + const query = ` + query { + site { + siteMetadata { + siteUrl + } + } + } + `; + const { data, errors } = await graphql(query); + + if (errors) { + reporter.panicOnBuild(`Error while running GraphQL query.`); + throw errors; + } + + if (!data) { + reporter.panicOnBuild(`No documents found.`); + throw new Error('No documents found.'); + } + + const siteUrl = data.site.siteMetadata.siteUrl; + + if (!siteUrl) { + reporter.panicOnBuild(`Site URL not found.`); + throw new Error('Site URL not found.'); + } + + // Default to advanced mode + await exportToMarkdown({ reporter, siteUrl }, { advancedMode: true }); +}; diff --git a/data/onPostBuild/markdownOutputWithLanguages.ts b/data/onPostBuild/markdownOutputWithLanguages.ts new file mode 100644 index 0000000000..4de9602dfa --- /dev/null +++ b/data/onPostBuild/markdownOutputWithLanguages.ts @@ -0,0 +1,631 @@ +import { GatsbyNode, Reporter } from 'gatsby'; +import * as fs from 'fs/promises'; +import * as path from 'path'; +import { glob } from 'glob'; +import { JSDOM, VirtualConsole, ResourceLoader, FetchOptions, AbortablePromise } from 'jsdom'; +import * as TurndownService from 'turndown'; +import languageData from '../../src/data/languages/languageData'; + +const CONFIG = { + htmlDir: './public', + markdownDir: './public', + excludePatterns: ['404.html', 'api/**/*', 'page-data/**/*', 'static/**/*', 'docs/404.html'], + includeMetadata: true, + hydrationTimeout: 30000, // 30 seconds + languageSwitchTimeout: 5000, // 5 seconds per language +}; + +// Selectors for elements to remove from the HTML before converting to markdown +const UNWANTED_ELEMENTS_SELECTOR = + 'script, style, nav[role="navigation"], .header, #header, header, .footer, #footer, footer, [aria-label="breadcrumb"], aside'; + +// Prioritised selectors for the main content of the page, first match wins +const CONTENT_SELECTORS = ['main', '[role="main"]', '.content', '#content', 'article']; + +const withoutTrailingSlash = (urlPath: string) => (urlPath === `/` ? urlPath : urlPath.replace(/\/$/, ``)); + +const cleanAttribute = (attribute: string | null) => { + return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : ''; +}; + +interface LanguageMarkdownOptions { + reporter: Reporter; + siteUrl: string; + assetPrefix?: string; +} + +/** + * Custom ResourceLoader that rewrites ASSET_PREFIX URLs to local file paths + */ +class LocalAssetResourceLoader extends ResourceLoader { + private assetPrefix?: string; + private publicDir: string; + + constructor(assetPrefix?: string) { + super(); + this.assetPrefix = assetPrefix; + this.publicDir = path.resolve('./public'); + } + + fetch(url: string, options: FetchOptions): AbortablePromise { + // If URL contains ASSET_PREFIX, rewrite to local path + if (this.assetPrefix && url.includes(this.assetPrefix)) { + const localPath = url.replace(this.assetPrefix, ''); + const fullPath = path.join(this.publicDir, localPath); + + const promise = (async () => { + try { + const content = await fs.readFile(fullPath, 'utf-8'); + return Buffer.from(content); + } catch (error) { + // If file doesn't exist locally, return null + return null as any; + } + })() as AbortablePromise; + + // Add abort method to make it an AbortablePromise + promise.abort = () => {}; + + return promise; + } + + // For other URLs, use default behavior (but we'll configure JSDOM to not load external resources) + return super.fetch(url, options); + } +} + +/** + * Rewrite asset URLs in HTML content from ASSET_PREFIX to relative paths + */ +function rewriteAssetUrls(html: string, assetPrefix?: string): string { + if (!assetPrefix) return html; + + // Rewrite src and href attributes + return html + .replace(new RegExp(`src="${assetPrefix}`, 'g'), 'src="') + .replace(new RegExp(`href="${assetPrefix}`, 'g'), 'href="') + .replace(new RegExp(`src='${assetPrefix}`, 'g'), "src='") + .replace(new RegExp(`href='${assetPrefix}`, 'g'), "href='"); +} + +/** + * Extract product key from page path + * Examples: /docs/realtime/channels -> realtime (maps to pubsub) + */ +function extractProductFromPath(htmlFile: string): string | null { + const match = htmlFile.match(/docs\/(realtime|chat|spaces|asset-tracking)/i); + if (!match) return null; + + const segment = match[1].toLowerCase(); + + // Map path segments to product keys + const productMap: Record = { + 'realtime': 'pubsub', + 'chat': 'chat', + 'spaces': 'spaces', + 'asset-tracking': 'assetTracking', + }; + + return productMap[segment] || null; +} + +/** + * Detect available languages for a page + */ +function detectAvailableLanguages(document: Document, htmlFile: string): string[] { + // Try to find language selector in the DOM + const languageOptions = document.querySelectorAll('[data-language-selector] option, .language-selector option'); + if (languageOptions.length > 0) { + return Array.from(languageOptions) + .map(option => option.getAttribute('value')) + .filter((val): val is string => !!val); + } + + // Fallback: use product-based language data + const product = extractProductFromPath(htmlFile); + if (product && languageData[product as keyof typeof languageData]) { + return Object.keys(languageData[product as keyof typeof languageData]); + } + + return []; +} + +/** + * Wait for a condition with timeout + */ +function waitFor( + condition: () => boolean, + timeout: number, + checkInterval: number = 100 +): Promise { + return new Promise((resolve) => { + const startTime = Date.now(); + + const check = () => { + if (condition()) { + resolve(true); + return; + } + + if (Date.now() - startTime > timeout) { + resolve(false); + return; + } + + setTimeout(check, checkInterval); + }; + + check(); + }); +} + +/** + * Load and execute Gatsby bundles in JSDOM + */ +async function loadGatsbyBundles( + dom: JSDOM, + htmlFile: string, + reporter: Reporter +): Promise { + const { window } = dom; + const document = window.document; + + try { + // Gatsby bundles are already in the HTML as script tags + // We need to execute them in order + const scripts = Array.from(document.querySelectorAll('script[src]')); + + // Find the Gatsby scripts + const gatsbyScripts = scripts.filter(script => { + const src = script.getAttribute('src') || ''; + return src.includes('webpack-runtime') || + src.includes('framework') || + src.includes('app') || + src.match(/^\/component---/); + }); + + // Scripts are already loaded by JSDOM, but we need to ensure they executed + // Wait for Gatsby to be ready + const gatsbyReady = await waitFor( + () => { + return !!(window as any).___gatsby && !!(window as any).React; + }, + CONFIG.hydrationTimeout + ); + + if (!gatsbyReady) { + reporter.warn(`Gatsby failed to hydrate for ${htmlFile}`); + return false; + } + + // Wait for the page to be fully rendered + await waitFor( + () => { + const mainContent = document.querySelector(CONTENT_SELECTORS.join(',')); + return !!mainContent && mainContent.children.length > 0; + }, + CONFIG.hydrationTimeout + ); + + return true; + } catch (error) { + reporter.error(`Error loading Gatsby bundles for ${htmlFile}:`, error as Error); + return false; + } +} + +/** + * Switch to a specific language and wait for content to update + */ +async function switchLanguage( + dom: JSDOM, + language: string, + reporter: Reporter +): Promise { + const { window } = dom; + const document = window.document; + + try { + // Get current content hash to detect changes + const getContentHash = () => { + const mainContent = document.querySelector(CONTENT_SELECTORS.join(',')); + return mainContent ? mainContent.innerHTML.substring(0, 1000) : ''; + }; + + const beforeHash = getContentHash(); + + // Method 1: Try to manipulate the URL search params + const currentUrl = new URL(window.location.href); + currentUrl.searchParams.set('lang', language); + + // Update window.location + Object.defineProperty(window.location, 'search', { + writable: true, + value: currentUrl.search + }); + Object.defineProperty(window.location, 'href', { + writable: true, + value: currentUrl.href + }); + + // Dispatch events that might trigger language change + window.dispatchEvent(new window.Event('popstate')); + window.dispatchEvent(new window.Event('hashchange')); + + // Method 2: Try to find and manipulate the language selector directly + const languageSelector = document.querySelector('[data-language-selector] select, .language-selector select') as HTMLSelectElement; + if (languageSelector) { + languageSelector.value = language; + + // Trigger change event + const changeEvent = new window.Event('change', { bubbles: true }); + languageSelector.dispatchEvent(changeEvent); + } + + // Method 3: Try to manipulate React state directly if available + if ((window as any).___LANGUAGE_CONTEXT___) { + (window as any).___LANGUAGE_CONTEXT___.setLanguage(language); + } + + // Wait for content to change (or timeout) + const contentChanged = await waitFor( + () => { + const afterHash = getContentHash(); + return afterHash !== beforeHash && afterHash.length > 0; + }, + CONFIG.languageSwitchTimeout + ); + + if (!contentChanged) { + reporter.verbose(`Language switch to ${language} did not change content (might already be in that language)`); + } + + // Additional wait to ensure all React updates are complete + await new Promise(resolve => setTimeout(resolve, 500)); + + return true; + } catch (error) { + reporter.error(`Error switching to language ${language}:`, error as Error); + return false; + } +} + +/** + * Extract and convert content to markdown for a specific language + */ +function extractMarkdownForLanguage( + document: Document, + turndownService: TurndownService, + language: string | null, + siteUrl: string, + htmlFile: string +): { markdown: string; metadata: Record } { + // Remove unwanted elements (create a clone to avoid modifying the original) + const docClone = document.cloneNode(true) as Document; + const unwanted = docClone.querySelectorAll(UNWANTED_ELEMENTS_SELECTOR); + unwanted.forEach((el) => el.remove()); + + // Get main content + let mainContent = null; + for (const selector of CONTENT_SELECTORS) { + mainContent = docClone.querySelector(selector); + if (mainContent) { + break; + } + } + + if (!mainContent) { + mainContent = docClone.body; + } + + // Convert to markdown + const markdown = turndownService.turndown(mainContent.innerHTML); + + // Extract metadata + const title = document.querySelector('title')?.textContent?.trim() || 'Untitled'; + const description = document.querySelector('meta[name="description"]')?.getAttribute('content')?.trim() || ''; + const canonicalUrl = document.querySelector('link[rel="canonical"]')?.getAttribute('href') || ''; + + // Get language version if available + const product = extractProductFromPath(htmlFile); + let languageVersion: string | undefined; + if (language && product) { + const productData = languageData[product as keyof typeof languageData]; + if (productData && typeof productData === 'object') { + languageVersion = (productData as any)[language]; + } + } + + return { + markdown, + metadata: { + title, + url: canonicalUrl || `/${htmlFile.replace('.html', '').replace('/index', '')}`, + description, + language, + languageVersion, + generatedAt: new Date().toISOString(), + }, + }; +} + +/** + * Process a single HTML file with language support + */ +async function processHtmlFileWithLanguages( + htmlFile: string, + options: LanguageMarkdownOptions +): Promise { + const { reporter, siteUrl, assetPrefix } = options; + + try { + const fullPath = path.join(CONFIG.htmlDir, htmlFile); + let htmlContent = await fs.readFile(fullPath, 'utf-8'); + + // Rewrite asset URLs if ASSET_PREFIX is set + if (assetPrefix) { + htmlContent = rewriteAssetUrls(htmlContent, assetPrefix); + } + + // Create TurndownService + const turndownService = new (TurndownService as any)({ + headingStyle: 'atx', + codeBlockStyle: 'fenced', + emDelimiter: '*', + }); + + // Add custom rules (same as original) + turndownService.addRule('header', { + filter: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'], + replacement: (_, node) => { + const level = parseInt(node.nodeName.charAt(1), 10); + return `${'#'.repeat(level)} ${node.textContent}`; + }, + }); + + turndownService.addRule('localLink', { + filter: (node) => (node.nodeName === 'A' && node.getAttribute('href')?.startsWith('/')) || false, + replacement: (content, node) => { + let href = withoutTrailingSlash(siteUrl) + (node as HTMLElement).getAttribute('href'); + if (href) { + href = href.replace(/([()])/g, '\\$1'); + } + let title = cleanAttribute((node as HTMLElement).getAttribute('title')); + if (title) { + title = ' "' + title.replace(/"/g, '\\"') + '"'; + } + return '[' + content + '](' + href + title + ')'; + }, + }); + + // Setup JSDOM with custom resource loader + const virtualConsole = new VirtualConsole(); + virtualConsole.on('error', () => {}); // Suppress errors + + const resourceLoader = new LocalAssetResourceLoader(assetPrefix); + + const dom = new JSDOM(htmlContent, { + url: siteUrl + '/' + htmlFile, + runScripts: 'dangerously', + resources: resourceLoader, + virtualConsole, + beforeParse(window) { + // Mock necessary browser APIs + (window as any).requestAnimationFrame = (cb: any) => setTimeout(cb, 0); + (window as any).cancelAnimationFrame = (id: any) => clearTimeout(id); + + // Mock IntersectionObserver + (window as any).IntersectionObserver = class IntersectionObserver { + observe() {} + unobserve() {} + disconnect() {} + }; + + // Mock localStorage + if (!(window as any).localStorage) { + (window as any).localStorage = { + getItem: () => null, + setItem: () => {}, + removeItem: () => {}, + clear: () => {}, + }; + } + }, + }); + + const { window } = dom; + const document = window.document; + + // Detect available languages + const availableLanguages = detectAvailableLanguages(document, htmlFile); + + if (availableLanguages.length === 0) { + // No language selector found - generate single markdown file (current behavior) + reporter.verbose(`No languages found for ${htmlFile}, generating single markdown`); + + const { markdown, metadata } = extractMarkdownForLanguage( + document, + turndownService, + null, + siteUrl, + htmlFile + ); + + const finalContent = CONFIG.includeMetadata + ? `--- +title: "${metadata.title}" +url: ${metadata.url} +generated_at: ${metadata.generatedAt} +description: "${metadata.description}" +--- + +${markdown}` + : markdown; + + const outputName = `${htmlFile.replace('/index.html', '')}.md`; + const outputPath = path.join(CONFIG.markdownDir, outputName); + await fs.writeFile(outputPath, finalContent); + + return; + } + + reporter.info(`Found ${availableLanguages.length} languages for ${htmlFile}: ${availableLanguages.join(', ')}`); + + // Try to hydrate React + const hydrated = await loadGatsbyBundles(dom, htmlFile, reporter); + + if (!hydrated) { + reporter.warn(`Failed to hydrate React for ${htmlFile}, falling back to static extraction`); + + // Fall back to generating a single file without language switching + const { markdown, metadata } = extractMarkdownForLanguage( + document, + turndownService, + null, + siteUrl, + htmlFile + ); + + const finalContent = CONFIG.includeMetadata + ? `--- +title: "${metadata.title}" +url: ${metadata.url} +generated_at: ${metadata.generatedAt} +description: "${metadata.description}" +--- + +${markdown}` + : markdown; + + const outputName = `${htmlFile.replace('/index.html', '')}.md`; + const outputPath = path.join(CONFIG.markdownDir, outputName); + await fs.writeFile(outputPath, finalContent); + + return; + } + + // Generate markdown for each language + for (const language of availableLanguages) { + reporter.verbose(`Processing language: ${language} for ${htmlFile}`); + + // Switch to this language + const switched = await switchLanguage(dom, language, reporter); + + if (!switched) { + reporter.warn(`Failed to switch to language ${language} for ${htmlFile}`); + continue; + } + + // Extract content for this language + const { markdown, metadata } = extractMarkdownForLanguage( + document, + turndownService, + language, + siteUrl, + htmlFile + ); + + // Create final content with metadata + const finalContent = CONFIG.includeMetadata + ? `--- +title: "${metadata.title}" +url: ${metadata.url} +generated_at: ${metadata.generatedAt} +description: "${metadata.description}" +language: "${metadata.language}"${metadata.languageVersion ? `\nlanguage_version: "${metadata.languageVersion}"` : ''} +--- + +${markdown}` + : markdown; + + // Save with language suffix: page.javascript.md + const baseName = htmlFile.replace('/index.html', ''); + const outputName = `${baseName}.${language}.md`; + const outputPath = path.join(CONFIG.markdownDir, outputName); + + await fs.writeFile(outputPath, finalContent); + reporter.verbose(`✓ Generated ${outputName}`); + } + } catch (error) { + reporter.error(`Error processing ${htmlFile}:`, error as Error); + throw error; + } +} + +/** + * Export all HTML files to language-specific markdown + */ +export async function exportToMarkdownWithLanguages(options: LanguageMarkdownOptions): Promise { + const { reporter } = options; + + // Find all HTML files + const htmlFiles = await glob('**/*.html', { + cwd: CONFIG.htmlDir, + ignore: CONFIG.excludePatterns, + }); + + reporter.info(`Found ${htmlFiles.length} HTML files to process with language support`); + + // Process files sequentially to avoid overwhelming the system + // (JSDOM with React is resource-intensive) + for (const htmlFile of htmlFiles) { + try { + await processHtmlFileWithLanguages(htmlFile, options); + } catch (error) { + reporter.error(`Failed to process ${htmlFile}:`, error as Error); + // Continue with next file + } + } + + reporter.info(`✓ Language-aware markdown export complete! ${htmlFiles.length} files processed.`); +} + +interface QueryResult { + site: { + siteMetadata: { + siteUrl: string; + }; + }; +} + +/** + * Gatsby post-build hook + */ +export const onPostBuild: GatsbyNode['onPostBuild'] = async ({ graphql, reporter }) => { + const query = ` + query { + site { + siteMetadata { + siteUrl + } + } + } + `; + + const { data, errors } = await graphql(query); + + if (errors) { + reporter.panicOnBuild(`Error while running GraphQL query.`); + throw errors; + } + + if (!data) { + reporter.panicOnBuild(`No documents found.`); + throw new Error('No documents found.'); + } + + const siteUrl = data.site.siteMetadata.siteUrl; + + if (!siteUrl) { + reporter.panicOnBuild(`Site URL not found.`); + throw new Error('Site URL not found.'); + } + + const assetPrefix = process.env.ASSET_PREFIX; + + await exportToMarkdownWithLanguages({ + reporter, + siteUrl, + assetPrefix, + }); +}; diff --git a/package.json b/package.json index d49fe77cec..1654947fb5 100644 --- a/package.json +++ b/package.json @@ -38,7 +38,10 @@ "lint-staged": "lint-staged", "repo-githooks": "git config core.hooksPath .githooks", "no-githooks": "git config --unset core.hooksPath", - "validate-llms-txt": "node bin/validate-llms.txt.ts" + "validate-llms-txt": "node bin/validate-llms.txt.ts", + "generate-markdown": "node scripts/generate-language-markdown.ts", + "generate-markdown:simple": "node scripts/generate-language-markdown.ts --mode=simple", + "generate-markdown:verbose": "node scripts/generate-language-markdown.ts --verbose" }, "dependencies": { "@ably/ui": "17.9.3", @@ -92,7 +95,7 @@ "react-select": "^5.7.0", "remark-gfm": "^1.0.0", "textile-js": "^2.1.1", - "turndown": "^7.1.1", + "turndown": "^7.2.0", "typescript": "^4.6.3", "use-keyboard-shortcut": "^1.1.6", "util": "^0.12.4", @@ -133,10 +136,12 @@ "eslint-plugin-react-hooks": "^4.6.0", "fast-check": "^3.4.0", "gatsby-plugin-postcss": "^6.3.0", + "glob": "^11.0.2", "identity-obj-proxy": "^3.0.0", "jest": "^29.3.1", "jest-axe": "^7.0.0", "jest-environment-jsdom": "^29.3.1", + "jsdom": "^26.1.0", "lint-staged": "^13.1.0", "msw": "^2.0.1", "postcss": "^8.4.31", diff --git a/scripts/generate-language-markdown.ts b/scripts/generate-language-markdown.ts new file mode 100644 index 0000000000..29644f85f5 --- /dev/null +++ b/scripts/generate-language-markdown.ts @@ -0,0 +1,223 @@ +#!/usr/bin/env node + +/** + * Standalone CLI script for generating language-specific markdown files + * + * Usage: + * yarn generate-markdown + * yarn generate-markdown --mode=simple + * yarn generate-markdown --pages "docs/realtime/*" + * yarn generate-markdown --languages "javascript,python" + * yarn generate-markdown --help + */ + +import dotenv from 'dotenv'; +import { exportToMarkdown } from '../data/onPostBuild/markdownOutput'; +import { exportToMarkdownWithLanguages } from '../data/onPostBuild/markdownOutputWithLanguages'; + +// Mock reporter for standalone execution +class ConsoleReporter { + info(message: string) { + console.log(`ℹ️ ${message}`); + } + + warn(message: string) { + console.warn(`⚠️ ${message}`); + } + + error(message: string, error?: Error) { + console.error(`❌ ${message}`); + if (error) { + console.error(error); + } + } + + verbose(message: string) { + if (process.env.VERBOSE === 'true') { + console.log(`🔍 ${message}`); + } + } + + panicOnBuild(message: string) { + console.error(`💥 PANIC: ${message}`); + process.exit(1); + } +} + +interface CliOptions { + mode: 'simple' | 'advanced'; + env?: string; + pages?: string; + languages?: string; + siteUrl?: string; + verbose: boolean; + help: boolean; +} + +function parseArgs(): CliOptions { + const args = process.argv.slice(2); + const options: CliOptions = { + mode: 'advanced', + env: process.env.NODE_ENV || 'production', + pages: undefined, + languages: undefined, + siteUrl: process.env.GATSBY_ABLY_MAIN_WEBSITE || 'https://ably.com', + verbose: false, + help: false, + }; + + for (const arg of args) { + if (arg === '--help' || arg === '-h') { + options.help = true; + } else if (arg.startsWith('--mode=')) { + const mode = arg.split('=')[1]; + if (mode === 'simple' || mode === 'advanced') { + options.mode = mode; + } else { + console.error(`Invalid mode: ${mode}. Must be "simple" or "advanced"`); + process.exit(1); + } + } else if (arg.startsWith('--env=')) { + options.env = arg.split('=')[1]; + } else if (arg.startsWith('--pages=')) { + options.pages = arg.split('=')[1]; + } else if (arg.startsWith('--languages=')) { + options.languages = arg.split('=')[1]; + } else if (arg.startsWith('--site-url=')) { + options.siteUrl = arg.split('=')[1]; + } else if (arg === '--verbose' || arg === '-v') { + options.verbose = true; + } + } + + return options; +} + +function printHelp() { + console.log(` +Generate language-specific markdown files from built HTML + +Usage: + yarn generate-markdown [options] + +Options: + --mode= Export mode: "simple" or "advanced" (default: advanced) + --env= Environment to load (.env.) + --pages= Glob pattern to filter pages (e.g., "docs/realtime/*") + --languages= Comma-separated language list (e.g., "javascript,python") + --site-url= Site URL for absolute links + --verbose, -v Enable verbose logging + --help, -h Show this help message + +Examples: + yarn generate-markdown + yarn generate-markdown --mode=simple + yarn generate-markdown --pages="docs/realtime/*" + yarn generate-markdown --languages="javascript,python" --verbose + +Environment Variables: + MARKDOWN_SIMPLE_MODE Force simple mode (set to 'true') + ASSET_PREFIX Asset prefix for rewriting URLs + VERBOSE Enable verbose logging + `); +} + +async function loadEnvironment(env?: string) { + // Load environment variables + const envFile = env ? `.env.${env}` : `.env.${process.env.NODE_ENV || 'production'}`; + + dotenv.config({ path: envFile }); + + console.log(`📦 Loaded environment from ${envFile}`); +} + +async function main() { + const options = parseArgs(); + + if (options.help) { + printHelp(); + process.exit(0); + } + + // Set verbose mode + if (options.verbose) { + process.env.VERBOSE = 'true'; + } + + // Load environment + await loadEnvironment(options.env); + + const reporter = new ConsoleReporter() as any; + const siteUrl = options.siteUrl || process.env.GATSBY_ABLY_MAIN_WEBSITE || 'https://ably.com'; + + console.log(''); + console.log('🚀 Starting markdown generation...'); + console.log(''); + console.log(` Mode: ${options.mode}`); + console.log(` Site URL: ${siteUrl}`); + console.log(` Environment: ${options.env}`); + + if (options.pages) { + console.log(` Pages filter: ${options.pages}`); + } + + if (options.languages) { + console.log(` Languages: ${options.languages}`); + } + + if (process.env.ASSET_PREFIX) { + console.log(` Asset Prefix: ${process.env.ASSET_PREFIX}`); + } + + console.log(''); + + const startTime = Date.now(); + + try { + if (options.mode === 'simple') { + // Simple mode + await exportToMarkdown( + { reporter, siteUrl }, + { advancedMode: false } + ); + } else { + // Advanced mode with language support + const assetPrefix = process.env.ASSET_PREFIX; + + if (options.pages || options.languages) { + reporter.warn('Page and language filtering not yet implemented, generating all pages/languages'); + // TODO: Implement filtering by passing options to exportToMarkdownWithLanguages + } + + await exportToMarkdownWithLanguages({ + reporter, + siteUrl, + assetPrefix, + }); + } + + const duration = ((Date.now() - startTime) / 1000).toFixed(2); + + console.log(''); + console.log(`✅ Markdown generation complete in ${duration}s`); + console.log(''); + + process.exit(0); + } catch (error) { + console.error(''); + console.error('❌ Markdown generation failed:'); + console.error(error); + console.error(''); + + process.exit(1); + } +} + +// Handle unhandled rejections +process.on('unhandledRejection', (error) => { + console.error('Unhandled rejection:', error); + process.exit(1); +}); + +// Run main function +main(); diff --git a/src/components/Head.tsx b/src/components/Head.tsx index 6789294f81..5716786e14 100644 --- a/src/components/Head.tsx +++ b/src/components/Head.tsx @@ -7,12 +7,14 @@ export const Head = ({ description, metaTitle, keywords, + robots, }: { title: string; canonical: string; description: string; metaTitle?: string; keywords?: string; + robots?: string; }) => ( {metaTitle || title} @@ -23,6 +25,7 @@ export const Head = ({ {keywords && } + {robots && } diff --git a/src/components/Layout/MDXWrapper.tsx b/src/components/Layout/MDXWrapper.tsx index 5e74554bad..8ac207002c 100644 --- a/src/components/Layout/MDXWrapper.tsx +++ b/src/components/Layout/MDXWrapper.tsx @@ -1,6 +1,7 @@ import React, { PropsWithChildren, useState, + useEffect, createContext, isValidElement, cloneElement, @@ -182,7 +183,29 @@ const MDXWrapper: React.FC = ({ children, pageContext, location const metaTitle = getMetaTitle(title, (activePage.product as ProductName) || META_PRODUCT_FALLBACK) as string; const { canonicalUrl } = useSiteMetadata(); - const canonical = canonicalUrl(location.pathname); + + // Check if current path is a language variant (ends with a language identifier) + // Pattern: /docs/auth/token/java → language variant + const languageSegmentPattern = /\/(javascript|typescript|python|java|ruby|php|go|swift|kotlin|csharp|objc|nodejs|react|flutter|laravel|shell)$/i; + const isLanguageVariant = languageSegmentPattern.test(location.pathname); + + // Strip language segment from pathname for canonical URL and browser display + const cleanPath = isLanguageVariant + ? location.pathname.replace(languageSegmentPattern, '') + : location.pathname; + const canonical = canonicalUrl(cleanPath) + location.search; + + // Prevent indexing of language variant URLs (nginx serves these via query params) + const robots = isLanguageVariant ? 'noindex, follow' : undefined; + + // Clean up browser URL if it's a language variant (strip language segment) + // Use Gatsby's navigate to keep router state in sync + useEffect(() => { + if (isLanguageVariant) { + const cleanUrl = cleanPath + location.search; + navigate(cleanUrl, { replace: true }); + } + }, [isLanguageVariant, cleanPath, location.search]); // Use the copyable headers hook useCopyableHeaders(); @@ -206,7 +229,7 @@ const MDXWrapper: React.FC = ({ children, pageContext, location return ( - +
{ + // Priority 1: Pre-set language from pageContext (for language variant pages) + if (pageContextLanguage && Object.keys(languageInfo).includes(pageContextLanguage)) { + return pageContextLanguage as LanguageKey; + } + + // Priority 2: Query parameter const params = new URLSearchParams(location); const langParam = params.get('lang') as LanguageKey; if (langParam && Object.keys(languageInfo).includes(langParam) && activeLanguages.includes(langParam)) { return langParam; - } else if (activeLanguages.length > 0 && product) { + } + + // Priority 3: First relevant language for product + if (activeLanguages.length > 0 && product) { const relevantLanguages = activeLanguages.filter((lang) => Object.keys(languageData[product]).includes(lang)); return relevantLanguages[0]; } @@ -94,7 +104,12 @@ export const LayoutProvider: React.FC