diff --git a/.docsearch.README.md b/.docsearch.README.md new file mode 100644 index 00000000000..2310abe4854 --- /dev/null +++ b/.docsearch.README.md @@ -0,0 +1,111 @@ +# DocSearch Configuration + +This directory contains the Algolia DocSearch configuration for the Apache Camel website. + +## Overview + +The `.docsearch.config.json` file defines how Algolia's crawler indexes the Camel website for search functionality. This configuration ensures that all relevant content is discoverable through the site search, including: + +- All component documentation (not just canonical versions) +- Tables with component specifications and supported models +- Metadata sections and inline code +- Multiple documentation versions (next, latest, and release branches) + +## Key Configuration Elements + +### Index Settings (`index`) +- **name**: `apache_camel` - The Algolia index where content is stored +- **startUrls**: Entry points for the crawler +- **pathsToMatch**: URL patterns to include in indexing +- **pathsToIgnore**: URLs to skip (search pages, error pages, etc.) +- **includeHeadingLevels**: All heading levels (h1-h6) are indexed for better navigation + +### Content Selectors (`selectors`) + +These CSS selectors define what content gets indexed: + +- **lvl0-lvl5**: Heading hierarchy (h1-h6) used to build the breadcrumb structure +- **text**: Main content to index including: + - Paragraphs (`p`), list items (`li`) + - Table cells (`td`, `th`) - **Important for component specs** + - Definition terms (`dt`, `dd`) + - Code blocks (`code`, `pre`) + +This ensures keywords like "PyTorch" in Model Zoo tables are indexed, fixing issue #1209. + +### Exclusions (`selectors_exclude`) + +Navigation, sidebars, footers, and other non-content elements are excluded to improve search quality: +- `.no_index`, `[data-no-index]` - Custom exclusion attributes +- Navigation elements: `nav`, `.navbar`, `.menu`, `.sidebar`, `.toc` +- Footer and copyright +- Hidden elements: `.hidden`, `[aria-hidden='true']` + +### Crawling Rules (`crawler`) + +- **maxDepth**: 20 - Allows deep navigation through component docs +- **maxUrls**: 50,000 - Sufficient for Camel's comprehensive documentation +- **sitemapUrls**: Uses sitemap for efficient crawling +- **timeoutMs**: 30,000 - Adequate for large pages with tables + +### Multi-Version Support (`start_urls`) + +The configuration crawls multiple documentation versions: + +1. **next** (page_rank: 5) - Development version +2. **latest** (page_rank: 5) - Latest stable +3. **\d+\.\d+\.\x** (page_rank: 4) - Release branches (4.4.x, 4.10.x, etc.) +4. **manual** (page_rank: 7) - Core documentation (highest priority) +5. **docs** (page_rank: 6) - General documentation +6. **blog** (page_rank: 3) - Blog posts + +This addresses the issue where only canonical (4.4.x) pages were indexed. + +### Search Behavior (`custom_settings`) + +- **searchableAttributes**: Fields available for full-text search +- **separatorsToIndex**: Include underscores, dots, and dashes in search (important for component names like `camel-k`) +- **attributeForDistinctResults**: Deduplicate results by URL to avoid showing the same page multiple times + +## Maintenance + +When making changes to this configuration: + +1. **Test locally** - Build the site and verify crawling works +2. **Document changes** - Explain why selectors or URLs were modified +3. **Consider impacts** - Changes affect search indexing across all users +4. **Verify coverage** - Use Algolia dashboard to check what's indexed + +### Common Modifications + +**Adding new documentation sections:** +```json +{ + "url": "https://camel.apache.org/new-section/", + "page_rank": 5 +} +``` + +**Excluding problematic content:** +```json +"selectors_exclude": [ + ".no_index", + ".problematic-element" +] +``` + +**Adjusting content extraction:** +Modify the `text` selector in the `selectors` section to include additional elements. + +## Related Issue + +- **Issue #1209**: "The search is not finding several fields" + - Problem: Keywords like Bradley, firmata, PyTorch not indexed from component documentation + - Root cause: Missing configuration for table content and non-canonical versions + - Solution: This configuration file with improved selectors and multi-version crawling + +## References + +- [Algolia DocSearch Documentation](https://docsearch.algolia.com/) +- [Camel Website GitHub](https://github.com/apache/camel-website) +- [Issue #1209](https://github.com/apache/camel-website/issues/1209) diff --git a/.docsearch.config.json b/.docsearch.config.json new file mode 100644 index 00000000000..557fd70e84f --- /dev/null +++ b/.docsearch.config.json @@ -0,0 +1,125 @@ +{ + "index": { + "name": "apache_camel", + "startUrls": [ + "https://camel.apache.org/" + ], + "ignoreCanonicalTo": false, + "pathsToMatch": [ + "https://camel.apache.org/**" + ], + "pathsToIgnore": [ + "https://camel.apache.org/search", + "https://camel.apache.org/404.html" + ], + "includeHeadingLevels": [1, 2, 3, 4, 5, 6], + "stripQueryParameters": true + }, + "crawler": { + "userAgent": "Algolia Crawler", + "maxDepth": 20, + "maxUrls": 50000, + "waitUntilFired": true, + "timeoutMs": 30000, + "sitemapUrls": [ + "https://camel.apache.org/sitemap.xml" + ], + "ignoreRobotsTxt": false, + "allowedDomains": [ + "camel.apache.org" + ] + }, + "selectors": { + "lvl0": { + "selector": "h1", + "global": true, + "default_value": "Documentation" + }, + "lvl1": "h2", + "lvl2": "h3", + "lvl3": "h4", + "lvl4": "h5", + "lvl5": "h6", + "text": "p, li, td, th, dt, dd, span:not(.tooltip), div:not([class*='hidden']), table tbody, code, pre" + }, + "selectors_exclude": [ + ".no_index", + "[data-no-index]", + ".sidebar", + ".breadcrumb", + "nav", + ".navbar", + ".menu", + ".toc", + "footer", + ".footer", + ".copyright", + ".hide", + ".hidden", + "[aria-hidden='true']", + "script", + "style", + ".language-toggle", + ".sidebar-toggle" + ], + "min_indexed_level": 1, + "only_content_level": false, + "start_urls": [ + { + "url": "https://camel.apache.org/components/next/", + "page_rank": 5 + }, + { + "url": "https://camel.apache.org/components/latest/", + "page_rank": 5 + }, + { + "url": "https://camel.apache.org/components/(\\d+)\\.(\\d+)\\.x/", + "page_rank": 4 + }, + { + "url": "https://camel.apache.org/manual/", + "page_rank": 7 + }, + { + "url": "https://camel.apache.org/docs/", + "page_rank": 6 + }, + { + "url": "https://camel.apache.org/blog/", + "page_rank": 3 + }, + { + "url": "https://camel.apache.org/", + "page_rank": 8 + } + ], + "stop_urls": [ + "\\?", + "#" + ], + "custom_settings": { + "separatorsToIndex": "_.-", + "attributesForFaceting": [ + "version" + ], + "attributesToIndex": [ + "hierarchy", + "content", + "url" + ], + "minWordSizefor1Typo": 4, + "minWordSizefor2Typos": 8, + "exactOnSingleWordQuery": "none", + "attributeForDistinctResults": "url", + "searchableAttributes": [ + "hierarchy.lvl0", + "hierarchy.lvl1", + "hierarchy.lvl2", + "hierarchy.lvl3", + "hierarchy.lvl4", + "hierarchy.lvl5", + "content" + ] + } +} diff --git a/README.md b/README.md index 62a0c645f5b..81c42a8ceb2 100644 --- a/README.md +++ b/README.md @@ -453,6 +453,35 @@ all generated sources in the project first. Of course this then takes some more time than an optimized rebuild (time to grab another coffee!). +## Search Indexing Configuration + +The website uses [Algolia DocSearch](https://docsearch.algolia.com/) to provide site-wide search functionality. The search configuration is defined in [`.docsearch.config.json`](.docsearch.config.json). + +### What is indexed + +The configuration ensures that Algolia's crawler indexes: +- All documentation versions (development `next`, latest, and release branches like `4.4.x`) +- Component specifications and tables (fixing issue #1209) +- All heading levels and content blocks +- Code blocks and inline code snippets + +### Maintaining the search configuration + +If you need to modify what gets indexed or how content is crawled: + +1. Edit [`.docsearch.config.json`](.docsearch.config.json) to change selectors or crawling rules +2. Review the detailed documentation in [`.docsearch.README.md`](.docsearch.README.md) +3. Test your changes by building the site locally: `yarn build` +4. Verify content is indexable by visiting the search functionality in the preview + +Key elements to be aware of: +- **Selectors** define what HTML elements are indexed (headings, paragraphs, tables, code) +- **start_urls** control which parts of the site are crawled and their search priority +- **selectors_exclude** specify elements to skip (navigation, sidebars, footers) +- **custom_settings** control search behavior and index settings + +For more details, see [`.docsearch.README.md`](.docsearch.README.md). + # Checks, publishing the website The content of the website, as built by the [Camel.website](https://ci-builds.apache.org/job/Camel/job/Camel.website/job/main/) diff --git a/antora-ui-camel/src/css/header.css b/antora-ui-camel/src/css/header.css index 8b265ec82da..9eccac2bd0a 100644 --- a/antora-ui-camel/src/css/header.css +++ b/antora-ui-camel/src/css/header.css @@ -303,6 +303,8 @@ html:not([data-scroll='0']) .navbar { margin-right: 10px; overflow-y: auto; max-height: 80vh; + max-width: min(600px, 90vw); + min-width: 300px; scrollbar-width: thin; /* Firefox */ } diff --git a/antora-ui-camel/src/js/vendor/algoliasearch.bundle.js b/antora-ui-camel/src/js/vendor/algoliasearch.bundle.js index a4b692ab8df..5951d366b5c 100644 --- a/antora-ui-camel/src/js/vendor/algoliasearch.bundle.js +++ b/antora-ui-camel/src/js/vendor/algoliasearch.bundle.js @@ -5,6 +5,7 @@ const MAX_SNIPPET_LENGTH = 200 const RESULTS_LIMIT = 10 + const MAX_INPUT_LENGTH = 200 // Sub-projects to exclude from main search - users can browse these directly const EXCLUDED_SUBPROJECTS = [ @@ -16,6 +17,15 @@ '/camel-karaf/', ] + // Core docs patterns - these should rank higher than component pages + const CORE_DOCS_PATTERNS = [ + '/manual/', + '/user-guide/', + '/architecture/', + '/getting-started/', + '/faq/', + ] + // Check if a URL belongs to a sub-project that should be filtered out function isSubProjectUrl (url) { if (!url) return false @@ -24,6 +34,125 @@ }) } + // Check if a URL points to core documentation (should rank higher) + function isCoreDocsUrl (url) { + if (!url) return false + return CORE_DOCS_PATTERNS.some(function (pattern) { + return url.indexOf(pattern) !== -1 + }) + } + + // Check if a URL points to component documentation + function isComponentUrl (url) { + if (!url) return false + return url.indexOf('/components/') !== -1 + } + + // Sort hits to prioritize core docs over components + function sortByCoreDocs (hits) { + return hits.sort(function (a, b) { + var aIsCore = isCoreDocsUrl(a.url) + var bIsCore = isCoreDocsUrl(b.url) + var aIsComponent = isComponentUrl(a.url) + var bIsComponent = isComponentUrl(b.url) + + // Core docs first + if (aIsCore && !bIsCore) return -1 + if (!aIsCore && bIsCore) return 1 + + // Components last + if (aIsComponent && !bIsComponent) return 1 + if (!aIsComponent && bIsComponent) return -1 + + return 0 + }) + } + + // Extract the parent page path from a URL (removes anchor and trailing segments) + function getParentPagePath (url) { + if (!url) return '' + // Remove anchor fragment + var path = url.split('#')[0] + // Normalize trailing slash + if (path.endsWith('/')) { + path = path.slice(0, -1) + } + return path + } + + // Check if hit represents a sub-section of a page (has anchor or deeper hierarchy) + function isSubSection (hit) { + if (!hit || !hit.url) return false + return hit.url.indexOf('#') !== -1 + } + + // Get the breadcrumb depth (number of hierarchy levels) + function getBreadcrumbDepth (hit) { + if (!hit || !hit.hierarchy) return 0 + return Object.values(hit.hierarchy).filter(function (lvl) { + return lvl !== null + }).length + } + + // Remove duplicate results for the same parent page + // When parent page is a direct match, exclude its sub-sections + function deduplicateHits (hits, query) { + var seenPages = {} + var parentMatches = {} + var queryLower = (query || '').toLowerCase().trim() + + // First pass: identify parent pages that match the query directly + hits.forEach(function (hit) { + var parentPath = getParentPagePath(hit.url) + var hierarchy = hit.hierarchy || {} + + // Check if any top-level hierarchy matches the search query + var lvl1 = (hierarchy.lvl1 || '').toLowerCase() + var lvl0 = (hierarchy.lvl0 || '').toLowerCase() + + if (lvl1 && lvl1.indexOf(queryLower) !== -1) { + parentMatches[parentPath] = true + } + if (lvl0 && lvl0.indexOf(queryLower) !== -1) { + parentMatches[parentPath] = true + } + }) + + // Second pass: filter out sub-sections when parent is already matched + return hits.filter(function (hit) { + var parentPath = getParentPagePath(hit.url) + var isSubSec = isSubSection(hit) + var depth = getBreadcrumbDepth(hit) + + // If this is a sub-section and parent page already matched, skip it + if (isSubSec && parentMatches[parentPath]) { + // Only keep the main page hit, not sub-sections + if (seenPages[parentPath]) { + return false + } + } + + // For component pages, only show the main entry (depth <= 2) + if (isComponentUrl(hit.url) && depth > 2 && seenPages[parentPath]) { + return false + } + + // Track that we've seen this parent page + if (!seenPages[parentPath]) { + seenPages[parentPath] = { depth: depth, hit: hit } + return true + } + + // If we already have this page, only keep if it's a better match (shallower) + if (depth < seenPages[parentPath].depth) { + seenPages[parentPath] = { depth: depth, hit: hit } + return true + } + + return false + }) + } + function truncateHighlightedHtml (html, maxChars) { if (!html || maxChars <= 0) return '' @@ -104,6 +233,13 @@ e.stopPropagation() }) + // Enforce max input length as backup to HTML maxlength attribute + search.addEventListener('input', function () { + if (search.value.length > MAX_INPUT_LENGTH) { + search.value = search.value.substring(0, MAX_INPUT_LENGTH) + } + }) + search.addEventListener( 'keyup', debounce((key) => { @@ -119,16 +255,21 @@ return } cancel.style.display = 'block' + var searchQuery = search.value index - .search(search.value, { - hitsPerPage: 10, + .search(searchQuery, { + hitsPerPage: 50, }) .then((results) => { // Filter out sub-project results to focus on camel-core documentation const filteredHits = results.hits.filter(function (hit) { return !isSubProjectUrl(hit.url) - }).slice(0, RESULTS_LIMIT) - const data = filteredHits.reduce((data, hit) => { + }) + // Remove duplicate results for the same parent page + const dedupedHits = deduplicateHits(filteredHits, searchQuery) + // Sort to prioritize core docs over components and limit results + const sortedHits = sortByCoreDocs(dedupedHits).slice(0, RESULTS_LIMIT) + const data = sortedHits.reduce((data, hit) => { const section = hit.hierarchy.lvl0 const sectionKey = `${section}-${hit.version || ''}` diff --git a/antora-ui-camel/src/partials/header-content.hbs b/antora-ui-camel/src/partials/header-content.hbs index 3173cfc98e4..4ae105d550c 100644 --- a/antora-ui-camel/src/partials/header-content.hbs +++ b/antora-ui-camel/src/partials/header-content.hbs @@ -17,7 +17,7 @@