From e72ac7ac2da79785c3009329f731078e4eaad735 Mon Sep 17 00:00:00 2001 From: thinko Date: Tue, 1 Apr 2025 11:48:18 -0700 Subject: [PATCH] feat: add direct HTML file parsing support --- src/service/document/parse/html.js | 13 +++++++++++++ src/service/document/parse/index.js | 4 +++- src/service/document/parse/odt.js | 4 ++-- src/service/document/reader.js | 10 +++++++++- 4 files changed, 27 insertions(+), 4 deletions(-) diff --git a/src/service/document/parse/html.js b/src/service/document/parse/html.js index fb885a4..8a29042 100644 --- a/src/service/document/parse/html.js +++ b/src/service/document/parse/html.js @@ -33,6 +33,19 @@ function extractSectionsAndContent(html) { return extractedSections; } +async function parseHtml(htmlContent, documentPath) { + try { + // HTML content doesn't need citations and hyperlinks removed as extractSectionsAndContent handles that + const sections = extractSectionsAndContent(htmlContent); + // If a document path is provided, preserve it for session reset + return documentPath ? preserveDocumentContext(sections, documentPath) : sections; + } catch (err) { + console.error("Error parsing HTML content:", err); + return []; + } +} + module.exports = { extractSectionsAndContent, + parseHtml, }; diff --git a/src/service/document/parse/index.js b/src/service/document/parse/index.js index d9de47e..1e2bbc8 100644 --- a/src/service/document/parse/index.js +++ b/src/service/document/parse/index.js @@ -3,11 +3,13 @@ const { parseTxt } = require("./txt"); const { parseMd } = require("./md"); const { parseDocx } = require("./docx"); const { parseOdt } = require("./odt"); +const { parseHtml } = require("./html"); module.exports = { parseMd, parseOdt, parsePdf, parseTxt, - parseDocx + parseDocx, + parseHtml }; diff --git a/src/service/document/parse/odt.js b/src/service/document/parse/odt.js index 6a7d94b..8f510b8 100644 --- a/src/service/document/parse/odt.js +++ b/src/service/document/parse/odt.js @@ -11,8 +11,8 @@ async function parseOdt(odtFilePath) { return []; } - html = removeCitations(html); - html = removeHyperlinks(html); + html = removeCitations(html.value); // Ensure .value is used correctly + html = removeHyperlinks(html.value); return extractSectionsAndContent(html); } catch (err) { diff --git a/src/service/document/reader.js b/src/service/document/reader.js index 5c957e9..cd437ca 100644 --- a/src/service/document/reader.js +++ b/src/service/document/reader.js @@ -1,6 +1,6 @@ const path = require("path"); const fs = require("fs").promises; -const { parsePdf, parseMd, parseOdt, parseTxt, parseDocx } = require("./parse"); +const { parsePdf, parseMd, parseOdt, parseTxt, parseDocx, parseHtml } = require("./parse"); async function loadFile(filePath) { const fileExtension = path.extname(filePath).toLowerCase(); @@ -25,6 +25,14 @@ async function loadFile(filePath) { }; case ".pdf": return await parsePdf(filePath); + case ".html": + case ".xhtml": + case ".htm": + let htmlContent = await fs.readFile(filePath, "utf-8"); + return { + fileName: path.basename(filePath), + data: await parseHtml(htmlContent), + }; default: // just try to parse it as a text file let rawText = await fs.readFile(filePath, "utf-8");