From e72ac7ac2da79785c3009329f731078e4eaad735 Mon Sep 17 00:00:00 2001
From: thinko <ahandy@gmail.com>
Date: Tue, 1 Apr 2025 11:48:18 -0700
Subject: [PATCH] feat: add direct HTML file parsing support

---
 src/service/document/parse/html.js  | 13 +++++++++++++
 src/service/document/parse/index.js |  4 +++-
 src/service/document/parse/odt.js   |  4 ++--
 src/service/document/reader.js      | 10 +++++++++-
 4 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/src/service/document/parse/html.js b/src/service/document/parse/html.js
index fb885a4..8a29042 100644
--- a/src/service/document/parse/html.js
+++ b/src/service/document/parse/html.js
@@ -33,6 +33,19 @@ function extractSectionsAndContent(html) {
     return extractedSections;
 }
 
+async function parseHtml(htmlContent, documentPath) {
+    try {
+        // HTML content doesn't need citations and hyperlinks removed as extractSectionsAndContent handles that
+        const sections = extractSectionsAndContent(htmlContent);
+        // If a document path is provided, preserve it for session reset
+        return documentPath ? preserveDocumentContext(sections, documentPath) : sections;
+    } catch (err) {
+        console.error("Error parsing HTML content:", err);
+        return [];
+    }
+}
+
 module.exports = {
     extractSectionsAndContent,
+    parseHtml,
 };
diff --git a/src/service/document/parse/index.js b/src/service/document/parse/index.js
index d9de47e..1e2bbc8 100644
--- a/src/service/document/parse/index.js
+++ b/src/service/document/parse/index.js
@@ -3,11 +3,13 @@ const { parseTxt } = require("./txt");
 const { parseMd } = require("./md");
 const { parseDocx } = require("./docx");
 const { parseOdt } = require("./odt");
+const { parseHtml } = require("./html");
 
 module.exports = {
   parseMd,
   parseOdt,
   parsePdf,
   parseTxt,
-  parseDocx
+  parseDocx,
+  parseHtml
 };
diff --git a/src/service/document/parse/odt.js b/src/service/document/parse/odt.js
index 6a7d94b..8f510b8 100644
--- a/src/service/document/parse/odt.js
+++ b/src/service/document/parse/odt.js
@@ -11,8 +11,8 @@ async function parseOdt(odtFilePath) {
       return [];
     }
 
-    html = removeCitations(html);
-    html = removeHyperlinks(html);
+    html = removeCitations(html.value); // Ensure .value is used correctly
+    html = removeHyperlinks(html.value);
 
     return extractSectionsAndContent(html);
   } catch (err) {
diff --git a/src/service/document/reader.js b/src/service/document/reader.js
index 5c957e9..cd437ca 100644
--- a/src/service/document/reader.js
+++ b/src/service/document/reader.js
@@ -1,6 +1,6 @@
 const path = require("path");
 const fs = require("fs").promises;
-const { parsePdf, parseMd, parseOdt, parseTxt, parseDocx } = require("./parse");
+const { parsePdf, parseMd, parseOdt, parseTxt, parseDocx, parseHtml } = require("./parse");
 
 async function loadFile(filePath) {
   const fileExtension = path.extname(filePath).toLowerCase();
@@ -25,6 +25,14 @@ async function loadFile(filePath) {
       };
     case ".pdf":
       return await parsePdf(filePath);
+    case ".html":
+    case ".xhtml":
+    case ".htm":
+      let htmlContent = await fs.readFile(filePath, "utf-8");
+      return {
+        fileName: path.basename(filePath),
+        data: await parseHtml(htmlContent),
+      };
     default:
       // just try to parse it as a text file
       let rawText = await fs.readFile(filePath, "utf-8");