diff --git a/src/service/document/parse/html.js b/src/service/document/parse/html.js
index fb885a4..8a29042 100644
--- a/src/service/document/parse/html.js
+++ b/src/service/document/parse/html.js
@@ -33,6 +33,19 @@ function extractSectionsAndContent(html) {
return extractedSections;
}
+async function parseHtml(htmlContent, documentPath) {
+ try {
+ // HTML content doesn't need citations and hyperlinks removed as extractSectionsAndContent handles that
+ const sections = extractSectionsAndContent(htmlContent);
+ // If a document path is provided, preserve it for session reset
+ return documentPath ? preserveDocumentContext(sections, documentPath) : sections;
+ } catch (err) {
+ console.error("Error parsing HTML content:", err);
+ return [];
+ }
+}
+
module.exports = {
extractSectionsAndContent,
+ parseHtml,
};
diff --git a/src/service/document/parse/index.js b/src/service/document/parse/index.js
index d9de47e..1e2bbc8 100644
--- a/src/service/document/parse/index.js
+++ b/src/service/document/parse/index.js
@@ -3,11 +3,13 @@ const { parseTxt } = require("./txt");
const { parseMd } = require("./md");
const { parseDocx } = require("./docx");
const { parseOdt } = require("./odt");
+const { parseHtml } = require("./html");
module.exports = {
parseMd,
parseOdt,
parsePdf,
parseTxt,
- parseDocx
+ parseDocx,
+ parseHtml
};
diff --git a/src/service/document/parse/odt.js b/src/service/document/parse/odt.js
index 6a7d94b..8f510b8 100644
--- a/src/service/document/parse/odt.js
+++ b/src/service/document/parse/odt.js
@@ -11,8 +11,8 @@ async function parseOdt(odtFilePath) {
return [];
}
- html = removeCitations(html);
- html = removeHyperlinks(html);
+ html = removeCitations(html.value); // Ensure .value is used correctly
+ html = removeHyperlinks(html.value);
return extractSectionsAndContent(html);
} catch (err) {
diff --git a/src/service/document/reader.js b/src/service/document/reader.js
index 5c957e9..cd437ca 100644
--- a/src/service/document/reader.js
+++ b/src/service/document/reader.js
@@ -1,6 +1,6 @@
const path = require("path");
const fs = require("fs").promises;
-const { parsePdf, parseMd, parseOdt, parseTxt, parseDocx } = require("./parse");
+const { parsePdf, parseMd, parseOdt, parseTxt, parseDocx, parseHtml } = require("./parse");
async function loadFile(filePath) {
const fileExtension = path.extname(filePath).toLowerCase();
@@ -25,6 +25,14 @@ async function loadFile(filePath) {
};
case ".pdf":
return await parsePdf(filePath);
+ case ".html":
+ case ".xhtml":
+ case ".htm":
+ let htmlContent = await fs.readFile(filePath, "utf-8");
+ return {
+ fileName: path.basename(filePath),
+ data: await parseHtml(htmlContent),
+ };
default:
// just try to parse it as a text file
let rawText = await fs.readFile(filePath, "utf-8");