From 85f2a4a3bb077ca30280a5aba612891f5fcf29e9 Mon Sep 17 00:00:00 2001 From: sencoders Date: Sun, 14 Sep 2025 22:12:32 +0530 Subject: [PATCH 1/8] added an endpoint for ambiguity resolving --- bundle/server.js | 93 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/bundle/server.js b/bundle/server.js index 3ad2f83..85d0851 100644 --- a/bundle/server.js +++ b/bundle/server.js @@ -193,6 +193,99 @@ app.post('/generate-json', async (req, res) => { } ); +// --- NEW AMBIGUITY HANDLING ENDPOINT --- + +/** + * Endpoint for Command Processing + * This endpoint accepts a user's text prompt and the sanitized HTML of the current page. + * It intelligently decides whether the command is clear, ambiguous but resolvable, + * or ambiguous and requires a clarifying question for the user. + * * Route: POST /process-command + * Content-Type: application/json + * Body: { "userPrompt": "text of user's command", "pageHtmlContext": "sanitized html string" } + */ +app.post('/process-command', async (req, res) => { + console.log('Received request for /process-command'); + try { + const { userPrompt, pageHtmlContext } = req.body; + + if (!userPrompt || !pageHtmlContext) { + return res.status(400).json({ error: 'User prompt and page HTML context are required.' }); + } + + const generationConfig = { + responseMimeType: "application/json", + responseSchema: { + type: "OBJECT", + properties: { + type: { + type: "STRING", + enum: ["action", "clarification"], + }, + command: { + type: "OBJECT", + properties: { + key: { + type: "STRING", + enum: ["click", "hover", "input", "back", "forward", "search", "bookmark"], + }, + value: { + type: "ANY" // Value can be a string, array, or object depending on the key + }, + }, + }, + question: { + type: "STRING", + }, + }, + required: ["type"], + }, + }; + + const model = genAI.getGenerativeModel({ model: MODEL_NAME, safetySettings, generationConfig }); + + // This is the core prompt that instructs the AI on how to handle ambiguity. + const instructionPrompt = ` +You are an AI assistant for a browser extension for visually impaired users. Your goal is to translate a user's voice command into a precise action or, if the command is ambiguous, a clarifying question. You will be given the user's command and the sanitized HTML of the current webpage. + +Analyze the user's command based on the provided HTML context and respond in one of two JSON formats: + +1. If the command is clear OR if it is ambiguous but you can resolve it using the HTML context (e.g., user says "click the first video" and you can identify it), respond with an "action" object. + - For actions like 'click', 'hover', or 'input', use the exact text from the HTML element as the target. + - The format is: {"type": "action", "command": {"key": "...", "value": ...}} + + Examples: + - User says: "search for funny cat videos" -> {"type": "action", "command": {"key": "search", "value": "funny cat videos"}} + - User says: "click on the contact us button" -> {"type": "action", "command": {"key": "click", "value": {"text": "Contact Us"}}} + - User says: "type hello world into the username field" -> {"type": "action", "command": {"key": "input", "value": ["hello world", {"text": "username"}]}} + +2. If the command is ambiguous and you CANNOT resolve it with the given HTML (e.g., user says "click the link" and there are many links), you MUST ask a clarifying question. Do not try to guess. + - The format is: {"type": "clarification", "question": "Your question to the user."} + + Example: + - User says: "click the button" and the HTML contains "Login", "Sign Up", and "Learn More" buttons. + - Your response: {"type": "clarification", "question": "I see a few buttons: Login, Sign Up, and Learn More. Which one would you like me to click?"} + +--- +USER COMMAND: "${userPrompt}" + +PAGE HTML CONTEXT: +${pageHtmlContext} +--- + `; + + const result = await model.generateContent(instructionPrompt); + const response = result.response; + const jsonResponse = JSON.parse(response.text()); + + console.log('Successfully processed command. Sending response:', jsonResponse); + res.status(200).json(jsonResponse); + + } catch (error) { + console.error('Error in /process-command:', error); + res.status(500).json({ error: 'Failed to process command.', details: error.message }); + } +}); /** From 90ac77d051b23ce3c8ae979b59efa28a0ec28a1f Mon Sep 17 00:00:00 2001 From: sencoders Date: Sun, 14 Sep 2025 22:23:02 +0530 Subject: [PATCH 2/8] increased payload for html --- bundle/server.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bundle/server.js b/bundle/server.js index 85d0851..5799fdb 100644 --- a/bundle/server.js +++ b/bundle/server.js @@ -14,7 +14,7 @@ const MODEL2_NAME = 'gemini-2.5-flash-preview-tts'; const app = express(); app.use(cors()); -app.use(express.json()); +app.use(express.json({ limit: '10mb' }));// Increase payload limit for HTML context // Using memory storage to handle the file as a buffer const storage = multer.memoryStorage(); From ccc466a93ce45e2a8100022293e6faa3f2ef6235 Mon Sep 17 00:00:00 2001 From: sencoders Date: Sun, 14 Sep 2025 22:53:09 +0530 Subject: [PATCH 3/8] updated addListener for sanitized html --- contentScript.js | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/contentScript.js b/contentScript.js index e347f29..4734efc 100644 --- a/contentScript.js +++ b/contentScript.js @@ -1,11 +1,11 @@ // Listens for messages from the background script. browser.runtime.onMessage.addListener((message, sender, sendResponse) => { - // Handles the request for HTML from the background script. - if (message.action === "getPageHtml") { - console.log("Background script requested HTML. Sending it now."); - sendResponse({ html: document }); - parseAndSanitize(document); - return true; // Required for asynchronous responses. + // Handles the request for SANITIZED HTML from the background script. + if (message.action === "getSanitizedPageHtml") { + console.log("Background script requested sanitized HTML. Sanitizing and Sending it now."); + const sanitizedHtml = parseAndSanitizePage(document.body); + sendResponse({ html: sanitizedHtml }); + return true; // Required for asynchronous response. } // If the popup wrapped the backend result as { key: 'ai_result', value: result } From 6ade3445ba995f9be7ea86c5e797a9945884e6e8 Mon Sep 17 00:00:00 2001 From: sencoders Date: Sun, 14 Sep 2025 23:01:05 +0530 Subject: [PATCH 4/8] changed the sanitation function --- contentScript.js | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/contentScript.js b/contentScript.js index 4734efc..971d30f 100644 --- a/contentScript.js +++ b/contentScript.js @@ -22,7 +22,7 @@ browser.runtime.onMessage.addListener((message, sender, sendResponse) => { } }); - +/* function parseAndSanitize(dom) { // Select all elements that have an

as a direct child const nodes = dom.querySelectorAll('a:has(> h3)'); @@ -34,6 +34,37 @@ function parseAndSanitize(dom) { return htmlString; // optionally return the concatenated HTML string } +*/ + +/** + * Sanitizes the page's body to create a clean, simple HTML string for the AI. + * This focuses on interactive elements and text content, removing clutter. + * @param {HTMLElement} body - The document.body element. + * @returns {string} - A simplified HTML string representing the page content. + */ +function parseAndSanitizePage(body) { + if (!body) return ""; + + // Create a clone of the body to avoid modifying the actual page. + const clone = body.cloneNode(true); + + // Remove tags that are usually irrelevant for navigation and clutter the context. + const tagsToRemove = ['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'img', 'link', 'meta']; + clone.querySelectorAll(tagsToRemove.join(',')).forEach(el => el.remove()); + + // Remove hidden elements + clone.querySelectorAll('[style*="display: none"], [hidden]').forEach(el => el.remove()); + + // Reduce long text to avoid exceeding token limits + clone.querySelectorAll('p, div, span').forEach(el => { + if (el.textContent.length > 200) { + el.textContent = el.textContent.substring(0, 200) + '...'; + } + }); + + // Return the cleaned HTML as a string + return clone.innerHTML.replace(/\s{2,}/g, ' ').trim(); // Collapse whitespace +} //Procedure S function executeCommand(command) { From ecd16b197d336ca85c7455737634aa1b6f22beff Mon Sep 17 00:00:00 2001 From: sencoders Date: Sun, 14 Sep 2025 23:10:11 +0530 Subject: [PATCH 5/8] modified the helper functions --- contentScript.js | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/contentScript.js b/contentScript.js index 971d30f..23ceefa 100644 --- a/contentScript.js +++ b/contentScript.js @@ -87,7 +87,14 @@ function findElementByText(text) { if (!text) return null; const lowerCaseText = text.trim().toLowerCase(); const candidates = document.querySelectorAll('a, button, [role="button"], [role="link"], input[type="submit"]'); - return Array.from(candidates).find(el => el.textContent.trim().toLowerCase().includes(lowerCaseText)); + // Find the best match, preferring exact matches + let bestMatch = null; + for (const el of Array.from(candidates)) { + const elText = el.textContent.trim().toLowerCase(); + if (elText === lowerCaseText) return el; // Exact match found + if (elText.includes(lowerCaseText)) bestMatch = el; // Partial match + } + return bestMatch; } function findElementForInput(labelText) { @@ -100,7 +107,8 @@ function findElementForInput(labelText) { return label.querySelector('input, textarea, select'); } } - return null; + // Fallback for inputs without labels + return document.querySelector(`[aria-label*="${labelText}" i], [placeholder*="${labelText}" i]`); } //Procedure Execution From 659e541e78b2edae36d8563609a519c13ee0a055 Mon Sep 17 00:00:00 2001 From: sencoders Date: Mon, 15 Sep 2025 00:02:22 +0530 Subject: [PATCH 6/8] changed the request sanitized html function --- background.js | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/background.js b/background.js index 9e7df48..f56b021 100644 --- a/background.js +++ b/background.js @@ -4,14 +4,41 @@ // }); // }); +// --- State Management --- +let activeTabId = null; +let activeTabSanitizedHtml = ''; +const SERVER_URL = 'http://localhost:3000'; // IMPORTANT: Replace with the server's URL and port. But it has to be replaced by our deployed server URL in production. -// This listener triggers automatically whenever a tab finishes loading. +// When a tab finishes loading, get its sanitized HTML. browser.tabs.onUpdated.addListener((tabId, changeInfo, tab) => { // We only act when a page has completely loaded and has a web URL. - if (changeInfo.status === 'complete' && tab.url && tab.url.startsWith('http')) { - console.log(`Page loaded: ${tab.url}. Requesting HTML...`); + if (tabId === activeTabId && changeInfo.status === 'complete' && tab.url && tab.url.startsWith('http')) { + requestSanitizedHtml(tabId); + }}); + + // When the user switches to a different tab, update the active tab ID and get its HTML. +browser.tabs.onActivated.addListener(activeInfo => { + activeTabId = activeInfo.tabId; + requestSanitizedHtml(activeTabId); +}); + +/** + * Asks the content script of a given tab for its sanitized HTML content. + * @param {number} tabId The ID of the tab to request HTML from. + */ +function requestSanitizedHtml(tabId) { + if (!tabId) return; + browser.tabs.sendMessage(tabId, { action: "getSanitizedPageHtml" }) + .then(response => { + if (response && response.html) { + console.log(`Successfully updated HTML context for tab ${tabId}.`); + activeTabSanitizedHtml = response.html; + } + }) + .catch(error => console.error(`Could not get HTML from content script for tab ${tabId}: ${error}`)); +} - // Ask the content script on that tab to send us its HTML content. +/* // Ask the content script on that tab to send us its HTML content. browser.tabs.sendMessage(tabId, { action: "getPageHtml" }) .then(response => { if (response && response.html) { @@ -21,7 +48,7 @@ browser.tabs.onUpdated.addListener((tabId, changeInfo, tab) => { .catch(error => console.error(`Could not get HTML from content script: ${error}`)); } }); - +*/ //Take the captured HTML and log it. function handleRawHtml(rawHtml) { From 307e83cd956be49b3a3d256dfaefd68f79fe2bef Mon Sep 17 00:00:00 2001 From: sencoders Date: Mon, 15 Sep 2025 00:10:50 +0530 Subject: [PATCH 7/8] created functions interacting with the endpoints --- background.js | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/background.js b/background.js index f56b021..57882f5 100644 --- a/background.js +++ b/background.js @@ -48,13 +48,74 @@ function requestSanitizedHtml(tabId) { .catch(error => console.error(`Could not get HTML from content script: ${error}`)); } }); -*/ //Take the captured HTML and log it. function handleRawHtml(rawHtml) { console.log("Successfully captured raw HTML content from the page:"); console.log(rawHtml); } +*/ + + +/** + * This is the main entry point to be called from our UI (e.g., popup.js). + * It takes the user's transcribed text, sends it to the backend with context, and handles the response. + * @param {string} promptText The transcribed text from the user's voice command. + */ +async function processUserCommand(promptText) { + if (!activeTabId || !activeTabSanitizedHtml) { + console.error("No active tab or page context available."); + speakText("I'm sorry, I don't have the context of the page yet. Please wait a moment and try again."); + return; + } + + console.log(`Processing command: "${promptText}"`); + try { + const response = await fetch(`${SERVER_URL}/process-command`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + userPrompt: promptText, + pageHtmlContext: activeTabSanitizedHtml + }) + }); + + if (!response.ok) { + throw new Error(`Server responded with status: ${response.status}`); + } + + const result = await response.json(); + + if (result.type === 'action') { + // Send the executable command to the active content script + browser.tabs.sendMessage(activeTabId, result.command); + } else if (result.type === 'clarification') { + // Speak the clarifying question back to the user + speakText(result.question); + } + + } catch (error) { + console.error('Error in processUserCommand:', error); + speakText("I'm sorry, I encountered an error trying to process your command."); + } +} +/** + * Sends text to our backend's TTS endpoint to be converted to speech. + * @param {string} text The text to be spoken. + */ +async function speakText(text) { + try { + await fetch(`${SERVER_URL}/generate-tts`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ text: text }) + }); + // NOTE: This assumes our client-side UI is set up to play the audio + // that this endpoint generates and saves (e.g., out.wav). + } catch (error) { + console.error('Failed to call TTS endpoint:', error); + } +} //handling actions requested by the Content Script browser.runtime.onMessage.addListener((message, sender) => { From 1213f36b830fb1d55a3f5b560e4f23f64e6d1f2d Mon Sep 17 00:00:00 2001 From: sencoders Date: Mon, 15 Sep 2025 00:33:03 +0530 Subject: [PATCH 8/8] modified listener for helper + recorded tabID --- background.js | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/background.js b/background.js index 57882f5..44aad46 100644 --- a/background.js +++ b/background.js @@ -7,7 +7,8 @@ // --- State Management --- let activeTabId = null; let activeTabSanitizedHtml = ''; -const SERVER_URL = 'http://localhost:3000'; // IMPORTANT: Replace with the server's URL and port. But it has to be replaced by our deployed server URL in production. +const SERVER_URL = 'http://localhost:3000'; // IMPORTANT: Replace with the server's URL and port. +// But it has to be later replaced by our deployed server URL in production. // When a tab finishes loading, get its sanitized HTML. browser.tabs.onUpdated.addListener((tabId, changeInfo, tab) => { @@ -120,18 +121,22 @@ async function speakText(text) { //handling actions requested by the Content Script browser.runtime.onMessage.addListener((message, sender) => { if (message.action === 'search') { - performSearch(message.query); + performSearch(message.query, sender.tab); } else if (message.action === 'addBookmark') { addBookmark(sender.tab); } }); -function performSearch(query) { - browser.tabs.create({ - url: `https://www.google.com/search?q=${encodeURIComponent(query)}` - }); +function performSearch(query, tab) { + const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(query)}`; + // For a better user experience, perform the search in the user's current tab. + if (tab && tab.id) { + browser.tabs.update(tab.id, { url: searchUrl }); + } else { + browser.tabs.create({ url: searchUrl }); // Fallback to a new tab + } } function addBookmark(tab) { - if (tab && tab.url) { + if (tab && tab.url && tab.title) { browser.bookmarks.create({ title: tab.title || 'New Bookmark', url: tab.url @@ -139,4 +144,10 @@ function addBookmark(tab) { } } - +// Initialize the active tab ID when the extension starts. +browser.tabs.query({ active: true, currentWindow: true }).then(tabs => { + if (tabs[0]) { + activeTabId = tabs[0].id; + requestSanitizedHtml(activeTabId); + } +});