urffsamhunt · sencoders · Sep 14, 2025 · Sep 14, 2025 · Sep 14, 2025 · Sep 14, 2025
diff --git a/background.js b/background.js
@@ -4,13 +4,45 @@
 //   });
 // });
 
+// --- State Management ---
+let activeTabId = null;
+let activeTabSanitizedHtml = '';
+const SERVER_URL = 'http://localhost:3000'; // IMPORTANT: Replace with the server's URL and port. 
+// But it has to be later replaced by our deployed server URL in production.
 
-// This listener triggers automatically whenever a tab finishes loading.
+// When a tab finishes loading, get its sanitized HTML.
 browser.tabs.onUpdated.addListener((tabId, changeInfo, tab) => {
     // We only act when a page has completely loaded and has a web URL.
-    if (changeInfo.status === 'complete' && tab.url && tab.url.startsWith('http')) {
-        console.log(`Page loaded: ${tab.url}. Requesting HTML...`);
+    if (tabId === activeTabId && changeInfo.status === 'complete' && tab.url && tab.url.startsWith('http')) {
+        requestSanitizedHtml(tabId);
+    }});
+
+    // When the user switches to a different tab, update the active tab ID and get its HTML.
+browser.tabs.onActivated.addListener(activeInfo => {
+    activeTabId = activeInfo.tabId;
+    requestSanitizedHtml(activeTabId);
+});
+
+/**
+ * Asks the content script of a given tab for its sanitized HTML content.
+ * @param {number} tabId The ID of the tab to request HTML from.
+ */
+function requestSanitizedHtml(tabId) {
+    if (!tabId) return;
+    browser.tabs.sendMessage(tabId, { action: "getSanitizedPageHtml" })
+        .then(response => {
+            if (response && response.html) {
+                console.log(`Successfully updated HTML context for tab ${tabId}.`);
+                activeTabSanitizedHtml = response.html;
+            }
+        })
+        .catch(error => console.error(`Could not get HTML from content script for tab ${tabId}: ${error}`));
+}
 
+<<<<<<< HEAD
+/*        // Ask the content script on that tab to send us its HTML content.
+=======
+>>>>>>> 70e1aab686b724bb60b57e04dbf22a2ca9be0c47
         browser.tabs.sendMessage(tabId, { action: "getPageHtml" })
             .then(response => {
                 if (response && response.html) {
@@ -26,15 +58,91 @@ function handleRawHtml(rawHtml) {
     console.log("Successfully captured raw HTML content from the page:");
     console.log(rawHtml); 
 }
+*/
+
+
+/**
+ * This is the main entry point to be called from our UI (e.g., popup.js).
+ * It takes the user's transcribed text, sends it to the backend with context, and handles the response.
+ * @param {string} promptText The transcribed text from the user's voice command.
+ */
+async function processUserCommand(promptText) {
+    if (!activeTabId || !activeTabSanitizedHtml) {
+        console.error("No active tab or page context available.");
+        speakText("I'm sorry, I don't have the context of the page yet. Please wait a moment and try again.");
+        return;
+    }
+
+    console.log(`Processing command: "${promptText}"`);
+    try {
+        const response = await fetch(`${SERVER_URL}/process-command`, {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({
+                userPrompt: promptText,
+                pageHtmlContext: activeTabSanitizedHtml
+            })
+        });
+
+        if (!response.ok) {
+            throw new Error(`Server responded with status: ${response.status}`);
+        }
+
+        const result = await response.json();
+
+        if (result.type === 'action') {
+            // Send the executable command to the active content script
+            browser.tabs.sendMessage(activeTabId, result.command);
+        } else if (result.type === 'clarification') {
+            // Speak the clarifying question back to the user
+            speakText(result.question);
+        }
+
+    } catch (error) {
+        console.error('Error in processUserCommand:', error);
+        speakText("I'm sorry, I encountered an error trying to process your command.");
+    }
+}
+/**
+ * Sends text to our backend's TTS endpoint to be converted to speech.
+ * @param {string} text The text to be spoken.
+ */
+async function speakText(text) {
+    try {
+        await fetch(`${SERVER_URL}/generate-tts`, {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({ text: text })
+        });
+        // NOTE: This assumes our client-side UI is set up to play the audio
+        // that this endpoint generates and saves (e.g., out.wav).
+    } catch (error) {
+        console.error('Failed to call TTS endpoint:', error);
+    }
+}
 
 //handling actions requested by the Content Script
 browser.runtime.onMessage.addListener((message, sender) => {
     if (message.action === 'search') {
+<<<<<<< HEAD
+        performSearch(message.query, sender.tab);
+=======
         performSearch(message.query, sender && sender.tab);
+>>>>>>> 70e1aab686b724bb60b57e04dbf22a2ca9be0c47
     } else if (message.action === 'addBookmark') {
         addBookmark(sender.tab);
     }
 });
+<<<<<<< HEAD
+function performSearch(query, tab) {
+    const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(query)}`;
+    // For a better user experience, perform the search in the user's current tab.
+    if (tab && tab.id) {
+        browser.tabs.update(tab.id, { url: searchUrl });
+    } else {
+        browser.tabs.create({ url: searchUrl }); // Fallback to a new tab
+    }
+=======
 
 function performSearch(query, tab) {
     const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(query)}`;
@@ -61,15 +169,22 @@ function performSearch(query, tab) {
             console.error('Error finding active tab for search, creating new tab instead:', err);
             browser.tabs.create({ url: searchUrl });
         });
+>>>>>>> 70e1aab686b724bb60b57e04dbf22a2ca9be0c47
 }
 
 function addBookmark(tab) {
-    if (tab && tab.url) {
+    if (tab && tab.url && tab.title) {
         browser.bookmarks.create({
             title: tab.title || 'New Bookmark',
             url: tab.url
         });
     }
 }
 
-
+// Initialize the active tab ID when the extension starts.
+browser.tabs.query({ active: true, currentWindow: true }).then(tabs => {
+    if (tabs[0]) {
+        activeTabId = tabs[0].id;
+        requestSanitizedHtml(activeTabId);
+    }
+});
diff --git a/bundle/server.js b/bundle/server.js
@@ -14,7 +14,7 @@ const MODEL2_NAME = 'gemini-2.5-flash-preview-tts';
 
 const app = express();
 app.use(cors());
-app.use(express.json());
+app.use(express.json({ limit: '10mb' }));// Increase payload limit for HTML context
 
 // Using memory storage to handle the file as a buffer
 const storage = multer.memoryStorage();
@@ -193,6 +193,99 @@ app.post('/generate-json', async (req, res) => {
 }
 );
 
+// --- NEW AMBIGUITY HANDLING ENDPOINT ---
+
+/**
+ * Endpoint for Command Processing
+ * This endpoint accepts a user's text prompt and the sanitized HTML of the current page.
+ * It intelligently decides whether the command is clear, ambiguous but resolvable,
+ * or ambiguous and requires a clarifying question for the user.
+ * * Route: POST /process-command
+ * Content-Type: application/json
+ * Body: { "userPrompt": "text of user's command", "pageHtmlContext": "sanitized html string" }
+ */
+app.post('/process-command', async (req, res) => {
+    console.log('Received request for /process-command');
+    try {
+        const { userPrompt, pageHtmlContext } = req.body;
+
+        if (!userPrompt || !pageHtmlContext) {
+            return res.status(400).json({ error: 'User prompt and page HTML context are required.' });
+        }
+
+        const generationConfig = {
+            responseMimeType: "application/json",
+            responseSchema: {
+                type: "OBJECT",
+                properties: {
+                    type: {
+                        type: "STRING",
+                        enum: ["action", "clarification"],
+                    },
+                    command: {
+                        type: "OBJECT",
+                        properties: {
+                            key: {
+                                type: "STRING",
+                                enum: ["click", "hover", "input", "back", "forward", "search", "bookmark"],
+                            },
+                            value: {
+                                type: "ANY" // Value can be a string, array, or object depending on the key
+                            },
+                        },
+                    },
+                    question: {
+                        type: "STRING",
+                    },
+                },
+                required: ["type"],
+            },
+        };
+
+        const model = genAI.getGenerativeModel({ model: MODEL_NAME, safetySettings, generationConfig });
+
+        // This is the core prompt that instructs the AI on how to handle ambiguity.
+        const instructionPrompt = `
+You are an AI assistant for a browser extension for visually impaired users. Your goal is to translate a user's voice command into a precise action or, if the command is ambiguous, a clarifying question. You will be given the user's command and the sanitized HTML of the current webpage.
+
+Analyze the user's command based on the provided HTML context and respond in one of two JSON formats:
+
+1.  If the command is clear OR if it is ambiguous but you can resolve it using the HTML context (e.g., user says "click the first video" and you can identify it), respond with an "action" object.
+    - For actions like 'click', 'hover', or 'input', use the exact text from the HTML element as the target.
+    - The format is: {"type": "action", "command": {"key": "...", "value": ...}}
+
+    Examples:
+    - User says: "search for funny cat videos" -> {"type": "action", "command": {"key": "search", "value": "funny cat videos"}}
+    - User says: "click on the contact us button" -> {"type": "action", "command": {"key": "click", "value": {"text": "Contact Us"}}}
+    - User says: "type hello world into the username field" -> {"type": "action", "command": {"key": "input", "value": ["hello world", {"text": "username"}]}}
+
+2.  If the command is ambiguous and you CANNOT resolve it with the given HTML (e.g., user says "click the link" and there are many links), you MUST ask a clarifying question. Do not try to guess.
+    - The format is: {"type": "clarification", "question": "Your question to the user."}
+
+    Example:
+    - User says: "click the button" and the HTML contains "Login", "Sign Up", and "Learn More" buttons.
+    - Your response: {"type": "clarification", "question": "I see a few buttons: Login, Sign Up, and Learn More. Which one would you like me to click?"}
+
+---
+USER COMMAND: "${userPrompt}"
+
+PAGE HTML CONTEXT:
+${pageHtmlContext}
+---
+        `;
+
+        const result = await model.generateContent(instructionPrompt);
+        const response = result.response;
+        const jsonResponse = JSON.parse(response.text());
+
+        console.log('Successfully processed command. Sending response:', jsonResponse);
+        res.status(200).json(jsonResponse);
+
+    } catch (error) {
+        console.error('Error in /process-command:', error);
+        res.status(500).json({ error: 'Failed to process command.', details: error.message });
+    }
+});
 
 
 /**

diff --git a/contentScript.js b/contentScript.js
@@ -1,12 +1,22 @@
 // Listens for messages from the background script.
 browser.runtime.onMessage.addListener((message, sender, sendResponse) => {
+<<<<<<< HEAD
+    // Handles the request for SANITIZED HTML from the background script.
+    if (message.action === "getSanitizedPageHtml") {
+        console.log("Background script requested sanitized HTML. Sanitizing and Sending it now.");
+        const sanitizedHtml = parseAndSanitizePage(document.body);
+        sendResponse({ html: sanitizedHtml });
+        return true; // Required for asynchronous response.
+    }
+=======
   // Handles the request for HTML from the background script.
   if (message.action === "getPageHtml") {
     console.log("Background script requested HTML. Sending it now.");
     sendResponse({ html: document });
     parseAndSanitize(document);
     return true; // Required for asynchronous responses.
   }
+>>>>>>> 70e1aab686b724bb60b57e04dbf22a2ca9be0c47
 
   if (message.key === "ai_result" && message.value) {
     console.log(
@@ -40,6 +50,10 @@ browser.tabs.onUpdated.addListener((tabId, changeInfo, tab) => {
     }
 });
 
+<<<<<<< HEAD
+/*
+=======
+>>>>>>> 70e1aab686b724bb60b57e04dbf22a2ca9be0c47
 function parseAndSanitize(dom) {
     // Select all <a> elements that have an <h3> as a direct child
     const nodes = dom.querySelectorAll('a:has(> h3)');
@@ -51,6 +65,37 @@ function parseAndSanitize(dom) {
 
     return htmlString; // optionally return the concatenated HTML string
 }
+*/ 
+
+/**
+ * Sanitizes the page's body to create a clean, simple HTML string for the AI.
+ * This focuses on interactive elements and text content, removing clutter.
+ * @param {HTMLElement} body - The document.body element.
+ * @returns {string} - A simplified HTML string representing the page content.
+ */
+function parseAndSanitizePage(body) {
+    if (!body) return "";
+
+    // Create a clone of the body to avoid modifying the actual page.
+    const clone = body.cloneNode(true);
+
+    // Remove tags that are usually irrelevant for navigation and clutter the context.
+    const tagsToRemove = ['script', 'style', 'noscript', 'iframe', 'svg', 'header', 'footer', 'nav', 'img', 'link', 'meta'];
+    clone.querySelectorAll(tagsToRemove.join(',')).forEach(el => el.remove());
+
+    // Remove hidden elements
+    clone.querySelectorAll('[style*="display: none"], [hidden]').forEach(el => el.remove());
+
+    // Reduce long text to avoid exceeding token limits
+    clone.querySelectorAll('p, div, span').forEach(el => {
+        if (el.textContent.length > 200) {
+            el.textContent = el.textContent.substring(0, 200) + '...';
+        }
+    });
+
+    // Return the cleaned HTML as a string
+    return clone.innerHTML.replace(/\s{2,}/g, ' ').trim(); // Collapse whitespace
+}
 
 function parseAndSanitizePage(dommy) {
     const nodes = dommy.querySelector(body);
@@ -93,6 +138,19 @@ function executeCommand(command) {
 
 // --- Helper Functions to Find Elements on the Page ---
 function findElementByText(text) {
+<<<<<<< HEAD
+    if (!text) return null;
+    const lowerCaseText = text.trim().toLowerCase();
+    const candidates = document.querySelectorAll('a, button, [role="button"], [role="link"], input[type="submit"]');
+    // Find the best match, preferring exact matches
+    let bestMatch = null;
+    for (const el of Array.from(candidates)) {
+        const elText = el.textContent.trim().toLowerCase();
+        if (elText === lowerCaseText) return el; // Exact match found
+        if (elText.includes(lowerCaseText)) bestMatch = el; // Partial match
+    }
+    return bestMatch;
+=======
   if (!text) return null;
   const lowerCaseText = text.trim().toLowerCase();
   const candidates = document.querySelectorAll(
@@ -101,6 +159,7 @@ function findElementByText(text) {
   return Array.from(candidates).find((el) =>
     el.textContent.trim().toLowerCase().includes(lowerCaseText)
   );
+>>>>>>> 70e1aab686b724bb60b57e04dbf22a2ca9be0c47
 }
 
 function findElementForInput(labelText) {
@@ -112,8 +171,13 @@ function findElementForInput(labelText) {
       if (inputId) return document.getElementById(inputId);
       return label.querySelector("input, textarea, select");
     }
+<<<<<<< HEAD
+    // Fallback for inputs without labels
+    return document.querySelector(`[aria-label*="${labelText}" i], [placeholder*="${labelText}" i]`);
+=======
   }
   return null;
+>>>>>>> 70e1aab686b724bb60b57e04dbf22a2ca9be0c47
 }
 
 //Procedure Execution