From 102eaa9a7e27a5ba703872fd8e89057e8f801d84 Mon Sep 17 00:00:00 2001
From: Corey McCallum <corey.mccallum@gmail.com>
Date: Tue, 3 Feb 2026 14:16:23 -0800
Subject: [PATCH 01/10] feat(memory): add short-term memory system

Adds deterministic short-term memory with three storage mechanisms:
- Auto-store from tool responses via memory_hint field
- Explicit memory_short tool (store/get/delete/list actions)
- HTTP API endpoints for external access

Backend: src/caal/memory/ package with file-based JSON persistence,
singleton pattern, TTL support, and context injection into LLM.

Frontend: Memory Panel UI with Brain icon button, entry list,
detail modal, and clear all functionality.

Includes i18n translations for en, fr, it.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 frontend/app/api/memory/[key]/route.ts      |  63 ++++
 frontend/app/api/memory/route.ts            |  87 ++++++
 frontend/components/app/view-controller.tsx |   6 +
 frontend/components/app/welcome-view.tsx    |  13 +-
 frontend/components/memory/index.ts         |   1 +
 frontend/components/memory/memory-panel.tsx | 309 ++++++++++++++++++++
 frontend/messages/en.json                   |  24 ++
 frontend/messages/fr.json                   |  24 ++
 frontend/messages/it.json                   |  24 ++
 src/caal/integrations/__init__.py           |   2 +
 src/caal/integrations/memory_tool.py        | 113 +++++++
 src/caal/llm/llm_node.py                    |  34 ++-
 src/caal/memory/__init__.py                 |  34 +++
 src/caal/memory/base.py                     |  51 ++++
 src/caal/memory/short_term.py               | 294 +++++++++++++++++++
 src/caal/webhooks.py                        | 168 +++++++++++
 voice_agent.py                              |  20 +-
 17 files changed, 1261 insertions(+), 6 deletions(-)
 create mode 100644 frontend/app/api/memory/[key]/route.ts
 create mode 100644 frontend/app/api/memory/route.ts
 create mode 100644 frontend/components/memory/index.ts
 create mode 100644 frontend/components/memory/memory-panel.tsx
 create mode 100644 src/caal/integrations/memory_tool.py
 create mode 100644 src/caal/memory/__init__.py
 create mode 100644 src/caal/memory/base.py
 create mode 100644 src/caal/memory/short_term.py

diff --git a/frontend/app/api/memory/[key]/route.ts b/frontend/app/api/memory/[key]/route.ts
new file mode 100644
index 0000000..27b11ed
--- /dev/null
+++ b/frontend/app/api/memory/[key]/route.ts
@@ -0,0 +1,63 @@
+import { NextRequest, NextResponse } from 'next/server';
+
+const WEBHOOK_URL = process.env.WEBHOOK_URL || 'http://agent:8889';
+
+interface RouteParams {
+  params: Promise<{ key: string }>;
+}
+
+/**
+ * GET /api/memory/[key] - Get a single memory entry
+ */
+export async function GET(_request: NextRequest, { params }: RouteParams) {
+  try {
+    const { key } = await params;
+    const res = await fetch(`${WEBHOOK_URL}/memory/${encodeURIComponent(key)}`, {
+      method: 'GET',
+      headers: { 'Content-Type': 'application/json' },
+    });
+
+    if (!res.ok) {
+      const text = await res.text();
+      console.error(`[/api/memory/${key}] Backend error:`, res.status, text);
+      return NextResponse.json({ error: text || 'Backend error' }, { status: res.status });
+    }
+
+    const data = await res.json();
+    return NextResponse.json(data);
+  } catch (error) {
+    console.error('[/api/memory/[key]] Error:', error);
+    return NextResponse.json(
+      { error: error instanceof Error ? error.message : 'Unknown error' },
+      { status: 500 }
+    );
+  }
+}
+
+/**
+ * DELETE /api/memory/[key] - Delete a single memory entry
+ */
+export async function DELETE(_request: NextRequest, { params }: RouteParams) {
+  try {
+    const { key } = await params;
+    const res = await fetch(`${WEBHOOK_URL}/memory/${encodeURIComponent(key)}`, {
+      method: 'DELETE',
+      headers: { 'Content-Type': 'application/json' },
+    });
+
+    if (!res.ok) {
+      const text = await res.text();
+      console.error(`[/api/memory/${key}] Backend error:`, res.status, text);
+      return NextResponse.json({ error: text || 'Backend error' }, { status: res.status });
+    }
+
+    const data = await res.json();
+    return NextResponse.json(data);
+  } catch (error) {
+    console.error('[/api/memory/[key]] Error:', error);
+    return NextResponse.json(
+      { error: error instanceof Error ? error.message : 'Unknown error' },
+      { status: 500 }
+    );
+  }
+}
diff --git a/frontend/app/api/memory/route.ts b/frontend/app/api/memory/route.ts
new file mode 100644
index 0000000..ab43e22
--- /dev/null
+++ b/frontend/app/api/memory/route.ts
@@ -0,0 +1,87 @@
+import { NextRequest, NextResponse } from 'next/server';
+
+const WEBHOOK_URL = process.env.WEBHOOK_URL || 'http://agent:8889';
+
+/**
+ * GET /api/memory - List all memory entries
+ */
+export async function GET() {
+  try {
+    const res = await fetch(`${WEBHOOK_URL}/memory`, {
+      method: 'GET',
+      headers: { 'Content-Type': 'application/json' },
+    });
+
+    if (!res.ok) {
+      const text = await res.text();
+      console.error('[/api/memory] Backend error:', res.status, text);
+      return NextResponse.json({ error: text || 'Backend error' }, { status: res.status });
+    }
+
+    const data = await res.json();
+    return NextResponse.json(data);
+  } catch (error) {
+    console.error('[/api/memory] Error:', error);
+    return NextResponse.json(
+      { error: error instanceof Error ? error.message : 'Unknown error' },
+      { status: 500 }
+    );
+  }
+}
+
+/**
+ * POST /api/memory - Store a new memory entry
+ */
+export async function POST(request: NextRequest) {
+  try {
+    const body = await request.json();
+
+    const res = await fetch(`${WEBHOOK_URL}/memory`, {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify(body),
+    });
+
+    if (!res.ok) {
+      const text = await res.text();
+      console.error('[/api/memory] Backend error:', res.status, text);
+      return NextResponse.json({ error: text || 'Backend error' }, { status: res.status });
+    }
+
+    const data = await res.json();
+    return NextResponse.json(data);
+  } catch (error) {
+    console.error('[/api/memory] Error:', error);
+    return NextResponse.json(
+      { error: error instanceof Error ? error.message : 'Unknown error' },
+      { status: 500 }
+    );
+  }
+}
+
+/**
+ * DELETE /api/memory - Clear all memory entries
+ */
+export async function DELETE() {
+  try {
+    const res = await fetch(`${WEBHOOK_URL}/memory`, {
+      method: 'DELETE',
+      headers: { 'Content-Type': 'application/json' },
+    });
+
+    if (!res.ok) {
+      const text = await res.text();
+      console.error('[/api/memory] Backend error:', res.status, text);
+      return NextResponse.json({ error: text || 'Backend error' }, { status: res.status });
+    }
+
+    const data = await res.json();
+    return NextResponse.json(data);
+  } catch (error) {
+    console.error('[/api/memory] Error:', error);
+    return NextResponse.json(
+      { error: error instanceof Error ? error.message : 'Unknown error' },
+      { status: 500 }
+    );
+  }
+}
diff --git a/frontend/components/app/view-controller.tsx b/frontend/components/app/view-controller.tsx
index 655f13b..c419fa9 100644
--- a/frontend/components/app/view-controller.tsx
+++ b/frontend/components/app/view-controller.tsx
@@ -6,6 +6,7 @@ import { useSessionContext } from '@livekit/components-react';
 import type { AppConfig } from '@/app-config';
 import { SessionView } from '@/components/app/session-view';
 import { WelcomeView } from '@/components/app/welcome-view';
+import { MemoryPanel } from '@/components/memory';
 import { SettingsPanel } from '@/components/settings/settings-panel';
 import { ToolsPanel } from '@/components/tools';
 
@@ -38,6 +39,7 @@ export function ViewController({ appConfig }: ViewControllerProps) {
   const { isConnected, start } = useSessionContext();
   const [settingsOpen, setSettingsOpen] = useState(false);
   const [toolsOpen, setToolsOpen] = useState(false);
+  const [memoryOpen, setMemoryOpen] = useState(false);
 
   return (
     <>
@@ -50,6 +52,7 @@ export function ViewController({ appConfig }: ViewControllerProps) {
             onStartCall={start}
             onOpenSettings={() => setSettingsOpen(true)}
             onOpenTools={() => setToolsOpen(true)}
+            onOpenMemory={() => setMemoryOpen(true)}
           />
         )}
         {/* Session view */}
@@ -63,6 +66,9 @@ export function ViewController({ appConfig }: ViewControllerProps) {
 
       {/* Tools panel */}
       <ToolsPanel isOpen={toolsOpen} onClose={() => setToolsOpen(false)} />
+
+      {/* Memory panel */}
+      <MemoryPanel isOpen={memoryOpen} onClose={() => setMemoryOpen(false)} />
     </>
   );
 }
diff --git a/frontend/components/app/welcome-view.tsx b/frontend/components/app/welcome-view.tsx
index 9778b3f..7cb4b45 100644
--- a/frontend/components/app/welcome-view.tsx
+++ b/frontend/components/app/welcome-view.tsx
@@ -1,7 +1,7 @@
 'use client';
 
 import { useTranslations } from 'next-intl';
-import { Gear, Wrench } from '@phosphor-icons/react/dist/ssr';
+import { Brain, Gear, Wrench } from '@phosphor-icons/react/dist/ssr';
 import { Button } from '@/components/livekit/button';
 
 function WelcomeImage() {
@@ -26,12 +26,14 @@ interface WelcomeViewProps {
   onStartCall: () => void;
   onOpenSettings?: () => void;
   onOpenTools?: () => void;
+  onOpenMemory?: () => void;
 }
 
 export const WelcomeView = ({
   onStartCall,
   onOpenSettings,
   onOpenTools,
+  onOpenMemory,
   ref,
 }: React.ComponentProps<'div'> & WelcomeViewProps) => {
   const t = useTranslations('Welcome');
@@ -41,6 +43,15 @@ export const WelcomeView = ({
     <div ref={ref} className="relative min-h-screen" style={{ background: 'var(--surface-deep)' }}>
       {/* Top right buttons */}
       <div className="fixed top-6 right-6 z-40 flex items-center gap-2">
+        {onOpenMemory && (
+          <button
+            onClick={onOpenMemory}
+            className="text-muted-foreground hover:text-foreground hover:bg-muted rounded-full p-2 transition-colors"
+            title={tCommon('memory')}
+          >
+            <Brain className="h-6 w-6" weight="fill" />
+          </button>
+        )}
         {onOpenTools && (
           <button
             onClick={onOpenTools}
diff --git a/frontend/components/memory/index.ts b/frontend/components/memory/index.ts
new file mode 100644
index 0000000..1b7c95c
--- /dev/null
+++ b/frontend/components/memory/index.ts
@@ -0,0 +1 @@
+export { MemoryPanel } from './memory-panel';
diff --git a/frontend/components/memory/memory-panel.tsx b/frontend/components/memory/memory-panel.tsx
new file mode 100644
index 0000000..663d15a
--- /dev/null
+++ b/frontend/components/memory/memory-panel.tsx
@@ -0,0 +1,309 @@
+'use client';
+
+import { useCallback, useEffect, useState } from 'react';
+import { createPortal } from 'react-dom';
+import { useTranslations } from 'next-intl';
+import { Brain, Clock, Trash, X } from '@phosphor-icons/react/dist/ssr';
+import { Button } from '@/components/livekit/button';
+
+interface MemoryEntry {
+  key: string;
+  value: unknown;
+  stored_at: number;
+  expires_at: number | null;
+  source: 'tool_hint' | 'explicit' | 'api';
+}
+
+interface MemoryPanelProps {
+  isOpen: boolean;
+  onClose: () => void;
+}
+
+function formatTimeAgo(timestamp: number): string {
+  const now = Date.now() / 1000;
+  const diff = now - timestamp;
+
+  if (diff < 60) return 'just now';
+  if (diff < 3600) return `${Math.floor(diff / 60)}m ago`;
+  if (diff < 86400) return `${Math.floor(diff / 3600)}h ago`;
+  return `${Math.floor(diff / 86400)}d ago`;
+}
+
+function formatExpiry(expiresAt: number | null): string {
+  if (expiresAt === null) return 'no expiry';
+
+  const now = Date.now() / 1000;
+  const remaining = expiresAt - now;
+
+  if (remaining <= 0) return 'expired';
+  if (remaining < 60) return `${Math.floor(remaining)}s`;
+  if (remaining < 3600) return `${Math.floor(remaining / 60)}m`;
+  if (remaining < 86400) return `${Math.floor(remaining / 3600)}h`;
+  return `${Math.floor(remaining / 86400)}d`;
+}
+
+function formatValue(value: unknown): string {
+  if (typeof value === 'string') return value;
+  if (typeof value === 'number' || typeof value === 'boolean') return String(value);
+  try {
+    const str = JSON.stringify(value);
+    return str.length > 80 ? str.slice(0, 77) + '...' : str;
+  } catch {
+    return String(value);
+  }
+}
+
+function SourceBadge({ source }: { source: string }) {
+  const colors: Record<string, string> = {
+    tool_hint: 'bg-blue-500/20 text-blue-400',
+    explicit: 'bg-green-500/20 text-green-400',
+    api: 'bg-purple-500/20 text-purple-400',
+  };
+
+  const labels: Record<string, string> = {
+    tool_hint: 'tool',
+    explicit: 'voice',
+    api: 'api',
+  };
+
+  return (
+    <span
+      className={`rounded px-1.5 py-0.5 text-xs font-medium ${colors[source] || 'bg-gray-500/20 text-gray-400'}`}
+    >
+      {labels[source] || source}
+    </span>
+  );
+}
+
+export function MemoryPanel({ isOpen, onClose }: MemoryPanelProps) {
+  const t = useTranslations('Memory');
+  const tCommon = useTranslations('Common');
+
+  const [entries, setEntries] = useState<MemoryEntry[]>([]);
+  const [loading, setLoading] = useState(true);
+  const [selectedEntry, setSelectedEntry] = useState<MemoryEntry | null>(null);
+
+  const fetchMemory = useCallback(async () => {
+    setLoading(true);
+    try {
+      const res = await fetch('/api/memory');
+      if (res.ok) {
+        const data = await res.json();
+        setEntries(data.entries || []);
+      }
+    } catch (err) {
+      console.error('Failed to fetch memory:', err);
+    } finally {
+      setLoading(false);
+    }
+  }, []);
+
+  useEffect(() => {
+    if (isOpen) {
+      fetchMemory();
+    }
+  }, [isOpen, fetchMemory]);
+
+  const handleDelete = async (key: string) => {
+    try {
+      const res = await fetch(`/api/memory/${encodeURIComponent(key)}`, {
+        method: 'DELETE',
+      });
+      if (res.ok) {
+        setEntries(entries.filter((e) => e.key !== key));
+        setSelectedEntry(null);
+      }
+    } catch (err) {
+      console.error('Failed to delete entry:', err);
+    }
+  };
+
+  const handleClearAll = async () => {
+    if (!confirm(t('confirmClearAll'))) return;
+
+    try {
+      const res = await fetch('/api/memory', { method: 'DELETE' });
+      if (res.ok) {
+        setEntries([]);
+      }
+    } catch (err) {
+      console.error('Failed to clear memory:', err);
+    }
+  };
+
+  if (!isOpen) return null;
+
+  return createPortal(
+    <div className="fixed inset-0 z-50 flex">
+      {/* Overlay */}
+      <div className="absolute inset-0 bg-black/40 backdrop-blur-sm" onClick={onClose} />
+
+      {/* Panel */}
+      <div
+        className="panel-elevated absolute inset-y-0 right-0 flex w-full flex-col sm:w-[85%] sm:max-w-3xl"
+        style={{ borderLeft: '1px solid var(--border-subtle)' }}
+      >
+        {/* Header */}
+        <header
+          className="section-divider shrink-0"
+          style={{
+            background: 'rgb(from var(--surface-0) r g b / 0.5)',
+            backdropFilter: 'blur(8px)',
+          }}
+        >
+          <div className="flex items-center justify-between px-6 py-5">
+            <div className="flex items-center gap-3">
+              <Brain className="text-primary h-7 w-7" weight="fill" />
+              <div>
+                <h1 className="text-2xl font-bold">{t('panel.title')}</h1>
+                <p className="text-muted-foreground text-sm">
+                  {t('panel.subtitle', { count: entries.length })}
+                </p>
+              </div>
+            </div>
+            <button
+              onClick={onClose}
+              className="text-muted-foreground hover:text-foreground hover:bg-muted rounded-full p-2 transition-colors"
+            >
+              <X className="h-6 w-6" weight="bold" />
+            </button>
+          </div>
+        </header>
+
+        {/* Content */}
+        <main className="flex-1 overflow-y-auto p-6">
+          {loading ? (
+            <div className="text-muted-foreground py-12 text-center">{tCommon('loading')}</div>
+          ) : entries.length === 0 ? (
+            <div className="py-12 text-center">
+              <Brain className="text-muted-foreground mx-auto mb-4 h-12 w-12" />
+              <p className="text-muted-foreground">{t('panel.emptyState')}</p>
+              <p className="text-muted-foreground mt-2 text-sm">{t('panel.emptyHint')}</p>
+            </div>
+          ) : (
+            <div className="space-y-2">
+              {entries.map((entry) => (
+                <div
+                  key={entry.key}
+                  className="hover:bg-muted/50 flex cursor-pointer items-center justify-between rounded-lg border p-4 transition-colors"
+                  style={{ borderColor: 'var(--border-subtle)' }}
+                  onClick={() => setSelectedEntry(entry)}
+                >
+                  <div className="min-w-0 flex-1">
+                    <div className="flex items-center gap-2">
+                      <span className="font-mono text-sm font-medium">{entry.key}</span>
+                      <SourceBadge source={entry.source} />
+                    </div>
+                    <p className="text-muted-foreground mt-1 truncate text-sm">
+                      {formatValue(entry.value)}
+                    </p>
+                    <div className="text-muted-foreground mt-1 flex items-center gap-3 text-xs">
+                      <span>{formatTimeAgo(entry.stored_at)}</span>
+                      {entry.expires_at && (
+                        <span className="flex items-center gap-1">
+                          <Clock className="h-3 w-3" />
+                          {formatExpiry(entry.expires_at)}
+                        </span>
+                      )}
+                    </div>
+                  </div>
+                  <button
+                    onClick={(e) => {
+                      e.stopPropagation();
+                      handleDelete(entry.key);
+                    }}
+                    className="text-muted-foreground hover:text-destructive ml-4 p-2 transition-colors"
+                    title={tCommon('delete')}
+                  >
+                    <Trash className="h-4 w-4" />
+                  </button>
+                </div>
+              ))}
+            </div>
+          )}
+        </main>
+
+        {/* Footer */}
+        <div
+          className="section-divider shrink-0 p-4"
+          style={{ borderTop: '1px solid var(--border-subtle)' }}
+        >
+          <Button
+            variant="secondary"
+            onClick={handleClearAll}
+            disabled={entries.length === 0}
+            className="w-full"
+          >
+            <Trash className="mr-2 h-4 w-4" />
+            {t('panel.clearAll')}
+          </Button>
+        </div>
+      </div>
+
+      {/* Detail modal */}
+      {selectedEntry && (
+        <div className="fixed inset-0 z-60 flex items-center justify-center">
+          <div className="absolute inset-0 bg-black/40" onClick={() => setSelectedEntry(null)} />
+          <div
+            className="panel-elevated relative mx-4 max-h-[80vh] w-full max-w-lg overflow-auto rounded-xl p-6"
+            style={{ border: '1px solid var(--border-subtle)' }}
+          >
+            <button
+              onClick={() => setSelectedEntry(null)}
+              className="text-muted-foreground hover:text-foreground absolute top-4 right-4 p-1"
+            >
+              <X className="h-5 w-5" />
+            </button>
+
+            <div className="mb-4 flex items-center gap-2">
+              <h2 className="font-mono text-lg font-bold">{selectedEntry.key}</h2>
+              <SourceBadge source={selectedEntry.source} />
+            </div>
+
+            <div className="mb-4">
+              <label className="text-muted-foreground mb-1 block text-xs uppercase">
+                {t('detail.value')}
+              </label>
+              <pre className="bg-muted overflow-auto rounded-lg p-3 text-sm">
+                {typeof selectedEntry.value === 'string'
+                  ? selectedEntry.value
+                  : JSON.stringify(selectedEntry.value, null, 2)}
+              </pre>
+            </div>
+
+            <div className="text-muted-foreground grid grid-cols-2 gap-4 text-sm">
+              <div>
+                <label className="mb-1 block text-xs uppercase">{t('detail.storedAt')}</label>
+                <span>{new Date(selectedEntry.stored_at * 1000).toLocaleString()}</span>
+              </div>
+              <div>
+                <label className="mb-1 block text-xs uppercase">{t('detail.expiresAt')}</label>
+                <span>
+                  {selectedEntry.expires_at
+                    ? new Date(selectedEntry.expires_at * 1000).toLocaleString()
+                    : t('detail.noExpiry')}
+                </span>
+              </div>
+            </div>
+
+            <div className="mt-6 flex justify-end gap-2">
+              <Button variant="secondary" onClick={() => setSelectedEntry(null)}>
+                {tCommon('close')}
+              </Button>
+              <Button
+                variant="primary"
+                onClick={() => {
+                  handleDelete(selectedEntry.key);
+                }}
+              >
+                <Trash className="mr-2 h-4 w-4" />
+                {tCommon('delete')}
+              </Button>
+            </div>
+          </div>
+        </div>
+      )}
+    </div>,
+    document.body
+  );
+}
diff --git a/frontend/messages/en.json b/frontend/messages/en.json
index 55e513e..e93280e 100644
--- a/frontend/messages/en.json
+++ b/frontend/messages/en.json
@@ -8,11 +8,13 @@
     "continue": "Continue",
     "cancel": "Cancel",
     "close": "Close",
+    "delete": "Delete",
     "retry": "Retry",
     "enabled": "Enabled",
     "disabled": "Disabled",
     "settings": "Settings",
     "tools": "Tools",
+    "memory": "Memory",
     "connected": "Connected"
   },
   "Welcome": {
@@ -221,6 +223,28 @@
       "parametersTitle": "Tool Parameters"
     }
   },
+  "Memory": {
+    "panel": {
+      "title": "Memory",
+      "subtitle": "{count} entries stored",
+      "emptyState": "No memories stored",
+      "emptyHint": "Memories are created when tools return data or when you ask me to remember something",
+      "clearAll": "Clear All"
+    },
+    "confirmClearAll": "Clear all memory entries?",
+    "detail": {
+      "value": "Value",
+      "storedAt": "Stored",
+      "expiresAt": "Expires",
+      "noExpiry": "No expiry",
+      "source": "Source",
+      "sources": {
+        "tool_hint": "Auto-stored from tool",
+        "explicit": "Stored by request",
+        "api": "Stored via API"
+      }
+    }
+  },
   "Setup": {
     "welcome": "Welcome to CAAL",
     "stepOf": "Step {current} of {total}",
diff --git a/frontend/messages/fr.json b/frontend/messages/fr.json
index b31bfc2..7bd7a71 100644
--- a/frontend/messages/fr.json
+++ b/frontend/messages/fr.json
@@ -8,11 +8,13 @@
     "continue": "Continuer",
     "cancel": "Annuler",
     "close": "Fermer",
+    "delete": "Supprimer",
     "retry": "Réessayer",
     "enabled": "Actif",
     "disabled": "Inactif",
     "settings": "Paramètres",
     "tools": "Outils",
+    "memory": "Mémoire",
     "connected": "Connecté"
   },
   "Welcome": {
@@ -221,6 +223,28 @@
       "parametersTitle": "Paramètres de l'outil"
     }
   },
+  "Memory": {
+    "panel": {
+      "title": "Mémoire",
+      "subtitle": "{count} entrées stockées",
+      "emptyState": "Aucune mémoire stockée",
+      "emptyHint": "Les mémoires sont créées lorsque les outils renvoient des données ou lorsque vous me demandez de mémoriser quelque chose",
+      "clearAll": "Tout effacer"
+    },
+    "confirmClearAll": "Effacer toutes les entrées de mémoire ?",
+    "detail": {
+      "value": "Valeur",
+      "storedAt": "Stocké",
+      "expiresAt": "Expire",
+      "noExpiry": "Pas d'expiration",
+      "source": "Source",
+      "sources": {
+        "tool_hint": "Stocké automatiquement par l'outil",
+        "explicit": "Stocké sur demande",
+        "api": "Stocké via API"
+      }
+    }
+  },
   "Setup": {
     "welcome": "Bienvenue sur CAAL",
     "stepOf": "Étape {current} sur {total}",
diff --git a/frontend/messages/it.json b/frontend/messages/it.json
index d426444..d7f2005 100644
--- a/frontend/messages/it.json
+++ b/frontend/messages/it.json
@@ -8,11 +8,13 @@
     "continue": "Continua",
     "cancel": "Annulla",
     "close": "Chiudi",
+    "delete": "Elimina",
     "retry": "Riprova",
     "enabled": "Attivo",
     "disabled": "Disattivo",
     "settings": "Impostazioni",
     "tools": "Strumenti",
+    "memory": "Memoria",
     "connected": "Connesso"
   },
   "Welcome": {
@@ -221,6 +223,28 @@
       "parametersTitle": "Parametri dello strumento"
     }
   },
+  "Memory": {
+    "panel": {
+      "title": "Memoria",
+      "subtitle": "{count} voci memorizzate",
+      "emptyState": "Nessuna memoria memorizzata",
+      "emptyHint": "Le memorie vengono create quando gli strumenti restituiscono dati o quando mi chiedi di ricordare qualcosa",
+      "clearAll": "Cancella tutto"
+    },
+    "confirmClearAll": "Cancellare tutte le voci di memoria?",
+    "detail": {
+      "value": "Valore",
+      "storedAt": "Memorizzato",
+      "expiresAt": "Scade",
+      "noExpiry": "Nessuna scadenza",
+      "source": "Fonte",
+      "sources": {
+        "tool_hint": "Memorizzato automaticamente dallo strumento",
+        "explicit": "Memorizzato su richiesta",
+        "api": "Memorizzato tramite API"
+      }
+    }
+  },
   "Setup": {
     "welcome": "Benvenuto su CAAL",
     "stepOf": "Passaggio {current} di {total}",
diff --git a/src/caal/integrations/__init__.py b/src/caal/integrations/__init__.py
index 9134700..b50b827 100644
--- a/src/caal/integrations/__init__.py
+++ b/src/caal/integrations/__init__.py
@@ -3,6 +3,7 @@
 """
 
 from .mcp_loader import MCPServerConfig, initialize_mcp_servers, load_mcp_config
+from .memory_tool import MemoryTools
 from .n8n import discover_n8n_workflows, execute_n8n_workflow
 from .web_search import WebSearchTools
 
@@ -12,5 +13,6 @@
     "MCPServerConfig",
     "discover_n8n_workflows",
     "execute_n8n_workflow",
+    "MemoryTools",
     "WebSearchTools",
 ]
diff --git a/src/caal/integrations/memory_tool.py b/src/caal/integrations/memory_tool.py
new file mode 100644
index 0000000..d1c5e13
--- /dev/null
+++ b/src/caal/integrations/memory_tool.py
@@ -0,0 +1,113 @@
+"""Short-term memory tool for CAAL.
+
+Provides explicit memory operations for the LLM to store and retrieve data.
+This is mechanism #2 of the three storage mechanisms:
+    1. Auto-store: Tool responses with memory_hint field (automatic)
+    2. Explicit: memory_short tool (this file) - user says "remember X"
+    3. Passive: Tool descriptions instruct LLM to check memory first
+
+Usage:
+    class VoiceAssistant(MemoryTools, Agent):
+        pass  # memory_short tool is automatically available
+"""
+
+import json
+import logging
+from typing import TYPE_CHECKING
+
+from livekit.agents import function_tool
+
+if TYPE_CHECKING:
+    from ..memory import ShortTermMemory
+
+logger = logging.getLogger(__name__)
+
+
+class MemoryTools:
+    """Mixin providing memory_short tool for explicit memory operations.
+
+    Requires the parent class to have:
+    - self._short_term_memory: ShortTermMemory instance
+    """
+
+    @function_tool
+    async def memory_short(
+        self,
+        action: str,
+        key: str = "",
+        value: str = "",
+    ) -> str:
+        """Store or retrieve information for later use in this conversation.
+
+        Use this to remember things the user tells you like flight numbers,
+        tracking codes, preferences, or any data you'll need to reference later.
+
+        IMPORTANT: Before asking the user for information you've already
+        discussed (like a tracking number or flight), check memory first
+        with action="get" or action="list".
+
+        Args:
+            action: One of "store", "get", "delete", "list"
+            key: The key to store/get/delete (e.g., "flight_number", "tracking_code")
+            value: The value to store (only required for action="store")
+
+        Returns:
+            Result of the operation
+        """
+        memory: "ShortTermMemory" = getattr(self, "_short_term_memory", None)
+
+        if memory is None:
+            logger.warning("memory_short called but no memory instance available")
+            return "Memory not available"
+
+        logger.info(f"memory_short: action={action}, key={key}")
+
+        if action == "store":
+            if not key:
+                return "Key is required for store action"
+            if not value:
+                return "Value is required for store action"
+
+            # Parse value as JSON if it looks like JSON
+            try:
+                if value.startswith(("{", "[")):
+                    parsed_value = json.loads(value)
+                else:
+                    parsed_value = value
+            except json.JSONDecodeError:
+                parsed_value = value
+
+            memory.store(key=key, value=parsed_value, source="explicit")
+            return f"Stored: {key}"
+
+        elif action == "get":
+            if not key:
+                return "Key is required for get action"
+
+            result = memory.get(key)
+            if result is None:
+                return f"No value found for key: {key}"
+
+            if isinstance(result, (dict, list)):
+                return json.dumps(result)
+            return str(result)
+
+        elif action == "delete":
+            if not key:
+                return "Key is required for delete action"
+
+            deleted = memory.delete(key)
+            return f"Deleted: {key}" if deleted else f"Key not found: {key}"
+
+        elif action == "list":
+            entries = memory.list_keys()
+            if not entries:
+                return "Memory is empty"
+
+            lines = ["Stored memory keys:"]
+            for entry in entries:
+                lines.append(f"- {entry['key']} (source: {entry['source']})")
+            return "\n".join(lines)
+
+        else:
+            return f"Unknown action: {action}. Valid actions: store, get, delete, list"
diff --git a/src/caal/llm/llm_node.py b/src/caal/llm/llm_node.py
index bca9fb9..931639a 100644
--- a/src/caal/llm/llm_node.py
+++ b/src/caal/llm/llm_node.py
@@ -28,6 +28,7 @@ async def llm_node(self, chat_ctx, tools, model_settings):
 from typing import TYPE_CHECKING, Any
 
 from ..integrations.n8n import execute_n8n_workflow
+from ..memory import ShortTermMemory
 from ..utils.formatting import strip_markdown_for_tts
 from .providers import LLMProvider
 
@@ -77,6 +78,7 @@ async def llm_node(
     chat_ctx,
     provider: LLMProvider,
     tool_data_cache: ToolDataCache | None = None,
+    short_term_memory: ShortTermMemory | None = None,
     max_turns: int = 20,
 ) -> AsyncIterable[str]:
     """Provider-agnostic LLM node with tool calling support.
@@ -88,6 +90,7 @@ async def llm_node(
         chat_ctx: Chat context from LiveKit
         provider: LLMProvider instance (OllamaProvider, GroqProvider, etc.)
         tool_data_cache: Cache for structured tool response data
+        short_term_memory: Short-term memory for context persistence
         max_turns: Max conversation turns to keep in sliding window
 
     Yields:
@@ -106,6 +109,7 @@ async def llm_node(self, chat_ctx, tools, model_settings):
         messages = _build_messages_from_context(
             chat_ctx,
             tool_data_cache=tool_data_cache,
+            short_term_memory=short_term_memory,
             max_turns=max_turns,
         )
 
@@ -168,6 +172,7 @@ async def llm_node(self, chat_ctx, tools, model_settings):
                     response.content,
                     provider=provider,
                     tool_data_cache=tool_data_cache,
+                    short_term_memory=short_term_memory,
                 )
 
                 # Stream follow-up response with tool results
@@ -220,18 +225,21 @@ async def llm_node(self, chat_ctx, tools, model_settings):
 def _build_messages_from_context(
     chat_ctx,
     tool_data_cache: ToolDataCache | None = None,
+    short_term_memory: ShortTermMemory | None = None,
     max_turns: int = 20,
 ) -> list[dict]:
-    """Build messages with sliding window and tool data context.
+    """Build messages with sliding window and context injection.
 
     Message order:
     1. System prompt (always first, never trimmed)
     2. Tool data context (injected from cache)
-    3. Chat history (sliding window applied)
+    3. Short-term memory context (persistent facts)
+    4. Chat history (sliding window applied)
 
     Args:
         chat_ctx: LiveKit chat context
         tool_data_cache: Cache of recent tool response data
+        short_term_memory: Short-term memory for persistent context
         max_turns: Max conversation turns to keep (1 turn = user + assistant)
     """
     system_prompt = None
@@ -294,7 +302,13 @@ def _build_messages_from_context(
         if context:
             messages.append({"role": "system", "content": context})
 
-    # 3. Apply sliding window to chat history
+    # 3. Inject short-term memory context
+    if short_term_memory:
+        memory_context = short_term_memory.get_context_message()
+        if memory_context:
+            messages.append({"role": "system", "content": memory_context})
+
+    # 4. Apply sliding window to chat history
     # max_turns * 2 accounts for user + assistant pairs
     max_messages = max_turns * 2
     if len(chat_messages) > max_messages:
@@ -441,6 +455,7 @@ async def _execute_tool_calls(
     response_content: str | None,
     provider: LLMProvider,
     tool_data_cache: ToolDataCache | None = None,
+    short_term_memory: ShortTermMemory | None = None,
 ) -> list[dict]:
     """Execute tool calls and append results to messages.
 
@@ -451,6 +466,7 @@ async def _execute_tool_calls(
         response_content: Original LLM response content (if any)
         provider: LLM provider (for formatting tool results)
         tool_data_cache: Optional cache to store structured tool response data
+        short_term_memory: Optional memory to store memory_hint from tool responses
     """
     # Add assistant message with tool calls
     tool_call_message = provider.format_tool_call_message(
@@ -468,6 +484,18 @@ async def _execute_tool_calls(
         try:
             tool_result = await _execute_single_tool(agent, tool_name, arguments)
 
+            # Extract and store memory_hint from tool response (deterministic)
+            if short_term_memory and isinstance(tool_result, dict):
+                memory_hint = tool_result.get("memory_hint")
+                if memory_hint and isinstance(memory_hint, dict):
+                    for key, value in memory_hint.items():
+                        short_term_memory.store(
+                            key=key,
+                            value=value,
+                            source="tool_hint",
+                        )
+                        logger.info(f"Stored memory hint from {tool_name}: {key}")
+
             # Cache structured data if present
             if tool_data_cache and isinstance(tool_result, dict):
                 # Look for common data fields, otherwise cache the whole result
diff --git a/src/caal/memory/__init__.py b/src/caal/memory/__init__.py
new file mode 100644
index 0000000..f825d57
--- /dev/null
+++ b/src/caal/memory/__init__.py
@@ -0,0 +1,34 @@
+"""CAAL Memory System.
+
+Provides memory capabilities for the voice agent:
+    - Short-term: Session/task context with TTL-based expiry
+    - Long-term (future): Knowledge graph with semantic search
+
+Example:
+    >>> from caal.memory import ShortTermMemory
+    >>> memory = ShortTermMemory()
+    >>> memory.store("flight_number", "UA1234")
+    >>> memory.get("flight_number")
+    'UA1234'
+"""
+
+from .base import (
+    DEFAULT_TTL_SECONDS,
+    MEMORY_DIR,
+    MemoryEntry,
+    MemorySource,
+    MemoryStore,
+)
+from .short_term import ShortTermMemory
+
+__all__ = [
+    # Classes
+    "ShortTermMemory",
+    # Types
+    "MemoryEntry",
+    "MemorySource",
+    "MemoryStore",
+    # Constants
+    "DEFAULT_TTL_SECONDS",
+    "MEMORY_DIR",
+]
diff --git a/src/caal/memory/base.py b/src/caal/memory/base.py
new file mode 100644
index 0000000..d205419
--- /dev/null
+++ b/src/caal/memory/base.py
@@ -0,0 +1,51 @@
+"""Base types and constants for CAAL memory systems.
+
+This module provides shared types used by both short-term and future long-term
+memory implementations, ensuring consistent interfaces across the memory layer.
+
+Memory Architecture:
+    Short-term: Session/task context, JSON key-value, TTL-based expiry
+    Long-term (future): Knowledge graph, embeddings, semantic search
+"""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+from typing import Any, Literal, TypedDict
+
+# Memory storage directory - same as settings.json location
+_SCRIPT_DIR = Path(__file__).parent.parent.parent.parent  # src/caal/memory -> project root
+MEMORY_DIR = Path(os.getenv("CAAL_MEMORY_DIR", _SCRIPT_DIR))
+
+# Default TTL for auto-stored data (24 hours)
+DEFAULT_TTL_SECONDS = 86400
+
+# Source types for tracking where memory entries came from
+MemorySource = Literal["tool_hint", "explicit", "api"]
+
+
+class MemoryEntry(TypedDict):
+    """Single memory entry with metadata.
+
+    Attributes:
+        value: The stored data (any JSON-serializable type)
+        stored_at: Unix timestamp when entry was created
+        expires_at: Unix timestamp for expiry, or None for no expiry
+        source: How the entry was created (tool_hint, explicit, api)
+    """
+
+    value: Any
+    stored_at: float
+    expires_at: float | None
+    source: MemorySource
+
+
+class MemoryStore(TypedDict):
+    """Full memory store structure.
+
+    Attributes:
+        entries: Dict mapping keys to MemoryEntry objects
+    """
+
+    entries: dict[str, MemoryEntry]
diff --git a/src/caal/memory/short_term.py b/src/caal/memory/short_term.py
new file mode 100644
index 0000000..713f163
--- /dev/null
+++ b/src/caal/memory/short_term.py
@@ -0,0 +1,294 @@
+"""Short-term memory for CAAL voice agent.
+
+Provides session-persistent memory storage with optional TTL-based expiry.
+Memory survives agent restarts via file-based JSON persistence.
+
+Three storage mechanisms feed into this:
+    1. Auto-store: Tool responses with memory_hint field
+    2. Explicit: memory_short tool with store/get/delete/list actions
+    3. API: HTTP endpoints for external systems
+
+Example:
+    >>> from caal.memory import ShortTermMemory
+    >>> memory = ShortTermMemory()
+    >>> memory.store("flight_number", "UA1234", source="explicit")
+    >>> memory.get("flight_number")
+    'UA1234'
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import time
+from typing import Any
+
+from .base import (
+    DEFAULT_TTL_SECONDS,
+    MEMORY_DIR,
+    MemoryEntry,
+    MemorySource,
+    MemoryStore,
+)
+
+logger = logging.getLogger(__name__)
+
+# File path for short-term memory persistence
+SHORT_TERM_MEMORY_PATH = MEMORY_DIR / "short_term_memory.json"
+
+
+class ShortTermMemory:
+    """Global singleton for short-term memory.
+
+    Thread-safe, file-backed storage with in-memory cache.
+    Automatically cleans up expired entries on access.
+
+    Attributes:
+        _instance: Singleton instance (class-level)
+        _cache: In-memory cache of the memory store
+    """
+
+    _instance: ShortTermMemory | None = None
+    _cache: MemoryStore | None = None
+
+    def __new__(cls) -> ShortTermMemory:
+        """Singleton pattern - returns existing instance or creates new one."""
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+            cls._instance._load()
+        return cls._instance
+
+    def store(
+        self,
+        key: str,
+        value: Any,
+        ttl_seconds: int | None = DEFAULT_TTL_SECONDS,
+        source: MemorySource = "explicit",
+    ) -> None:
+        """Store a value in memory.
+
+        Args:
+            key: Descriptive key (e.g., "flight_number", "tracking_code")
+            value: Any JSON-serializable data to store
+            ttl_seconds: Time-to-live in seconds. None for no expiry.
+            source: Origin of data ("tool_hint", "explicit", "api")
+        """
+        now = time.time()
+        expires_at = now + ttl_seconds if ttl_seconds is not None else None
+
+        entry: MemoryEntry = {
+            "value": value,
+            "stored_at": now,
+            "expires_at": expires_at,
+            "source": source,
+        }
+
+        self._ensure_cache()
+        self._cache["entries"][key] = entry
+        self._save()
+
+        ttl_str = f"ttl={ttl_seconds}s" if ttl_seconds else "no expiry"
+        logger.info(f"Stored memory: {key} (source={source}, {ttl_str})")
+
+    def get(self, key: str) -> Any | None:
+        """Get a value from memory.
+
+        Automatically removes the entry if it has expired.
+
+        Args:
+            key: The memory key to retrieve
+
+        Returns:
+            The stored value, or None if not found or expired
+        """
+        self._ensure_cache()
+        entry = self._cache["entries"].get(key)
+
+        if entry is None:
+            return None
+
+        # Check expiry
+        if entry["expires_at"] is not None and time.time() > entry["expires_at"]:
+            self.delete(key)
+            logger.debug(f"Memory expired: {key}")
+            return None
+
+        return entry["value"]
+
+    def delete(self, key: str) -> bool:
+        """Delete a key from memory.
+
+        Args:
+            key: The memory key to delete
+
+        Returns:
+            True if key existed and was deleted, False otherwise
+        """
+        self._ensure_cache()
+
+        if key in self._cache["entries"]:
+            del self._cache["entries"][key]
+            self._save()
+            logger.info(f"Deleted memory: {key}")
+            return True
+
+        return False
+
+    def list_keys(self) -> list[dict[str, Any]]:
+        """List all non-expired memory keys with metadata.
+
+        Automatically cleans up expired entries.
+
+        Returns:
+            List of dicts with key, stored_at, expires_at, source
+        """
+        self._ensure_cache()
+        self.cleanup_expired()
+
+        result = []
+        for key, entry in self._cache["entries"].items():
+            result.append({
+                "key": key,
+                "stored_at": entry["stored_at"],
+                "expires_at": entry["expires_at"],
+                "source": entry["source"],
+            })
+
+        return result
+
+    def get_all(self) -> list[dict[str, Any]]:
+        """Get all non-expired memory entries with full data.
+
+        Returns:
+            List of dicts with key, value, stored_at, expires_at, source
+        """
+        self._ensure_cache()
+        self.cleanup_expired()
+
+        result = []
+        for key, entry in self._cache["entries"].items():
+            result.append({
+                "key": key,
+                "value": entry["value"],
+                "stored_at": entry["stored_at"],
+                "expires_at": entry["expires_at"],
+                "source": entry["source"],
+            })
+
+        return result
+
+    def get_context_message(self) -> str | None:
+        """Format memory as context string for LLM injection.
+
+        Returns:
+            Formatted string for system message, or None if empty
+        """
+        self._ensure_cache()
+        self.cleanup_expired()
+
+        if not self._cache["entries"]:
+            return None
+
+        now = time.time()
+        lines = [
+            "Short-term memory (check before asking user for previously-mentioned info):"
+        ]
+
+        for key, entry in self._cache["entries"].items():
+            value = entry["value"]
+            source = entry["source"]
+
+            # Format value (truncate if too long)
+            if isinstance(value, dict):
+                value_str = json.dumps(value)
+            elif isinstance(value, list):
+                value_str = json.dumps(value)
+            else:
+                value_str = str(value)
+
+            if len(value_str) > 100:
+                value_str = value_str[:97] + "..."
+
+            # Format expiry
+            if entry["expires_at"] is not None:
+                remaining = entry["expires_at"] - now
+                if remaining > 3600:
+                    expiry_str = f"{remaining / 3600:.1f}h"
+                elif remaining > 60:
+                    expiry_str = f"{remaining / 60:.0f}m"
+                else:
+                    expiry_str = f"{remaining:.0f}s"
+                expiry_info = f", expires: {expiry_str}"
+            else:
+                expiry_info = ", no expiry"
+
+            lines.append(f"- {key}: {value_str} (source: {source}{expiry_info})")
+
+        return "\n".join(lines)
+
+    def clear(self) -> None:
+        """Clear all memory entries."""
+        self._cache = {"entries": {}}
+        self._save()
+        logger.info("Cleared all short-term memory")
+
+    def cleanup_expired(self) -> int:
+        """Remove all expired entries.
+
+        Returns:
+            Number of entries cleaned up
+        """
+        self._ensure_cache()
+        now = time.time()
+        expired_keys = []
+
+        for key, entry in self._cache["entries"].items():
+            if entry["expires_at"] is not None and now > entry["expires_at"]:
+                expired_keys.append(key)
+
+        for key in expired_keys:
+            del self._cache["entries"][key]
+
+        if expired_keys:
+            self._save()
+            logger.info(f"Cleaned up {len(expired_keys)} expired memory entries")
+
+        return len(expired_keys)
+
+    def _ensure_cache(self) -> None:
+        """Ensure cache is loaded."""
+        if self._cache is None:
+            self._load()
+
+    def _load(self) -> None:
+        """Load memory from file."""
+        if SHORT_TERM_MEMORY_PATH.exists():
+            try:
+                with open(SHORT_TERM_MEMORY_PATH) as f:
+                    data = json.load(f)
+                    self._cache = {"entries": data.get("entries", {})}
+                    logger.debug(f"Loaded short-term memory from {SHORT_TERM_MEMORY_PATH}")
+                    return
+            except Exception as e:
+                logger.warning(f"Failed to load short-term memory: {e}")
+
+        self._cache = {"entries": {}}
+
+    def _save(self) -> None:
+        """Save memory to file."""
+        if self._cache is None:
+            return
+
+        try:
+            SHORT_TERM_MEMORY_PATH.parent.mkdir(parents=True, exist_ok=True)
+            with open(SHORT_TERM_MEMORY_PATH, "w") as f:
+                json.dump(self._cache, f, indent=2)
+            logger.debug(f"Saved short-term memory to {SHORT_TERM_MEMORY_PATH}")
+        except Exception as e:
+            logger.error(f"Failed to save short-term memory: {e}")
+
+    def reload(self) -> None:
+        """Force reload memory from disk."""
+        self._cache = None
+        self._load()
+        logger.info("Reloaded short-term memory from disk")
diff --git a/src/caal/webhooks.py b/src/caal/webhooks.py
index e459b09..64041bb 100644
--- a/src/caal/webhooks.py
+++ b/src/caal/webhooks.py
@@ -48,6 +48,7 @@ def run_webhook_server():
 
 from . import registry_cache
 from . import settings as settings_module
+from .memory import ShortTermMemory
 
 logger = logging.getLogger(__name__)
 
@@ -1449,6 +1450,173 @@ async def get_n8n_workflow(workflow_id: str) -> N8nWorkflowDetailResponse:
         )
 
 
+# =============================================================================
+# Short-Term Memory Endpoints
+# =============================================================================
+
+
+class MemoryEntryResponse(BaseModel):
+    """Response body for a single memory entry."""
+
+    key: str
+    value: dict | list | str | int | float | bool | None
+    stored_at: float
+    expires_at: float | None
+    source: str
+
+
+class MemoryListResponse(BaseModel):
+    """Response body for GET /memory endpoint."""
+
+    entries: list[MemoryEntryResponse]
+
+
+class MemoryStoreRequest(BaseModel):
+    """Request body for POST /memory endpoint."""
+
+    key: str
+    value: dict | list | str | int | float | bool | None
+    ttl_seconds: int | None = None
+
+
+class MemoryStoreResponse(BaseModel):
+    """Response body for POST /memory endpoint."""
+
+    status: str
+    key: str
+
+
+class MemoryDeleteResponse(BaseModel):
+    """Response body for DELETE /memory endpoint."""
+
+    status: str
+    key: str
+
+
+class MemoryClearResponse(BaseModel):
+    """Response body for DELETE /memory (clear all) endpoint."""
+
+    status: str
+    cleared_count: int
+
+
+@app.get("/memory", response_model=MemoryListResponse)
+async def get_memory() -> MemoryListResponse:
+    """Get all short-term memory entries.
+
+    Returns:
+        MemoryListResponse with all non-expired entries
+    """
+    memory = ShortTermMemory()
+    entries = memory.get_all()
+
+    return MemoryListResponse(
+        entries=[
+            MemoryEntryResponse(
+                key=e["key"],
+                value=e["value"],
+                stored_at=e["stored_at"],
+                expires_at=e["expires_at"],
+                source=e["source"],
+            )
+            for e in entries
+        ]
+    )
+
+
+@app.get("/memory/{key}", response_model=MemoryEntryResponse)
+async def get_memory_entry(key: str) -> MemoryEntryResponse:
+    """Get a single memory entry by key.
+
+    Args:
+        key: The memory key to retrieve
+
+    Returns:
+        MemoryEntryResponse with the entry data
+
+    Raises:
+        HTTPException: 404 if key not found or expired
+    """
+    memory = ShortTermMemory()
+    entries = memory.get_all()
+
+    # Find the entry with matching key
+    for entry in entries:
+        if entry["key"] == key:
+            return MemoryEntryResponse(
+                key=entry["key"],
+                value=entry["value"],
+                stored_at=entry["stored_at"],
+                expires_at=entry["expires_at"],
+                source=entry["source"],
+            )
+
+    raise HTTPException(status_code=404, detail=f"Key not found: {key}")
+
+
+@app.post("/memory", response_model=MemoryStoreResponse)
+async def store_memory(req: MemoryStoreRequest) -> MemoryStoreResponse:
+    """Store a value in short-term memory.
+
+    Args:
+        req: MemoryStoreRequest with key, value, and optional TTL
+
+    Returns:
+        MemoryStoreResponse with status
+    """
+    memory = ShortTermMemory()
+    memory.store(
+        key=req.key,
+        value=req.value,
+        ttl_seconds=req.ttl_seconds,
+        source="api",
+    )
+
+    logger.info(f"Memory stored via API: {req.key}")
+
+    return MemoryStoreResponse(status="stored", key=req.key)
+
+
+@app.delete("/memory/{key}", response_model=MemoryDeleteResponse)
+async def delete_memory_entry(key: str) -> MemoryDeleteResponse:
+    """Delete a single memory entry.
+
+    Args:
+        key: The memory key to delete
+
+    Returns:
+        MemoryDeleteResponse with status
+
+    Raises:
+        HTTPException: 404 if key not found
+    """
+    memory = ShortTermMemory()
+    deleted = memory.delete(key)
+
+    if not deleted:
+        raise HTTPException(status_code=404, detail=f"Key not found: {key}")
+
+    return MemoryDeleteResponse(status="deleted", key=key)
+
+
+@app.delete("/memory", response_model=MemoryClearResponse)
+async def clear_memory() -> MemoryClearResponse:
+    """Clear all short-term memory entries.
+
+    Returns:
+        MemoryClearResponse with status and count of cleared entries
+    """
+    memory = ShortTermMemory()
+    entries = memory.list_keys()
+    count = len(entries)
+
+    memory.clear()
+
+    logger.info(f"Memory cleared via API: {count} entries removed")
+
+    return MemoryClearResponse(status="cleared", cleared_count=count)
+
+
 # =============================================================================
 # Prewarm Endpoint
 # =============================================================================
diff --git a/voice_agent.py b/voice_agent.py
index b22c4a0..f7f125f 100644
--- a/voice_agent.py
+++ b/voice_agent.py
@@ -49,12 +49,14 @@
 
 from caal import CAALLLM  # noqa: E402
 from caal.integrations import (  # noqa: E402
+    MemoryTools,
     WebSearchTools,
     discover_n8n_workflows,
     initialize_mcp_servers,
     load_mcp_config,
 )
 from caal.llm import ToolDataCache, llm_node  # noqa: E402
+from caal.memory import ShortTermMemory  # noqa: E402
 from caal.stt import WakeWordGatedSTT  # noqa: E402
 
 # Configure logging - LiveKit adds LogQueueHandler to root in worker processes,
@@ -336,8 +338,8 @@ async def hass_get_state(target: str = None) -> str:
     return tool_definitions, tool_callables
 
 
-class VoiceAssistant(WebSearchTools, Agent):
-    """Voice assistant with MCP tools and web search."""
+class VoiceAssistant(MemoryTools, WebSearchTools, Agent):
+    """Voice assistant with MCP tools, web search, and short-term memory."""
 
     def __init__(
         self,
@@ -352,6 +354,7 @@ def __init__(
         max_turns: int = 20,
         hass_tool_definitions: list[dict] | None = None,
         hass_tool_callables: dict | None = None,
+        short_term_memory: ShortTermMemory | None = None,
     ) -> None:
         super().__init__(
             instructions=load_prompt(language=language),
@@ -381,6 +384,9 @@ def __init__(
         self._tool_data_cache = ToolDataCache(max_entries=tool_cache_size)
         self._max_turns = max_turns
 
+        # Short-term memory for persistent context (MemoryTools mixin requirement)
+        self._short_term_memory = short_term_memory
+
     async def llm_node(self, chat_ctx, tools, model_settings):
         """Custom LLM node using provider-agnostic interface."""
         async for chunk in llm_node(
@@ -388,6 +394,7 @@ async def llm_node(self, chat_ctx, tools, model_settings):
             chat_ctx,
             provider=self._provider,
             tool_data_cache=self._tool_data_cache,
+            short_term_memory=self._short_term_memory,
             max_turns=self._max_turns,
         ):
             yield chunk
@@ -702,6 +709,14 @@ async def _publish_tool_status(
         hass_tool_definitions, hass_tool_callables = create_hass_tools(hass_server)
         logger.info("Home Assistant tools enabled: hass_control, hass_get_state")
 
+    # Initialize short-term memory (singleton, persists across restarts)
+    short_term_memory = ShortTermMemory()
+    memory_count = len(short_term_memory.list_keys())
+    if memory_count > 0:
+        logger.info(f"Short-term memory loaded: {memory_count} entries")
+    else:
+        logger.info("Short-term memory initialized (empty)")
+
     # Create agent with CAALLLM and all MCP servers
     assistant = VoiceAssistant(
         caal_llm=caal_llm,
@@ -715,6 +730,7 @@ async def _publish_tool_status(
         max_turns=runtime["max_turns"],
         hass_tool_definitions=hass_tool_definitions,
         hass_tool_callables=hass_tool_callables,
+        short_term_memory=short_term_memory,
     )
 
     # Create event to wait for session close (BEFORE session.start to avoid race condition)

From aaae3be91ae1397f58c26a8c48dae84e99847aba Mon Sep 17 00:00:00 2001
From: Corey McCallum <corey.mccallum@gmail.com>
Date: Tue, 3 Feb 2026 14:29:55 -0800
Subject: [PATCH 02/10] feat(memory): extend TTL to 7 days, support
 tool-specified TTL

- Change default TTL from 24h to 7 days (604800s)
- Allow tools to specify custom TTL in memory_hint:
  - Simple value: uses default 7d TTL
  - {"value": ..., "ttl": seconds}: custom TTL
  - {"value": ..., "ttl": null}: no expiry

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/caal/llm/llm_node.py | 22 ++++++++++++++++++++--
 src/caal/memory/base.py  |  4 ++--
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/src/caal/llm/llm_node.py b/src/caal/llm/llm_node.py
index 931639a..e2849e8 100644
--- a/src/caal/llm/llm_node.py
+++ b/src/caal/llm/llm_node.py
@@ -485,13 +485,31 @@ async def _execute_tool_calls(
             tool_result = await _execute_single_tool(agent, tool_name, arguments)
 
             # Extract and store memory_hint from tool response (deterministic)
+            # Supports two formats:
+            #   {"memory_hint": {"key": "simple_value"}}  → 7d default TTL
+            #   {"memory_hint": {"key": {"value": "...", "ttl": 3600}}}  → custom TTL
+            #   {"memory_hint": {"key": {"value": "...", "ttl": null}}}  → no expiry
             if short_term_memory and isinstance(tool_result, dict):
                 memory_hint = tool_result.get("memory_hint")
                 if memory_hint and isinstance(memory_hint, dict):
-                    for key, value in memory_hint.items():
+                    from caal.memory.base import DEFAULT_TTL_SECONDS
+
+                    for key, hint_value in memory_hint.items():
+                        # Check if hint_value is extended format with ttl
+                        if isinstance(hint_value, dict) and "value" in hint_value:
+                            actual_value = hint_value["value"]
+                            # "ttl" in dict distinguishes missing (use default) vs null (no expiry)
+                            if "ttl" in hint_value:
+                                ttl = hint_value["ttl"]  # Could be int or None
+                            else:
+                                ttl = DEFAULT_TTL_SECONDS
+                        else:
+                            actual_value = hint_value
+                            ttl = DEFAULT_TTL_SECONDS
                         short_term_memory.store(
                             key=key,
-                            value=value,
+                            value=actual_value,
+                            ttl_seconds=ttl,
                             source="tool_hint",
                         )
                         logger.info(f"Stored memory hint from {tool_name}: {key}")
diff --git a/src/caal/memory/base.py b/src/caal/memory/base.py
index d205419..e4bc91e 100644
--- a/src/caal/memory/base.py
+++ b/src/caal/memory/base.py
@@ -18,8 +18,8 @@
 _SCRIPT_DIR = Path(__file__).parent.parent.parent.parent  # src/caal/memory -> project root
 MEMORY_DIR = Path(os.getenv("CAAL_MEMORY_DIR", _SCRIPT_DIR))
 
-# Default TTL for auto-stored data (24 hours)
-DEFAULT_TTL_SECONDS = 86400
+# Default TTL for auto-stored data (7 days)
+DEFAULT_TTL_SECONDS = 604800
 
 # Source types for tracking where memory entries came from
 MemorySource = Literal["tool_hint", "explicit", "api"]

From 59c251209501b4e5e9ff572f7f63850684c42992 Mon Sep 17 00:00:00 2001
From: Corey McCallum <corey.mccallum@gmail.com>
Date: Tue, 3 Feb 2026 21:10:47 -0800
Subject: [PATCH 03/10] fix(agent): proper tool call chaining in llm_node
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace linear execute→stream→retry with a loop that supports
multi-step tool chaining. Model can now: call tool A → get result →
call tool B → get result → generate text response.

Previously, after one tool execution the code tried to stream a text
response. If the model wanted to chain (call another tool), it
produced 0 text chunks, triggering a retry without tools that crashed
Ollama (tool references in messages but no tools registered).

New flow:
- Loop non-streaming chat() calls (max 5 rounds)
- Each round: if tool_calls → execute → loop back
- When no tool_calls → yield content or stream final response
- Safety fallback: _strip_tool_messages converts tool messages to
  plain text if Ollama still crashes on the streaming path

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/caal/llm/llm_node.py | 193 ++++++++++++++++++++++++++-------------
 1 file changed, 127 insertions(+), 66 deletions(-)

diff --git a/src/caal/llm/llm_node.py b/src/caal/llm/llm_node.py
index e2849e8..0088b9a 100644
--- a/src/caal/llm/llm_node.py
+++ b/src/caal/llm/llm_node.py
@@ -116,47 +116,81 @@ async def llm_node(self, chat_ctx, tools, model_settings):
         # Discover tools from agent and MCP servers
         tools = await _discover_tools(agent)
 
-        # If tools available, check for tool calls first (non-streaming)
+        # If tools available, loop non-streaming calls to support chaining
+        # Model can call tool A → get result → call tool B → get result → text
+        max_tool_rounds = 5
+        tool_round = 0
+
         if tools:
-            try:
-                response = await provider.chat(messages=messages, tools=tools)
-            except Exception as tool_err:
-                # Tool call generation failed (e.g., model garbled tool name)
-                # Retry once, then fall back to streaming without tools
-                err_msg = str(tool_err)
-                is_malformed_tool = (
-                    "tool_use_failed" in err_msg
-                    or "Failed to call a function" in err_msg
-                    # Ollama: model leaked think/control tokens into tool name
-                    # e.g. "tool '{}[/THINK][TOOL_CALLS]notion_tasks' not found"
-                    or "[/THINK]" in err_msg
-                    or "[TOOL_CALLS]" in err_msg
-                    or "<function=" in err_msg
-                )
-                if is_malformed_tool:
-                    logger.warning(
-                        f"LLM generated malformed tool call, retrying: {err_msg}"
+            while tool_round < max_tool_rounds:
+                try:
+                    response = await provider.chat(messages=messages, tools=tools)
+                except Exception as tool_err:
+                    # Tool call generation failed (e.g., model garbled tool name)
+                    err_msg = str(tool_err)
+                    # Wrong tool name — model called a non-existent tool
+                    if "not found" in err_msg:
+                        logger.warning(
+                            f"LLM called non-existent tool: {err_msg}. "
+                            "Falling back to streaming."
+                        )
+                        break
+
+                    # Garbled tool call — leaked control tokens etc.
+                    is_garbled = (
+                        "tool_use_failed" in err_msg
+                        or "Failed to call a function" in err_msg
+                        or "[/THINK]" in err_msg
+                        or "[TOOL_CALLS]" in err_msg
+                        or "<function=" in err_msg
                     )
-                    try:
-                        response = await provider.chat(messages=messages, tools=tools)
-                    except Exception:
+                    if is_garbled and tool_round == 0:
+                        logger.warning(
+                            f"Malformed tool call, retrying: {err_msg}"
+                        )
+                        try:
+                            response = await provider.chat(
+                                messages=messages, tools=tools
+                            )
+                        except Exception:
+                            logger.warning(
+                                "Retry failed, falling back to streaming"
+                            )
+                            break
+                    elif is_garbled:
                         logger.warning(
-                            "Tool call retry failed, falling back to no-tools response"
+                            f"Malformed tool call in round {tool_round + 1}, "
+                            "streaming final response"
                         )
-                        async for chunk in provider.chat_stream(messages=messages):
-                            yield strip_markdown_for_tts(chunk)
+                        break
+                    else:
+                        raise  # Re-raise non-tool errors
+
+                if not response.tool_calls:
+                    # Model is done with tools
+                    if response.content:
+                        # Publish no-tool status
+                        if hasattr(agent, "_on_tool_status") and agent._on_tool_status:
+                            import asyncio
+
+                            asyncio.create_task(
+                                agent._on_tool_status(False, [], [])
+                            )
+                        yield strip_markdown_for_tts(response.content)
                         return
-                else:
-                    raise  # Re-raise non-tool errors
+                    break  # No content either, fall through to streaming
 
-            if response.tool_calls:
-                logger.info(f"LLM returned {len(response.tool_calls)} tool call(s)")
+                # Execute tool calls
+                tool_round += 1
+                logger.info(
+                    f"Tool round {tool_round}/{max_tool_rounds}: "
+                    f"{len(response.tool_calls)} call(s)"
+                )
 
                 # Track tool usage for frontend indicator
                 tool_names = [tc.name for tc in response.tool_calls]
                 tool_params = [tc.arguments for tc in response.tool_calls]
 
-                # Publish tool status immediately
                 if hasattr(agent, "_on_tool_status") and agent._on_tool_status:
                     import asyncio
 
@@ -164,7 +198,6 @@ async def llm_node(self, chat_ctx, tools, model_settings):
                         agent._on_tool_status(True, tool_names, tool_params)
                     )
 
-                # Execute tools and get results (cache structured data)
                 messages = await _execute_tool_calls(
                     agent,
                     messages,
@@ -174,54 +207,82 @@ async def llm_node(self, chat_ctx, tools, model_settings):
                     tool_data_cache=tool_data_cache,
                     short_term_memory=short_term_memory,
                 )
+                # Loop back — model sees tool results and decides: chain or respond
 
-                # Stream follow-up response with tool results
-                # Pass tools so Ollama can validate tool_calls in message history
-                logger.info("Streaming follow-up response after tool execution...")
-                chunk_count = 0
-                async for chunk in provider.chat_stream(messages=messages, tools=tools):
-                    chunk_count += 1
-                    yield strip_markdown_for_tts(chunk)
-                if chunk_count == 0:
-                    logger.warning(
-                        "Follow-up stream produced 0 chunks — model may have "
-                        "attempted a tool call instead of generating text. "
-                        "Retrying without tools..."
-                    )
-                    async for chunk in provider.chat_stream(messages=messages):
-                        chunk_count += 1
-                        yield strip_markdown_for_tts(chunk)
-                    if chunk_count == 0:
-                        logger.error("Follow-up retry also produced 0 chunks")
-                else:
-                    logger.info(f"Follow-up stream completed: {chunk_count} chunks")
-                return
-
-            # No tool calls - return content directly
-            elif response.content:
-                # Publish no-tool status immediately
-                if hasattr(agent, "_on_tool_status") and agent._on_tool_status:
-                    import asyncio
-
-                    asyncio.create_task(agent._on_tool_status(False, [], []))
-                yield strip_markdown_for_tts(response.content)
-                return
+            if tool_round >= max_tool_rounds:
+                logger.warning(
+                    f"Hit max tool rounds ({max_tool_rounds}), streaming response"
+                )
 
-        # No tools or no tool calls - stream directly
-        # Publish no-tool status immediately
+        # Stream final response (after tool chain or no tools)
+        # If we executed tools, messages contains full tool history
         if hasattr(agent, "_on_tool_status") and agent._on_tool_status:
             import asyncio
 
             asyncio.create_task(agent._on_tool_status(False, [], []))
 
-        async for chunk in provider.chat_stream(messages=messages):
-            yield strip_markdown_for_tts(chunk)
+        if tool_round > 0:
+            # After tool execution — pass tools so Ollama can validate
+            # tool_calls in message history
+            logger.info("Streaming response after tool execution...")
+            try:
+                async for chunk in provider.chat_stream(
+                    messages=messages, tools=tools
+                ):
+                    yield strip_markdown_for_tts(chunk)
+            except Exception as stream_err:
+                # Safety fallback: strip tool messages and retry without tools
+                logger.warning(
+                    f"Post-tool streaming failed: {stream_err}. "
+                    "Retrying with stripped tool messages..."
+                )
+                clean_messages = _strip_tool_messages(messages)
+                async for chunk in provider.chat_stream(messages=clean_messages):
+                    yield strip_markdown_for_tts(chunk)
+        else:
+            # No tools or no tool calls — plain streaming
+            async for chunk in provider.chat_stream(messages=messages):
+                yield strip_markdown_for_tts(chunk)
 
     except Exception as e:
         logger.error(f"Error in llm_node: {e}", exc_info=True)
         yield f"I encountered an error: {e}"
 
 
+def _strip_tool_messages(messages: list[dict]) -> list[dict]:
+    """Convert tool call/result messages to plain text.
+
+    Ollama crashes if messages contain tool references but no tools are
+    registered. This converts tool messages to plain text equivalents,
+    preserving the context so the model knows what happened.
+    """
+    cleaned = []
+    for msg in messages:
+        if msg.get("role") == "tool":
+            # Tool result → system message with the content
+            cleaned.append({
+                "role": "system",
+                "content": f"Tool result: {msg.get('content', '')}",
+            })
+        elif msg.get("role") == "assistant" and msg.get("tool_calls"):
+            # Assistant tool call → plain assistant message
+            parts = []
+            if msg.get("content"):
+                parts.append(msg["content"])
+            for tc in msg["tool_calls"]:
+                func = tc.get("function", {})
+                name = func.get("name", "unknown")
+                args = func.get("arguments", {})
+                parts.append(f"[Called {name} with {args}]")
+            cleaned.append({
+                "role": "assistant",
+                "content": "\n".join(parts),
+            })
+        else:
+            cleaned.append(msg)
+    return cleaned
+
+
 def _build_messages_from_context(
     chat_ctx,
     tool_data_cache: ToolDataCache | None = None,

From d8d5fcd19953bc04d95b7911629c97ffd5d20524 Mon Sep 17 00:00:00 2001
From: Corey McCallum <corey.mccallum@gmail.com>
Date: Tue, 3 Feb 2026 22:01:06 -0800
Subject: [PATCH 04/10] fix(agent): dedup tool calls, persist indicator, inject
 call args into context

- Deduplicate identical tool calls within a single round (same name + args)
- Accumulate tool names/params across chained rounds for frontend indicator
- Keep tool indicator showing after response (don't clear when tools were used)
- Include tool call arguments in ToolDataCache context injection

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/caal/llm/llm_node.py | 48 ++++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 19 deletions(-)

diff --git a/src/caal/llm/llm_node.py b/src/caal/llm/llm_node.py
index 0088b9a..9c5e221 100644
--- a/src/caal/llm/llm_node.py
+++ b/src/caal/llm/llm_node.py
@@ -52,9 +52,9 @@ def __init__(self, max_entries: int = 3):
         self.max_entries = max_entries
         self._cache: list[dict] = []
 
-    def add(self, tool_name: str, data: Any) -> None:
-        """Add tool response data to cache."""
-        entry = {"tool": tool_name, "data": data, "timestamp": time.time()}
+    def add(self, tool_name: str, data: Any, arguments: dict | None = None) -> None:
+        """Add tool call and response data to cache."""
+        entry = {"tool": tool_name, "args": arguments, "data": data, "timestamp": time.time()}
         self._cache.append(entry)
         if len(self._cache) > self.max_entries:
             self._cache.pop(0)  # Remove oldest
@@ -63,9 +63,10 @@ def get_context_message(self) -> str | None:
         """Format cached data as context string for LLM injection."""
         if not self._cache:
             return None
-        parts = ["Recent tool response data for reference:"]
+        parts = ["Recent tool calls and responses for reference:"]
         for entry in self._cache:
-            parts.append(f"\n{entry['tool']}: {json.dumps(entry['data'])}")
+            args = json.dumps(entry['args']) if entry.get('args') else ''
+            parts.append(f"\n{entry['tool']}({args}) → {json.dumps(entry['data'])}")
         return "\n".join(parts)
 
     def clear(self) -> None:
@@ -120,6 +121,8 @@ async def llm_node(self, chat_ctx, tools, model_settings):
         # Model can call tool A → get result → call tool B → get result → text
         max_tool_rounds = 5
         tool_round = 0
+        all_tool_names: list[str] = []
+        all_tool_params: list[dict] = []
 
         if tools:
             while tool_round < max_tool_rounds:
@@ -169,13 +172,8 @@ async def llm_node(self, chat_ctx, tools, model_settings):
                 if not response.tool_calls:
                     # Model is done with tools
                     if response.content:
-                        # Publish no-tool status
-                        if hasattr(agent, "_on_tool_status") and agent._on_tool_status:
-                            import asyncio
-
-                            asyncio.create_task(
-                                agent._on_tool_status(False, [], [])
-                            )
+                        # Don't clear tool status — keep indicator showing
+                        # which tools were used for this response
                         yield strip_markdown_for_tts(response.content)
                         return
                     break  # No content either, fall through to streaming
@@ -187,15 +185,15 @@ async def llm_node(self, chat_ctx, tools, model_settings):
                     f"{len(response.tool_calls)} call(s)"
                 )
 
-                # Track tool usage for frontend indicator
-                tool_names = [tc.name for tc in response.tool_calls]
-                tool_params = [tc.arguments for tc in response.tool_calls]
+                # Accumulate tool usage across rounds for frontend indicator
+                all_tool_names.extend(tc.name for tc in response.tool_calls)
+                all_tool_params.extend(tc.arguments for tc in response.tool_calls)
 
                 if hasattr(agent, "_on_tool_status") and agent._on_tool_status:
                     import asyncio
 
                     asyncio.create_task(
-                        agent._on_tool_status(True, tool_names, tool_params)
+                        agent._on_tool_status(True, all_tool_names, all_tool_params)
                     )
 
                 messages = await _execute_tool_calls(
@@ -215,8 +213,8 @@ async def llm_node(self, chat_ctx, tools, model_settings):
                 )
 
         # Stream final response (after tool chain or no tools)
-        # If we executed tools, messages contains full tool history
-        if hasattr(agent, "_on_tool_status") and agent._on_tool_status:
+        # Only clear tool indicator if no tools were called this turn
+        if tool_round == 0 and hasattr(agent, "_on_tool_status") and agent._on_tool_status:
             import asyncio
 
             asyncio.create_task(agent._on_tool_status(False, [], []))
@@ -536,6 +534,18 @@ async def _execute_tool_calls(
     )
     messages.append(tool_call_message)
 
+    # Deduplicate identical tool calls (same name + same args)
+    seen = set()
+    unique_tool_calls = []
+    for tc in tool_calls:
+        key = (tc.name, json.dumps(tc.arguments, sort_keys=True))
+        if key not in seen:
+            seen.add(key)
+            unique_tool_calls.append(tc)
+        else:
+            logger.info(f"Dedup: skipping duplicate {tc.name} call with identical args")
+    tool_calls = unique_tool_calls
+
     # Execute each tool
     for tool_call in tool_calls:
         tool_name = tool_call.name
@@ -583,7 +593,7 @@ async def _execute_tool_calls(
                     or tool_result.get("results")
                     or tool_result
                 )
-                tool_data_cache.add(tool_name, data)
+                tool_data_cache.add(tool_name, data, arguments=arguments)
                 logger.debug(f"Cached tool data for {tool_name}")
 
             # Format tool result - preserve JSON structure for LLM

From c4cd33c306ff2b8d86a08f447c33c091d0a3d70e Mon Sep 17 00:00:00 2001
From: Corey McCallum <corey.mccallum@gmail.com>
Date: Thu, 5 Feb 2026 20:28:56 -0800
Subject: [PATCH 05/10] fix(memory): write to /app/data volume for Docker
 persistence

Memory file was failing with permission denied because /app is
owned by root. Now uses CAAL_MEMORY_DIR=/app/data (the caal-memory
volume) and entrypoint ensures directory is writable by agent user.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 docker-compose.apple.yaml | 1 +
 docker-compose.cpu.yaml   | 1 +
 docker-compose.yaml       | 1 +
 entrypoint.sh             | 5 +++++
 4 files changed, 8 insertions(+)

diff --git a/docker-compose.apple.yaml b/docker-compose.apple.yaml
index 895b635..4a6997e 100644
--- a/docker-compose.apple.yaml
+++ b/docker-compose.apple.yaml
@@ -88,6 +88,7 @@ services:
       - ./.env
     environment:
       - LIVEKIT_URL=ws://livekit:7880
+      - CAAL_MEMORY_DIR=/app/data
       # Point to mlx-audio (single service for STT + TTS)
       - SPEACHES_URL=${MLX_AUDIO_URL:-http://host.docker.internal:8001}
       - KOKORO_URL=${MLX_AUDIO_URL:-http://host.docker.internal:8001}
diff --git a/docker-compose.cpu.yaml b/docker-compose.cpu.yaml
index 454fcce..ef2600a 100644
--- a/docker-compose.cpu.yaml
+++ b/docker-compose.cpu.yaml
@@ -80,6 +80,7 @@ services:
     environment:
       - LIVEKIT_URL=ws://livekit:7880
       - SPEACHES_URL=http://speaches:8000
+      - CAAL_MEMORY_DIR=/app/data
     volumes:
       - caal-memory:/app/data
       - caal-config:/app/config  # Runtime config (settings.json, mcp_servers.json)
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 190fb28..443c459 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -76,6 +76,7 @@ services:
       - LIVEKIT_URL=ws://livekit:7880
       - SPEACHES_URL=http://speaches:8000
       - KOKORO_URL=http://kokoro:8880
+      - CAAL_MEMORY_DIR=/app/data
     volumes:
       - caal-memory:/app/data
       - caal-config:/app/config  # Runtime config (settings.json, mcp_servers.json)
diff --git a/entrypoint.sh b/entrypoint.sh
index 1b58e9c..5491d6a 100755
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -5,11 +5,16 @@
 set -e
 
 CONFIG_DIR="/app/config"
+DATA_DIR="/app/data"
 
 # Ensure config directory exists and is writable by agent
 mkdir -p "$CONFIG_DIR"
 chown agent:agent "$CONFIG_DIR"
 
+# Ensure data directory exists and is writable by agent (memory persistence)
+mkdir -p "$DATA_DIR"
+chown agent:agent "$DATA_DIR"
+
 # settings.json - copy default if missing
 if [ ! -f "$CONFIG_DIR/settings.json" ]; then
     echo "Creating settings.json from defaults..."

From 323fb3291d501a792ea1c4e560bd0b482bdfabc1 Mon Sep 17 00:00:00 2001
From: Corey McCallum <corey.mccallum@gmail.com>
Date: Thu, 5 Feb 2026 20:36:00 -0800
Subject: [PATCH 06/10] fix(memory): mark context as internal so LLM won't read
 it aloud

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/caal/memory/short_term.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/caal/memory/short_term.py b/src/caal/memory/short_term.py
index 713f163..28c58aa 100644
--- a/src/caal/memory/short_term.py
+++ b/src/caal/memory/short_term.py
@@ -191,7 +191,8 @@ def get_context_message(self) -> str | None:
 
         now = time.time()
         lines = [
-            "Short-term memory (check before asking user for previously-mentioned info):"
+            "[INTERNAL CONTEXT - DO NOT read aloud or announce. "
+            "Use silently when relevant to the conversation.]"
         ]
 
         for key, entry in self._cache["entries"].items():

From fdde93dff37a80ada351fed0415343a2275c4aa6 Mon Sep 17 00:00:00 2001
From: Corey McCallum <corey.mccallum@gmail.com>
Date: Thu, 5 Feb 2026 20:37:17 -0800
Subject: [PATCH 07/10] fix(memory): only inject context after user has spoken

Prevents the LLM from using memory data in the initial greeting.
Memory context is now skipped when there are no user messages yet.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/caal/llm/llm_node.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/caal/llm/llm_node.py b/src/caal/llm/llm_node.py
index 9c5e221..486fc5e 100644
--- a/src/caal/llm/llm_node.py
+++ b/src/caal/llm/llm_node.py
@@ -361,8 +361,9 @@ def _build_messages_from_context(
         if context:
             messages.append({"role": "system", "content": context})
 
-    # 3. Inject short-term memory context
-    if short_term_memory:
+    # 3. Inject short-term memory context (only after user has spoken)
+    has_user_message = any(m["role"] == "user" for m in chat_messages)
+    if short_term_memory and has_user_message:
         memory_context = short_term_memory.get_context_message()
         if memory_context:
             messages.append({"role": "system", "content": memory_context})

From 351b374bf78ce38e01ee0d55873e7d468d7963d4 Mon Sep 17 00:00:00 2001
From: Corey McCallum <corey.mccallum@gmail.com>
Date: Thu, 5 Feb 2026 20:50:33 -0800
Subject: [PATCH 08/10] refactor(memory): keep context injection as awareness
 hint for tool chaining
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Context injection helps the LLM know what's in memory so it can
chain tools correctly (e.g. memory_short → flight_tracker). Without
it, the model may skip memory and go to other tools directly.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/caal/llm/llm_node.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/caal/llm/llm_node.py b/src/caal/llm/llm_node.py
index 486fc5e..8300ee4 100644
--- a/src/caal/llm/llm_node.py
+++ b/src/caal/llm/llm_node.py
@@ -292,13 +292,13 @@ def _build_messages_from_context(
     Message order:
     1. System prompt (always first, never trimmed)
     2. Tool data context (injected from cache)
-    3. Short-term memory context (persistent facts)
+    3. Short-term memory context (awareness hint for tool chaining)
     4. Chat history (sliding window applied)
 
     Args:
         chat_ctx: LiveKit chat context
         tool_data_cache: Cache of recent tool response data
-        short_term_memory: Short-term memory for persistent context
+        short_term_memory: Short-term memory for context awareness
         max_turns: Max conversation turns to keep (1 turn = user + assistant)
     """
     system_prompt = None

From 0e4603cd6274388408d8984d046e554ae836c1f7 Mon Sep 17 00:00:00 2001
From: Corey McCallum <corey.mccallum@gmail.com>
Date: Thu, 5 Feb 2026 21:23:46 -0800
Subject: [PATCH 09/10] feat(memory): add inline edit in memory panel, fix
 registry_cache permissions

- Memory detail modal now has pencil icon to edit values in-place
- Add registry_cache.json symlink to entrypoint.sh (same pattern as
  settings.json) to fix permission denied on /app/registry_cache.json

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 entrypoint.sh                               |  7 ++
 frontend/components/memory/memory-panel.tsx | 84 ++++++++++++++++++---
 2 files changed, 81 insertions(+), 10 deletions(-)

diff --git a/entrypoint.sh b/entrypoint.sh
index 5491d6a..17e68f9 100755
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -29,9 +29,16 @@ if [ ! -f "$CONFIG_DIR/mcp_servers.json" ]; then
     chown agent:agent "$CONFIG_DIR/mcp_servers.json"
 fi
 
+# registry_cache.json - create empty if missing
+if [ ! -f "$CONFIG_DIR/registry_cache.json" ]; then
+    echo '{}' > "$CONFIG_DIR/registry_cache.json"
+    chown agent:agent "$CONFIG_DIR/registry_cache.json"
+fi
+
 # Create symlinks from /app to config files (for code that expects them in /app)
 ln -sf "$CONFIG_DIR/settings.json" /app/settings.json
 ln -sf "$CONFIG_DIR/mcp_servers.json" /app/mcp_servers.json
+ln -sf "$CONFIG_DIR/registry_cache.json" /app/registry_cache.json
 
 # Drop privileges and execute the main command as agent user
 exec gosu agent "$@"
diff --git a/frontend/components/memory/memory-panel.tsx b/frontend/components/memory/memory-panel.tsx
index 663d15a..bff3673 100644
--- a/frontend/components/memory/memory-panel.tsx
+++ b/frontend/components/memory/memory-panel.tsx
@@ -3,7 +3,7 @@
 import { useCallback, useEffect, useState } from 'react';
 import { createPortal } from 'react-dom';
 import { useTranslations } from 'next-intl';
-import { Brain, Clock, Trash, X } from '@phosphor-icons/react/dist/ssr';
+import { Brain, Clock, FloppyDisk, PencilSimple, Trash, X } from '@phosphor-icons/react/dist/ssr';
 import { Button } from '@/components/livekit/button';
 
 interface MemoryEntry {
@@ -82,6 +82,8 @@ export function MemoryPanel({ isOpen, onClose }: MemoryPanelProps) {
   const [entries, setEntries] = useState<MemoryEntry[]>([]);
   const [loading, setLoading] = useState(true);
   const [selectedEntry, setSelectedEntry] = useState<MemoryEntry | null>(null);
+  const [editing, setEditing] = useState(false);
+  const [editValue, setEditValue] = useState('');
 
   const fetchMemory = useCallback(async () => {
     setLoading(true);
@@ -131,6 +133,27 @@ export function MemoryPanel({ isOpen, onClose }: MemoryPanelProps) {
     }
   };
 
+  const handleSaveEdit = async () => {
+    if (!selectedEntry) return;
+
+    try {
+      const res = await fetch('/api/memory', {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({ key: selectedEntry.key, value: editValue }),
+      });
+      if (res.ok) {
+        setEntries(
+          entries.map((e) => (e.key === selectedEntry.key ? { ...e, value: editValue } : e))
+        );
+        setSelectedEntry({ ...selectedEntry, value: editValue });
+        setEditing(false);
+      }
+    } catch (err) {
+      console.error('Failed to save memory entry:', err);
+    }
+  };
+
   if (!isOpen) return null;
 
   return createPortal(
@@ -187,7 +210,10 @@ export function MemoryPanel({ isOpen, onClose }: MemoryPanelProps) {
                   key={entry.key}
                   className="hover:bg-muted/50 flex cursor-pointer items-center justify-between rounded-lg border p-4 transition-colors"
                   style={{ borderColor: 'var(--border-subtle)' }}
-                  onClick={() => setSelectedEntry(entry)}
+                  onClick={() => {
+                    setSelectedEntry(entry);
+                    setEditing(false);
+                  }}
                 >
                   <div className="min-w-0 flex-1">
                     <div className="flex items-center gap-2">
@@ -261,14 +287,52 @@ export function MemoryPanel({ isOpen, onClose }: MemoryPanelProps) {
             </div>
 
             <div className="mb-4">
-              <label className="text-muted-foreground mb-1 block text-xs uppercase">
-                {t('detail.value')}
-              </label>
-              <pre className="bg-muted overflow-auto rounded-lg p-3 text-sm">
-                {typeof selectedEntry.value === 'string'
-                  ? selectedEntry.value
-                  : JSON.stringify(selectedEntry.value, null, 2)}
-              </pre>
+              <div className="mb-1 flex items-center justify-between">
+                <label className="text-muted-foreground block text-xs uppercase">
+                  {t('detail.value')}
+                </label>
+                {!editing && (
+                  <button
+                    onClick={() => {
+                      setEditing(true);
+                      setEditValue(
+                        typeof selectedEntry.value === 'string'
+                          ? selectedEntry.value
+                          : JSON.stringify(selectedEntry.value, null, 2)
+                      );
+                    }}
+                    className="text-muted-foreground hover:text-foreground p-1 transition-colors"
+                  >
+                    <PencilSimple className="h-4 w-4" />
+                  </button>
+                )}
+              </div>
+              {editing ? (
+                <div>
+                  <textarea
+                    value={editValue}
+                    onChange={(e) => setEditValue(e.target.value)}
+                    className="bg-muted w-full rounded-lg border p-3 font-mono text-sm"
+                    style={{ borderColor: 'var(--border-subtle)', minHeight: '80px' }}
+                    autoFocus
+                  />
+                  <div className="mt-2 flex justify-end gap-2">
+                    <Button variant="secondary" onClick={() => setEditing(false)}>
+                      {tCommon('cancel')}
+                    </Button>
+                    <Button variant="primary" onClick={handleSaveEdit}>
+                      <FloppyDisk className="mr-2 h-4 w-4" />
+                      {tCommon('save')}
+                    </Button>
+                  </div>
+                </div>
+              ) : (
+                <pre className="bg-muted overflow-auto rounded-lg p-3 text-sm">
+                  {typeof selectedEntry.value === 'string'
+                    ? selectedEntry.value
+                    : JSON.stringify(selectedEntry.value, null, 2)}
+                </pre>
+              )}
             </div>
 
             <div className="text-muted-foreground grid grid-cols-2 gap-4 text-sm">

From ca937f051b2fd3290b91fe052d981b25c1101565 Mon Sep 17 00:00:00 2001
From: Corey McCallum <corey.mccallum@gmail.com>
Date: Thu, 5 Feb 2026 21:29:46 -0800
Subject: [PATCH 10/10] Change default temperature from 0.7 to 0.15

Ministral-3 recommended instruction temperature is 0.15. The old 0.7
default was overriding the Modelfile setting on every API call.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 mobile/lib/screens/settings_screen.dart   | 2 +-
 settings.default.json                     | 2 +-
 src/caal/llm/ollama_llm.py                | 4 ++--
 src/caal/llm/ollama_node.py               | 4 ++--
 src/caal/llm/providers/__init__.py        | 6 +++---
 src/caal/llm/providers/groq_provider.py   | 2 +-
 src/caal/llm/providers/ollama_provider.py | 2 +-
 voice_agent.py                            | 2 +-
 8 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/mobile/lib/screens/settings_screen.dart b/mobile/lib/screens/settings_screen.dart
index 8c0bbac..7415039 100644
--- a/mobile/lib/screens/settings_screen.dart
+++ b/mobile/lib/screens/settings_screen.dart
@@ -58,7 +58,7 @@ class _SettingsScreenState extends State<SettingsScreen> {
   String _n8nToken = '';
 
   // LLM settings
-  double _temperature = 0.7;
+  double _temperature = 0.15;
   int _numCtx = 8192;
   int _maxTurns = 20;
   int _toolCacheSize = 3;
diff --git a/settings.default.json b/settings.default.json
index 8130cf0..0488885 100644
--- a/settings.default.json
+++ b/settings.default.json
@@ -3,7 +3,7 @@
   "agent_name": "Cal",
   "tts_voice": "am_puck",
   "prompt": "default",
-  "temperature": 0.7,
+  "temperature": 0.15,
   "model": "ministral-3:8b",
   "num_ctx": 8192,
   "max_turns": 20,
diff --git a/src/caal/llm/ollama_llm.py b/src/caal/llm/ollama_llm.py
index fb5d39e..43c3716 100644
--- a/src/caal/llm/ollama_llm.py
+++ b/src/caal/llm/ollama_llm.py
@@ -20,7 +20,7 @@
     >>> llm = OllamaLLM(
     ...     model="qwen3:8b",
     ...     think=False,  # Disable thinking for lower latency
-    ...     temperature=0.7,
+    ...     temperature=0.15,
     ... )
     >>>
     >>> session = AgentSession(stt=..., llm=llm, tts=...)
@@ -78,7 +78,7 @@ def __init__(
         *,
         model: str = "qwen3:8b",
         think: bool = False,
-        temperature: float = 0.7,
+        temperature: float = 0.15,
         top_p: float = 0.8,
         top_k: int = 20,
         num_ctx: int = 8192,
diff --git a/src/caal/llm/ollama_node.py b/src/caal/llm/ollama_node.py
index 265ec79..305d7b8 100644
--- a/src/caal/llm/ollama_node.py
+++ b/src/caal/llm/ollama_node.py
@@ -73,7 +73,7 @@ def __init__(
         self,
         model: str = "qwen3:8b",
         think: bool = False,
-        temperature: float = 0.7,
+        temperature: float = 0.15,
         top_p: float = 0.8,
         top_k: int = 20,
     ):
@@ -98,7 +98,7 @@ async def ollama_llm_node(
     chat_ctx,
     model: str = "qwen3:8b",
     think: bool = False,
-    temperature: float = 0.7,
+    temperature: float = 0.15,
     top_p: float = 0.8,
     top_k: int = 20,
     num_ctx: int = 8192,
diff --git a/src/caal/llm/providers/__init__.py b/src/caal/llm/providers/__init__.py
index e74d464..316cca3 100644
--- a/src/caal/llm/providers/__init__.py
+++ b/src/caal/llm/providers/__init__.py
@@ -61,7 +61,7 @@ def create_provider(
         ...     "ollama",
         ...     model="qwen3:8b",
         ...     think=False,
-        ...     temperature=0.7,
+        ...     temperature=0.15,
         ... )
     """
     provider_name = provider_name.lower()
@@ -106,7 +106,7 @@ def create_provider_from_settings(settings: dict[str, Any]) -> LLMProvider:
             model=settings.get("ollama_model", "qwen3:8b"),
             base_url=settings.get("ollama_host"),
             think=settings.get("think", False),
-            temperature=settings.get("temperature", 0.7),
+            temperature=settings.get("temperature", 0.15),
             num_ctx=settings.get("num_ctx", 8192),
         )
     elif provider_name == "groq":
@@ -115,7 +115,7 @@ def create_provider_from_settings(settings: dict[str, Any]) -> LLMProvider:
         return GroqProvider(
             model=settings.get("groq_model", "llama-3.3-70b-versatile"),
             api_key=api_key,
-            temperature=settings.get("temperature", 0.7),
+            temperature=settings.get("temperature", 0.15),
         )
     else:
         raise ValueError(
diff --git a/src/caal/llm/providers/groq_provider.py b/src/caal/llm/providers/groq_provider.py
index edd6838..fa76258 100644
--- a/src/caal/llm/providers/groq_provider.py
+++ b/src/caal/llm/providers/groq_provider.py
@@ -42,7 +42,7 @@ def __init__(
         self,
         model: str = "llama-3.3-70b-versatile",
         api_key: str | None = None,
-        temperature: float = 0.7,
+        temperature: float = 0.15,
         max_tokens: int = 4096,
     ) -> None:
         self._model = model
diff --git a/src/caal/llm/providers/ollama_provider.py b/src/caal/llm/providers/ollama_provider.py
index 8e88c0e..382b92c 100644
--- a/src/caal/llm/providers/ollama_provider.py
+++ b/src/caal/llm/providers/ollama_provider.py
@@ -44,7 +44,7 @@ def __init__(
         self,
         model: str = "qwen3:8b",
         think: bool = False,
-        temperature: float = 0.7,
+        temperature: float = 0.15,
         top_p: float = 0.8,
         top_k: int = 20,
         num_ctx: int = 8192,
diff --git a/voice_agent.py b/voice_agent.py
index f7f125f..4e426b0 100644
--- a/voice_agent.py
+++ b/voice_agent.py
@@ -134,7 +134,7 @@ def get_runtime_settings() -> dict:
         "stt_provider": user_settings.get("stt_provider") or os.getenv("STT_PROVIDER", "speaches"),
         # LLM Provider settings - .env overrides default, user setting overrides .env
         "llm_provider": user_settings.get("llm_provider") or os.getenv("LLM_PROVIDER", "ollama"),
-        "temperature": settings.get("temperature", float(os.getenv("OLLAMA_TEMPERATURE", "0.7"))),
+        "temperature": settings.get("temperature", float(os.getenv("OLLAMA_TEMPERATURE", "0.15"))),
         # Ollama settings
         "ollama_host": (
             user_settings.get("ollama_host")