From 25c76b969c3d5527acb7a093e2115e3f1c5718db Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Wed, 19 Nov 2025 04:03:05 +0700 Subject: [PATCH 1/3] WIP --- dev/payload.config.ts | 16 +- dev/specs/e2e.spec.ts | 1 - dev/specs/extensionFields.spec.ts | 157 +++++++++++++++ dev/specs/int.spec.ts | 4 +- dev/specs/queueName.spec.ts | 16 +- dev/specs/vectorSearch.spec.ts | 67 ++++++- package.json | 6 +- pnpm-lock.yaml | 12 ++ src/collections/embeddings.ts | 143 +++++++------- src/endpoints/vectorSearch.ts | 317 +++++++++++++++++++++++++----- src/index.ts | 13 +- src/tasks/vectorize.ts | 58 +++--- src/types.ts | 35 ++-- 13 files changed, 658 insertions(+), 187 deletions(-) create mode 100644 dev/specs/extensionFields.spec.ts diff --git a/dev/payload.config.ts b/dev/payload.config.ts index ced11d8..74ea031 100644 --- a/dev/payload.config.ts +++ b/dev/payload.config.ts @@ -103,9 +103,19 @@ const buildConfigWithPostgres = async () => { default: { collections: { posts: { - fields: { - title: { chunker: chunkText }, - content: { chunker: chunkRichText }, + toKnowledgePool: async (doc, payload) => { + const chunks: Array<{ chunk: string }> = [] + // Process title + if (doc.title) { + const titleChunks = chunkText(doc.title) + chunks.push(...titleChunks.map((chunk) => ({ chunk }))) + } + // Process content + if (doc.content) { + const contentChunks = await chunkRichText(doc.content, payload) + chunks.push(...contentChunks.map((chunk) => ({ chunk }))) + } + return chunks }, }, }, diff --git a/dev/specs/e2e.spec.ts b/dev/specs/e2e.spec.ts index a019e17..d60b29e 100644 --- a/dev/specs/e2e.spec.ts +++ b/dev/specs/e2e.spec.ts @@ -34,7 +34,6 @@ test('querying the endpoint should return the title when queried', async ({ requ expect.objectContaining({ sourceCollection: 'posts', docId: String(post.id), - fieldPath: 'title', chunkIndex: 0, chunkText: title, embeddingVersion: testEmbeddingVersion, diff --git a/dev/specs/extensionFields.spec.ts b/dev/specs/extensionFields.spec.ts new file mode 100644 index 0000000..ce967d4 --- /dev/null +++ b/dev/specs/extensionFields.spec.ts @@ -0,0 +1,157 @@ +import type { Payload } from 'payload' +import { getPayload } from 'payload' +import { beforeAll, describe, expect, test } from 'vitest' +import { postgresAdapter } from '@payloadcms/db-postgres' +import { buildDummyConfig, integration, plugin } from './constants.js' +import { createTestDb } from './utils.js' +import { PostgresPayload } from '../../src/types.js' +import { chunkText, chunkRichText } from 'helpers/chunkers.js' +import { makeDummyEmbedDocs, makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' +import { DIMS } from './constants.js' + +describe('Extension fields integration tests', () => { + let payload: Payload + const dbName = 'extension_fields_test' + + beforeAll(async () => { + await createTestDb({ dbName }) + const config = await buildDummyConfig({ + collections: [ + { + slug: 'posts', + fields: [ + { name: 'title', type: 'text' }, + { name: 'content', type: 'richText' }, + ], + }, + ], + db: postgresAdapter({ + extensions: ['vector'], + afterSchemaInit: [integration.afterSchemaInitHook], + pool: { + connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, + }, + }), + plugins: [ + plugin({ + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc, payload) => { + const chunks: Array<{ chunk: string; category?: string; priority?: number }> = + [] + // Process title + if (doc.title) { + const titleChunks = chunkText(doc.title) + chunks.push( + ...titleChunks.map((chunk) => ({ + chunk, + category: doc.category || 'general', + priority: doc.priority || 0, + })), + ) + } + // Process content + if (doc.content) { + const contentChunks = await chunkRichText(doc.content, payload) + chunks.push( + ...contentChunks.map((chunk) => ({ + chunk, + category: doc.category || 'general', + priority: doc.priority || 0, + })), + ) + } + return chunks + }, + extensionFields: [ + { + name: 'category', + type: 'text', + admin: { + description: 'Category for filtering embeddings', + }, + }, + { + name: 'priority', + type: 'number', + admin: { + description: 'Priority level for the embedding', + }, + }, + ], + }, + }, + embedDocs: makeDummyEmbedDocs(DIMS), + embedQuery: makeDummyEmbedQuery(DIMS), + embeddingVersion: testEmbeddingVersion, + }, + }, + }), + ], + }) + payload = await getPayload({ config }) + }) + + test('extension fields are added to the embeddings table schema', async () => { + const db = (payload as PostgresPayload).db + const sql = ` + SELECT column_name, data_type, udt_name + FROM information_schema.columns + WHERE table_schema = 'public' AND table_name = 'default' + ORDER BY column_name + ` + + let rows: any[] = [] + if (db?.pool?.query) { + const res = await db.pool.query(sql) + rows = res?.rows || [] + } else if (db?.drizzle?.execute) { + const res = await db.drizzle.execute(sql) + rows = Array.isArray(res) ? res : res?.rows || [] + } + + const columnsByName = Object.fromEntries(rows.map((r: any) => [r.column_name, r])) + + // Check that reserved fields exist + expect(columnsByName.source_collection).toBeDefined() + expect(columnsByName.doc_id).toBeDefined() + expect(columnsByName.chunk_index).toBeDefined() + expect(columnsByName.chunk_text).toBeDefined() + expect(columnsByName.embedding_version).toBeDefined() + expect(columnsByName.embedding).toBeDefined() + + // Check that extension fields exist + expect(columnsByName.category).toBeDefined() + expect(columnsByName.category.data_type).toBe('text') + expect(columnsByName.priority).toBeDefined() + expect(columnsByName.priority.data_type).toBe('numeric' || 'integer') + }) + + test('extension field values are stored with embeddings', async () => { + const post = await payload.create({ + collection: 'posts', + data: { + title: 'Test Post', + content: null, + category: 'tech', + priority: 5, + }, + }) + + // Wait for vectorization to complete + await new Promise((resolve) => setTimeout(resolve, 6000)) + + const embeddings = await payload.find({ + collection: 'default', + where: { + and: [{ sourceCollection: { equals: 'posts' } }, { docId: { equals: String(post.id) } }], + }, + }) + + expect(embeddings.docs.length).toBeGreaterThan(0) + expect(embeddings.docs[0]).toHaveProperty('category', 'tech') + expect(embeddings.docs[0]).toHaveProperty('priority', 5) + }) +}) diff --git a/dev/specs/int.spec.ts b/dev/specs/int.spec.ts index 9d562fa..d39a15d 100644 --- a/dev/specs/int.spec.ts +++ b/dev/specs/int.spec.ts @@ -102,7 +102,6 @@ describe('Plugin integration tests', () => { const expectedTitleDoc = { sourceCollection: 'posts', docId: String(post.id), - fieldPath: 'title', chunkIndex: 0, chunkText: title, embeddingVersion: testEmbeddingVersion, @@ -112,8 +111,7 @@ describe('Plugin integration tests', () => { const expectedContentDocs = contentChunks.map((chunkText, index) => ({ sourceCollection: 'posts', docId: String(post.id), - fieldPath: 'content', - chunkIndex: index, + chunkIndex: index + 1, // +1 because title chunk is at index 0 chunkText, embeddingVersion: testEmbeddingVersion, })) diff --git a/dev/specs/queueName.spec.ts b/dev/specs/queueName.spec.ts index f253206..36fcdbc 100644 --- a/dev/specs/queueName.spec.ts +++ b/dev/specs/queueName.spec.ts @@ -39,9 +39,19 @@ describe('Queue tests', () => { default: { collections: { posts: { - fields: { - title: { chunker: chunkText }, - content: { chunker: chunkRichText }, + toKnowledgePool: async (doc, payload) => { + const chunks: Array<{ chunk: string }> = [] + // Process title + if (doc.title) { + const titleChunks = chunkText(doc.title) + chunks.push(...titleChunks.map((chunk) => ({ chunk }))) + } + // Process content + if (doc.content) { + const contentChunks = await chunkRichText(doc.content, payload) + chunks.push(...contentChunks.map((chunk) => ({ chunk }))) + } + return chunks }, }, }, diff --git a/dev/specs/vectorSearch.spec.ts b/dev/specs/vectorSearch.spec.ts index 3f0c1ec..eb74d28 100644 --- a/dev/specs/vectorSearch.spec.ts +++ b/dev/specs/vectorSearch.spec.ts @@ -24,6 +24,8 @@ async function performVectorSearch( payload: Payload, query: any, knowledgePool: string = 'default', + where?: any, + limit?: number, ): Promise { const knowledgePools: Record = { default: { @@ -37,7 +39,12 @@ async function performVectorSearch( // Create a mock request object const mockRequest = { - json: async () => ({ query, knowledgePool }), + json: async () => ({ + query, + knowledgePool, + ...(where ? { where } : {}), + ...(limit ? { limit } : {}), + }), payload, } as any @@ -83,9 +90,19 @@ describe('Search endpoint integration tests', () => { default: { collections: { posts: { - fields: { - title: { chunker: chunkText }, - content: { chunker: chunkRichText }, + toKnowledgePool: async (doc, payload) => { + const chunks: Array<{ chunk: string }> = [] + // Process title + if (doc.title) { + const titleChunks = chunkText(doc.title) + chunks.push(...titleChunks.map((chunk) => ({ chunk }))) + } + // Process content + if (doc.content) { + const contentChunks = await chunkRichText(doc.content, payload) + chunks.push(...contentChunks.map((chunk) => ({ chunk }))) + } + return chunks }, }, }, @@ -125,7 +142,6 @@ describe('Search endpoint integration tests', () => { expect.objectContaining({ sourceCollection: 'posts', docId: String(post.id), - fieldPath: 'title', chunkIndex: 0, chunkText: titleAndQuery, embeddingVersion: testEmbeddingVersion, @@ -172,4 +188,45 @@ describe('Search endpoint integration tests', () => { expect(error).toHaveProperty('error') expect(error.error).toContain('Query is required and must be a string') }) + + describe('where', () => { + test('filters results by extensionFields using WHERE clause', async () => { + const sharedText = 'Shared searchable content' + + // Create two posts with same text but different categories + const post1 = await payload.create({ + collection: 'posts', + data: { + title: sharedText, + content: null, + }, + }) + + const post2 = await payload.create({ + collection: 'posts', + data: { + title: sharedText, + content: null, + }, + }) + + // Wait for vectorization jobs to complete + await waitForVectorizationJobs(payload) + + // Search without WHERE - should return both + const responseAll = await performVectorSearch(payload, sharedText) + const jsonAll = await responseAll.json() + expect(jsonAll.results.length).toBeGreaterThanOrEqual(2) + + // Search with WHERE clause filtering by docId - should return only one + const responseFiltered = await performVectorSearch(payload, sharedText, 'default', { + where: { + docId: { equals: String(post1.id) }, + }, + }) + const jsonFiltered = await responseFiltered.json() + expect(jsonFiltered.results.length).toBeGreaterThan(0) + expect(jsonFiltered.results.every((r: any) => r.docId === String(post1.id))).toBe(true) + }) + }) }) diff --git a/package.json b/package.json index 80a03a5..7a880a2 100644 --- a/package.json +++ b/package.json @@ -64,6 +64,7 @@ "@types/pg": "^8.15.5", "@types/react": "19.1.8", "@types/react-dom": "19.1.6", + "@types/to-snake-case": "^1.0.2", "ai": "^5.0.72", "autoprefixer": "^10.4.21", "copyfiles": "2.4.1", @@ -128,5 +129,8 @@ ] }, "registry": "https://registry.npmjs.org/", - "packageManager": "pnpm@10.17.1+sha512.17c560fca4867ae9473a3899ad84a88334914f379be46d455cbf92e5cf4b39d34985d452d2583baf19967fa76cb5c17bc9e245529d0b98745721aa7200ecaf7a" + "packageManager": "pnpm@10.17.1+sha512.17c560fca4867ae9473a3899ad84a88334914f379be46d455cbf92e5cf4b39d34985d452d2583baf19967fa76cb5c17bc9e245529d0b98745721aa7200ecaf7a", + "dependencies": { + "to-snake-case": "1.0.0" + } } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 43b96a7..414ccc9 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -7,6 +7,10 @@ settings: importers: .: + dependencies: + to-snake-case: + specifier: 1.0.0 + version: 1.0.0 devDependencies: '@eslint/eslintrc': specifier: ^3.2.0 @@ -56,6 +60,9 @@ importers: '@types/react-dom': specifier: 19.1.6 version: 19.1.6(@types/react@19.1.8) + '@types/to-snake-case': + specifier: ^1.0.2 + version: 1.0.2 ai: specifier: ^5.0.72 version: 5.0.72(zod@4.1.12) @@ -2457,6 +2464,9 @@ packages: '@types/stack-utils@2.0.3': resolution: {integrity: sha512-9aEbYZ3TbYMznPdcdr3SmIrLXwC/AKZXQeCf9Pgao5CKb8CyHuEX5jzWPTkvregvhRJHcpRO6BFoGW9ycaOkYw==} + '@types/to-snake-case@1.0.2': + resolution: {integrity: sha512-uWcU4i5CsTPECCaqPSzmgdAa2wDFjmHw96RAy6kImMp9hmpn/XLocxNEO2PWoUFNbkV/QwKukfDwVN7bcHXNPg==} + '@types/trusted-types@1.0.6': resolution: {integrity: sha512-230RC8sFeHoT6sSUlRO6a8cAnclO06eeiq1QDfiv2FGCLWFvvERWgwIQD4FWqD9A69BN7Lzee4OXwoMVnnsWDw==} @@ -9046,6 +9056,8 @@ snapshots: '@types/stack-utils@2.0.3': {} + '@types/to-snake-case@1.0.2': {} + '@types/trusted-types@1.0.6': {} '@types/unist@2.0.11': {} diff --git a/src/collections/embeddings.ts b/src/collections/embeddings.ts index 16de6a7..2b02bd7 100644 --- a/src/collections/embeddings.ts +++ b/src/collections/embeddings.ts @@ -1,74 +1,85 @@ -import type { CollectionConfig } from 'payload' +import type { CollectionConfig, Field } from 'payload' import type { KnowledgePoolName } from '../types.js' -export const createEmbeddingsCollection = (slug: KnowledgePoolName): CollectionConfig => ({ - slug, - admin: { - description: - 'Vector embeddings for search and similarity queries. Created by the payloadcms-vectorize plugin. Embeddings cannot be added or modified, only deleted, through the admin panel. No other restrictions enforced.', - }, - access: { - create: () => false, // Cannot add new embeddings through admin panel - update: () => false, // Cannot modify any embeddings field through admin panel - }, - fields: [ - { - name: 'sourceCollection', - type: 'text', - required: true, - admin: { - description: 'The collection that this embedding belongs to', - }, +const RESERVED_FIELDS = ['sourceCollection', 'docId', 'chunkIndex', 'chunkText', 'embeddingVersion'] + +export const createEmbeddingsCollection = ( + slug: KnowledgePoolName, + extensionFields?: Field[], +): CollectionConfig => { + // Validate that extensionFields don't conflict with reserved fields + if (extensionFields) { + const conflictingFields = extensionFields + .map((f) => (typeof f === 'object' && 'name' in f ? f.name : null)) + .filter((name): name is string => name !== null && RESERVED_FIELDS.includes(name)) + + if (conflictingFields.length > 0) { + throw new Error( + `[payloadcms-vectorize] Extension fields cannot use reserved field names: ${conflictingFields.join(', ')}`, + ) + } + } + + return { + slug, + admin: { + description: + 'Vector embeddings for search and similarity queries. Created by the payloadcms-vectorize plugin. Embeddings cannot be added or modified, only deleted, through the admin panel. No other restrictions enforced.', }, - // TODO(techiejd): This could probably be a relationship field to the source document. - // Is it possible to use a relationship field to an `ANY` collection? - { - name: 'docId', - type: 'text', - required: true, - admin: { - description: 'The ID of the source document', - }, + access: { + create: () => false, // Cannot add new embeddings through admin panel + update: () => false, // Cannot modify any embeddings field through admin panel }, - { - name: 'fieldPath', - type: 'text', - required: true, - admin: { - description: 'The field path that was vectorized (e.g., "title", "content")', + fields: [ + { + name: 'sourceCollection', + type: 'text', + required: true, + admin: { + description: 'The collection that this embedding belongs to', + }, }, - }, - { - name: 'chunkIndex', - type: 'number', - required: true, - admin: { - description: 'The index of this chunk within the field', + // TODO(techiejd): This could probably be a relationship field to the source document. + // Is it possible to use a relationship field to an `ANY` collection? + { + name: 'docId', + type: 'text', + required: true, + admin: { + description: 'The ID of the source document', + }, }, - }, - { - name: 'chunkText', - type: 'textarea', - admin: { - description: 'The original text that was vectorized', + { + name: 'chunkIndex', + type: 'number', + required: true, + admin: { + description: 'The index of this chunk', + }, }, - }, - { - name: 'embeddingVersion', - type: 'text', - admin: { - description: 'The version of the embedding model used', + { + name: 'chunkText', + type: 'textarea', + admin: { + description: 'The original text that was vectorized', + }, }, - }, - // Note: 'embedding' field is added via pgvector SQL, not as a Payload field - ], - timestamps: true, - indexes: [ - { - fields: ['sourceCollection', 'docId'], - }, - { - fields: ['sourceCollection', 'fieldPath'], - }, - ], -}) + { + name: 'embeddingVersion', + type: 'text', + admin: { + description: 'The version of the embedding model used', + }, + }, + // Note: 'embedding' field is added via pgvector SQL, not as a Payload field + // Extension fields are merged here + ...(extensionFields || []), + ], + timestamps: true, + indexes: [ + { + fields: ['sourceCollection', 'docId'], + }, + ], + } +} diff --git a/src/endpoints/vectorSearch.ts b/src/endpoints/vectorSearch.ts index 3fdb22a..6c86ac1 100644 --- a/src/endpoints/vectorSearch.ts +++ b/src/endpoints/vectorSearch.ts @@ -1,8 +1,28 @@ -import type { PayloadHandler } from 'payload' +import type { PayloadHandler, Where } from 'payload' +import { + sql, + cosineDistance, + inArray, + eq, + and, + or, + not, + like, + gt, + gte, + lt, + lte, + ne, + isNull, + isNotNull, +} from '@payloadcms/db-postgres/drizzle' + +import toSnakeCase from 'to-snake-case' import type { VectorSearchResult, KnowledgePoolName, KnowledgePoolDynamicConfig, + VectorSearchQuery, } from 'payloadcms-vectorize' export const vectorSearch = ( @@ -13,7 +33,7 @@ export const vectorSearch = ( return Response.json({ error: 'Request is required' }, { status: 400 }) } try { - const { query, knowledgePool } = await req.json() + const { query, knowledgePool, where, limit = 10 }: VectorSearchQuery = await req.json() if (!query || typeof query !== 'string') { return Response.json({ error: 'Query is required and must be a string' }, { status: 400 }) } @@ -40,8 +60,14 @@ export const vectorSearch = ( return Array.isArray(qE) ? qE : Array.from(qE) })() - // Perform cosine similarity search using raw SQL - const results = await performCosineSearch(payload, queryEmbedding, knowledgePool, 10) + // Perform cosine similarity search using Drizzle + const results = await performCosineSearch( + payload, + queryEmbedding, + knowledgePool, + limit, + where, + ) return Response.json({ results }) } catch (error) { @@ -56,58 +82,247 @@ async function performCosineSearch( queryEmbedding: number[], poolName: KnowledgePoolName, limit: number = 10, + whereClause?: Where, ): Promise> { - const isPostgres = payload.db?.pool?.query || payload.db?.drizzle?.execute + const isPostgres = payload.db?.pool?.query || payload.db?.drizzle?.execute || payload.db?.adapter if (!isPostgres) { throw new Error('Only works with Postgres') } - const runSQL = async (sql: string, params?: any[]) => { - if (payload.db.pool?.query) { - return payload.db.pool.query(sql, params) - } - if (payload.db.drizzle?.execute) { - return payload.db.drizzle.execute(sql) - } - throw new Error('Failed to execute SQL') - } - - // Convert embedding array to PostgreSQL vector format - const vectorString = `[${queryEmbedding.join(',')}]` - - // SQL query for cosine similarity search - use the specified embeddings table - const sql = ` - SELECT - "doc_id", - "chunk_text", - "field_path", - "source_collection", - "chunk_index", - "embedding_version", - 1 - (embedding <=> $1::vector) as similarity - FROM "${poolName}" - ORDER BY embedding <=> $1::vector - LIMIT $2 - ` - - try { - const result = await runSQL(sql, [vectorString, limit]) - - // Handle different result formats from different database adapters - const rows = result.rows || result || [] - - return rows.map((row: any) => ({ - id: String(row.doc_id), // Convert to string for consistency - docId: row.doc_id, - similarity: parseFloat(row.similarity), - chunkText: row.chunk_text, - fieldPath: row.field_path, - sourceCollection: row.source_collection, - chunkIndex: parseInt(row.chunk_index, 10), // Convert to number - embeddingVersion: row.embedding_version, - })) - } catch (error) { - throw new Error(`Cosine search failed: ${error}`) + // Access Drizzle adapter instance + const adapter = payload.db?.adapter + if (!adapter) { + throw new Error('Drizzle adapter not found') + } + const drizzle = adapter.drizzle + if (!drizzle) { + throw new Error('Drizzle instance not found in adapter') + } + + // Get collection config and table name + const collectionConfig = payload.collections[poolName]?.config + if (!collectionConfig) { + throw new Error(`Collection ${poolName} not found`) + } + const tableName = adapter.tableNameMap?.get(toSnakeCase(collectionConfig.slug)) + if (!tableName) { + throw new Error( + `[payloadcms-vectorize] Table name not found in adapter for collection "${poolName}" (slug: "${collectionConfig.slug}"). This typically indicates a configuration issue with the embeddings collection.`, + ) + } + const table = adapter.tables[tableName] + if (!table) { + throw new Error(`Table ${tableName} not found in adapter`) + } + + // Use Drizzle's query builder with cosineDistance function + // cosineDistance returns distance, so we calculate similarity as 1 - distance + const embeddingColumn = table.embedding + if (!embeddingColumn) { + throw new Error(`Embedding column not found in table ${tableName}`) + } + + // Convert WHERE clause to Drizzle conditions + let drizzleWhere: any = undefined + if (whereClause) { + drizzleWhere = convertWhereToDrizzle(whereClause, table, collectionConfig.flattenedFields) + if (drizzleWhere === null) { + // WHERE clause resulted in an empty condition (e.g., empty 'and' or 'or' array) + // This semantically means "match nothing", so return empty results + throw new Error( + `[payloadcms-vectorize] WHERE clause resulted in no valid conditions. This typically occurs when using empty 'and' or 'or' arrays, or when all field conditions reference non-existent columns.`, + ) + } + if (drizzleWhere === undefined) { + // WHERE clause could not be converted (invalid structure or unsupported operators) + throw new Error( + `[payloadcms-vectorize] WHERE clause could not be converted to Drizzle conditions. Please check that all field names exist and operators are supported.`, + ) + } + } + + // Build query using Drizzle's query builder + let query = drizzle + .select({ + id: table.id, + docId: table.docId || (table as any).doc_id, + chunkText: table.chunkText || (table as any).chunk_text, + sourceCollection: table.sourceCollection || (table as any).source_collection, + chunkIndex: table.chunkIndex || (table as any).chunk_index, + embeddingVersion: table.embeddingVersion || (table as any).embedding_version, + // Calculate similarity: 1 - cosineDistance (distance) + similarity: sql`1 - ${cosineDistance(embeddingColumn, queryEmbedding)}`, + }) + .from(table) + + // Add WHERE clause if provided + if (drizzleWhere) { + query = query.where(drizzleWhere) + } + + // Order by cosine distance (ascending = most similar first) and limit + query = query.orderBy(cosineDistance(embeddingColumn, queryEmbedding)).limit(limit) + + // Execute the query + const result = await query + + return mapRowsToResults(result) +} + +/** + * Convert Payload WHERE clause to Drizzle conditions + * Simplified version inspired by Payload's buildQuery + */ +function convertWhereToDrizzle(where: Where, table: any, fields: any[]): any { + if (!where || typeof where !== 'object') { + return undefined + } + + // Handle 'and' operator + if ('and' in where && Array.isArray(where.and)) { + const conditions = where.and + .map((condition) => convertWhereToDrizzle(condition, table, fields)) + .filter((c) => c !== undefined && c !== null) + if (conditions.length === 0) return null + if (conditions.length === 1) return conditions[0] + return and(...conditions) + } + + // Handle 'or' operator + if ('or' in where && Array.isArray(where.or)) { + const conditions = where.or + .map((condition) => convertWhereToDrizzle(condition, table, fields)) + .filter((c) => c !== undefined && c !== null) + if (conditions.length === 0) return null + if (conditions.length === 1) return conditions[0] + return or(...conditions) + } + + // Handle field conditions - collect all field conditions and combine with AND + const fieldConditions: any[] = [] + for (const [fieldName, condition] of Object.entries(where)) { + if (fieldName === 'and' || fieldName === 'or') continue + + // Get the column from the table (handle both camelCase and snake_case) + const column = table[fieldName] || table[toSnakeCase(fieldName)] + if (!column) { + // Field not found, skip (could be a nested field we don't support) + continue + } + + if (typeof condition !== 'object' || condition === null || Array.isArray(condition)) { + continue + } + + const cond = condition as Record + + // Handle equals + if ('equals' in cond) { + fieldConditions.push(eq(column, cond.equals)) + continue + } + + // Handle not_equals / notEquals + if ('not_equals' in cond || 'notEquals' in cond) { + fieldConditions.push(ne(column, cond.not_equals ?? cond.notEquals)) + continue + } + + // Handle in + if ('in' in cond && Array.isArray(cond.in)) { + fieldConditions.push(inArray(column, cond.in)) + continue + } + + // Handle not_in / notIn + if ('not_in' in cond || 'notIn' in cond) { + const values = cond.not_in ?? cond.notIn + if (Array.isArray(values)) { + fieldConditions.push(not(inArray(column, values))) + } + continue + } + + // Handle like + if ('like' in cond && typeof cond.like === 'string') { + fieldConditions.push(like(column, cond.like)) + continue + } + + // Handle contains + if ('contains' in cond && typeof cond.contains === 'string') { + fieldConditions.push(like(column, `%${cond.contains}%`)) + continue + } + + // Handle greater_than / greaterThan + if ('greater_than' in cond || 'greaterThan' in cond) { + fieldConditions.push(gt(column, cond.greater_than ?? cond.greaterThan)) + continue + } + + // Handle greater_than_equal / greaterThanEqual + if ('greater_than_equal' in cond || 'greaterThanEqual' in cond) { + fieldConditions.push(gte(column, cond.greater_than_equal ?? cond.greaterThanEqual)) + continue + } + + // Handle less_than / lessThan + if ('less_than' in cond || 'lessThan' in cond) { + fieldConditions.push(lt(column, cond.less_than ?? cond.lessThan)) + continue + } + + // Handle less_than_equal / lessThanEqual + if ('less_than_equal' in cond || 'lessThanEqual' in cond) { + fieldConditions.push(lte(column, cond.less_than_equal ?? cond.lessThanEqual)) + continue + } + + // Handle exists (null check) + if ('exists' in cond && typeof cond.exists === 'boolean') { + fieldConditions.push(cond.exists ? isNotNull(column) : isNull(column)) + continue + } + } + + // Combine all field conditions with AND + if (fieldConditions.length === 0) { + return undefined } + if (fieldConditions.length === 1) { + return fieldConditions[0] + } + return and(...fieldConditions) +} + +function mapRowsToResults(rows: any[]): Array { + return rows.map((row: any) => ({ + id: String(row.id), + docId: String(row.docId), + similarity: + typeof row.similarity === 'number' ? row.similarity : parseFloat(String(row.similarity)), + chunkText: row.chunkText || '', + sourceCollection: row.sourceCollection || '', + chunkIndex: + typeof row.chunkIndex === 'number' ? row.chunkIndex : parseInt(String(row.chunkIndex), 10), + embeddingVersion: row.embeddingVersion || '', + // Include any extension fields that might be in the row + ...Object.fromEntries( + Object.entries(row).filter( + ([key]) => + ![ + 'id', + 'docId', + 'chunkText', + 'sourceCollection', + 'chunkIndex', + 'embeddingVersion', + 'similarity', + 'embedding', + ].includes(key), + ), + ), + })) } diff --git a/src/index.ts b/src/index.ts index fd87521..68f3f4c 100644 --- a/src/index.ts +++ b/src/index.ts @@ -132,8 +132,16 @@ export const createVectorizeIntegration = ( // Process each knowledge pool for (const [poolName, dynamicConfig] of Object.entries(pluginOptions.knowledgePools)) { - // Add the embeddings collection for this knowledge pool - const embeddingsCollection = createEmbeddingsCollection(poolName) + // Collect all extensionFields from all collections in this pool + const allExtensionFields: any[] = [] + for (const collectionConfig of Object.values(dynamicConfig.collections)) { + if (collectionConfig?.extensionFields) { + allExtensionFields.push(...collectionConfig.extensionFields) + } + } + + // Add the embeddings collection for this knowledge pool with extensionFields + const embeddingsCollection = createEmbeddingsCollection(poolName, allExtensionFields) if (!config.collections.find((c) => c.slug === poolName)) { config.collections.push(embeddingsCollection) } @@ -190,7 +198,6 @@ export const createVectorizeIntegration = ( doc, collection: collectionSlug, knowledgePool: pool, - fieldsConfig: collectionConfig.fields, }, req: req, ...(pluginOptions.queueName ? { queue: pluginOptions.queueName } : {}), diff --git a/src/tasks/vectorize.ts b/src/tasks/vectorize.ts index 5227fb3..5097c0d 100644 --- a/src/tasks/vectorize.ts +++ b/src/tasks/vectorize.ts @@ -4,6 +4,7 @@ import { PostgresPayload, KnowledgePoolName, KnowledgePoolDynamicConfig, + ToKnowledgePoolFn, } from '../types.js' type VectorizeTaskInput = { @@ -80,7 +81,7 @@ async function runVectorizeTask(args: { `[payloadcms-vectorize] collection "${collection}" not configured in knowledge pool "${poolName}"`, ) } - const fieldsConfig = collectionConfig.fields + const toKnowledgePoolFn: ToKnowledgePoolFn = collectionConfig.toKnowledgePool const isPostgres = isPostgresPayload(payload) if (!isPostgres) { @@ -93,41 +94,39 @@ async function runVectorizeTask(args: { throw new Error('[payloadcms-vectorize] Failed to persist vector column') } - const inputs: { chunkText: string; fieldPath: string; chunkIndex: number }[] = [] - for (const [fieldPath, fieldCfg] of Object.entries(fieldsConfig)) { - // Delete existing embeddings for this doc/field combination to keep one set per doc/field - // The embeddingVersion is stored in each document and can be updated by re-vectorizing - await payload.delete({ - collection: poolName, - where: { - and: [ - { sourceCollection: { equals: collection } }, - { docId: { equals: String(sourceDoc.id) } }, - { fieldPath: { equals: fieldPath } }, - ], - }, - }) - const value = getByPath(sourceDoc, fieldPath) - const chunker = fieldCfg.chunker - const chunks = await chunker(value, payload) - inputs.push( - ...chunks.map((chunk, index) => ({ chunkText: chunk, fieldPath, chunkIndex: index })), - ) - } - const chunkTexts = inputs.map((input) => input.chunkText) + // Delete all existing embeddings for this document before creating new ones + // This ensures we replace old embeddings (potentially with a different embeddingVersion) + // and prevents duplicates when a document is updated + await payload.delete({ + collection: poolName, + where: { + and: [ + { sourceCollection: { equals: collection } }, + { docId: { equals: String(sourceDoc.id) } }, + ], + }, + }) + + // Get chunks from toKnowledgePoolFn + const chunkData = await toKnowledgePoolFn(sourceDoc, payload) + + // Extract chunk texts for embedding + const chunkTexts = chunkData.map((item) => item.chunk) const vectors = await dynamicConfig.embedDocs(chunkTexts) + + // Create embedding documents with extension field values await Promise.all( vectors.map(async (vector, index) => { - const { fieldPath, chunkIndex, chunkText } = inputs[index] + const { chunk, ...extensionFields } = chunkData[index] const created = await payload.create({ collection: poolName, data: { sourceCollection: collection, docId: String(sourceDoc.id), - fieldPath, - chunkIndex, - chunkText, + chunkIndex: index, + chunkText: chunk, embeddingVersion, + ...extensionFields, embedding: Array.isArray(vector) ? vector : Array.from(vector), }, }) @@ -144,8 +143,3 @@ async function runVectorizeTask(args: { }), ) } - -function getByPath(obj: any, path: string): any { - if (!obj) return undefined - return path.split('.').reduce((acc, key) => (acc == null ? acc : acc[key]), obj) -} diff --git a/src/types.ts b/src/types.ts index cb415d0..5ae5d6b 100644 --- a/src/types.ts +++ b/src/types.ts @@ -1,23 +1,18 @@ -import type { CollectionSlug, Payload } from 'payload' -import type { SerializedEditorState } from '@payloadcms/richtext-lexical/lexical' +import type { CollectionSlug, Payload, Field, Where } from 'payload' export type EmbedDocsFn = (texts: string[]) => Promise export type EmbedQueryFn = (text: string) => Promise -export type ChunkerFn = - | ((text: string, payload: Payload) => string[]) - | ((text: string, payload: Payload) => Promise) - | ((richText: SerializedEditorState, payload: Payload) => string[]) - | ((richText: SerializedEditorState, payload: Payload) => Promise) - -export type FieldVectorizeOption = { - /** Required per-field chunker override */ - chunker: ChunkerFn -} +export type ToKnowledgePoolFn = ( + doc: Record, + payload: Payload, +) => Promise> export type CollectionVectorizeOption = { - /** Map of field paths to enable vectorization */ - fields: Record + /** Function that converts a document to an array of chunks with optional extension field values */ + toKnowledgePool: ToKnowledgePoolFn + /** Optional fields to extend the embeddings collection schema */ + extensionFields?: Field[] } /** Knowledge pool name identifier */ @@ -92,7 +87,7 @@ export type VectorizeTaskArgs = { doc: Record collection: string knowledgePool: KnowledgePoolName - fieldsConfig: Record + toKnowledgePoolFn: ToKnowledgePoolFn } export interface VectorSearchResult { @@ -100,10 +95,10 @@ export interface VectorSearchResult { similarity: number sourceCollection: string // The collection that this embedding belongs to docId: string // The ID of the source document - fieldPath: string // The field path that was vectorized (e.g., "title", "content") - chunkIndex: number // The index of this chunk within the field + chunkIndex: number // The index of this chunk chunkText: string // The original text that was vectorized embeddingVersion: string // The version of the embedding model used + [key: string]: any // Extension fields and other dynamic fields } export interface VectorSearchResponse { @@ -115,8 +110,10 @@ export interface VectorSearchQuery { knowledgePool: KnowledgePoolName /** The search query string */ query: string - // TODO(techiejd): Expand on query API - // add support for particular collections, fields, etc. + /** Optional Payload where clause to filter results. Can rely on embeddings collection fields or extension fields. */ + where?: Where + /** Optional limit for number of results (default: 10) */ + limit?: number } export type JobContext = { From 12e3713b721e2854a748c33d7619d59c5353037e Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Wed, 19 Nov 2025 19:06:29 +0700 Subject: [PATCH 2/3] Vector search with where now works --- dev/payload-types.ts | 7 +- dev/specs/extensionFields.spec.ts | 23 ++++-- dev/specs/vectorSearch.spec.ts | 5 +- src/drizzle/tables.ts | 17 ++++ src/endpoints/vectorSearch.ts | 133 +++++++++++++++++++----------- src/index.ts | 14 +++- 6 files changed, 136 insertions(+), 63 deletions(-) create mode 100644 src/drizzle/tables.ts diff --git a/dev/payload-types.ts b/dev/payload-types.ts index a6fd1d1..f860f73 100644 --- a/dev/payload-types.ts +++ b/dev/payload-types.ts @@ -165,11 +165,7 @@ export interface Default { */ docId: string; /** - * The field path that was vectorized (e.g., "title", "content") - */ - fieldPath: string; - /** - * The index of this chunk within the field + * The index of this chunk */ chunkIndex: number; /** @@ -374,7 +370,6 @@ export interface PostsSelect { export interface DefaultSelect { sourceCollection?: T; docId?: T; - fieldPath?: T; chunkIndex?: T; chunkText?: T; embeddingVersion?: T; diff --git a/dev/specs/extensionFields.spec.ts b/dev/specs/extensionFields.spec.ts index ce967d4..ea76fa5 100644 --- a/dev/specs/extensionFields.spec.ts +++ b/dev/specs/extensionFields.spec.ts @@ -3,7 +3,7 @@ import { getPayload } from 'payload' import { beforeAll, describe, expect, test } from 'vitest' import { postgresAdapter } from '@payloadcms/db-postgres' import { buildDummyConfig, integration, plugin } from './constants.js' -import { createTestDb } from './utils.js' +import { createTestDb, waitForVectorizationJobs } from './utils.js' import { PostgresPayload } from '../../src/types.js' import { chunkText, chunkRichText } from 'helpers/chunkers.js' import { makeDummyEmbedDocs, makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' @@ -16,12 +16,23 @@ describe('Extension fields integration tests', () => { beforeAll(async () => { await createTestDb({ dbName }) const config = await buildDummyConfig({ + jobs: { + tasks: [], + autoRun: [ + { + cron: '*/5 * * * * *', // Run every 5 seconds + limit: 10, + }, + ], + }, collections: [ { slug: 'posts', fields: [ { name: 'title', type: 'text' }, { name: 'content', type: 'richText' }, + { name: 'category', type: 'text' }, + { name: 'priority', type: 'number' }, ], }, ], @@ -124,9 +135,9 @@ describe('Extension fields integration tests', () => { // Check that extension fields exist expect(columnsByName.category).toBeDefined() - expect(columnsByName.category.data_type).toBe('text') + expect(columnsByName.category.data_type).toBe('character varying') expect(columnsByName.priority).toBeDefined() - expect(columnsByName.priority.data_type).toBe('numeric' || 'integer') + expect(['numeric', 'integer']).toContain(columnsByName.priority.data_type) }) test('extension field values are stored with embeddings', async () => { @@ -137,11 +148,11 @@ describe('Extension fields integration tests', () => { content: null, category: 'tech', priority: 5, - }, + } as unknown as any, // any type needed because generated types works off of payload.config.ts, and does not take into account our `buildDummyConfig`. }) - // Wait for vectorization to complete - await new Promise((resolve) => setTimeout(resolve, 6000)) + // Wait for vectorization jobs to complete + await waitForVectorizationJobs(payload) const embeddings = await payload.find({ collection: 'default', diff --git a/dev/specs/vectorSearch.spec.ts b/dev/specs/vectorSearch.spec.ts index eb74d28..a76c8b0 100644 --- a/dev/specs/vectorSearch.spec.ts +++ b/dev/specs/vectorSearch.spec.ts @@ -216,13 +216,12 @@ describe('Search endpoint integration tests', () => { // Search without WHERE - should return both const responseAll = await performVectorSearch(payload, sharedText) const jsonAll = await responseAll.json() + expect(jsonAll.results.length).toBeGreaterThanOrEqual(2) // Search with WHERE clause filtering by docId - should return only one const responseFiltered = await performVectorSearch(payload, sharedText, 'default', { - where: { - docId: { equals: String(post1.id) }, - }, + docId: { equals: String(post1.id) }, }) const jsonFiltered = await responseFiltered.json() expect(jsonFiltered.results.length).toBeGreaterThan(0) diff --git a/src/drizzle/tables.ts b/src/drizzle/tables.ts new file mode 100644 index 0000000..453fa57 --- /dev/null +++ b/src/drizzle/tables.ts @@ -0,0 +1,17 @@ +import type { KnowledgePoolName } from '../types.js' + +type DrizzleTable = Record + +const embeddingsTables = new Map() + +export function registerEmbeddingsTable(poolName: KnowledgePoolName, table: DrizzleTable): void { + embeddingsTables.set(poolName, table) +} + +export function getEmbeddingsTable(poolName: KnowledgePoolName): DrizzleTable | undefined { + return embeddingsTables.get(poolName) +} + +export function clearEmbeddingsTables(): void { + embeddingsTables.clear() +} diff --git a/src/endpoints/vectorSearch.ts b/src/endpoints/vectorSearch.ts index 6c86ac1..00d3f64 100644 --- a/src/endpoints/vectorSearch.ts +++ b/src/endpoints/vectorSearch.ts @@ -24,6 +24,7 @@ import type { KnowledgePoolDynamicConfig, VectorSearchQuery, } from 'payloadcms-vectorize' +import { getEmbeddingsTable } from '../drizzle/tables.js' export const vectorSearch = ( knowledgePools: Record, @@ -84,17 +85,19 @@ async function performCosineSearch( limit: number = 10, whereClause?: Where, ): Promise> { - const isPostgres = payload.db?.pool?.query || payload.db?.drizzle?.execute || payload.db?.adapter + const isPostgres = payload.db?.pool?.query || payload.db?.drizzle if (!isPostgres) { throw new Error('Only works with Postgres') } - // Access Drizzle adapter instance - const adapter = payload.db?.adapter + // In PayloadCMS, payload.db IS the adapter, and drizzle is at payload.db.drizzle + const adapter = payload.db if (!adapter) { throw new Error('Drizzle adapter not found') } + + // Get drizzle instance const drizzle = adapter.drizzle if (!drizzle) { throw new Error('Drizzle instance not found in adapter') @@ -105,22 +108,22 @@ async function performCosineSearch( if (!collectionConfig) { throw new Error(`Collection ${poolName} not found`) } - const tableName = adapter.tableNameMap?.get(toSnakeCase(collectionConfig.slug)) - if (!tableName) { + + const table = getEmbeddingsTable(poolName) + if (!table) { throw new Error( - `[payloadcms-vectorize] Table name not found in adapter for collection "${poolName}" (slug: "${collectionConfig.slug}"). This typically indicates a configuration issue with the embeddings collection.`, + `[payloadcms-vectorize] Embeddings table for knowledge pool "${poolName}" not registered. Ensure the plugin's afterSchemaInit hook ran and the pool exists.`, ) } - const table = adapter.tables[tableName] - if (!table) { - throw new Error(`Table ${tableName} not found in adapter`) - } // Use Drizzle's query builder with cosineDistance function // cosineDistance returns distance, so we calculate similarity as 1 - distance + // The table from fullSchema should have columns as direct properties const embeddingColumn = table.embedding if (!embeddingColumn) { - throw new Error(`Embedding column not found in table ${tableName}`) + throw new Error( + `Embedding column not found in table for pool "${poolName}". Available properties: ${Object.keys(table).join(', ')}`, + ) } // Convert WHERE clause to Drizzle conditions @@ -143,16 +146,21 @@ async function performCosineSearch( } // Build query using Drizzle's query builder + // Column names in the table are camelCase (docId, chunkText, etc.) + // but their database names are snake_case (doc_id, chunk_text, etc.) + // The table from fullSchema should have columns as direct properties + // Calculate similarity: 1 - cosineDistance (distance) + // Need to cast 1 to numeric to avoid "integer - vector" error + const distanceExpr = cosineDistance(embeddingColumn, queryEmbedding) let query = drizzle .select({ id: table.id, - docId: table.docId || (table as any).doc_id, - chunkText: table.chunkText || (table as any).chunk_text, - sourceCollection: table.sourceCollection || (table as any).source_collection, - chunkIndex: table.chunkIndex || (table as any).chunk_index, - embeddingVersion: table.embeddingVersion || (table as any).embedding_version, - // Calculate similarity: 1 - cosineDistance (distance) - similarity: sql`1 - ${cosineDistance(embeddingColumn, queryEmbedding)}`, + docId: table.docId, + chunkText: table.chunkText, + sourceCollection: table.sourceCollection, + chunkIndex: table.chunkIndex, + embeddingVersion: table.embeddingVersion, + similarity: sql`1 - (${distanceExpr})`, }) .from(table) @@ -162,7 +170,8 @@ async function performCosineSearch( } // Order by cosine distance (ascending = most similar first) and limit - query = query.orderBy(cosineDistance(embeddingColumn, queryEmbedding)).limit(limit) + // Reuse the same distance expression for ordering + query = query.orderBy(distanceExpr).limit(limit) // Execute the query const result = await query @@ -204,8 +213,24 @@ function convertWhereToDrizzle(where: Where, table: any, fields: any[]): any { for (const [fieldName, condition] of Object.entries(where)) { if (fieldName === 'and' || fieldName === 'or') continue - // Get the column from the table (handle both camelCase and snake_case) - const column = table[fieldName] || table[toSnakeCase(fieldName)] + // Get the column from the table + // Drizzle tables have columns as direct properties + // Try camelCase first, then snake_case as fallback + // Use 'in' operator to check existence, then access the property + let column: any = undefined + if (fieldName in table) { + column = table[fieldName] + } else if (toSnakeCase(fieldName) in table) { + column = table[toSnakeCase(fieldName)] + } else if (table.columns) { + // Fallback to table.columns if it exists + if (fieldName in table.columns) { + column = table.columns[fieldName] + } else if (toSnakeCase(fieldName) in table.columns) { + column = table.columns[toSnakeCase(fieldName)] + } + } + if (!column) { // Field not found, skip (could be a nested field we don't support) continue @@ -298,31 +323,45 @@ function convertWhereToDrizzle(where: Where, table: any, fields: any[]): any { } function mapRowsToResults(rows: any[]): Array { - return rows.map((row: any) => ({ - id: String(row.id), - docId: String(row.docId), - similarity: - typeof row.similarity === 'number' ? row.similarity : parseFloat(String(row.similarity)), - chunkText: row.chunkText || '', - sourceCollection: row.sourceCollection || '', - chunkIndex: - typeof row.chunkIndex === 'number' ? row.chunkIndex : parseInt(String(row.chunkIndex), 10), - embeddingVersion: row.embeddingVersion || '', - // Include any extension fields that might be in the row - ...Object.fromEntries( - Object.entries(row).filter( - ([key]) => - ![ - 'id', - 'docId', - 'chunkText', - 'sourceCollection', - 'chunkIndex', - 'embeddingVersion', - 'similarity', - 'embedding', - ].includes(key), + return rows.map((row: any) => { + // Drizzle returns columns with the names we selected (camelCase) + // Handle both camelCase and snake_case for robustness + const docId = row.docId ?? row.doc_id + const chunkText = row.chunkText ?? row.chunk_text ?? '' + const sourceCollection = row.sourceCollection ?? row.source_collection ?? '' + const chunkIndex = row.chunkIndex ?? row.chunk_index + const embeddingVersion = row.embeddingVersion ?? row.embedding_version ?? '' + + return { + id: String(row.id), + docId: String(docId), + similarity: + typeof row.similarity === 'number' ? row.similarity : parseFloat(String(row.similarity)), + chunkText, + sourceCollection, + chunkIndex: typeof chunkIndex === 'number' ? chunkIndex : parseInt(String(chunkIndex), 10), + embeddingVersion, + // Include any extension fields that might be in the row + ...Object.fromEntries( + Object.entries(row).filter( + ([key]) => + ![ + 'id', + 'docId', + 'doc_id', + 'chunkText', + 'chunk_text', + 'sourceCollection', + 'source_collection', + 'chunkIndex', + 'chunk_index', + 'embeddingVersion', + 'embedding_version', + 'similarity', + 'embedding', + ].includes(key), + ), ), - ), - })) + } + }) } diff --git a/src/index.ts b/src/index.ts index 68f3f4c..7a46d2f 100644 --- a/src/index.ts +++ b/src/index.ts @@ -13,6 +13,7 @@ import { isPostgresPayload } from './types.js' import type { PostgresAdapterArgs } from '@payloadcms/db-postgres' import { createVectorizeTask } from './tasks/vectorize.js' import { vectorSearch } from './endpoints/vectorSearch.js' +import { clearEmbeddingsTables, registerEmbeddingsTable } from './drizzle/tables.js' export type * from './types.js' @@ -74,6 +75,9 @@ export const createVectorizeIntegration = ( schema, extendTable, }) => { + // Ensure registry reflects the latest schema + clearEmbeddingsTables() + // Extend schema for each knowledge pool for (const [poolName, staticConfig] of Object.entries(staticConfigs)) { const dims = staticConfig.dims @@ -85,7 +89,13 @@ export const createVectorizeIntegration = ( }) const table = schema?.tables?.[poolName] - if (table && typeof extendTable === 'function') { + if (!table) { + throw new Error( + `[payloadcms-vectorize] Embeddings table "${poolName}" not found during schema initialization. Ensure the collection has been registered.`, + ) + } + + if (typeof extendTable === 'function') { extendTable({ table, columns: { @@ -93,6 +103,8 @@ export const createVectorizeIntegration = ( }, }) } + + registerEmbeddingsTable(poolName as KnowledgePoolName, table) } return schema From b3c142aa3fdb8e0276e48b8a957008fc14440e98 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Wed, 19 Nov 2025 22:08:06 +0700 Subject: [PATCH 3/3] Updates package version, readme and adds changelog --- CHANGELOG.md | 82 ++++++++++++++++++ README.md | 235 ++++++++++++++++++++++----------------------------- package.json | 2 +- 3 files changed, 185 insertions(+), 134 deletions(-) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..b992320 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,82 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +## 0.3.0 - 2025-11-19 + +### Added + +- `extensionFields` option that lets each collection extend the embeddings table schema with arbitrary Payload fields (while protecting reserved column names). +- `toKnowledgePool` functions replace field-based chunking and provide full control over how documents are chunkedβ€”including the ability to attach extension-field values per chunk. +- Vector search endpoint now accepts optional `where` clauses (Payload syntax) and `limit`, enabling filtered queries against both default embedding columns and extension fields. +- Expanded `vectorSearch` coverage for filtering, using `where` clause now possible. + +### Changed + +- Embedding deletion now occurs per document/collection pair (no `fieldPath` column). +- Search results omit `fieldPath` and include any extension-field values that were stored with the embedding chunk. +- Documentation updated to describe `toKnowledgePool`, extension fields, and the enhanced search API. + +## 0.2.0 + +### Breaking + +- Introduced knowledge pools with separate static (schema) and dynamic (runtime) configurations. +- The vector search endpoint requires a `knowledgePool` parameter to disambiguate results across pools. + +### Migration Notes + +**Before (≀0.1.x)** + +```ts +const { afterSchemaInitHook, payloadcmsVectorize } = createVectorizeIntegration({ + dims: 1536, + ivfflatLists: 100, +}) + +payloadcmsVectorize({ + collections: { + posts: { + fields: { + /* ... */ + }, + }, + }, + embedDocs, + embedQuery, + embeddingVersion: 'v1.0.0', +}) +``` + +**After (0.2.0+)** + +```ts +const { afterSchemaInitHook, payloadcmsVectorize } = createVectorizeIntegration({ + main: { + dims: 1536, + ivfflatLists: 100, + }, +}) + +payloadcmsVectorize({ + knowledgePools: { + main: { + collections: { + posts: { + fields: { + /* ... */ + }, + }, + }, + embedDocs, + embedQuery, + embeddingVersion: 'v1.0.0', + }, + }, +}) +``` + +### Benefits Introduced + +- Multiple knowledge pools allow separate domains, embedding settings, and versioning per pool. +- Collections can participate in multiple pools, enabling more flexible organization of embeddings. diff --git a/README.md b/README.md index e5124fd..e198fe4 100644 --- a/README.md +++ b/README.md @@ -4,14 +4,14 @@ A Payload CMS plugin that adds vector search capabilities to your collections us ## Features -- πŸ” **Semantic Search**: Vectorize any collection field for intelligent content discovery -- πŸš€ **Automatic Vectorization**: Documents are automatically vectorized when created or updated +- πŸ” **Semantic Search**: Vectorize any collection for intelligent content discovery +- πŸš€ **Automatic**: Documents are automatically vectorized when created or updated, and vectors are deleted as soon as the document is deleted. - πŸ“Š **PostgreSQL Integration**: Built on pgvector for high-performance vector operations - ⚑ **Background Processing**: Uses Payload's job system for non-blocking vectorization -- 🎯 **Flexible Chunking**: You provide the custom chunkers for different field types (text, rich text, etc.) -- πŸ”§ **Configurable**: Choose which collections and fields to vectorize -- 🌐 **REST API**: Built-in vector-search endpoint for querying vectorized content -- 🏊 **Multiple Knowledge Pools**: Separate knowledge pools with independent configurations (dims, ivfflatLists, embedding functions) +- 🎯 **Flexible Chunking**: Drive chunk creation yourself with `toKnowledgePool` functions so you can combine any fields or content types +- 🧩 **Extensible Schema**: Attach custom `extensionFields` to the embeddings collection and persist values per chunk and use for querying. +- 🌐 **REST API**: Built-in vector-search endpoint with Payload-style `where` filtering and configurable limits +- 🏊 **Multiple Knowledge Pools**: Separate knowledge pools with independent configurations (dims, ivfflatLists, embedding functions) and needs. ## Prerequisites @@ -27,58 +27,77 @@ pnpm add payloadcms-vectorize ## Quick Start -### 1. Install pgvector +### 0. Install pgvector -Make sure your PostgreSQL database has the pgvector extension: +The plugin automatically creates the `vector` extension when Payload initializes. However, your PostgreSQL database user must have permission to create extensions. If your user doesn't have these permissions, you may need to manually create the extension once: ```sql CREATE EXTENSION IF NOT EXISTS vector; ``` -### 2. Configure the Plugin +**Note:** Most managed PostgreSQL services (like AWS RDS, Supabase, etc.) require superuser privileges or specific extension permissions. If you encounter permission errors, contact your database administrator or check your service's documentation. + +### 1. Configure the Plugin ```typescript import { buildConfig } from 'payload' +import type { Payload } from 'payload' import { postgresAdapter } from '@payloadcms/db-postgres' import { createVectorizeIntegration } from 'payloadcms-vectorize' +import type { ToKnowledgePoolFn } from 'payloadcms-vectorize' // Configure your embedding functions const embedDocs = async (texts: string[]) => { // Your embedding logic here - return texts.map(text => /* vector array */) + return texts.map((text) => /* vector array */) } -const embedQuery = async (text: string, - payload: Payload,) => { +const embedQuery = async (text: string) => { // Your query embedding logic here return /* vector array */ } -// Configure your chunking functions -const chunkText = async (text: string, - payload: Payload) => { +// Optional chunker helpers (see dev/helpers/chunkers.ts for ideas) +const chunkText = async (text: string, payload: Payload) => { return /* string array */ } -// See examples under chunkers.ts -const chunkRichText = async (richText: SerializedEditorState, - payload: Payload) => { +const chunkRichText = async (richText: any, payload: Payload) => { return /* string array */ } +// Convert a document into chunks + extension-field values +const postsToKnowledgePool: ToKnowledgePoolFn = async (doc, payload) => { + const entries: Array<{ chunk: string; category?: string; priority?: number }> = [] + + const titleChunks = await chunkText(doc.title ?? '', payload) + titleChunks.forEach((chunk) => + entries.push({ + chunk, + category: doc.category ?? 'general', + priority: Number(doc.priority ?? 0), + }), + ) + + const contentChunks = await chunkRichText(doc.content, payload) + contentChunks.forEach((chunk) => + entries.push({ + chunk, + category: doc.category ?? 'general', + priority: Number(doc.priority ?? 0), + }), + ) + + return entries +} + // Create the integration with static configs (dims, ivfflatLists) const { afterSchemaInitHook, payloadcmsVectorize } = createVectorizeIntegration({ - // Note limitation: Changing these values is currently not supported. - // Migration is necessary. + // Note limitation: Changing these values requires a migration. main: { dims: 1536, // Vector dimensions ivfflatLists: 100, // IVFFLAT index parameter }, - // You can add more knowledge pools with different static configs - // products: { - // dims: 384, - // ivfflatLists: 50, - // }, }) export default buildConfig({ @@ -91,43 +110,32 @@ export default buildConfig({ }), plugins: [ payloadcmsVectorize({ - // Knowledge pools - dynamic configs (collections, embedding functions) knowledgePools: { main: { - // The collection-fields you want vectorized in this pool collections: { posts: { - fields: { - title: { chunker: chunkText }, - content: { chunker: chunkRichText }, - }, + toKnowledgePool: postsToKnowledgePool, + extensionFields: [ + { name: 'category', type: 'text' }, + { name: 'priority', type: 'number' }, + ], }, }, embedDocs, embedQuery, embeddingVersion: 'v1.0.0', }, - // You can add more knowledge pools with different dynamic configs - // products: { - // collections: { ... }, - // embedDocs: differentEmbedDocs, - // embedQuery: differentEmbedQuery, - // embeddingVersion: 'v2.0.0', - // }, }, // Optional plugin options: // queueName: 'custom-queue', - // endpointOverrides: { - // path: '/custom-vector-search', - // enabled: true, - // }, + // endpointOverrides: { path: '/custom-vector-search', enabled: true }, // will be /api/custom-vector-search // disabled: false, }), ], }) ``` -### 3. Search Your Content +### 2. Search Your Content The plugin automatically creates a `/api/vector-search` endpoint: @@ -136,13 +144,18 @@ const response = await fetch('/api/vector-search', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ - query: 'What is machine learning?', // Required: query - knowledgePool: 'main', // Required: specify which knowledge pool to search + query: 'What is machine learning?', // Required + knowledgePool: 'main', // Required + where: { + category: { equals: 'guides' }, // Optional Payload-style filter + }, + limit: 5, // Optional (defaults to 10) }), }) -const results = await response.json() -// Returns: { results: [{ id, similarity, sourceCollection, docId, fieldPath, chunkText, ... }] } +const { results } = await response.json() +// Each result contains: id, similarity, sourceCollection, docId, chunkIndex, chunkText, +// embeddingVersion, and any extensionFields you attached (e.g., category, priority). ``` ## Configuration Options @@ -169,28 +182,35 @@ The embeddings collection name will be the same as the knowledge pool name. **2. Dynamic Config** (passed to `payloadcmsVectorize`): -- `collections`: `Record` - Collections and fields to vectorize +- `collections`: `Record` - Collections and their chunking/extension configs - `embedDocs`: `EmbedDocsFn` - Function to embed multiple documents - `embedQuery`: `EmbedQueryFn` - Function to embed search queries - `embeddingVersion`: `string` - Version string for tracking model changes -## Chunkers +#### CollectionVectorizeOption -The plugin includes examples chunkers for common field types: -// Not yet provided publicly because maintenance is not guaranteed +- `toKnowledgePool (doc, payload)` – return an array of `{ chunk, ...extensionFieldValues }`. Each object becomes one embedding row and the index in the array determines `chunkIndex`. +- `extensionFields?` – standard Payload `Field[]` merged onto the embeddings collection. These values can be written from `toKnowledgePool` output and queried later (including via the `where` parameter). -- `chunkText`: For plain text fields -- `chunkRichText`: For Lexical rich text fields +Reserved column names: `sourceCollection`, `docId`, `chunkIndex`, `chunkText`, `embeddingVersion`. Avoid reusing them in `extensionFields`. -You must create (or copy) custom chunkers: +## Chunkers + +Use chunker helpers (see `dev/helpers/chunkers.ts`) to keep `toKnowledgePool` implementations focused on orchestration. A `toKnowledgePool` can combine multiple chunkers, enrich each chunk with metadata, and return everything the embeddings collection needs. ```typescript -const customChunker = async (value: any, payload: Payload) => { - // Your custom chunking logic - return ['chunk1', 'chunk2', 'chunk3'] +const postsToKnowledgePool: ToKnowledgePoolFn = async (doc, payload) => { + const chunks = await chunkText(doc.title ?? '', payload) + + return chunks.map((chunk) => ({ + chunk, + category: doc.category ?? 'general', + })) } ``` +Because you control the output, you can mix different field types, discard empty values, or inject any metadata that aligns with your `extensionFields`. + ## Example ### Using with Voyage AI @@ -230,21 +250,28 @@ Search for similar content using vector similarity. **Request Body:** -```json +```jsonc { "query": "Your search query", - "knowledgePool": "main" + "knowledgePool": "main", + "where": { + "category": { "equals": "guides" }, + "priority": { "gte": 3 }, + }, + "limit": 5, } ``` -**Parameters:** +**Parameters** -- `query` (required): The search query string -- `knowledgePool` (required): The knowledge pool identifier to search in +- `query` (required): Search query string +- `knowledgePool` (required): Knowledge pool identifier to search in +- `where` (optional): Payload-style `Where` clause evaluated against the embeddings collection + any `extensionFields` +- `limit` (optional): Maximum results to return (defaults to `10`) **Response:** -```json +```jsonc { "results": [ { @@ -252,81 +279,19 @@ Search for similar content using vector similarity. "similarity": 0.85, "sourceCollection": "posts", "docId": "post_id", - "fieldPath": "content", "chunkIndex": 0, "chunkText": "Relevant text chunk", - "embeddingVersion": "v1.0.0" - } - ] -} -``` - -## Migration from v0.1.0 to v0.2.0 - -Version 0.2.0 introduces support for multiple knowledge pools. This is a **breaking change** that requires updating your configuration. - -### Before (v0.1.0): - -```typescript -const { afterSchemaInitHook, payloadcmsVectorize } = createVectorizeIntegration({ - dims: 1536, - ivfflatLists: 100, -}) - -payloadcmsVectorize({ - collections: { - posts: { fields: { ... } }, - }, - embedDocs, - embedQuery, - embeddingVersion: 'v1.0.0', -}) -``` - -### After (v0.2.0): - -```typescript -// Static configs (schema-related) passed to createVectorizeIntegration -const { afterSchemaInitHook, payloadcmsVectorize } = createVectorizeIntegration({ - main: { - dims: 1536, - ivfflatLists: 100, - }, -}) - -// Dynamic configs (runtime behavior) passed to payloadcmsVectorize -payloadcmsVectorize({ - knowledgePools: { - main: { - collections: { - posts: { fields: { ... } }, - }, - embedDocs, - embedQuery, - embeddingVersion: 'v1.0.0', + "embeddingVersion": "v1.0.0", + "category": "guides", // example extension field + "priority": 4, // example extension field }, - }, -}) -``` - -### API Changes - -The vector search endpoint now requires a `knowledgePool` parameter: - -```typescript -// Before -{ query: 'search term' } - -// After -{ query: 'search term', knowledgePool: 'main' } + ], +} ``` -### Benefits of Multiple Knowledge Pools +## Changelog -- **Separate knowledge domains**: Keep different types of content in separate pools -- **Different technical requirements**: Each pool can have different `dims`, `ivfflatLists`, and embedding functions -- **Flexible organization**: Collections can appear in multiple pools if needed -- **Independent versioning**: Each pool can track its own embedding model version +See [CHANGELOG.md](./CHANGELOG.md) for release history, migration notes, and upgrade guides. ## Requirements @@ -360,6 +325,11 @@ The more detailed your issue, the better I can understand and address your needs ## πŸ—ΊοΈ Roadmap +Thank you for the stars! The following updates have been completed: + +- **Multiple Knowledge Pools**: You can create separate knowledge pools with independent configurations (dims, ivfflatLists, embedding functions) and needs. Each pool operates independently, allowing you to organize your vectorized content by domain, use case, or any other criteria that makes sense for your application. +- **More expressive queries**: Added ability to change query limit, search on certain collections or certain fields + The following features are planned for future releases based on community interest and stars: - **Migrations for vector dimensions**: Easy migration tools for changing vector dimensions and/or ivfflatLists after initial setup @@ -367,6 +337,5 @@ The following features are planned for future releases based on community intere - **Vercel support**: Optimized deployment and configuration for Vercel hosting - **Batch embedding**: More efficient bulk embedding operations for large datasets - **'Embed all' button**: Admin UI button to re-embed all content after embeddingVersion changes -- **More expressive queries**: Add ability to change query limit, search on certain collections or certain fields. **Want to see these features sooner?** Star this repository and open issues for the features you need most! diff --git a/package.json b/package.json index 7a880a2..3bb3706 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "payloadcms-vectorize", - "version": "0.2.0", + "version": "0.3.0", "description": "A plugin to vectorize collections for RAG in Payload 3.0", "license": "MIT", "type": "module",