From c7808d13b8564122fb49f353523b7d1bf8fd92e0 Mon Sep 17 00:00:00 2001 From: Jake Bromberg Date: Sat, 14 Feb 2026 14:14:25 -0800 Subject: [PATCH] feat: add unaccent to trigram GIN indexes for accent-insensitive search MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Queries for "Bjork" now match "Björk" at the database level. Adds an immutable f_unaccent() wrapper (required because the built-in unaccent() is STABLE) and updates all 4 trigram index expressions from lower(col) to lower(f_unaccent(col)). - schema/create_database.sql: enable unaccent extension - schema/create_functions.sql: immutable f_unaccent() wrapper - schema/create_indexes.sql: accent-insensitive index expressions - scripts/dedup_releases.py: update hardcoded post-dedup indexes - scripts/run_pipeline.py: run create_functions.sql after schema - scripts/verify_cache.py: run create_functions.sql on target DB --- CLAUDE.md | 11 +++--- README.md | 13 +++---- schema/create_database.sql | 3 ++ schema/create_functions.sql | 13 +++++++ schema/create_indexes.sql | 16 ++++----- scripts/dedup_releases.py | 11 +++--- scripts/run_pipeline.py | 3 +- scripts/verify_cache.py | 12 ++++--- tests/integration/test_copy_to_target.py | 3 +- tests/integration/test_db_introspect.py | 3 ++ tests/integration/test_dedup.py | 3 +- tests/integration/test_schema.py | 43 ++++++++++++++++++++++-- 12 files changed, 100 insertions(+), 34 deletions(-) create mode 100644 schema/create_functions.sql diff --git a/CLAUDE.md b/CLAUDE.md index 9263874..8dd0454 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -16,9 +16,9 @@ ETL pipeline for building and maintaining a PostgreSQL cache of Discogs release 3. **Fix newlines** in CSV fields (`scripts/fix_csv_newlines.py`) 4. **Enrich** `library_artists.txt` with WXYC cross-references (`scripts/enrich_library_artists.py`, optional) 5. **Filter** CSVs to library-matching artists only (`scripts/filter_csv.py`) -- ~70% data reduction -6. **Create schema** (`schema/create_database.sql`) +6. **Create schema** (`schema/create_database.sql`) and **functions** (`schema/create_functions.sql`) 7. **Import** filtered CSVs into PostgreSQL (`scripts/import_csv.py`) -8. **Create indexes** including trigram GIN indexes (`schema/create_indexes.sql`) +8. **Create indexes** including accent-insensitive trigram GIN indexes (`schema/create_indexes.sql`) 9. **Deduplicate** by master_id (`scripts/dedup_releases.py`) 10. **Prune or Copy-to** -- one of: - `--prune`: delete non-matching releases in place (~89% data reduction, 3 GB -> 340 MB) @@ -41,15 +41,16 @@ The `release` table includes a `master_id` column used during import and dedup. The SQL files in `schema/` define the contract between this ETL pipeline and all consumers: -- `schema/create_database.sql` -- Tables: `release`, `release_artist`, `release_track`, `release_track_artist`, `cache_metadata` -- `schema/create_indexes.sql` -- Trigram GIN indexes for fuzzy text search (pg_trgm) +- `schema/create_database.sql` -- Tables: `release`, `release_artist`, `release_track`, `release_track_artist`, `cache_metadata`; extensions: pg_trgm, unaccent +- `schema/create_functions.sql` -- `f_unaccent()` immutable wrapper for accent-insensitive index expressions +- `schema/create_indexes.sql` -- Trigram GIN indexes for accent-insensitive fuzzy text search (pg_trgm + unaccent) Consumers connect via `DATABASE_URL_DISCOGS` environment variable. ### Docker Compose `docker-compose.yml` provides a self-contained environment: -- **`db`** service: PostgreSQL 16 with pg_trgm, port 5433:5432 +- **`db`** service: PostgreSQL 16 with pg_trgm + unaccent, port 5433:5432 - **`pipeline`** service: runs `scripts/run_pipeline.py` against the db ```bash diff --git a/README.md b/README.md index 07b65fd..e5356a2 100644 --- a/README.md +++ b/README.md @@ -7,13 +7,13 @@ ETL pipeline for building a PostgreSQL cache of Discogs release data, filtered t The pipeline processes monthly Discogs data dumps (~40 GB XML) into a focused PostgreSQL database (~3 GB) containing only releases by artists in the WXYC library catalog. This provides: - Fast local lookups instead of rate-limited Discogs API calls -- Trigram fuzzy text search via pg_trgm +- Accent-insensitive trigram fuzzy text search via pg_trgm + unaccent - Shared data resource for multiple consuming services ## Prerequisites - Python 3.11+ -- PostgreSQL with the `pg_trgm` extension (or use [Docker Compose](#docker-compose)) +- PostgreSQL with the `pg_trgm` and `unaccent` extensions (or use [Docker Compose](#docker-compose)) - Discogs monthly data dump (XML) from https://discogs-data-dumps.s3.us-west-2.amazonaws.com/index.html - [discogs-xml2db](https://github.com/philipmat/discogs-xml2db) -- clone separately; not a PyPI package - `library_artists.txt` and `library.db` (produced by request-parser's library sync) @@ -34,9 +34,9 @@ All 9 steps are automated by `run_pipeline.py` (or Docker Compose). The script s | 2. Fix newlines | `scripts/fix_csv_newlines.py` | Clean embedded newlines in CSV fields | | 2.5. Enrich | `scripts/enrich_library_artists.py` | Enrich artist list with cross-references (optional) | | 3. Filter | `scripts/filter_csv.py` | Keep only library artists (~70% reduction) | -| 4. Create schema | `schema/create_database.sql` | Set up tables and constraints | +| 4. Create schema | `schema/create_database.sql`, `schema/create_functions.sql` | Set up tables, extensions, and functions | | 5. Import | `scripts/import_csv.py` | Bulk load CSVs via psycopg COPY | -| 6. Create indexes | `schema/create_indexes.sql` | Trigram GIN indexes for fuzzy search | +| 6. Create indexes | `schema/create_indexes.sql` | Accent-insensitive trigram GIN indexes for fuzzy search | | 7. Deduplicate | `scripts/dedup_releases.py` | Keep best release per master_id (most tracks) | | 8. Prune/Copy | `scripts/verify_cache.py` | Remove non-library releases or copy matches to target DB | | 9. Vacuum | `VACUUM FULL` | Reclaim disk space | @@ -162,8 +162,9 @@ python scripts/fix_csv_newlines.py /path/to/raw/release.csv /path/to/cleaned/rel # 3. Filter to library artists python scripts/filter_csv.py /path/to/library_artists.txt /path/to/cleaned/ /path/to/filtered/ -# 4. Create schema +# 4. Create schema and functions psql -d discogs -f schema/create_database.sql +psql -d discogs -f schema/create_functions.sql # 5. Import CSVs python scripts/import_csv.py /path/to/filtered/ [database_url] @@ -201,7 +202,7 @@ The schema files in `schema/` define the shared contract between this ETL pipeli ### Indexes - Foreign key indexes on all child tables -- Trigram GIN indexes (`pg_trgm`) on `title` and `artist_name` columns for fuzzy text search +- Accent-insensitive trigram GIN indexes (`pg_trgm` + `unaccent`) on `title` and `artist_name` columns for fuzzy text search. Uses an immutable `f_unaccent()` wrapper to enable index expressions with `lower(f_unaccent(column))`. - Cache metadata indexes for freshness queries ### Consumer Integration diff --git a/schema/create_database.sql b/schema/create_database.sql index 4d9f86d..9007ae7 100644 --- a/schema/create_database.sql +++ b/schema/create_database.sql @@ -13,6 +13,9 @@ -- Enable trigram extension for fuzzy text search CREATE EXTENSION IF NOT EXISTS pg_trgm; +-- Enable unaccent extension for accent-insensitive search +CREATE EXTENSION IF NOT EXISTS unaccent; + -- ============================================ -- Core tables -- ============================================ diff --git a/schema/create_functions.sql b/schema/create_functions.sql new file mode 100644 index 0000000..d21cc49 --- /dev/null +++ b/schema/create_functions.sql @@ -0,0 +1,13 @@ +-- Create immutable wrapper for unaccent() to allow use in index expressions. +-- +-- PostgreSQL's built-in unaccent() is STABLE (depends on search_path), so it +-- can't be used directly in index expressions which require IMMUTABLE functions. +-- This wrapper pins the dictionary to public.unaccent, removing the search_path +-- variability. +-- +-- Run AFTER create_database.sql (which creates the unaccent extension). +-- Run BEFORE create_indexes.sql (which uses f_unaccent in index expressions). + +CREATE OR REPLACE FUNCTION f_unaccent(text) RETURNS text AS $$ + SELECT public.unaccent('public.unaccent', $1) +$$ LANGUAGE sql IMMUTABLE PARALLEL SAFE STRICT; diff --git a/schema/create_indexes.sql b/schema/create_indexes.sql index e79ea5f..59623d0 100644 --- a/schema/create_indexes.sql +++ b/schema/create_indexes.sql @@ -13,26 +13,26 @@ CREATE EXTENSION IF NOT EXISTS pg_trgm; -- 1. Track title search: "Find releases containing track 'Blue Monday'" -- Used by: search_releases_by_track() --- Query pattern: WHERE lower(title) % $1 OR lower(title) ILIKE '%' || $1 || '%' +-- Query pattern: WHERE lower(f_unaccent(title)) % $1 OR lower(f_unaccent(title)) ILIKE ... CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_release_track_title_trgm -ON release_track USING GIN (lower(title) gin_trgm_ops); +ON release_track USING GIN (lower(f_unaccent(title)) gin_trgm_ops); -- 2. Artist name search on releases: "Find releases by 'New Order'" -- Used by: search_releases_by_track() artist filter --- Query pattern: WHERE lower(artist_name) % $1 +-- Query pattern: WHERE lower(f_unaccent(artist_name)) % $1 CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_release_artist_name_trgm -ON release_artist USING GIN (lower(artist_name) gin_trgm_ops); +ON release_artist USING GIN (lower(f_unaccent(artist_name)) gin_trgm_ops); -- 3. Track artist search: "Find compilation tracks by 'Joy Division'" -- Used by: validate_track_on_release() for compilations --- Query pattern: WHERE lower(artist_name) % $1 +-- Query pattern: WHERE lower(f_unaccent(artist_name)) % $1 CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_release_track_artist_name_trgm -ON release_track_artist USING GIN (lower(artist_name) gin_trgm_ops); +ON release_track_artist USING GIN (lower(f_unaccent(artist_name)) gin_trgm_ops); -- 4. Release title search: "Find releases named 'Power, Corruption & Lies'" -- Used by: get_release searches CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_release_title_trgm -ON release USING GIN (lower(title) gin_trgm_ops); +ON release USING GIN (lower(f_unaccent(title)) gin_trgm_ops); -- ============================================ -- Verification queries @@ -49,5 +49,5 @@ ON release USING GIN (lower(title) gin_trgm_ops); -- Test trigram search (should use index) -- EXPLAIN ANALYZE -- SELECT * FROM release_track --- WHERE lower(title) % 'blue monday' +-- WHERE lower(f_unaccent(title)) % 'blue monday' -- LIMIT 10; diff --git a/scripts/dedup_releases.py b/scripts/dedup_releases.py index 24ffe82..39ef5f5 100644 --- a/scripts/dedup_releases.py +++ b/scripts/dedup_releases.py @@ -141,14 +141,15 @@ def add_constraints_and_indexes(conn) -> None: "CREATE INDEX idx_release_artist_release_id ON release_artist(release_id)", "CREATE INDEX idx_release_track_release_id ON release_track(release_id)", "CREATE INDEX idx_release_track_artist_release_id ON release_track_artist(release_id)", - # Trigram indexes for fuzzy search + # Trigram indexes for fuzzy search (accent-insensitive via f_unaccent) "CREATE INDEX idx_release_track_title_trgm ON release_track " - "USING gin (lower(title) gin_trgm_ops)", + "USING gin (lower(f_unaccent(title)) gin_trgm_ops)", "CREATE INDEX idx_release_artist_name_trgm ON release_artist " - "USING gin (lower(artist_name) gin_trgm_ops)", + "USING gin (lower(f_unaccent(artist_name)) gin_trgm_ops)", "CREATE INDEX idx_release_track_artist_name_trgm ON release_track_artist " - "USING gin (lower(artist_name) gin_trgm_ops)", - "CREATE INDEX idx_release_title_trgm ON release USING gin (lower(title) gin_trgm_ops)", + "USING gin (lower(f_unaccent(artist_name)) gin_trgm_ops)", + "CREATE INDEX idx_release_title_trgm ON release " + "USING gin (lower(f_unaccent(title)) gin_trgm_ops)", # Cache metadata indexes "CREATE INDEX idx_cache_metadata_cached_at ON cache_metadata(cached_at)", "CREATE INDEX idx_cache_metadata_source ON cache_metadata(source)", diff --git a/scripts/run_pipeline.py b/scripts/run_pipeline.py index f2fd019..2b8b21b 100644 --- a/scripts/run_pipeline.py +++ b/scripts/run_pipeline.py @@ -462,11 +462,12 @@ def _save_state() -> None: # Step 4: Wait for Postgres wait_for_postgres(db_url) - # Step 5: Create schema + # Step 5: Create schema and functions if state and state.is_completed("create_schema"): logger.info("Skipping create_schema (already completed)") else: run_sql_file(db_url, SCHEMA_DIR / "create_database.sql") + run_sql_file(db_url, SCHEMA_DIR / "create_functions.sql") if state: state.mark_completed("create_schema") _save_state() diff --git a/scripts/verify_cache.py b/scripts/verify_cache.py index 9f9e16c..c8c8d5c 100644 --- a/scripts/verify_cache.py +++ b/scripts/verify_cache.py @@ -728,15 +728,17 @@ def _create_target_schema(target_url: str) -> None: def _create_target_indexes(target_url: str) -> None: - """Create indexes on the target database (without CONCURRENTLY).""" - sql_text = SCHEMA_DIR.joinpath("create_indexes.sql").read_text() - sql_text = sql_text.replace(" CONCURRENTLY", "") + """Create functions and indexes on the target database (without CONCURRENTLY).""" + functions_sql = SCHEMA_DIR.joinpath("create_functions.sql").read_text() + indexes_sql = SCHEMA_DIR.joinpath("create_indexes.sql").read_text() + indexes_sql = indexes_sql.replace(" CONCURRENTLY", "") conn = psycopg.connect(target_url, autocommit=True) with conn.cursor() as cur: - cur.execute(sql_text) + cur.execute(functions_sql) + cur.execute(indexes_sql) conn.close() - logger.info("Created indexes on target database") + logger.info("Created functions and indexes on target database") def copy_releases_to_target( diff --git a/tests/integration/test_copy_to_target.py b/tests/integration/test_copy_to_target.py index 154c32d..015926a 100644 --- a/tests/integration/test_copy_to_target.py +++ b/tests/integration/test_copy_to_target.py @@ -55,12 +55,13 @@ def _fresh_import(db_url: str) -> None: - """Drop everything, apply schema, and import fixture CSVs.""" + """Drop everything, apply schema and functions, and import fixture CSVs.""" conn = psycopg.connect(db_url, autocommit=True) with conn.cursor() as cur: for table in ALL_TABLES: cur.execute(f"DROP TABLE IF EXISTS {table} CASCADE") cur.execute(SCHEMA_DIR.joinpath("create_database.sql").read_text()) + cur.execute(SCHEMA_DIR.joinpath("create_functions.sql").read_text()) conn.close() conn = psycopg.connect(db_url) diff --git a/tests/integration/test_db_introspect.py b/tests/integration/test_db_introspect.py index e4eb9ba..9990c5c 100644 --- a/tests/integration/test_db_introspect.py +++ b/tests/integration/test_db_introspect.py @@ -123,6 +123,7 @@ def test_true_after_index_creation(self, db_url) -> None: conn = psycopg.connect(db_url, autocommit=True) with conn.cursor() as cur: cur.execute(SCHEMA_DIR.joinpath("create_database.sql").read_text()) + cur.execute(SCHEMA_DIR.joinpath("create_functions.sql").read_text()) sql = SCHEMA_DIR.joinpath("create_indexes.sql").read_text() sql = sql.replace(" CONCURRENTLY", "") cur.execute(sql) @@ -168,6 +169,7 @@ def test_after_indexes(self, db_url) -> None: conn = psycopg.connect(db_url, autocommit=True) with conn.cursor() as cur: cur.execute(SCHEMA_DIR.joinpath("create_database.sql").read_text()) + cur.execute(SCHEMA_DIR.joinpath("create_functions.sql").read_text()) cur.execute("INSERT INTO release (id, title) VALUES (1, 'Test')") sql = SCHEMA_DIR.joinpath("create_indexes.sql").read_text() sql = sql.replace(" CONCURRENTLY", "") @@ -186,6 +188,7 @@ def test_after_dedup(self, db_url) -> None: conn = psycopg.connect(db_url, autocommit=True) with conn.cursor() as cur: cur.execute(SCHEMA_DIR.joinpath("create_database.sql").read_text()) + cur.execute(SCHEMA_DIR.joinpath("create_functions.sql").read_text()) cur.execute("INSERT INTO release (id, title) VALUES (1, 'Test')") sql = SCHEMA_DIR.joinpath("create_indexes.sql").read_text() sql = sql.replace(" CONCURRENTLY", "") diff --git a/tests/integration/test_dedup.py b/tests/integration/test_dedup.py index 776addd..f8446a3 100644 --- a/tests/integration/test_dedup.py +++ b/tests/integration/test_dedup.py @@ -57,11 +57,12 @@ def _drop_all_tables(conn) -> None: def _fresh_import(db_url: str) -> None: - """Drop everything, apply schema, and import fixture CSVs.""" + """Drop everything, apply schema and functions, and import fixture CSVs.""" conn = psycopg.connect(db_url, autocommit=True) _drop_all_tables(conn) with conn.cursor() as cur: cur.execute(SCHEMA_DIR.joinpath("create_database.sql").read_text()) + cur.execute(SCHEMA_DIR.joinpath("create_functions.sql").read_text()) conn.close() conn = psycopg.connect(db_url) diff --git a/tests/integration/test_schema.py b/tests/integration/test_schema.py index 576a26c..18b0eb6 100644 --- a/tests/integration/test_schema.py +++ b/tests/integration/test_schema.py @@ -17,11 +17,12 @@ class TestCreateDatabase: @pytest.fixture(autouse=True) def _apply_schema(self, db_url): - """Run create_database.sql against the test database.""" + """Run create_database.sql and create_functions.sql against the test database.""" self.db_url = db_url conn = psycopg.connect(db_url, autocommit=True) with conn.cursor() as cur: cur.execute(SCHEMA_DIR.joinpath("create_database.sql").read_text()) + cur.execute(SCHEMA_DIR.joinpath("create_functions.sql").read_text()) conn.close() def _connect(self): @@ -91,6 +92,23 @@ def test_pg_trgm_extension_enabled(self) -> None: conn.close() assert result is not None, "pg_trgm extension not installed" + def test_unaccent_extension_enabled(self) -> None: + conn = self._connect() + with conn.cursor() as cur: + cur.execute("SELECT extname FROM pg_extension WHERE extname = 'unaccent'") + result = cur.fetchone() + conn.close() + assert result is not None, "unaccent extension not installed" + + def test_f_unaccent_function_exists(self) -> None: + """Immutable f_unaccent() wrapper is available and strips diacritics.""" + conn = self._connect() + with conn.cursor() as cur: + cur.execute("SELECT f_unaccent('Björk')") + result = cur.fetchone()[0] + conn.close() + assert result == "Bjork" + def test_fk_constraints_with_cascade(self) -> None: """Child tables have ON DELETE CASCADE foreign keys to release.""" conn = self._connect() @@ -146,11 +164,12 @@ class TestCreateIndexes: @pytest.fixture(autouse=True) def _apply_schema_and_data(self, db_url): - """Set up schema and insert minimal sample data for index creation.""" + """Set up schema, functions, and insert minimal sample data for index creation.""" self.db_url = db_url conn = psycopg.connect(db_url, autocommit=True) with conn.cursor() as cur: cur.execute(SCHEMA_DIR.joinpath("create_database.sql").read_text()) + cur.execute(SCHEMA_DIR.joinpath("create_functions.sql").read_text()) # Insert minimal data so indexes have something to work with cur.execute( "INSERT INTO release (id, title) VALUES (1, 'Test Album') ON CONFLICT DO NOTHING" @@ -204,3 +223,23 @@ def test_trigram_indexes_exist(self) -> None: "idx_release_title_trgm", } assert expected.issubset(indexes) + + def test_trigram_indexes_use_unaccent(self) -> None: + """All four trigram indexes use f_unaccent() for accent-insensitive matching.""" + conn = psycopg.connect(self.db_url, autocommit=True) + with conn.cursor() as cur: + sql = SCHEMA_DIR.joinpath("create_indexes.sql").read_text() + sql = sql.replace(" CONCURRENTLY", "") + cur.execute(sql) + + cur.execute(""" + SELECT indexname, indexdef FROM pg_indexes + WHERE schemaname = 'public' + AND indexname LIKE '%trgm%' + """) + rows = cur.fetchall() + conn.close() + for indexname, indexdef in rows: + assert "f_unaccent" in indexdef, ( + f"Index {indexname} should use f_unaccent(): {indexdef}" + )