diff --git a/settings.js b/settings.js index 9dadf5f8..2380a460 100644 --- a/settings.js +++ b/settings.js @@ -3,19 +3,23 @@ const fs = require('fs'); const path = require('path'); const peliasConfig = require('pelias-config'); const punctuation = require('./punctuation'); -const synonymFile = require('./synonyms/parser'); +const synonymParser = require('./synonyms/parser'); +const synonymLinter = require('./synonyms/linter'); // load synonyms from disk const synonyms = fs.readdirSync(path.join(__dirname, 'synonyms')) .sort() .filter( f => f.match(/\.txt$/) ) .reduce(( acc, cur ) => { - acc[cur.replace('.txt','')] = synonymFile( + acc[cur.replace('.txt', '')] = synonymParser( path.join(__dirname, 'synonyms', cur) ); return acc; }, {}); +// emit synonym warnings +synonymLinter(synonyms); + require('./configValidation').validate(peliasConfig.generate()); function generate(){ diff --git a/synonyms/custom_name.txt b/synonyms/custom_name.txt index 72890fd9..ad9fe7d3 100644 --- a/synonyms/custom_name.txt +++ b/synonyms/custom_name.txt @@ -45,7 +45,7 @@ greater,grtr,gtr greens,grns groves,grvs heights,hghts,hgts,hieghts,ht,hts,hgths -international,intl,int'l +international,intl lake,lk lakes,lks little,ltl,lttl,littl,litl @@ -60,7 +60,7 @@ mount,mt,mnt mountain,mtn mountains,mtns municipal,mun,mpal -national,natl,nat'l +national,natl neck,nck orchard,orch paradise,pde,pdse @@ -115,7 +115,7 @@ wiese,ws # Spanish abril,abr,abl -agosto,ag,agto,ag.to,agt +agosto,ag,agto,agt altura,alt alturas,alts arboleda,arb @@ -132,9 +132,9 @@ corral,crral corralillo,crrlo diseminado,disem enero,en,eno,ene,en o -diciembre,dic,dicbre,dic.bre,dice,dic.e,dbre,d.bre,10bre,10.bre,10 bre,xbre,x.bre,x bre -febrero,febo,feb.o,febro,feb.ro,febr,feb -gobierno,gob,gobno,gob.no +diciembre,dic,dicbre,dice,dbre,10bre,10 bre,xbre,x bre +febrero,febo,febro,febr,feb +gobierno,gob,gobno grande,gr guerra,ga independencia,indep @@ -154,8 +154,8 @@ militar,milr monte,mt,mte,mnte montes,mts,mtes,mntes,mnts nacional,nal,nacl -noviembre,nbre,n.bre,nvre,n.vre,nove,nov.e,novre,nov.re,novbre,nov.bre,9bre,9.bre,9 bre -octubre,oct,octbre,oct.bre,octe,oct.e,8bre,8.bre,8 bre +noviembre,nbre,nvre,nove,novre,novbre,9bre,9 bre +octubre,oct,octbre,octe,8bre,8 bre portillo,ptilo,ptllo prado,prdo primeros,pros @@ -167,8 +167,8 @@ republica,rep revolucion,rev ribera,ribr río,rio -septiembre,setbre,set.bre,sepe,sep.e,sepbre,sep.bre,7bre,7 re,7re,7.re,7 bre,7.bre,sep,set +septiembre,setbre,sepe,sepbre,7bre,7 re,7re,7 bre,sep,set sierra,srra valle,vlle volcan,vlcn -voluntarios,voluntos \ No newline at end of file +voluntarios,voluntos diff --git a/synonyms/linter.js b/synonyms/linter.js new file mode 100644 index 00000000..930cb886 --- /dev/null +++ b/synonyms/linter.js @@ -0,0 +1,76 @@ +const _ = require('lodash'); +const logger = require('pelias-logger').get('schema-synonyms'); +const punctuation = require('../punctuation'); + +/** + * The synonyms linter attempts to warn the user when making + * common mistakes with synonyms. + * + * Warnings: + * - Puntuation: Synonyms should not contain characters in the punctuation blacklist + * - Letter Casing: Synonyms should be lowercase + * - Sanity Checks: At least one synonym should exist, duplicates should be removed + * - Multi Word: Multi-word synonyms can generate unexpected token positions + */ + +function linter(synonyms) { + _.each(synonyms, (lines, filename) => { + logger.debug(`[lint] ${filename}`); + + lines.forEach((line, idx) => { + const logprefix = `[${filename} line ${idx+1}]`; + logger.debug(`[line] ${line}`); + + // split the lines by delimeter + let tokens = line.split(/,|=>/g).map(t => t.trim()); + + // strip blacklisted punctuation from synonyms + // the 'punctuation.blacklist' contains a list of characters which are + // stripped from the tokens before indexing. + tokens = _.map(tokens, token => { + punctuation.blacklist.forEach(char => { + let replacement = token.split(char).join(''); + if(replacement.length != token.length){ + logger.warn(`${logprefix} punctunation removed: ${token} --> ${replacement}`); + } + token = replacement; + }); + return token + }); + + letterCasing(line, logprefix, tokens); + tokensSanityCheck(line, logprefix, tokens); + // multiWordCheck(line, logprefix, tokens); + }) + }) +} + +function letterCasing(line, logprefix){ + if (line.toLowerCase() !== line) { + logger.warn(`${logprefix} should be lowercase:`, line); + } +} + +function tokensSanityCheck(line, logprefix, tokens) { + switch (tokens.length){ + case 0: + return logger.warn(`${logprefix} no tokens:`, line); + case 1: + return logger.warn(`${logprefix} only one token:`, line); + default: + let dupes = _.filter(tokens, (val, i, t) => _.includes(t, val, i + 1)); + if (dupes.length){ + logger.warn(`${logprefix} duplicate tokens:`, dupes); + } + } +} + +function multiWordCheck(line, tokens) { + _.each(tokens, token => { + if (/\s/.test(token)){ + logger.warn(`multi word synonyms may cause issues with phrase queries:`, token); + } + }); +} + +module.exports = linter diff --git a/test/fixtures/expected.json b/test/fixtures/expected.json index 68ebc5d0..35481abb 100644 --- a/test/fixtures/expected.json +++ b/test/fixtures/expected.json @@ -231,7 +231,7 @@ "greens,grns", "groves,grvs", "heights,hghts,hgts,hieghts,ht,hts,hgths", - "international,intl,int'l", + "international,intl", "lake,lk", "lakes,lks", "little,ltl,lttl,littl,litl", @@ -246,7 +246,7 @@ "mountain,mtn", "mountains,mtns", "municipal,mun,mpal", - "national,natl,nat'l", + "national,natl", "neck,nck", "orchard,orch", "paradise,pde,pdse", @@ -295,7 +295,7 @@ "vordere,vd,vord", "wiese,ws", "abril,abr,abl", - "agosto,ag,agto,ag.to,agt", + "agosto,ag,agto,agt", "altura,alt", "alturas,alts", "arboleda,arb", @@ -312,9 +312,9 @@ "corralillo,crrlo", "diseminado,disem", "enero,en,eno,ene,en o", - "diciembre,dic,dicbre,dic.bre,dice,dic.e,dbre,d.bre,10bre,10.bre,10 bre,xbre,x.bre,x bre", - "febrero,febo,feb.o,febro,feb.ro,febr,feb", - "gobierno,gob,gobno,gob.no", + "diciembre,dic,dicbre,dice,dbre,10bre,10 bre,xbre,x bre", + "febrero,febo,febro,febr,feb", + "gobierno,gob,gobno", "grande,gr", "guerra,ga", "independencia,indep", @@ -334,8 +334,8 @@ "monte,mt,mte,mnte", "montes,mts,mtes,mntes,mnts", "nacional,nal,nacl", - "noviembre,nbre,n.bre,nvre,n.vre,nove,nov.e,novre,nov.re,novbre,nov.bre,9bre,9.bre,9 bre", - "octubre,oct,octbre,oct.bre,octe,oct.e,8bre,8.bre,8 bre", + "noviembre,nbre,nvre,nove,novre,novbre,9bre,9 bre", + "octubre,oct,octbre,octe,8bre,8 bre", "portillo,ptilo,ptllo", "prado,prdo", "primeros,pros", @@ -347,7 +347,7 @@ "revolucion,rev", "ribera,ribr", "río,rio", - "septiembre,setbre,set.bre,sepe,sep.e,sepbre,sep.bre,7bre,7 re,7re,7.re,7 bre,7.bre,sep,set", + "septiembre,setbre,sepe,sepbre,7bre,7 re,7re,7 bre,sep,set", "sierra,srra", "valle,vlle", "volcan,vlcn", @@ -1226,4 +1226,4 @@ }, "dynamic": "strict" } -} +} \ No newline at end of file