diff --git a/distance_test.go b/distance_test.go new file mode 100644 index 0000000..f858e6d --- /dev/null +++ b/distance_test.go @@ -0,0 +1,35 @@ +package fuzzy + +import ( + "testing" +) + +func TestLevshtein(t *testing.T) { + s1 := "hello" + s2 := "hollaaaa" + lev := Levenshtein(&s1, &s2) + + if lev != 5 { + t.Errorf("Lev %v", lev) + } +} + +func TestJaro(t *testing.T) { + s1 := "hello" + s2 := "hollaaaa" + j := Jaro(s1, s2) + + if j != 0.6833333333333332 { + t.Errorf("J %v", j) + } +} + +func TestJaroWinkler(t *testing.T) { + s1 := "LATE" + s2 := "LACE" + jw := JaroWinkler(s1, s2) + + if jw != 0.8666666666666667 { + t.Errorf("JW %v", jw) + } +} diff --git a/distances.go b/distances.go new file mode 100644 index 0000000..5e76506 --- /dev/null +++ b/distances.go @@ -0,0 +1,132 @@ +package fuzzy + +// Calculate the Levenshtein distance between two strings +func Levenshtein(a, b *string) int { + la := len(*a) + lb := len(*b) + d := make([]int, la+1) + var lastdiag, olddiag, temp int + + for i := 1; i <= la; i++ { + d[i] = i + } + for i := 1; i <= lb; i++ { + d[0] = i + lastdiag = i - 1 + for j := 1; j <= la; j++ { + olddiag = d[j] + min := d[j] + 1 + if (d[j-1] + 1) < min { + min = d[j-1] + 1 + } + if (*a)[j-1] == (*b)[i-1] { + temp = 0 + } else { + temp = 1 + } + if (lastdiag + temp) < min { + min = lastdiag + temp + } + d[j] = min + lastdiag = olddiag + } + } + return d[la] +} + +// Calculate Jaro-Winkler distance between two strings +func JaroWinkler(s1, s2 string) float64 { + jaroDistance := Jaro(s1, s2) + + if jaroDistance > 0.7 { + prefix := 0 + + for i := 0; i < Min(len(s1), len(s2)); i++ { + if s1[i] == s2[i] { + prefix += 1 + } else { + break + } + } + + prefix = Min(4, prefix) + + jaroDistance += 0.1 * float64(prefix) * (1 - jaroDistance) + } + + return jaroDistance +} + +func Jaro(s1, s2 string) float64 { + + if s1 == s2 { + return 1.0 + } + + len1 := len(s1) + len2 := len(s2) + + if len1 == 0 || len2 == 0 { + return 0.0 + } + + maxDistance := int(float64((Max(len1, len2))/2.0) - 1.0) + + match := 0. + + hashS1 := make([]int, len1) + hashS2 := make([]int, len2) + + for i := 0; i < len1; i++ { + for j := Max(0, 1-maxDistance); j < Min(len2, i+maxDistance+1); j++ { + if (s1[i] == s2[j]) && (hashS2[j] == 0) { + hashS1[i] = 1 + hashS2[j] = 1 + match += 1 + break + } + } + } + + if match == 0 { + return 0.0 + } + + t := 0.0 + point := 0 + + for i := 0; i < len1; i++ { + if hashS1[i] != 0 { + // loop on hashS2 until it finds 1 + for hashS2[point] < 1 { + point++ + } + if s1[i] != s2[point] { + t++ + } + point++ + } + //t = t /2 + } + t = t / 2 + + // Jaro Similarity + // return float64(((match/len1)+(match/len2)+match-t)/match) / 3.0 + return (match/float64(len1) + + match/float64(len2) + + (match-t)/match) / 3 +} + +func Max(x, y int) int { + if x < y { + return y + } + return x +} + +func Min(x, y int) int { + if x > y { + return y + } + return x +} diff --git a/fuzzy.go b/fuzzy.go index c6bd6a5..b825e7a 100644 --- a/fuzzy.go +++ b/fuzzy.go @@ -36,10 +36,11 @@ const ( ) type Potential struct { - Term string // Potential term string - Score int // Score - Leven int // Levenstein distance from the suggestion to the input - Method Method // How this potential was matched + Term string // Potential term string + Score int // Score + Leven int // Levenstein distance from the suggestion to the input + JaroWinkler float64 // JaroWinkler distance from the suggestion to the input + Method Method // How this potential was matched } type Counts struct { @@ -233,40 +234,6 @@ func (model *Model) SetDivergenceThreshold(val int) { model.Unlock() } -// Calculate the Levenshtein distance between two strings -func Levenshtein(a, b *string) int { - la := len(*a) - lb := len(*b) - d := make([]int, la+1) - var lastdiag, olddiag, temp int - - for i := 1; i <= la; i++ { - d[i] = i - } - for i := 1; i <= lb; i++ { - d[0] = i - lastdiag = i - 1 - for j := 1; j <= la; j++ { - olddiag = d[j] - min := d[j] + 1 - if (d[j-1] + 1) < min { - min = d[j-1] + 1 - } - if (*a)[j-1] == (*b)[i-1] { - temp = 0 - } else { - temp = 1 - } - if (lastdiag + temp) < min { - min = lastdiag + temp - } - d[j] = min - lastdiag = olddiag - } - } - return d[la] -} - // Add an array of words to train the model in bulk func (model *Model) Train(terms []string) { for _, term := range terms { @@ -328,6 +295,27 @@ func (model *Model) TrainQuery(term string) { } } +// Train using a search query term. This builds a second popularity +// index of terms used to search, as opposed to generally occurring +// in corpus text. It also adds a user define count (query count) to advice on ranking. +// see SetCount for inspiration. +// If the term exists in the model, advances it by `count`, otherwise count will be the +// starting point as opposed to `1` in the standard TrainQuery +func (model *Model) TrainQueryWithUserCount(term string, count int) { + model.Lock() + if t, ok := model.Data[term]; ok { + t.Query = t.Query + count + } else { + model.Data[term] = &Counts{count, 1} + } + model.SuffDivergence++ + update := model.SuffDivergence > model.SuffDivergenceThreshold + model.Unlock() + if update { + model.updateSuffixArr() + } +} + // For a given term, create the partially deleted lookup keys func (model *Model) createSuggestKeys(term string) { edits := model.EditsMulti(term, model.Depth) @@ -482,7 +470,7 @@ func (model *Model) suggestPotential(input string, exhaustive bool) map[string]* // 0 - If this is a dictionary term we're all good, no need to go further if model.corpusCount(input) > model.Threshold { - suggestions[input] = &Potential{Term: input, Score: model.corpusCount(input), Leven: 0, Method: MethodIsWord} + suggestions[input] = &Potential{Term: input, Score: model.corpusCount(input), Leven: 0, JaroWinkler: 0.0, Method: MethodIsWord} if !exhaustive { return suggestions } @@ -492,7 +480,7 @@ func (model *Model) suggestPotential(input string, exhaustive bool) map[string]* if sugg, ok := model.Suggest[input]; ok { for _, pot := range sugg { if _, ok := suggestions[pot]; !ok { - suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: Levenshtein(&input, &pot), Method: MethodSuggestMapsToInput} + suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: Levenshtein(&input, &pot), JaroWinkler: JaroWinkler(input, pot), Method: MethodSuggestMapsToInput} } } @@ -508,7 +496,7 @@ func (model *Model) suggestPotential(input string, exhaustive bool) map[string]* score := model.corpusCount(edit) if score > 0 && len(edit) > 2 { if _, ok := suggestions[edit]; !ok { - suggestions[edit] = &Potential{Term: edit, Score: score, Leven: Levenshtein(&input, &edit), Method: MethodInputDeleteMapsToDict} + suggestions[edit] = &Potential{Term: edit, Score: score, Leven: Levenshtein(&input, &edit), JaroWinkler: JaroWinkler(input, edit), Method: MethodInputDeleteMapsToDict} } if score > max { max = score @@ -530,9 +518,10 @@ func (model *Model) suggestPotential(input string, exhaustive bool) map[string]* // Is this a real transpose or replace? for _, pot := range sugg { lev := Levenshtein(&input, &pot) + jw := JaroWinkler(input, pot) if lev <= model.Depth+1 { // The +1 doesn't seem to impact speed, but has greater coverage when the depth is not sufficient to make suggestions if _, ok := suggestions[pot]; !ok { - suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: lev, Method: MethodInputDeleteMapsToSuggest} + suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: lev, JaroWinkler: jw, Method: MethodInputDeleteMapsToSuggest} } } }