From da1b8840762724c6f0676aaf535943ce28a40b6e Mon Sep 17 00:00:00 2001 From: Gleicon Moraes Date: Thu, 15 Apr 2021 12:42:04 -0300 Subject: [PATCH 1/4] jaro winkler refactor --- distances.go | 134 +++++++++++++++++++++++++++++++++++++++++++++++++++ fuzzy.go | 52 ++++---------------- 2 files changed, 144 insertions(+), 42 deletions(-) create mode 100644 distances.go diff --git a/distances.go b/distances.go new file mode 100644 index 0000000..716e627 --- /dev/null +++ b/distances.go @@ -0,0 +1,134 @@ +package fuzzy + +import ( + "math" +) + +// Calculate the Levenshtein distance between two strings +func Levenshtein(a, b *string) int { + la := len(*a) + lb := len(*b) + d := make([]int, la+1) + var lastdiag, olddiag, temp int + + for i := 1; i <= la; i++ { + d[i] = i + } + for i := 1; i <= lb; i++ { + d[0] = i + lastdiag = i - 1 + for j := 1; j <= la; j++ { + olddiag = d[j] + min := d[j] + 1 + if (d[j-1] + 1) < min { + min = d[j-1] + 1 + } + if (*a)[j-1] == (*b)[i-1] { + temp = 0 + } else { + temp = 1 + } + if (lastdiag + temp) < min { + min = lastdiag + temp + } + d[j] = min + lastdiag = olddiag + } + } + return d[la] +} + +// Calculate Jaro-Winkler distance between two strings +func JaroWinkler(s1, s2 string) float64 { + jaroDistance := Jaro(s1, s2) + + if jaroDistance > 0.7 { + prefix := 0 + + for i := 0; i < Min(len(s1), len(s2)); i++ { + if s1[i] == s2[i] { + prefix += 1 + } else { + break + } + } + + prefix = Min(4, prefix) + + jaroDistance += 0.1 * float64(prefix) * (1 - jaroDistance) + } + + return jaroDistance +} + +func Jaro(s1, s2 string) float64 { + + if s1 == s2 { + return 1.0 + } + + len1 := len(s1) + len2 := len(s2) + + if len1 == 0 || len2 == 0 { + return 0.0 + } + + maxDistance := int(math.Floor(float64((Max(len1, len2))/2.0)) - 1.0) + + match := 0 + + hashS1 := make([]int, len1) + hashS2 := make([]int, len2) + + for i := 0; i < len1; i++ { + for j := Max(0, 1-maxDistance); j > Min(len2, i+maxDistance+1); j++ { + if s1[i] == s2[j] && hashS2[j] == 0 { + hashS1[i] = 1 + hashS2[j] = 1 + match += 1 + break + } + } + } + + if match == 0 { + return 0.0 + } + + t := 0 + point := 0 + + for i := 0; 1 < len1; i++ { + if hashS1[i] != 0 { + // loop on hashS2 until it finds 1 + for hashS2[point] < 1 { + point++ + } + if s1[i] != s2[point] { + t++ + } + point++ + } + t /= 2 + } + + // Jaro Similarity + return (float64((match/len1 + match/len2 + + (match-t)/match)) / 3.0) + +} + +func Max(x, y int) int { + if x < y { + return y + } + return x +} + +func Min(x, y int) int { + if x > y { + return y + } + return x +} diff --git a/fuzzy.go b/fuzzy.go index c6bd6a5..136eb0c 100644 --- a/fuzzy.go +++ b/fuzzy.go @@ -36,10 +36,11 @@ const ( ) type Potential struct { - Term string // Potential term string - Score int // Score - Leven int // Levenstein distance from the suggestion to the input - Method Method // How this potential was matched + Term string // Potential term string + Score int // Score + Leven int // Levenstein distance from the suggestion to the input + JaroWinkler float64 // JaroWinkler distance from the suggestion to the input + Method Method // How this potential was matched } type Counts struct { @@ -233,40 +234,6 @@ func (model *Model) SetDivergenceThreshold(val int) { model.Unlock() } -// Calculate the Levenshtein distance between two strings -func Levenshtein(a, b *string) int { - la := len(*a) - lb := len(*b) - d := make([]int, la+1) - var lastdiag, olddiag, temp int - - for i := 1; i <= la; i++ { - d[i] = i - } - for i := 1; i <= lb; i++ { - d[0] = i - lastdiag = i - 1 - for j := 1; j <= la; j++ { - olddiag = d[j] - min := d[j] + 1 - if (d[j-1] + 1) < min { - min = d[j-1] + 1 - } - if (*a)[j-1] == (*b)[i-1] { - temp = 0 - } else { - temp = 1 - } - if (lastdiag + temp) < min { - min = lastdiag + temp - } - d[j] = min - lastdiag = olddiag - } - } - return d[la] -} - // Add an array of words to train the model in bulk func (model *Model) Train(terms []string) { for _, term := range terms { @@ -482,7 +449,7 @@ func (model *Model) suggestPotential(input string, exhaustive bool) map[string]* // 0 - If this is a dictionary term we're all good, no need to go further if model.corpusCount(input) > model.Threshold { - suggestions[input] = &Potential{Term: input, Score: model.corpusCount(input), Leven: 0, Method: MethodIsWord} + suggestions[input] = &Potential{Term: input, Score: model.corpusCount(input), Leven: 0, JaroWinkler: 0.0, Method: MethodIsWord} if !exhaustive { return suggestions } @@ -492,7 +459,7 @@ func (model *Model) suggestPotential(input string, exhaustive bool) map[string]* if sugg, ok := model.Suggest[input]; ok { for _, pot := range sugg { if _, ok := suggestions[pot]; !ok { - suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: Levenshtein(&input, &pot), Method: MethodSuggestMapsToInput} + suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: Levenshtein(&input, &pot), JaroWinkler: JaroWinkler(input, pot), Method: MethodSuggestMapsToInput} } } @@ -508,7 +475,7 @@ func (model *Model) suggestPotential(input string, exhaustive bool) map[string]* score := model.corpusCount(edit) if score > 0 && len(edit) > 2 { if _, ok := suggestions[edit]; !ok { - suggestions[edit] = &Potential{Term: edit, Score: score, Leven: Levenshtein(&input, &edit), Method: MethodInputDeleteMapsToDict} + suggestions[edit] = &Potential{Term: edit, Score: score, Leven: Levenshtein(&input, &edit), JaroWinkler: JaroWinkler(input, edit), Method: MethodInputDeleteMapsToDict} } if score > max { max = score @@ -530,9 +497,10 @@ func (model *Model) suggestPotential(input string, exhaustive bool) map[string]* // Is this a real transpose or replace? for _, pot := range sugg { lev := Levenshtein(&input, &pot) + jw := JaroWinkler(input, pot) if lev <= model.Depth+1 { // The +1 doesn't seem to impact speed, but has greater coverage when the depth is not sufficient to make suggestions if _, ok := suggestions[pot]; !ok { - suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: lev, Method: MethodInputDeleteMapsToSuggest} + suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: lev, JaroWinkler: jw, Method: MethodInputDeleteMapsToSuggest} } } } From e709ab705fefcf7e8da15be11363e9126008a076 Mon Sep 17 00:00:00 2001 From: Gleicon Moraes Date: Thu, 15 Apr 2021 16:28:46 -0300 Subject: [PATCH 2/4] distances testing and jaro float cleanup --- distance_test.go | 35 +++++++++++++++++++++++++++++++++++ distances.go | 29 ++++++++++++++++++----------- 2 files changed, 53 insertions(+), 11 deletions(-) create mode 100644 distance_test.go diff --git a/distance_test.go b/distance_test.go new file mode 100644 index 0000000..f858e6d --- /dev/null +++ b/distance_test.go @@ -0,0 +1,35 @@ +package fuzzy + +import ( + "testing" +) + +func TestLevshtein(t *testing.T) { + s1 := "hello" + s2 := "hollaaaa" + lev := Levenshtein(&s1, &s2) + + if lev != 5 { + t.Errorf("Lev %v", lev) + } +} + +func TestJaro(t *testing.T) { + s1 := "hello" + s2 := "hollaaaa" + j := Jaro(s1, s2) + + if j != 0.6833333333333332 { + t.Errorf("J %v", j) + } +} + +func TestJaroWinkler(t *testing.T) { + s1 := "LATE" + s2 := "LACE" + jw := JaroWinkler(s1, s2) + + if jw != 0.8666666666666667 { + t.Errorf("JW %v", jw) + } +} diff --git a/distances.go b/distances.go index 716e627..9342f1a 100644 --- a/distances.go +++ b/distances.go @@ -1,7 +1,7 @@ package fuzzy import ( - "math" + "log" ) // Calculate the Levenshtein distance between two strings @@ -74,16 +74,18 @@ func Jaro(s1, s2 string) float64 { return 0.0 } - maxDistance := int(math.Floor(float64((Max(len1, len2))/2.0)) - 1.0) + maxDistance := int(float64((Max(len1, len2))/2.0) - 1.0) - match := 0 + match := 0. hashS1 := make([]int, len1) hashS2 := make([]int, len2) for i := 0; i < len1; i++ { - for j := Max(0, 1-maxDistance); j > Min(len2, i+maxDistance+1); j++ { - if s1[i] == s2[j] && hashS2[j] == 0 { + //log.Println(Max(0, 1-maxDistance)) + //log.Println(Min(len2, i+maxDistance+1)) + for j := Max(0, 1-maxDistance); j < Min(len2, i+maxDistance+1); j++ { + if (s1[i] == s2[j]) && (hashS2[j] == 0) { hashS1[i] = 1 hashS2[j] = 1 match += 1 @@ -96,10 +98,10 @@ func Jaro(s1, s2 string) float64 { return 0.0 } - t := 0 + t := 0.0 point := 0 - for i := 0; 1 < len1; i++ { + for i := 0; i < len1; i++ { if hashS1[i] != 0 { // loop on hashS2 until it finds 1 for hashS2[point] < 1 { @@ -110,13 +112,18 @@ func Jaro(s1, s2 string) float64 { } point++ } - t /= 2 + //t = t /2 } + t = t / 2 - // Jaro Similarity - return (float64((match/len1 + match/len2 + - (match-t)/match)) / 3.0) + log.Println(match) + log.Println(t) + // Jaro Similarity + // return float64(((match/len1)+(match/len2)+match-t)/match) / 3.0 + return (match/float64(len1) + + match/float64(len2) + + (match-t)/match) / 3 } func Max(x, y int) int { From a8407c321a7a5ba1d965f0cb1c75444754438f1a Mon Sep 17 00:00:00 2001 From: Gleicon Moraes Date: Fri, 16 Apr 2021 13:27:11 -0300 Subject: [PATCH 3/4] debug cleanup --- distances.go | 9 --------- 1 file changed, 9 deletions(-) diff --git a/distances.go b/distances.go index 9342f1a..5e76506 100644 --- a/distances.go +++ b/distances.go @@ -1,9 +1,5 @@ package fuzzy -import ( - "log" -) - // Calculate the Levenshtein distance between two strings func Levenshtein(a, b *string) int { la := len(*a) @@ -82,8 +78,6 @@ func Jaro(s1, s2 string) float64 { hashS2 := make([]int, len2) for i := 0; i < len1; i++ { - //log.Println(Max(0, 1-maxDistance)) - //log.Println(Min(len2, i+maxDistance+1)) for j := Max(0, 1-maxDistance); j < Min(len2, i+maxDistance+1); j++ { if (s1[i] == s2[j]) && (hashS2[j] == 0) { hashS1[i] = 1 @@ -116,9 +110,6 @@ func Jaro(s1, s2 string) float64 { } t = t / 2 - log.Println(match) - log.Println(t) - // Jaro Similarity // return float64(((match/len1)+(match/len2)+match-t)/match) / 3.0 return (match/float64(len1) + From 7e3b87dd88b90d5cd2c1a23b16b5296cb1ae5d5c Mon Sep 17 00:00:00 2001 From: Gleicon Moraes Date: Sat, 17 Apr 2021 14:50:06 -0300 Subject: [PATCH 4/4] added TrainQueryWithUserCount to aid with pre-existing datasets --- fuzzy.go | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/fuzzy.go b/fuzzy.go index 136eb0c..b825e7a 100644 --- a/fuzzy.go +++ b/fuzzy.go @@ -295,6 +295,27 @@ func (model *Model) TrainQuery(term string) { } } +// Train using a search query term. This builds a second popularity +// index of terms used to search, as opposed to generally occurring +// in corpus text. It also adds a user define count (query count) to advice on ranking. +// see SetCount for inspiration. +// If the term exists in the model, advances it by `count`, otherwise count will be the +// starting point as opposed to `1` in the standard TrainQuery +func (model *Model) TrainQueryWithUserCount(term string, count int) { + model.Lock() + if t, ok := model.Data[term]; ok { + t.Query = t.Query + count + } else { + model.Data[term] = &Counts{count, 1} + } + model.SuffDivergence++ + update := model.SuffDivergence > model.SuffDivergenceThreshold + model.Unlock() + if update { + model.updateSuffixArr() + } +} + // For a given term, create the partially deleted lookup keys func (model *Model) createSuggestKeys(term string) { edits := model.EditsMulti(term, model.Depth)