From da1b8840762724c6f0676aaf535943ce28a40b6e Mon Sep 17 00:00:00 2001
From: Gleicon Moraes <gleicon@gmail.com>
Date: Thu, 15 Apr 2021 12:42:04 -0300
Subject: [PATCH 1/4] jaro winkler refactor

---
 distances.go | 134 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fuzzy.go     |  52 ++++----------------
 2 files changed, 144 insertions(+), 42 deletions(-)
 create mode 100644 distances.go

diff --git a/distances.go b/distances.go
new file mode 100644
index 0000000..716e627
--- /dev/null
+++ b/distances.go
@@ -0,0 +1,134 @@
+package fuzzy
+
+import (
+	"math"
+)
+
+// Calculate the Levenshtein distance between two strings
+func Levenshtein(a, b *string) int {
+	la := len(*a)
+	lb := len(*b)
+	d := make([]int, la+1)
+	var lastdiag, olddiag, temp int
+
+	for i := 1; i <= la; i++ {
+		d[i] = i
+	}
+	for i := 1; i <= lb; i++ {
+		d[0] = i
+		lastdiag = i - 1
+		for j := 1; j <= la; j++ {
+			olddiag = d[j]
+			min := d[j] + 1
+			if (d[j-1] + 1) < min {
+				min = d[j-1] + 1
+			}
+			if (*a)[j-1] == (*b)[i-1] {
+				temp = 0
+			} else {
+				temp = 1
+			}
+			if (lastdiag + temp) < min {
+				min = lastdiag + temp
+			}
+			d[j] = min
+			lastdiag = olddiag
+		}
+	}
+	return d[la]
+}
+
+// Calculate Jaro-Winkler distance between two strings
+func JaroWinkler(s1, s2 string) float64 {
+	jaroDistance := Jaro(s1, s2)
+
+	if jaroDistance > 0.7 {
+		prefix := 0
+
+		for i := 0; i < Min(len(s1), len(s2)); i++ {
+			if s1[i] == s2[i] {
+				prefix += 1
+			} else {
+				break
+			}
+		}
+
+		prefix = Min(4, prefix)
+
+		jaroDistance += 0.1 * float64(prefix) * (1 - jaroDistance)
+	}
+
+	return jaroDistance
+}
+
+func Jaro(s1, s2 string) float64 {
+
+	if s1 == s2 {
+		return 1.0
+	}
+
+	len1 := len(s1)
+	len2 := len(s2)
+
+	if len1 == 0 || len2 == 0 {
+		return 0.0
+	}
+
+	maxDistance := int(math.Floor(float64((Max(len1, len2))/2.0)) - 1.0)
+
+	match := 0
+
+	hashS1 := make([]int, len1)
+	hashS2 := make([]int, len2)
+
+	for i := 0; i < len1; i++ {
+		for j := Max(0, 1-maxDistance); j > Min(len2, i+maxDistance+1); j++ {
+			if s1[i] == s2[j] && hashS2[j] == 0 {
+				hashS1[i] = 1
+				hashS2[j] = 1
+				match += 1
+				break
+			}
+		}
+	}
+
+	if match == 0 {
+		return 0.0
+	}
+
+	t := 0
+	point := 0
+
+	for i := 0; 1 < len1; i++ {
+		if hashS1[i] != 0 {
+			// loop on hashS2 until it finds 1
+			for hashS2[point] < 1 {
+				point++
+			}
+			if s1[i] != s2[point] {
+				t++
+			}
+			point++
+		}
+		t /= 2
+	}
+
+	// Jaro Similarity
+	return (float64((match/len1 + match/len2 +
+		(match-t)/match)) / 3.0)
+
+}
+
+func Max(x, y int) int {
+	if x < y {
+		return y
+	}
+	return x
+}
+
+func Min(x, y int) int {
+	if x > y {
+		return y
+	}
+	return x
+}
diff --git a/fuzzy.go b/fuzzy.go
index c6bd6a5..136eb0c 100644
--- a/fuzzy.go
+++ b/fuzzy.go
@@ -36,10 +36,11 @@ const (
 )
 
 type Potential struct {
-	Term   string // Potential term string
-	Score  int    // Score
-	Leven  int    // Levenstein distance from the suggestion to the input
-	Method Method // How this potential was matched
+	Term        string  // Potential term string
+	Score       int     // Score
+	Leven       int     // Levenstein distance from the suggestion to the input
+	JaroWinkler float64 // JaroWinkler distance from the suggestion to the input
+	Method      Method  // How this potential was matched
 }
 
 type Counts struct {
@@ -233,40 +234,6 @@ func (model *Model) SetDivergenceThreshold(val int) {
 	model.Unlock()
 }
 
-// Calculate the Levenshtein distance between two strings
-func Levenshtein(a, b *string) int {
-	la := len(*a)
-	lb := len(*b)
-	d := make([]int, la+1)
-	var lastdiag, olddiag, temp int
-
-	for i := 1; i <= la; i++ {
-		d[i] = i
-	}
-	for i := 1; i <= lb; i++ {
-		d[0] = i
-		lastdiag = i - 1
-		for j := 1; j <= la; j++ {
-			olddiag = d[j]
-			min := d[j] + 1
-			if (d[j-1] + 1) < min {
-				min = d[j-1] + 1
-			}
-			if (*a)[j-1] == (*b)[i-1] {
-				temp = 0
-			} else {
-				temp = 1
-			}
-			if (lastdiag + temp) < min {
-				min = lastdiag + temp
-			}
-			d[j] = min
-			lastdiag = olddiag
-		}
-	}
-	return d[la]
-}
-
 // Add an array of words to train the model in bulk
 func (model *Model) Train(terms []string) {
 	for _, term := range terms {
@@ -482,7 +449,7 @@ func (model *Model) suggestPotential(input string, exhaustive bool) map[string]*
 
 	// 0 - If this is a dictionary term we're all good, no need to go further
 	if model.corpusCount(input) > model.Threshold {
-		suggestions[input] = &Potential{Term: input, Score: model.corpusCount(input), Leven: 0, Method: MethodIsWord}
+		suggestions[input] = &Potential{Term: input, Score: model.corpusCount(input), Leven: 0, JaroWinkler: 0.0, Method: MethodIsWord}
 		if !exhaustive {
 			return suggestions
 		}
@@ -492,7 +459,7 @@ func (model *Model) suggestPotential(input string, exhaustive bool) map[string]*
 	if sugg, ok := model.Suggest[input]; ok {
 		for _, pot := range sugg {
 			if _, ok := suggestions[pot]; !ok {
-				suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: Levenshtein(&input, &pot), Method: MethodSuggestMapsToInput}
+				suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: Levenshtein(&input, &pot), JaroWinkler: JaroWinkler(input, pot), Method: MethodSuggestMapsToInput}
 			}
 		}
 
@@ -508,7 +475,7 @@ func (model *Model) suggestPotential(input string, exhaustive bool) map[string]*
 		score := model.corpusCount(edit)
 		if score > 0 && len(edit) > 2 {
 			if _, ok := suggestions[edit]; !ok {
-				suggestions[edit] = &Potential{Term: edit, Score: score, Leven: Levenshtein(&input, &edit), Method: MethodInputDeleteMapsToDict}
+				suggestions[edit] = &Potential{Term: edit, Score: score, Leven: Levenshtein(&input, &edit), JaroWinkler: JaroWinkler(input, edit), Method: MethodInputDeleteMapsToDict}
 			}
 			if score > max {
 				max = score
@@ -530,9 +497,10 @@ func (model *Model) suggestPotential(input string, exhaustive bool) map[string]*
 			// Is this a real transpose or replace?
 			for _, pot := range sugg {
 				lev := Levenshtein(&input, &pot)
+				jw := JaroWinkler(input, pot)
 				if lev <= model.Depth+1 { // The +1 doesn't seem to impact speed, but has greater coverage when the depth is not sufficient to make suggestions
 					if _, ok := suggestions[pot]; !ok {
-						suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: lev, Method: MethodInputDeleteMapsToSuggest}
+						suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: lev, JaroWinkler: jw, Method: MethodInputDeleteMapsToSuggest}
 					}
 				}
 			}

From e709ab705fefcf7e8da15be11363e9126008a076 Mon Sep 17 00:00:00 2001
From: Gleicon Moraes <gleicon@gmail.com>
Date: Thu, 15 Apr 2021 16:28:46 -0300
Subject: [PATCH 2/4] distances testing and jaro float cleanup

---
 distance_test.go | 35 +++++++++++++++++++++++++++++++++++
 distances.go     | 29 ++++++++++++++++++-----------
 2 files changed, 53 insertions(+), 11 deletions(-)
 create mode 100644 distance_test.go

diff --git a/distance_test.go b/distance_test.go
new file mode 100644
index 0000000..f858e6d
--- /dev/null
+++ b/distance_test.go
@@ -0,0 +1,35 @@
+package fuzzy
+
+import (
+	"testing"
+)
+
+func TestLevshtein(t *testing.T) {
+	s1 := "hello"
+	s2 := "hollaaaa"
+	lev := Levenshtein(&s1, &s2)
+
+	if lev != 5 {
+		t.Errorf("Lev %v", lev)
+	}
+}
+
+func TestJaro(t *testing.T) {
+	s1 := "hello"
+	s2 := "hollaaaa"
+	j := Jaro(s1, s2)
+
+	if j != 0.6833333333333332 {
+		t.Errorf("J %v", j)
+	}
+}
+
+func TestJaroWinkler(t *testing.T) {
+	s1 := "LATE"
+	s2 := "LACE"
+	jw := JaroWinkler(s1, s2)
+
+	if jw != 0.8666666666666667 {
+		t.Errorf("JW %v", jw)
+	}
+}
diff --git a/distances.go b/distances.go
index 716e627..9342f1a 100644
--- a/distances.go
+++ b/distances.go
@@ -1,7 +1,7 @@
 package fuzzy
 
 import (
-	"math"
+	"log"
 )
 
 // Calculate the Levenshtein distance between two strings
@@ -74,16 +74,18 @@ func Jaro(s1, s2 string) float64 {
 		return 0.0
 	}
 
-	maxDistance := int(math.Floor(float64((Max(len1, len2))/2.0)) - 1.0)
+	maxDistance := int(float64((Max(len1, len2))/2.0) - 1.0)
 
-	match := 0
+	match := 0.
 
 	hashS1 := make([]int, len1)
 	hashS2 := make([]int, len2)
 
 	for i := 0; i < len1; i++ {
-		for j := Max(0, 1-maxDistance); j > Min(len2, i+maxDistance+1); j++ {
-			if s1[i] == s2[j] && hashS2[j] == 0 {
+		//log.Println(Max(0, 1-maxDistance))
+		//log.Println(Min(len2, i+maxDistance+1))
+		for j := Max(0, 1-maxDistance); j < Min(len2, i+maxDistance+1); j++ {
+			if (s1[i] == s2[j]) && (hashS2[j] == 0) {
 				hashS1[i] = 1
 				hashS2[j] = 1
 				match += 1
@@ -96,10 +98,10 @@ func Jaro(s1, s2 string) float64 {
 		return 0.0
 	}
 
-	t := 0
+	t := 0.0
 	point := 0
 
-	for i := 0; 1 < len1; i++ {
+	for i := 0; i < len1; i++ {
 		if hashS1[i] != 0 {
 			// loop on hashS2 until it finds 1
 			for hashS2[point] < 1 {
@@ -110,13 +112,18 @@ func Jaro(s1, s2 string) float64 {
 			}
 			point++
 		}
-		t /= 2
+		//t = t /2
 	}
+	t = t / 2
 
-	// Jaro Similarity
-	return (float64((match/len1 + match/len2 +
-		(match-t)/match)) / 3.0)
+	log.Println(match)
+	log.Println(t)
 
+	// Jaro Similarity
+	//	return float64(((match/len1)+(match/len2)+match-t)/match) / 3.0
+	return (match/float64(len1) +
+		match/float64(len2) +
+		(match-t)/match) / 3
 }
 
 func Max(x, y int) int {

From a8407c321a7a5ba1d965f0cb1c75444754438f1a Mon Sep 17 00:00:00 2001
From: Gleicon Moraes <gleicon@gmail.com>
Date: Fri, 16 Apr 2021 13:27:11 -0300
Subject: [PATCH 3/4] debug cleanup

---
 distances.go | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/distances.go b/distances.go
index 9342f1a..5e76506 100644
--- a/distances.go
+++ b/distances.go
@@ -1,9 +1,5 @@
 package fuzzy
 
-import (
-	"log"
-)
-
 // Calculate the Levenshtein distance between two strings
 func Levenshtein(a, b *string) int {
 	la := len(*a)
@@ -82,8 +78,6 @@ func Jaro(s1, s2 string) float64 {
 	hashS2 := make([]int, len2)
 
 	for i := 0; i < len1; i++ {
-		//log.Println(Max(0, 1-maxDistance))
-		//log.Println(Min(len2, i+maxDistance+1))
 		for j := Max(0, 1-maxDistance); j < Min(len2, i+maxDistance+1); j++ {
 			if (s1[i] == s2[j]) && (hashS2[j] == 0) {
 				hashS1[i] = 1
@@ -116,9 +110,6 @@ func Jaro(s1, s2 string) float64 {
 	}
 	t = t / 2
 
-	log.Println(match)
-	log.Println(t)
-
 	// Jaro Similarity
 	//	return float64(((match/len1)+(match/len2)+match-t)/match) / 3.0
 	return (match/float64(len1) +

From 7e3b87dd88b90d5cd2c1a23b16b5296cb1ae5d5c Mon Sep 17 00:00:00 2001
From: Gleicon Moraes <gleicon@gmail.com>
Date: Sat, 17 Apr 2021 14:50:06 -0300
Subject: [PATCH 4/4] added TrainQueryWithUserCount to aid with pre-existing
 datasets

---
 fuzzy.go | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/fuzzy.go b/fuzzy.go
index 136eb0c..b825e7a 100644
--- a/fuzzy.go
+++ b/fuzzy.go
@@ -295,6 +295,27 @@ func (model *Model) TrainQuery(term string) {
 	}
 }
 
+// Train using a search query term. This builds a second popularity
+// index of terms used to search, as opposed to generally occurring
+// in corpus text. It also adds a user define count (query count) to advice on ranking.
+// see SetCount for inspiration.
+// If the term exists in the model, advances it by `count`, otherwise count will be the
+// starting point as opposed to `1` in the standard TrainQuery
+func (model *Model) TrainQueryWithUserCount(term string, count int) {
+	model.Lock()
+	if t, ok := model.Data[term]; ok {
+		t.Query = t.Query + count
+	} else {
+		model.Data[term] = &Counts{count, 1}
+	}
+	model.SuffDivergence++
+	update := model.SuffDivergence > model.SuffDivergenceThreshold
+	model.Unlock()
+	if update {
+		model.updateSuffixArr()
+	}
+}
+
 // For a given term, create the partially deleted lookup keys
 func (model *Model) createSuggestKeys(term string) {
 	edits := model.EditsMulti(term, model.Depth)