diff --git a/src/SimMetrics.Net/Extensions.cs b/src/SimMetrics.Net/Extensions.cs index 67c3272..31feb0a 100644 --- a/src/SimMetrics.Net/Extensions.cs +++ b/src/SimMetrics.Net/Extensions.cs @@ -15,7 +15,9 @@ public static List ApproximatelyEquals(this List list, string wo var num = l.ApproximatelyEquals(word, simMetricType); var thr = 1 - num; if (thr <= threshold) + { newList.Add(l); + } } return newList; } diff --git a/src/SimMetrics.Net/Metric/ChapmanLengthDeviation.cs b/src/SimMetrics.Net/Metric/ChapmanLengthDeviation.cs index 2a5811c..ebbb82a 100644 --- a/src/SimMetrics.Net/Metric/ChapmanLengthDeviation.cs +++ b/src/SimMetrics.Net/Metric/ChapmanLengthDeviation.cs @@ -35,7 +35,7 @@ public override double GetUnnormalisedSimilarity(string firstWord, string second return GetSimilarity(firstWord, secondWord); } - public override string LongDescriptionString => "Implements the Chapman Length Deviation algorithm whereby the length deviation of the word strings is used to determine if the strings are similar in size - This apporach is not intended to be used single handedly but rather alongside other approaches"; + public override string LongDescriptionString => "Implements the Chapman Length Deviation algorithm whereby the length deviation of the word strings is used to determine if the strings are similar in size - This approach is not intended to be used single handedly but rather alongside other approaches"; public override string ShortDescriptionString => "ChapmanLengthDeviation"; } diff --git a/src/SimMetrics.Net/Metric/ChapmanMeanLength.cs b/src/SimMetrics.Net/Metric/ChapmanMeanLength.cs index d6e764b..e8af3cf 100644 --- a/src/SimMetrics.Net/Metric/ChapmanMeanLength.cs +++ b/src/SimMetrics.Net/Metric/ChapmanMeanLength.cs @@ -3,6 +3,11 @@ namespace SimMetrics.Net.Metric { + /// + /// This method only the lengths of the two words, not at the actual characters. + /// It uses some cutoff(ChapmanMeanLengthMaxString) and a polynomial scaling(1 - num2^4) to produce a score. + /// That means it's really a length-based heuristic similarity, not Chapman Mean Length. + /// public sealed class ChapmanMeanLength : AbstractStringMetric { private const int ChapmanMeanLengthMaxString = 500; @@ -39,9 +44,8 @@ public override double GetUnnormalisedSimilarity(string firstWord, string second return GetSimilarity(firstWord, secondWord); } - public override string LongDescriptionString => "Implements the Chapman Mean Length algorithm provides a similarity measure between two strings from size of the mean length of the vectors - this approach is suppossed to be used to determine which metrics may be best to apply rather than giveing a valid response itself"; + public override string LongDescriptionString => "Implements the Chapman Mean Length algorithm provides a similarity measure between two strings from size of the mean length of the vectors - this approach is suppossed to be used to determine which metrics may be best to apply rather than giving a valid response itself"; public override string ShortDescriptionString => "ChapmanMeanLength"; } -} - +} \ No newline at end of file diff --git a/src/SimMetrics.Net/Metric/ChapmanMeanLengthTrue.cs b/src/SimMetrics.Net/Metric/ChapmanMeanLengthTrue.cs new file mode 100644 index 0000000..3f2086b --- /dev/null +++ b/src/SimMetrics.Net/Metric/ChapmanMeanLengthTrue.cs @@ -0,0 +1,76 @@ +using System; +using SimMetrics.Net.API; + +namespace SimMetrics.Net.Metric; + +/// +/// Correct Chapman Mean Length implementation. +/// +public sealed class ChapmanMeanLengthTrue : AbstractStringMetric +{ + private const double DefaultMismatchScore = 0.0; + private const double DefaultPerfectScore = 1.0; + + public override double GetSimilarity(string firstWord, string secondWord) + { + if (string.IsNullOrEmpty(firstWord) || string.IsNullOrEmpty(secondWord)) + { + return DefaultMismatchScore; + } + + // Compute LCS length + var lcs = LongestCommonSubsequence(firstWord, secondWord); + + // Chapman Mean Length formula + var score = 2.0 * lcs / (firstWord.Length + secondWord.Length); + + return score switch + { + < DefaultMismatchScore => DefaultMismatchScore, + > DefaultPerfectScore => DefaultPerfectScore, + _ => score + }; + } + + public override string GetSimilarityExplained(string firstWord, string secondWord) + { + throw new NotImplementedException(); + } + + public override double GetSimilarityTimingEstimated(string firstWord, string secondWord) + { + return 0.0; + } + + public override double GetUnnormalisedSimilarity(string firstWord, string secondWord) + { + return GetSimilarity(firstWord, secondWord); + } + + public override string LongDescriptionString => "A true implementation of the Chapman Mean Length algorithm"; + + public override string ShortDescriptionString => nameof(ChapmanMeanLengthTrue); + + private static int LongestCommonSubsequence(string s1, string s2) + { + int m = s1.Length, n = s2.Length; + int[,] dp = new int[m + 1, n + 1]; + + for (var i = 0; i < m; i++) + { + for (var j = 0; j < n; j++) + { + if (s1[i] == s2[j]) + { + dp[i + 1, j + 1] = dp[i, j] + 1; + } + else + { + dp[i + 1, j + 1] = Math.Max(dp[i, j + 1], dp[i + 1, j]); + } + } + } + + return dp[m, n]; + } +} \ No newline at end of file diff --git a/src/SimMetrics.Net/SimMetrics.Net.csproj b/src/SimMetrics.Net/SimMetrics.Net.csproj index 5784119..afb5be9 100644 --- a/src/SimMetrics.Net/SimMetrics.Net.csproj +++ b/src/SimMetrics.Net/SimMetrics.Net.csproj @@ -6,6 +6,7 @@ 1.0.5.0 Hamed Fathi;Stef Heyenrath net20;net35;net40;net45;netstandard1.0;netstandard2.0 + 12 SimMetrics.Net SimMetrics.Net algorithms;artifical;intelligence diff --git a/tests/SimMetrics.Net.Tests/AssertUtil.cs b/tests/SimMetrics.Net.Tests/AssertUtil.cs index c51ec34..c157828 100644 --- a/tests/SimMetrics.Net.Tests/AssertUtil.cs +++ b/tests/SimMetrics.Net.Tests/AssertUtil.cs @@ -1,17 +1,16 @@ using Xunit; -namespace SimMetrics.Net.Tests +namespace SimMetrics.Net.Tests; + +internal static class AssertUtil { - internal static class AssertUtil + public static void Equal(T expected, T actual) { - public static void Equal(T expected, T actual) - { - Assert.Equal(expected, actual); - } + Assert.Equal(expected, actual); + } - public static void Equal(T expected, T actual, string message) - { - Assert.True(expected.Equals(actual), message); - } + public static void Equal(T expected, T actual, string message) + { + Assert.True(expected.Equals(actual), message); } } \ No newline at end of file diff --git a/tests/SimMetrics.Net.Tests/SimMetrics.Net.Tests.csproj b/tests/SimMetrics.Net.Tests/SimMetrics.Net.Tests.csproj index fa6625f..4a35483 100644 --- a/tests/SimMetrics.Net.Tests/SimMetrics.Net.Tests.csproj +++ b/tests/SimMetrics.Net.Tests/SimMetrics.Net.Tests.csproj @@ -3,8 +3,6 @@ Stef Heyenrath net8.0 - SimMetrics.Net.Tests - SimMetrics.Net.Tests true full @@ -14,8 +12,6 @@ - - diff --git a/tests/SimMetrics.Net.Tests/SimilarityClasses/LengthBased/ChapmanMeanLengthTrueTests.cs b/tests/SimMetrics.Net.Tests/SimilarityClasses/LengthBased/ChapmanMeanLengthTrueTests.cs new file mode 100644 index 0000000..f4f5f5a --- /dev/null +++ b/tests/SimMetrics.Net.Tests/SimilarityClasses/LengthBased/ChapmanMeanLengthTrueTests.cs @@ -0,0 +1,32 @@ +using SimMetrics.Net.Metric; +using Xunit; + +namespace SimMetrics.Net.Tests.SimilarityClasses.LengthBased; + +public sealed class ChapmanMeanLengthTrueTests +{ + private readonly ChapmanMeanLengthTrue _sut = new(); + + [Theory] + [InlineData("Davdi", 0.800000)] + [InlineData("david", 0.800000)] + [InlineData("David", 1.000000)] + [InlineData("Maday", 0.400000)] + [InlineData("Daves", 0.600000)] + [InlineData("divaD", 0.200000)] + [InlineData("Dave", 0.666667)] + [InlineData("Dovid", 0.800000)] + [InlineData("Dadiv", 0.600000)] + [InlineData("Da.v.id", 0.833333)] + [InlineData("Dav id", 0.909091)] + [InlineData("12345", 0.000000)] + [InlineData("Divad", 0.600000)] + [InlineData("D-avid", 0.909091)] + [InlineData("xxxxx", 0.000000)] + public void GetSimilarity(string test, double expected) + { + var result = _sut.GetSimilarity("David", test); + + Assert.Equal(expected, result, 5); + } +} \ No newline at end of file