From 6c32dec4d537bf4456e733f16dc716c67439bfec Mon Sep 17 00:00:00 2001
From: Roman Samarev <samarev@acm.org>
Date: Thu, 4 May 2023 11:53:24 -0700
Subject: [PATCH] BLEU: added token and character based implementations of
 ngram.

---
 src/NLP_Metrics/bleu.jl | 152 +++++++++++++++++++++++++++-------------
 test/nlp.jl             |  56 +++++++++++++--
 2 files changed, 154 insertions(+), 54 deletions(-)

diff --git a/src/NLP_Metrics/bleu.jl b/src/NLP_Metrics/bleu.jl
index a64f580..8dd6672 100644
--- a/src/NLP_Metrics/bleu.jl
+++ b/src/NLP_Metrics/bleu.jl
@@ -3,6 +3,19 @@
 
 # Example: bleu_score([["apple is apple"]], ["apple is appl"])
 
+# Julia implementation of BLEU and smooth-BLEU.
+
+# This module provides a Julia implementation of BLEU and smooth-BLEU.
+# Smooth BLEU is computed following the method outlined in the paper:
+# Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
+# evaluation metrics for machine translation. COLING 2004.
+
+
+const ListOfTokens = Vector{<:AbstractString}
+const DocumentWithTokenizedSentences = Vector{<:ListOfTokens}
+const DocumentWithStringSentences = Vector{<:AbstractString}
+const Document = Union{<:DocumentWithTokenizedSentences,<:DocumentWithStringSentences}
+
 """
     get_ngrams(segment, max_order)
 
@@ -14,16 +27,26 @@ with a count of how many times each n-gram occurred.
  - `max_order`: maximum length in tokens of the n-grams returned by this methods.
 
 """
-function get_ngrams(segment, max_order)
-    ngrams_count = OrderedDict()
+function get_ngrams(segment::ListOfTokens, max_order::Integer)
+    ngrams_count = Dict()
     for order in 1:max_order
-        for i in 1: (length(segment) - order+1)
-            ngram = tuple(segment[i:i+order-1]...)
-            if (ngram) in keys(ngrams_count)
-                ngrams_count[ngram] += 1
-            else 
-                ngrams_count[ngram] = 1
-            end
+        for i in 1:(length(segment)-order+1)
+            ngram = Symbol.(segment[i:i+order-1])
+            count = get(ngrams_count, ngram, 0)
+            ngrams_count[ngram] = count + 1
+        end
+    end
+    return ngrams_count
+end
+
+function get_ngrams(non_tokenized_string::AbstractString, max_order::Integer)
+    ngrams_count = Dict()
+    character_indices = eachindex(non_tokenized_string) |> collect
+    for order in 1:max_order
+        for i in 1:(length(character_indices)-order+1)
+            ngram = non_tokenized_string[character_indices[i:i+order-1]]
+            count = get(ngrams_count, ngram, 0)
+            ngrams_count[ngram] = count + 1
         end
     end
     return ngrams_count
@@ -41,8 +64,36 @@ geometric mean of n-gram precisions, translation_length and  reference_length
  - `max_order`: maximum n-gram order to use when computing BLEU score. 
  - `smooth=false`: whether or not to apply. Lin et al. 2004 smoothing.
 
+
+Example:
+```julia
+one_doc_references = [
+    ["apple", "is", "apple"],
+    ["apple", "is", "a", "fruit"]
+]
+one_doc_translation = [
+    "apple", "is", "appl"
+]
+bleu_score([one_doc_references], [one_doc_translation], smooth=true)
+```
 """
-function bleu_score(reference_corpus, translation_corpus; max_order=4, smooth=false)
+bleu_score(
+    reference_corpus::Vector{<:T}, translation_corpus::T;
+    max_order=4, smooth=false
+) where {T<:DocumentWithTokenizedSentences} =
+    _bleu_score(reference_corpus, translation_corpus, max_order=max_order, smooth=smooth)
+
+bleu_score(
+    reference_corpus::Vector{<:T}, translation_corpus::T;
+    max_order=4, smooth=false
+) where {T<:DocumentWithStringSentences} =
+    _bleu_score(reference_corpus, translation_corpus, max_order=max_order, smooth=smooth)
+
+
+function _bleu_score(
+    reference_corpus::Vector{<:T}, translation_corpus::T;
+    max_order=4, smooth=false
+) where {T<:Union{<:DocumentWithTokenizedSentences,<:DocumentWithStringSentences}}
     matches_by_order = zeros(max_order)
     possible_matches_by_order = zeros(max_order)
     reference_length = 0
@@ -50,31 +101,28 @@ function bleu_score(reference_corpus, translation_corpus; max_order=4, smooth=fa
     for (references, translation) in zip(reference_corpus, translation_corpus)
         reference_length += min([length(r) for r in references]...)
         translation_length += length(translation)
-        merged_ref_ngram_counts = OrderedDict()
+        merged_ref_ngram_counts = Dict()
         for reference in references
-          ref_ngrams = get_ngrams(reference, max_order)
-          keys_union = union(keys(merged_ref_ngram_counts), keys(ref_ngrams)) 
-          for key in keys_union
-              try (b[key])
-                  try (ref_ngrams[key])
-                      merged_ref_ngram_counts[key] = max(merged_ref_ngram_counts[key], ref_ngrams[i])
-                  catch error
-                      continue
-                  end
-              catch error
-                  merged_ref_ngram_counts[key] = ref_ngrams[key]
-              end
-           end
+            ref_ngrams = get_ngrams(reference, max_order)
+            for (k, v) in ref_ngrams
+                merged_count = get(merged_ref_ngram_counts, k, 0)
+                if v > merged_count
+                    merged_ref_ngram_counts[k] = v
+                end
+            end
         end
-        # print(length(merged_ref_ngram_counts),"\n")
+
         translation_ngram_counts = get_ngrams(translation, max_order)
-        overlap = OrderedDict()
-        keys_union = intersect(keys(merged_ref_ngram_counts), keys(translation_ngram_counts))
-        for key in keys_union
-                 overlap[key] = min(translation_ngram_counts[key],  merged_ref_ngram_counts[key])
+        overlap = Dict()
+        for (k, v) in translation_ngram_counts
+            new_counter = min(get(merged_ref_ngram_counts, k, 0), v)
+            if new_counter > 0
+                overlap[k] = new_counter
+            end
         end
-        for key in overlap
-            matches_by_order[length(key[1])] += key[2]
+
+        for (ngram, count) in overlap
+            matches_by_order[length(ngram)] += count
         end
         for order in 1:max_order
             possible_matches = length(translation) - order + 1
@@ -83,29 +131,37 @@ function bleu_score(reference_corpus, translation_corpus; max_order=4, smooth=fa
             end
         end
     end
-    precisions = zeros(max_order)
-    for i in 1:max_order
+
+    precisions = map(1:max_order) do i
         if smooth
-            precisions[i] = (matches_by_order[i] + 1.0) / (possible_matches_by_order[i] + 1.0)
-        else 
-            if possible_matches_by_order[i]>0
-                precisions[i] = (float(matches_by_order[i]) / possible_matches_by_order[i])
-            end
+            (matches_by_order[i] + 1.0) / (possible_matches_by_order[i] + 1.0)
+        elseif possible_matches_by_order[i] > 0
+            matches_by_order[i] / possible_matches_by_order[i]
+        else
+            0.0
         end
     end
 
     geo_mean = 0.0
-    if min(precisions...) > 0
-       p_log_sum = sum(log.(precisions)) / max_order
-       geo_mean = exp(p_log_sum)
-    end 
-     
-    ratio =  translation_length / reference_length 
+    if all(>(0), precisions)
+        p_log_sum = sum(log.(precisions)) / max_order
+        geo_mean = exp(p_log_sum)
+    end
+
+    ratio = translation_length / reference_length
     bp = 1.0
-    if ratio <1.0
-       bp = exp(1 - 1 /ratio)
+    if ratio < 1.0
+        bp = exp(1 - 1 / ratio)
     end
-    
+
     bleu = geo_mean * bp
-    return bleu, precisions, bp, geo_mean, translation_length, reference_length
+
+    return (
+        bleu=bleu,
+        precisions=precisions,
+        bp=bp,
+        geo_mean=geo_mean,
+        translation_length=translation_length,
+        reference_length=reference_length
+    )
 end
diff --git a/test/nlp.jl b/test/nlp.jl
index 6646a47..e2c52d9 100644
--- a/test/nlp.jl
+++ b/test/nlp.jl
@@ -1,14 +1,58 @@
 using Metrics
 using DataStructures: OrderedDict
 
-@testset "NLP" begin
+@testset "NLP/BLEU" begin
+    max_order = 4
+    # test token-based ngrams
+    ngrams = Metrics.get_ngrams(split("it is a dog "), max_order)
+    actual_orders = Set(length.(keys(ngrams)))
     
+    @test length(intersect(actual_orders, 1:max_order)) == max_order
+    @test length(setdiff(actual_orders, 1:max_order)) == 0
+
+    # test character-based ngrams
+    ngrams = Metrics.get_ngrams("it is a dog ", max_order)
+    actual_orders = Set(length.(keys(ngrams)))
+
+    @test length(intersect(actual_orders, 1:max_order)) == max_order
+    @test length(setdiff(actual_orders, 1:max_order)) == 0
+
+    # NLTK sample https://www.nltk.org/api/nltk.translate.bleu_score.html
+    reference1 = [
+        "It", "is", "a", "guide", "to", "action", "that",
+        "ensures", "that", "the", "military", "will", "forever",
+        "heed", "Party", "commands"
+    ]
+    reference2 = [
+        "It", "is", "the", "guiding", "principle", "which",
+        "guarantees", "the", "military", "forces", "always",
+        "being", "under", "the", "command", "of", "the",
+        "Party"
+    ]
+    reference3 = [
+        "It", "is", "the", "practical", "guide", "for", "the",
+        "army", "always", "to", "heed", "the", "directions",
+        "of", "the", "party"
+    ]
+
+    hypothesis1 = [
+        "It", "is", "a", "guide", "to", "action", "which",
+        "ensures", "that", "the", "military", "always",
+        "obeys", "the", "commands", "of", "the", "party"
+    ]
+
+    score = bleu_score([[reference1, reference2, reference3]], [hypothesis1])
+    @test isapprox(score.bleu, 0.5045, atol=1e-4) #(NLTK)
+
     ref_corpus = [["Example of bleu score"], ["This is an apple"]]
     translated_corpus = ["Example to bleu score", "This no a apple"]
-    
-    res = bleu_score(ref_corpus, translated_corpus) .≈ (0.7253666236200925, [0.9444444444444444, 0.7941176470588235, 0.6875, 0.6], 0.9726044771163485, 0.7457981540149954, 36, 37)
-    @test all(res)
-    
+
+    res = bleu_score(ref_corpus, translated_corpus)
+    @test collect(res) ≈ 
+      [0.7253666236200925, [0.9444444444444444, 0.7941176470588235, 0.6875, 0.6], 0.9726044771163485, 0.7457981540149954, 36, 37]
+end
+
+@testset "NLP/ROUGE" begin
     hypothesis = ["Example for bleu score", "This cz an apple"]
     ref_corpus = ["Example of bleu score", "This is an apple"]
     output = OrderedDict(
@@ -21,7 +65,7 @@ using DataStructures: OrderedDict
       "rouge_l / f_score"=> 0.75,
       "rouge_l / r_score"=> 0.75,
       "rouge_l / p_score"=> 0.75)
-    
+
     @testset "rouge" begin
         rouge_out = rouge(hypothesis, ref_corpus)
         for key in keys(output)