From 6c32dec4d537bf4456e733f16dc716c67439bfec Mon Sep 17 00:00:00 2001 From: Roman Samarev Date: Thu, 4 May 2023 11:53:24 -0700 Subject: [PATCH] BLEU: added token and character based implementations of ngram. --- src/NLP_Metrics/bleu.jl | 152 +++++++++++++++++++++++++++------------- test/nlp.jl | 56 +++++++++++++-- 2 files changed, 154 insertions(+), 54 deletions(-) diff --git a/src/NLP_Metrics/bleu.jl b/src/NLP_Metrics/bleu.jl index a64f580..8dd6672 100644 --- a/src/NLP_Metrics/bleu.jl +++ b/src/NLP_Metrics/bleu.jl @@ -3,6 +3,19 @@ # Example: bleu_score([["apple is apple"]], ["apple is appl"]) +# Julia implementation of BLEU and smooth-BLEU. + +# This module provides a Julia implementation of BLEU and smooth-BLEU. +# Smooth BLEU is computed following the method outlined in the paper: +# Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic +# evaluation metrics for machine translation. COLING 2004. + + +const ListOfTokens = Vector{<:AbstractString} +const DocumentWithTokenizedSentences = Vector{<:ListOfTokens} +const DocumentWithStringSentences = Vector{<:AbstractString} +const Document = Union{<:DocumentWithTokenizedSentences,<:DocumentWithStringSentences} + """ get_ngrams(segment, max_order) @@ -14,16 +27,26 @@ with a count of how many times each n-gram occurred. - `max_order`: maximum length in tokens of the n-grams returned by this methods. """ -function get_ngrams(segment, max_order) - ngrams_count = OrderedDict() +function get_ngrams(segment::ListOfTokens, max_order::Integer) + ngrams_count = Dict() for order in 1:max_order - for i in 1: (length(segment) - order+1) - ngram = tuple(segment[i:i+order-1]...) - if (ngram) in keys(ngrams_count) - ngrams_count[ngram] += 1 - else - ngrams_count[ngram] = 1 - end + for i in 1:(length(segment)-order+1) + ngram = Symbol.(segment[i:i+order-1]) + count = get(ngrams_count, ngram, 0) + ngrams_count[ngram] = count + 1 + end + end + return ngrams_count +end + +function get_ngrams(non_tokenized_string::AbstractString, max_order::Integer) + ngrams_count = Dict() + character_indices = eachindex(non_tokenized_string) |> collect + for order in 1:max_order + for i in 1:(length(character_indices)-order+1) + ngram = non_tokenized_string[character_indices[i:i+order-1]] + count = get(ngrams_count, ngram, 0) + ngrams_count[ngram] = count + 1 end end return ngrams_count @@ -41,8 +64,36 @@ geometric mean of n-gram precisions, translation_length and reference_length - `max_order`: maximum n-gram order to use when computing BLEU score. - `smooth=false`: whether or not to apply. Lin et al. 2004 smoothing. + +Example: +```julia +one_doc_references = [ + ["apple", "is", "apple"], + ["apple", "is", "a", "fruit"] +] +one_doc_translation = [ + "apple", "is", "appl" +] +bleu_score([one_doc_references], [one_doc_translation], smooth=true) +``` """ -function bleu_score(reference_corpus, translation_corpus; max_order=4, smooth=false) +bleu_score( + reference_corpus::Vector{<:T}, translation_corpus::T; + max_order=4, smooth=false +) where {T<:DocumentWithTokenizedSentences} = + _bleu_score(reference_corpus, translation_corpus, max_order=max_order, smooth=smooth) + +bleu_score( + reference_corpus::Vector{<:T}, translation_corpus::T; + max_order=4, smooth=false +) where {T<:DocumentWithStringSentences} = + _bleu_score(reference_corpus, translation_corpus, max_order=max_order, smooth=smooth) + + +function _bleu_score( + reference_corpus::Vector{<:T}, translation_corpus::T; + max_order=4, smooth=false +) where {T<:Union{<:DocumentWithTokenizedSentences,<:DocumentWithStringSentences}} matches_by_order = zeros(max_order) possible_matches_by_order = zeros(max_order) reference_length = 0 @@ -50,31 +101,28 @@ function bleu_score(reference_corpus, translation_corpus; max_order=4, smooth=fa for (references, translation) in zip(reference_corpus, translation_corpus) reference_length += min([length(r) for r in references]...) translation_length += length(translation) - merged_ref_ngram_counts = OrderedDict() + merged_ref_ngram_counts = Dict() for reference in references - ref_ngrams = get_ngrams(reference, max_order) - keys_union = union(keys(merged_ref_ngram_counts), keys(ref_ngrams)) - for key in keys_union - try (b[key]) - try (ref_ngrams[key]) - merged_ref_ngram_counts[key] = max(merged_ref_ngram_counts[key], ref_ngrams[i]) - catch error - continue - end - catch error - merged_ref_ngram_counts[key] = ref_ngrams[key] - end - end + ref_ngrams = get_ngrams(reference, max_order) + for (k, v) in ref_ngrams + merged_count = get(merged_ref_ngram_counts, k, 0) + if v > merged_count + merged_ref_ngram_counts[k] = v + end + end end - # print(length(merged_ref_ngram_counts),"\n") + translation_ngram_counts = get_ngrams(translation, max_order) - overlap = OrderedDict() - keys_union = intersect(keys(merged_ref_ngram_counts), keys(translation_ngram_counts)) - for key in keys_union - overlap[key] = min(translation_ngram_counts[key], merged_ref_ngram_counts[key]) + overlap = Dict() + for (k, v) in translation_ngram_counts + new_counter = min(get(merged_ref_ngram_counts, k, 0), v) + if new_counter > 0 + overlap[k] = new_counter + end end - for key in overlap - matches_by_order[length(key[1])] += key[2] + + for (ngram, count) in overlap + matches_by_order[length(ngram)] += count end for order in 1:max_order possible_matches = length(translation) - order + 1 @@ -83,29 +131,37 @@ function bleu_score(reference_corpus, translation_corpus; max_order=4, smooth=fa end end end - precisions = zeros(max_order) - for i in 1:max_order + + precisions = map(1:max_order) do i if smooth - precisions[i] = (matches_by_order[i] + 1.0) / (possible_matches_by_order[i] + 1.0) - else - if possible_matches_by_order[i]>0 - precisions[i] = (float(matches_by_order[i]) / possible_matches_by_order[i]) - end + (matches_by_order[i] + 1.0) / (possible_matches_by_order[i] + 1.0) + elseif possible_matches_by_order[i] > 0 + matches_by_order[i] / possible_matches_by_order[i] + else + 0.0 end end geo_mean = 0.0 - if min(precisions...) > 0 - p_log_sum = sum(log.(precisions)) / max_order - geo_mean = exp(p_log_sum) - end - - ratio = translation_length / reference_length + if all(>(0), precisions) + p_log_sum = sum(log.(precisions)) / max_order + geo_mean = exp(p_log_sum) + end + + ratio = translation_length / reference_length bp = 1.0 - if ratio <1.0 - bp = exp(1 - 1 /ratio) + if ratio < 1.0 + bp = exp(1 - 1 / ratio) end - + bleu = geo_mean * bp - return bleu, precisions, bp, geo_mean, translation_length, reference_length + + return ( + bleu=bleu, + precisions=precisions, + bp=bp, + geo_mean=geo_mean, + translation_length=translation_length, + reference_length=reference_length + ) end diff --git a/test/nlp.jl b/test/nlp.jl index 6646a47..e2c52d9 100644 --- a/test/nlp.jl +++ b/test/nlp.jl @@ -1,14 +1,58 @@ using Metrics using DataStructures: OrderedDict -@testset "NLP" begin +@testset "NLP/BLEU" begin + max_order = 4 + # test token-based ngrams + ngrams = Metrics.get_ngrams(split("it is a dog "), max_order) + actual_orders = Set(length.(keys(ngrams))) + @test length(intersect(actual_orders, 1:max_order)) == max_order + @test length(setdiff(actual_orders, 1:max_order)) == 0 + + # test character-based ngrams + ngrams = Metrics.get_ngrams("it is a dog ", max_order) + actual_orders = Set(length.(keys(ngrams))) + + @test length(intersect(actual_orders, 1:max_order)) == max_order + @test length(setdiff(actual_orders, 1:max_order)) == 0 + + # NLTK sample https://www.nltk.org/api/nltk.translate.bleu_score.html + reference1 = [ + "It", "is", "a", "guide", "to", "action", "that", + "ensures", "that", "the", "military", "will", "forever", + "heed", "Party", "commands" + ] + reference2 = [ + "It", "is", "the", "guiding", "principle", "which", + "guarantees", "the", "military", "forces", "always", + "being", "under", "the", "command", "of", "the", + "Party" + ] + reference3 = [ + "It", "is", "the", "practical", "guide", "for", "the", + "army", "always", "to", "heed", "the", "directions", + "of", "the", "party" + ] + + hypothesis1 = [ + "It", "is", "a", "guide", "to", "action", "which", + "ensures", "that", "the", "military", "always", + "obeys", "the", "commands", "of", "the", "party" + ] + + score = bleu_score([[reference1, reference2, reference3]], [hypothesis1]) + @test isapprox(score.bleu, 0.5045, atol=1e-4) #(NLTK) + ref_corpus = [["Example of bleu score"], ["This is an apple"]] translated_corpus = ["Example to bleu score", "This no a apple"] - - res = bleu_score(ref_corpus, translated_corpus) .≈ (0.7253666236200925, [0.9444444444444444, 0.7941176470588235, 0.6875, 0.6], 0.9726044771163485, 0.7457981540149954, 36, 37) - @test all(res) - + + res = bleu_score(ref_corpus, translated_corpus) + @test collect(res) ≈ + [0.7253666236200925, [0.9444444444444444, 0.7941176470588235, 0.6875, 0.6], 0.9726044771163485, 0.7457981540149954, 36, 37] +end + +@testset "NLP/ROUGE" begin hypothesis = ["Example for bleu score", "This cz an apple"] ref_corpus = ["Example of bleu score", "This is an apple"] output = OrderedDict( @@ -21,7 +65,7 @@ using DataStructures: OrderedDict "rouge_l / f_score"=> 0.75, "rouge_l / r_score"=> 0.75, "rouge_l / p_score"=> 0.75) - + @testset "rouge" begin rouge_out = rouge(hypothesis, ref_corpus) for key in keys(output)