Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 104 additions & 48 deletions src/NLP_Metrics/bleu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,19 @@

# Example: bleu_score([["apple is apple"]], ["apple is appl"])

# Julia implementation of BLEU and smooth-BLEU.

# This module provides a Julia implementation of BLEU and smooth-BLEU.
# Smooth BLEU is computed following the method outlined in the paper:
# Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
# evaluation metrics for machine translation. COLING 2004.


const ListOfTokens = Vector{<:AbstractString}
const DocumentWithTokenizedSentences = Vector{<:ListOfTokens}
const DocumentWithStringSentences = Vector{<:AbstractString}
const Document = Union{<:DocumentWithTokenizedSentences,<:DocumentWithStringSentences}

"""
get_ngrams(segment, max_order)

Expand All @@ -14,16 +27,26 @@ with a count of how many times each n-gram occurred.
- `max_order`: maximum length in tokens of the n-grams returned by this methods.

"""
function get_ngrams(segment, max_order)
ngrams_count = OrderedDict()
function get_ngrams(segment::ListOfTokens, max_order::Integer)
ngrams_count = Dict()
for order in 1:max_order
for i in 1: (length(segment) - order+1)
ngram = tuple(segment[i:i+order-1]...)
if (ngram) in keys(ngrams_count)
ngrams_count[ngram] += 1
else
ngrams_count[ngram] = 1
end
for i in 1:(length(segment)-order+1)
ngram = Symbol.(segment[i:i+order-1])
count = get(ngrams_count, ngram, 0)
ngrams_count[ngram] = count + 1
end
end
return ngrams_count
end

function get_ngrams(non_tokenized_string::AbstractString, max_order::Integer)
ngrams_count = Dict()
character_indices = eachindex(non_tokenized_string) |> collect
for order in 1:max_order
for i in 1:(length(character_indices)-order+1)
ngram = non_tokenized_string[character_indices[i:i+order-1]]
count = get(ngrams_count, ngram, 0)
ngrams_count[ngram] = count + 1
end
end
return ngrams_count
Expand All @@ -41,40 +64,65 @@ geometric mean of n-gram precisions, translation_length and reference_length
- `max_order`: maximum n-gram order to use when computing BLEU score.
- `smooth=false`: whether or not to apply. Lin et al. 2004 smoothing.


Example:
```julia
one_doc_references = [
["apple", "is", "apple"],
["apple", "is", "a", "fruit"]
]
one_doc_translation = [
"apple", "is", "appl"
]
bleu_score([one_doc_references], [one_doc_translation], smooth=true)
```
"""
function bleu_score(reference_corpus, translation_corpus; max_order=4, smooth=false)
bleu_score(
reference_corpus::Vector{<:T}, translation_corpus::T;
max_order=4, smooth=false
) where {T<:DocumentWithTokenizedSentences} =
_bleu_score(reference_corpus, translation_corpus, max_order=max_order, smooth=smooth)

bleu_score(
reference_corpus::Vector{<:T}, translation_corpus::T;
max_order=4, smooth=false
) where {T<:DocumentWithStringSentences} =
_bleu_score(reference_corpus, translation_corpus, max_order=max_order, smooth=smooth)


function _bleu_score(
reference_corpus::Vector{<:T}, translation_corpus::T;
max_order=4, smooth=false
) where {T<:Union{<:DocumentWithTokenizedSentences,<:DocumentWithStringSentences}}
matches_by_order = zeros(max_order)
possible_matches_by_order = zeros(max_order)
reference_length = 0
translation_length = 0
for (references, translation) in zip(reference_corpus, translation_corpus)
reference_length += min([length(r) for r in references]...)
translation_length += length(translation)
merged_ref_ngram_counts = OrderedDict()
merged_ref_ngram_counts = Dict()
for reference in references
ref_ngrams = get_ngrams(reference, max_order)
keys_union = union(keys(merged_ref_ngram_counts), keys(ref_ngrams))
for key in keys_union
try (b[key])
try (ref_ngrams[key])
merged_ref_ngram_counts[key] = max(merged_ref_ngram_counts[key], ref_ngrams[i])
catch error
continue
end
catch error
merged_ref_ngram_counts[key] = ref_ngrams[key]
end
end
ref_ngrams = get_ngrams(reference, max_order)
for (k, v) in ref_ngrams
merged_count = get(merged_ref_ngram_counts, k, 0)
if v > merged_count
merged_ref_ngram_counts[k] = v
end
end
end
# print(length(merged_ref_ngram_counts),"\n")

translation_ngram_counts = get_ngrams(translation, max_order)
overlap = OrderedDict()
keys_union = intersect(keys(merged_ref_ngram_counts), keys(translation_ngram_counts))
for key in keys_union
overlap[key] = min(translation_ngram_counts[key], merged_ref_ngram_counts[key])
overlap = Dict()
for (k, v) in translation_ngram_counts
new_counter = min(get(merged_ref_ngram_counts, k, 0), v)
if new_counter > 0
overlap[k] = new_counter
end
end
for key in overlap
matches_by_order[length(key[1])] += key[2]

for (ngram, count) in overlap
matches_by_order[length(ngram)] += count
end
for order in 1:max_order
possible_matches = length(translation) - order + 1
Expand All @@ -83,29 +131,37 @@ function bleu_score(reference_corpus, translation_corpus; max_order=4, smooth=fa
end
end
end
precisions = zeros(max_order)
for i in 1:max_order

precisions = map(1:max_order) do i
if smooth
precisions[i] = (matches_by_order[i] + 1.0) / (possible_matches_by_order[i] + 1.0)
else
if possible_matches_by_order[i]>0
precisions[i] = (float(matches_by_order[i]) / possible_matches_by_order[i])
end
(matches_by_order[i] + 1.0) / (possible_matches_by_order[i] + 1.0)
elseif possible_matches_by_order[i] > 0
matches_by_order[i] / possible_matches_by_order[i]
else
0.0
end
end

geo_mean = 0.0
if min(precisions...) > 0
p_log_sum = sum(log.(precisions)) / max_order
geo_mean = exp(p_log_sum)
end
ratio = translation_length / reference_length
if all(>(0), precisions)
p_log_sum = sum(log.(precisions)) / max_order
geo_mean = exp(p_log_sum)
end

ratio = translation_length / reference_length
bp = 1.0
if ratio <1.0
bp = exp(1 - 1 /ratio)
if ratio < 1.0
bp = exp(1 - 1 / ratio)
end

bleu = geo_mean * bp
return bleu, precisions, bp, geo_mean, translation_length, reference_length

return (
bleu=bleu,
precisions=precisions,
bp=bp,
geo_mean=geo_mean,
translation_length=translation_length,
reference_length=reference_length
)
end
56 changes: 50 additions & 6 deletions test/nlp.jl
Original file line number Diff line number Diff line change
@@ -1,14 +1,58 @@
using Metrics
using DataStructures: OrderedDict

@testset "NLP" begin
@testset "NLP/BLEU" begin
max_order = 4
# test token-based ngrams
ngrams = Metrics.get_ngrams(split("it is a dog "), max_order)
actual_orders = Set(length.(keys(ngrams)))

@test length(intersect(actual_orders, 1:max_order)) == max_order
@test length(setdiff(actual_orders, 1:max_order)) == 0

# test character-based ngrams
ngrams = Metrics.get_ngrams("it is a dog ", max_order)
actual_orders = Set(length.(keys(ngrams)))

@test length(intersect(actual_orders, 1:max_order)) == max_order
@test length(setdiff(actual_orders, 1:max_order)) == 0

# NLTK sample https://www.nltk.org/api/nltk.translate.bleu_score.html
reference1 = [
"It", "is", "a", "guide", "to", "action", "that",
"ensures", "that", "the", "military", "will", "forever",
"heed", "Party", "commands"
]
reference2 = [
"It", "is", "the", "guiding", "principle", "which",
"guarantees", "the", "military", "forces", "always",
"being", "under", "the", "command", "of", "the",
"Party"
]
reference3 = [
"It", "is", "the", "practical", "guide", "for", "the",
"army", "always", "to", "heed", "the", "directions",
"of", "the", "party"
]

hypothesis1 = [
"It", "is", "a", "guide", "to", "action", "which",
"ensures", "that", "the", "military", "always",
"obeys", "the", "commands", "of", "the", "party"
]

score = bleu_score([[reference1, reference2, reference3]], [hypothesis1])
@test isapprox(score.bleu, 0.5045, atol=1e-4) #(NLTK)

ref_corpus = [["Example of bleu score"], ["This is an apple"]]
translated_corpus = ["Example to bleu score", "This no a apple"]

res = bleu_score(ref_corpus, translated_corpus) .≈ (0.7253666236200925, [0.9444444444444444, 0.7941176470588235, 0.6875, 0.6], 0.9726044771163485, 0.7457981540149954, 36, 37)
@test all(res)


res = bleu_score(ref_corpus, translated_corpus)
@test collect(res) ≈
[0.7253666236200925, [0.9444444444444444, 0.7941176470588235, 0.6875, 0.6], 0.9726044771163485, 0.7457981540149954, 36, 37]
end

@testset "NLP/ROUGE" begin
hypothesis = ["Example for bleu score", "This cz an apple"]
ref_corpus = ["Example of bleu score", "This is an apple"]
output = OrderedDict(
Expand All @@ -21,7 +65,7 @@ using DataStructures: OrderedDict
"rouge_l / f_score"=> 0.75,
"rouge_l / r_score"=> 0.75,
"rouge_l / p_score"=> 0.75)

@testset "rouge" begin
rouge_out = rouge(hypothesis, ref_corpus)
for key in keys(output)
Expand Down