From ca8344cafd38aab5a93bc3c94c81d87dbcdabf39 Mon Sep 17 00:00:00 2001 From: tmatha <33797174+tmatha@users.noreply.github.com> Date: Wed, 1 May 2019 21:53:49 -0700 Subject: [PATCH] de-dup vocab_list when lowercase=True fix issue https://github.com/tensorflow/datasets/issues/268 --- tensorflow_datasets/core/features/text/text_encoder.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorflow_datasets/core/features/text/text_encoder.py b/tensorflow_datasets/core/features/text/text_encoder.py index 37502bbcbf4..9de63bfef59 100644 --- a/tensorflow_datasets/core/features/text/text_encoder.py +++ b/tensorflow_datasets/core/features/text/text_encoder.py @@ -249,6 +249,8 @@ def __init__(self, self._lowercase = lowercase if self._lowercase: self._vocab_list = [t.lower() for t in self._vocab_list] + # Remove duplicates using method suggested by 'https://stackoverflow.com/a/39835527' + self._vocab_list = list(dict.fromkeys(self._vocab_list)) # Note that internally everything is 0-indexed. Padding is dealt with at the # end of encode and the beginning of decode. self._token_to_id = dict(