diff --git a/tensorflow_datasets/core/features/text/text_encoder.py b/tensorflow_datasets/core/features/text/text_encoder.py index 37502bbcbf4..9de63bfef59 100644 --- a/tensorflow_datasets/core/features/text/text_encoder.py +++ b/tensorflow_datasets/core/features/text/text_encoder.py @@ -249,6 +249,8 @@ def __init__(self, self._lowercase = lowercase if self._lowercase: self._vocab_list = [t.lower() for t in self._vocab_list] + # Remove duplicates using method suggested by 'https://stackoverflow.com/a/39835527' + self._vocab_list = list(dict.fromkeys(self._vocab_list)) # Note that internally everything is 0-indexed. Padding is dealt with at the # end of encode and the beginning of decode. self._token_to_id = dict(