diff --git a/chapters/zh-TW/chapter3/2.mdx b/chapters/zh-TW/chapter3/2.mdx index 4cc8f883d..49039a4ea 100644 --- a/chapters/zh-TW/chapter3/2.mdx +++ b/chapters/zh-TW/chapter3/2.mdx @@ -164,8 +164,8 @@ from transformers import AutoTokenizer checkpoint = "bert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(checkpoint) -tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"]) -tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"]) +tokenized_sentences_1 = tokenizer(list(raw_datasets["train"]["sentence1"])) +tokenized_sentences_2 = tokenizer(list(raw_datasets["train"]["sentence2"])) ``` 然而,在兩句話傳遞給模型,預測這兩句話是否是同義之前。我們需要這兩句話依次進行適當的預處理。幸運的是,標記器不僅僅可以輸入單個句子還可以輸入一組句子,並按照我們的BERT模型所期望的輸入進行處理: @@ -221,8 +221,8 @@ tokenizer.convert_ids_to_tokens(inputs["input_ids"]) ```py tokenized_dataset = tokenizer( - raw_datasets["train"]["sentence1"], - raw_datasets["train"]["sentence2"], + raw_datasets["train"]["sentence1"].to_pylist(), + raw_datasets["train"]["sentence2"].to_pylist(), padding=True, truncation=True, )