From f5813a8baac77cb481be753d7544edd4bfd7d4bf Mon Sep 17 00:00:00 2001 From: Li Xing Date: Mon, 26 Feb 2024 17:37:58 +0800 Subject: [PATCH] =?UTF-8?q?=E7=89=88=E6=9C=AC=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 162 +++++++++++++++++++++++++++++++++++++++++++++++++ data_helper.py | 31 ++++++---- 2 files changed, 181 insertions(+), 12 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..66efa11 --- /dev/null +++ b/.gitignore @@ -0,0 +1,162 @@ +cache + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/data_helper.py b/data_helper.py index 011e634..b9a4106 100644 --- a/data_helper.py +++ b/data_helper.py @@ -1,9 +1,9 @@ -from collections import Counter -from torchtext.vocab import Vocab +import re + +import torch from torch.utils.data import DataLoader from torchtext.data.utils import get_tokenizer -import torch -import re +from torchtext.vocab import build_vocab_from_iterator from tqdm import tqdm @@ -29,12 +29,18 @@ def build_vocab(tokenizer, filepath, min_freq, specials=None): """ if specials is None: specials = ['', ''] - counter = Counter() - with open(filepath, encoding='utf8') as f: - for string_ in tqdm(f): - string_ = string_.strip().split('","')[-1][:-1] # 新闻描述 - counter.update(tokenizer(clean_str(string_))) - return Vocab(counter, min_freq=min_freq, specials=specials) + + def yield_tokens(filepath): + with open(filepath, encoding='utf8') as f: + for string_ in f: + yield tokenizer(string_) + + vocab_obj = build_vocab_from_iterator(yield_tokens( + filepath), specials=specials, min_freq=min_freq) + + vocab_obj.set_default_index(vocab_obj['']) + + return vocab_obj def pad_sequence(sequences, batch_first=False, max_len=None, padding_value=0): @@ -96,7 +102,7 @@ def data_process(self, filepath): :return: """ - raw_iter = open(filepath,encoding='utf8').readlines() + raw_iter = open(filepath, encoding='utf8').readlines() data = [] max_len = 0 for raw in tqdm(raw_iter, ncols=80): @@ -111,7 +117,8 @@ def data_process(self, filepath): return data, max_len def load_train_val_test_data(self, train_file_paths, test_file_paths): - train_data, max_sen_len = self.data_process(train_file_paths) # 得到处理好的所有样本 + train_data, max_sen_len = self.data_process( + train_file_paths) # 得到处理好的所有样本 if self.max_sen_len == 'same': self.max_sen_len = max_sen_len test_data, _ = self.data_process(test_file_paths)