diff --git a/.gitignore b/.gitignore index bdf74e8..4a61924 100644 --- a/.gitignore +++ b/.gitignore @@ -130,7 +130,6 @@ dmypy.json # temporary files data_prep/data -*.csv *.sav data_prep/src/cc/log.txt data_prep/src/cc/inference_output.json diff --git a/data_prep/cc/cc_net/Makefile b/data_prep/cc/cc_net/Makefile index e887ac4..bf6d3c2 100644 --- a/data_prep/cc/cc_net/Makefile +++ b/data_prep/cc/cc_net/Makefile @@ -1,6 +1,7 @@ # Makefile to install CC-Net and train the LMs. # `make` or `make help` to get some help. + # Arguments: lang?=en process?=8 @@ -58,6 +59,7 @@ dl_lm: lm: data/lm_sp/$(lang).sp.model data/lm_sp/$(lang).arpa.bin # Computes a 5-gram LM for the given language -> make lang=it lm # Restricted to the first NDOC_FOR_LM documents + mkdir -p data/lm_sp sp: data/lm_sp/$(lang).sp.model # Train a sentence piece model on Wikipedia -> make lang=it sp @@ -111,20 +113,21 @@ data/lm_sp/%.sp.model: data/cirrus/txt/%.opening.txt echo "Trained SentencePiece model with `wc -l $(basename $@).vocab` pieces" data/cirrus/sp/%.opening.txt: data/cirrus/gz/%.json.gz data/lm_sp/%.sp.model + mkdir -p data/cirrus/sp $(SPM_ENCODE) \ --model=$(word 2,$^) \ --output_format=piece \ - < <(python get_wiki_cirrus.py opening --file $< --n_docs $(NDOC_FOR_LM)) \ + < <(python cc_net/get_wiki_cirrus.py opening --file $< --n_docs $(NDOC_FOR_LM)) \ > $@ data/cirrus/txt/%.opening.txt: data/cirrus/gz/%.json.gz - python get_wiki_cirrus.py opening \ + python cc_net/get_wiki_cirrus.py opening \ --n_docs $(NDOC_FOR_LM) \ --file $< --output $@ data/cirrus/gz/%.json.gz: - mkdir $(@D) - python get_wiki_cirrus.py dl --lang $(call get_lang,$(@F)) --output_dir $(@D) + mkdir -p $(@D) + python cc_net/get_wiki_cirrus.py dl --lang $(call get_lang,$(@F)) --output_dir $(@D) clean: # Remove intemediary files, dataset, third_party sources @@ -155,11 +158,8 @@ bin/lmplz: third_party/kenlm third_party/sentencepiece: # Download sentencepiece sources: https://github.com/google/sentencepiece mkdir -p $(@D) - wget -c -O $(@D)/sentencepiece.zip https://github.com/google/sentencepiece/archive/v0.1.83.zip - unzip -o -d $(@D) $(@D)/sentencepiece.zip - rm $(@D)/sentencepiece.zip - # remove the version id from the folder name - mv $(@D)/sentencepiece-* $@ + git clone https://github.com/google/sentencepiece.git $(@D)/sentencepiece + bin/spm_train: third_party/sentencepiece # Compiles sentencepiece binaries @@ -172,7 +172,10 @@ bin/spm_train: third_party/sentencepiece # $ cd $ Dict[str, str]: def wget(url: str, output: Path): - subprocess.run(["wget", url, "-O", tmp(output), "-q"], check=True) - tmp(output).replace(output) + if not os.path.isfile(output): + subprocess.run(["wget", url, "-O", tmp(output), "-q"], check=True) + tmp(output).replace(output) + else: + print(f"File {tmp(output)} already exists, skipping download") + assert ( output.stat().st_size > 10_000 ), f"File {output} downloaded from {url} looks too small" diff --git a/data_prep/cc/cc_net/cc_net/mine.py b/data_prep/cc/cc_net/cc_net/mine.py index 907c588..02d40cf 100644 --- a/data_prep/cc/cc_net/cc_net/mine.py +++ b/data_prep/cc/cc_net/cc_net/mine.py @@ -43,7 +43,7 @@ "drop", "split_by_lang", ] - +import logging class Config(NamedTuple): """ @@ -120,7 +120,7 @@ def get_cc_shard(self, shard: int) -> process_wet_file.CCShardReader: self.cache_dir.mkdir(exist_ok=True) dump_cache = self.cache_dir / self.dump dump_cache.mkdir(exist_ok=True) - + return process_wet_file.CCShardReader( self.dump, shard=shard, @@ -265,8 +265,9 @@ def hashes(conf: Config) -> List[Path]: ex(_hashes_shard, repeat(conf), *_transpose(missing_outputs)) # Wait a bit so that files appears on the disk. + logging.info("Waiting for hashes to be written on the disk...") time.sleep(20) - assert all(o.exists() for o in outputs) + assert all(o.exists() for o in outputs), f"Missing outputs: {outputs}" return outputs diff --git a/data_prep/cc/cc_net/config/test_segment.json b/data_prep/cc/cc_net/config/test_segment.json index 79b2613..9b3b9b0 100644 --- a/data_prep/cc/cc_net/config/test_segment.json +++ b/data_prep/cc/cc_net/config/test_segment.json @@ -1,10 +1,10 @@ { "hash_in_mem": 2, "dump": "2019-09", - "num_shards": 4, + "num_shards": 1, "num_segments_per_shard": 1, "mine_num_processes": 0, - "lang_whitelist": ["de", "it", "fr"], + "lang_whitelist": ["af"], "pipeline": [ "dedup", "lid", @@ -19,5 +19,5 @@ "output_dir": "test_data2", "mined_dir": "mined_by_segment", "target_size": "32M", - "cache_dir": "test_data/wet_cache" + "cache_dir": "test_data2/wet_cache" }