From c7e362393a9b1612a2ca7e9c34cfcaa974bb2250 Mon Sep 17 00:00:00 2001 From: ljiangf <570055398@qq.com> Date: Mon, 28 Dec 2020 19:47:38 +0800 Subject: [PATCH] solve issue 56 --- preprocess/dump_binary.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/preprocess/dump_binary.cpp b/preprocess/dump_binary.cpp index 89d172a..a5152fe 100755 --- a/preprocess/dump_binary.cpp +++ b/preprocess/dump_binary.cpp @@ -457,7 +457,7 @@ int main(int argc, char* argv[]) char *endptr = nullptr; const int kBASE = 10; int doc_buf_idx; - + int32_t max_id = 0; double dump_start = get_time(); offset_buf[0] = 0; @@ -518,6 +518,7 @@ int main(int argc, char* argv[]) } // The input data may be already sorted std::sort(doc_tokens.begin(), doc_tokens.end(), Compare); + max_id = std::max(max_id, (doc_tokens.end()-1)->word_id); doc_buf_idx = 0; doc_buf[doc_buf_idx++] = 0; // cursor @@ -538,8 +539,10 @@ int main(int argc, char* argv[]) vocab_file.write(reinterpret_cast(&vocab_size), sizeof(int32_t)); int32_t non_zero_count = 0; + std::cout << "Max word id is: " << max_id << std::endl; + ++max_id; // write vocab - for (int i = 0; i < word_num; ++i) + for (int i = 0; i < max_id; ++i) { if (local_tf_map[i] > 0) { @@ -551,7 +554,7 @@ int main(int argc, char* argv[]) std::cout << "Local vocab_size for the output block is: " << non_zero_count << std::endl; // write global tf - for (int i = 0; i < word_num; ++i) + for (int i = 0; i < max_id; ++i) { if (local_tf_map[i] > 0) { @@ -559,7 +562,7 @@ int main(int argc, char* argv[]) } } // write local tf - for (int i = 0; i < word_num; ++i) + for (int i = 0; i < max_id; ++i) { if (local_tf_map[i] > 0) { @@ -571,7 +574,7 @@ int main(int argc, char* argv[]) vocab_file.close(); txt_vocab_file << non_zero_count << std::endl; - for (int i = 0; i < word_num; ++i) + for (int i = 0; i < max_id; ++i) { if (local_tf_map[i] > 0) {