@@ -327,24 +327,6 @@ def itemise_segment(self, original_segment:list, recovered_segment:list) -> list
327327 index_orig += 1
328328 index_rec += 1
329329
330- # # Verify all recovered words have been itemised
331- # try:
332- # assert index_rec == len(recovered_segment), \
333- # f"While reconstructing segment structure, one or more recovered words have been missed. \
334- # \n Original text: {[item.content for item in original_segment]} \
335- # \n Recovered text: {[item.content for item in recovered_segment]}"
336- # except AttributeError:
337- # assert index_rec == len(recovered_segment), \
338- # f"While reconstructing segment structure, one or more recovered words have been missed. \
339- # \n Original text: {[item.content for item in original_segment]} \
340- # \n Recovered text: {[item for item in recovered_segment]}"
341-
342- # # Verify that the reconstructed segment is the same length as original (excluding words removed by hyphenation)
343- # assert len(recovered_segment) == (len(original_segment) - total_fewer_words), \
344- # f"While reconstructing segment structure, a mistake has occured. \
345- # \n Original text: {[item.content for item in original_segment]} \
346- # \n Recovered text: {[item.content for item in recovered_segment]}"
347-
348330 # Return new itemised segment to the list of segments
349331 return output_segment
350332
@@ -378,15 +360,6 @@ def calc_end_item_index(self, plaintext_items_lst, recovered_words_lst, position
378360 orig_text_removals = original_segment_words .index ('pence' )
379361 punct_text_removals = 0
380362
381- # elif recovered_word.startswith('£') and not original_segment_words[0].startswith('£'):
382- # numerical_removals = self.find_subword_index(['pound', 'pounds'], original_segment_words, recovered_words_lst, position)
383- # elif recovered_word.startswith('$') and not original_segment_words[0].startswith('$'):
384- # numerical_removals = self.find_subword_index(['dollar', 'dollars'], original_segment_words, recovered_words_lst, position)
385- # elif recovered_word.startswith('€') and not original_segment_words[0].startswith('€'):
386- # numerical_removals = self.find_subword_index(['euro', 'euros'], original_segment_words, recovered_words_lst, position)
387- # elif recovered_word.startswith('¥') and not original_segment_words[0].startswith('¥') and original_segment_words.count('yen') > 0:
388- # numerical_removals = original_segment_words.index('yen')
389-
390363 else :
391364 # Align original natural language numbers to recovered digits
392365 mapping = align_texts (original_segment_words , recovered_words_lst , position )
0 commit comments