diff --git a/data_prep/github/github_clean_dedup_local.py b/data_prep/github/github_clean_dedup_local.py index aa1cb40..dccdbf9 100644 --- a/data_prep/github/github_clean_dedup_local.py +++ b/data_prep/github/github_clean_dedup_local.py @@ -25,17 +25,6 @@ def get_timestamp() -> str: def clean_copyright_comments(content: str): - r = PAT.search(content) - if r: - # found one, now see if it contains "copyright", if so strip it - span = r.span() - sub = content[span[0]:span[1]] - if CPAT.search(sub): - # cut it - content = content[: span[0]] + content[span[1]:] - - return content - lines = content.split('\n') skip = 0 @@ -43,20 +32,39 @@ def clean_copyright_comments(content: str): # are copyright headers for k in range(len(lines)): if ( - lines[k].startswith("//") or - lines[k].startswith("#") or - lines[k].startswith("--") or + lines[k].lstrip().startswith("//") or + lines[k].lstrip().startswith("#") or + lines[k].lstrip().startswith("--") or not lines[k] ): skip = skip + 1 else: break - if skip: - # we skipped, consume it - content = "\n".join(lines[skip:]) - - return content + lines = lines[skip:] + + if len(lines) > 100: + top100_line_content = '\n'.join(lines[:100]) + r = PAT.search(top100_line_content) + if r: + # found one, now see if it contains "copyright", if so strip it + span = r.span() + sub = top100_line_content[span[0]:span[1]] + if CPAT.search(sub): + # cut it + top100_line_content = top100_line_content[: span[0]] + top100_line_content[span[1]:] + return top100_line_content + '\n' + '\n'.join(lines[100:]) + else: + content = '\n'.join(lines) + r = PAT.search(content) + if r: + # found one, now see if it contains "copyright", if so strip it + span = r.span() + sub = content[span[0]:span[1]] + if CPAT.search(sub): + # cut it + content = content[: span[0]] + content[span[1]:] + return content def get_filecontent_stats(content: str) -> Dict[str, Union[int, str]]: