Skip to content

Commit 29b8c5a

Browse files
committed
Add backup cli arguments
--backup enables saving the scraper output to a file --backup_path specifies the file path
1 parent 934a734 commit 29b8c5a

File tree

5 files changed

+55
-18
lines changed

5 files changed

+55
-18
lines changed

data_scraper/core/ci_logs_scraper.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def record_postprocessing(self, record):
5959

6060
# pylint: disable=R0801
6161
def cleanup_records(
62-
self, records: list, backup_path: str = "ci_logs_all_data.pickle"
62+
self, records: list, backup: bool, backup_path: str
6363
) -> list:
6464
df = pd.DataFrame(records)
6565

@@ -72,8 +72,9 @@ def cleanup_records(
7272
LOG.info("Records stats AFTER cleanup:")
7373
LOG.info(df.info())
7474

75-
LOG.info("Saving backup to: %s", backup_path)
76-
df.to_pickle(backup_path)
75+
if backup:
76+
LOG.info("Saving backup to: %s", backup_path)
77+
df.to_pickle(backup_path)
7778

7879
return [CILogsRecord(**row) for row in df.to_dict(orient="records")]
7980

data_scraper/core/errata_scraper.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ def record_postprocessing(self, record):
149149
pass
150150

151151
def cleanup_records(
152-
self, records: list, backup_path: str = "errata_all_data.pickle"
152+
self, records: list, backup: bool, backup_path: str
153153
) -> list:
154154
df = pd.DataFrame(records)
155155

@@ -162,7 +162,8 @@ def cleanup_records(
162162
LOG.info("Records stats AFTER cleanup:")
163163
LOG.info(df.info())
164164

165-
LOG.info("Saving backup to: %s", backup_path)
166-
df.to_pickle(backup_path)
165+
if backup:
166+
LOG.info("Saving backup to: %s", backup_path)
167+
df.to_pickle(backup_path)
167168

168169
return [ErrataRecord(**row) for row in df.to_dict(orient="records")]

data_scraper/core/scraper.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@ def __init__(self, config: Dict):
7474
organization="",
7575
api_key=config["llm_api_key"],
7676
)
77+
self.backup = config["backup"]
78+
self.backup_path = config["backup_path"]
7779

7880
def get_embedding_dimension(self) -> int:
7981
"""Get embedding dimension for the model."""
@@ -142,7 +144,7 @@ def store_records(self,
142144
self.db_manager.upsert_data(self.config["db_collection_name"], [point])
143145

144146
def cleanup_records(
145-
self, records: list, backup_path: str = "all_data.pickle"
147+
self, records: list, backup: bool, backup_path: str
146148
) -> list:
147149
"""Cleanup Records"""
148150

@@ -164,7 +166,7 @@ def run(self, record_fields_for_key: tuple[str,...] = ("url",)):
164166
return
165167

166168
records = self.get_records(documents)
167-
records = self.cleanup_records(records)
169+
records = self.cleanup_records(records, backup=self.backup, backup_path=self.backup_path)
168170

169171
# Process and store embeddings
170172
self.store_records(records, record_fields_for_key, self.config["recreate_collection"])
@@ -295,7 +297,7 @@ def get_chunks(self, record: dict) -> list[str]:
295297
return chunks
296298

297299
def cleanup_records(
298-
self, records: list[JiraRecord], backup_path: str = "jira_all_bugs.pickle"
300+
self, records: list[JiraRecord], backup: bool, backup_path: str
299301
) -> list[JiraRecord]:
300302
"""Cleanup Jira Records"""
301303
df = pd.DataFrame(records)
@@ -309,8 +311,9 @@ def cleanup_records(
309311
LOG.info("Jira records stats AFTER cleanup:")
310312
LOG.info(df.info())
311313

312-
LOG.info("Saving backup to: %s", backup_path)
313-
df.to_pickle(backup_path)
314+
if backup:
315+
LOG.info("Saving backup to: %s", backup_path)
316+
df.to_pickle(backup_path)
314317

315318
return [JiraRecord(**row) for row in df.to_dict(orient="records")]
316319

@@ -410,7 +413,7 @@ def get_chunks(self, record: dict) -> list[str]:
410413
return self.text_processor.split_text(record["text"])
411414

412415
def cleanup_records(
413-
self, records: list[dict], backup_path: str = "osp_all_docs.pickle"
416+
self, records: list[dict], backup: bool, backup_path: str
414417
) -> list[dict]:
415418
"""Cleanup document records"""
416419
df = pd.DataFrame(records)
@@ -424,7 +427,8 @@ def cleanup_records(
424427
LOG.info("Document records stats AFTER cleanup:")
425428
LOG.info(df.info())
426429

427-
LOG.info("Saving backup to: %s", backup_path)
428-
df.to_pickle(backup_path)
430+
if backup:
431+
LOG.info("Saving backup to: %s", backup_path)
432+
df.to_pickle(backup_path)
429433

430434
return [JiraRecord(**row) for row in df.to_dict(orient="records")]

data_scraper/core/solutions_scraper.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def record_postprocessing(self, record):
7676
pass
7777

7878
def cleanup_records(
79-
self, records: list, backup_path: str = "solutions_all_data.csv"
79+
self, records: list, backup: bool, backup_path: str
8080
) -> list:
8181
df = pd.DataFrame(records)
8282

@@ -87,7 +87,8 @@ def cleanup_records(
8787

8888
LOG.info("Records stats AFTER cleanup: %d", df.shape[0])
8989

90-
LOG.info("Saving backup to: %s", backup_path)
91-
df.to_csv(backup_path)
90+
if backup:
91+
LOG.info("Saving backup to: %s", backup_path)
92+
df.to_csv(backup_path)
9293

9394
return [SolutionsRecord(**row) for row in df.to_dict(orient="records")]

data_scraper/main.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,10 @@ def jira_scraper():
4848
)
4949
parser.add_argument("--recreate_collection", action='store_true', default=False,
5050
help="Recreate database collection from scratch.")
51+
parser.add_argument("--backup", action='store_true', default=False,
52+
help="Save a scraper output to file")
53+
parser.add_argument("--backup_path", type=str,
54+
default="all_data.pickle")
5155
args = parser.parse_args()
5256

5357
config_args = {
@@ -65,6 +69,8 @@ def jira_scraper():
6569
"date_cutoff": args.date_cutoff,
6670
"scraper_processes": args.scraper_processes,
6771
"recreate_collection": args.recreate_collection,
72+
"backup": args.backup,
73+
"backup_path": args.backup_path,
6874
}
6975

7076
scraper = JiraScraper(config_args)
@@ -100,6 +106,10 @@ def osp_doc_scraper():
100106
parser.add_argument(
101107
"--rhoso_docs_path", type=str, default="",
102108
help="Path to downstream RHOSO docs generated by get_rhoso_plaintext_docs.sh")
109+
parser.add_argument("--backup", action='store_true', default=False,
110+
help="Save a scraper output to file")
111+
parser.add_argument("--backup_path", type=str,
112+
default="osp_all_docs.pickle")
103113
args = parser.parse_args()
104114

105115
config_args = {
@@ -114,6 +124,8 @@ def osp_doc_scraper():
114124
"osp_version": args.osp_version,
115125
"recreate_collection": args.recreate_collection,
116126
"rhoso_docs_path": args.rhoso_docs_path,
127+
"backup": args.backup,
128+
"backup_path": args.backup_path,
117129
}
118130

119131
scraper = OSPDocScraper(config_args)
@@ -156,6 +168,10 @@ def errata_scraper() -> None:
156168
)
157169
parser.add_argument("--recreate_collection", action='store_true', default=False,
158170
help="Recreate database collection from scratch.")
171+
parser.add_argument("--backup", action='store_true', default=False,
172+
help="Save a scraper output to file")
173+
parser.add_argument("--backup_path", type=str,
174+
default="errata_all_data.pickle")
159175
args = parser.parse_args()
160176

161177
config_args = {
@@ -176,6 +192,8 @@ def errata_scraper() -> None:
176192
"scraper_processes": args.scraper_processes,
177193
"date_cutoff": args.date_cutoff,
178194
"recreate_collection": args.recreate_collection,
195+
"backup": args.backup,
196+
"backup_path": args.backup_path,
179197
}
180198

181199
scraper = ErrataScraper(config_args)
@@ -215,6 +233,10 @@ def ci_logs_scraper() -> None:
215233
default=constants.DEFAULT_ZULL_TENANTS)
216234
parser.add_argument("--populate_db_from_json", type=bool, default=False,
217235
help="Used from Zuul jobs that create json file at the end of their runs.")
236+
parser.add_argument("--backup", action='store_true', default=False,
237+
help="Save a scraper output to file")
238+
parser.add_argument("--backup_path", type=str,
239+
default="aci_logs_all_data.pickle")
218240
args = parser.parse_args()
219241

220242
config_args = {
@@ -230,7 +252,9 @@ def ci_logs_scraper() -> None:
230252
"recreate_collection": args.recreate_collection,
231253
"pipelines": args.pipelines,
232254
"tenants": args.tenants,
233-
"tracebacks_json": "/tmp/tracebacks.json"
255+
"tracebacks_json": "/tmp/tracebacks.json",
256+
"backup": args.backup,
257+
"backup_path": args.backup_path,
234258
}
235259

236260

@@ -277,6 +301,10 @@ def solutions_scraper() -> None:
277301
default=constants.SOLUTIONS_PRODUCT_NAME)
278302
parser.add_argument("--recreate_collection", action='store_true', default=False,
279303
help="Recreate database collection from scratch.")
304+
parser.add_argument("--backup", action='store_true', default=False,
305+
help="Save a scraper output to file")
306+
parser.add_argument("--backup_path", type=str,
307+
default="solutions_all_data.csv")
280308
args = parser.parse_args()
281309

282310
config_args = {
@@ -292,6 +320,8 @@ def solutions_scraper() -> None:
292320
"product_name": args.product_name,
293321
"max_results": args.max_results,
294322
"recreate_collection": args.recreate_collection,
323+
"backup": args.backup,
324+
"backup_path": args.backup_path,
295325
}
296326

297327
scraper = SolutionsScraper(config_args)

0 commit comments

Comments
 (0)