From 7d9cb3a1c4ed7e9e2391e61fb34f27814991ab7e Mon Sep 17 00:00:00 2001 From: efried130 Date: Thu, 31 Oct 2024 16:55:23 +0000 Subject: [PATCH 1/3] Filter raw downloaded RiverSP or PIXC data for latest version-initial commit --- src/filter_version_PIXC_ex.py | 80 ++++++++++++++++++++++++++ src/filter_version_riverSP_ex.py | 99 ++++++++++++++++++++++++++++++++ 2 files changed, 179 insertions(+) create mode 100644 src/filter_version_PIXC_ex.py create mode 100644 src/filter_version_riverSP_ex.py diff --git a/src/filter_version_PIXC_ex.py b/src/filter_version_PIXC_ex.py new file mode 100644 index 0000000..99c8dcd --- /dev/null +++ b/src/filter_version_PIXC_ex.py @@ -0,0 +1,80 @@ +""" +Takes a list of directories of SWOT PIXC data, filters +them to find the best version of each granule, and writes +out a JSON containing the best files for each directory. + +Currently built for PGC0 and PIC0 versions + +Usage: + python3 filterVersionRiverSP.py + +Authors: Fiona Bennitt and Elisa Friedmann +Date: 2024-10-31 +""" + +import json +import os + +import pandas as pd + +def filterVersionPIXC(directories, outpath): + """ + Reads in all filenames, sorts them, and retrieves the best version/ + counter for each file (e.g. PGC0 over PIC0, PIC0_03 over PIC0_02). + Writes out a json with the filenames for filtering upon read in + to the outpath directory. + + Parameters: + directories (list): List of directories to search for best files. + outpath (string): Where to save the output JSON. + + Returns: + None + """ + + # Get all file names from directories + for directory in directories: + # List to store all file paths + files = [] + for file in os.listdir(directory): + files.append(file) + + print(f"There are {str(len(files))} original files in directory.") + + # Make DataFrame of filenames + granules = pd.DataFrame({'files': files}) + granules['cycle'] = granules['files'].str.slice(16, 19) + granules['pass'] = granules['files'].str.slice(20, 23) + granules['tile'] = granules['files'].str.slice(24, 28) + granules['version'] = granules['files'].str.slice(-10, -6) + granules['counter'] = granules['files'].str.slice(-5, -3) + + # Sort the files + granules = granules.sort_values(by=['cycle', 'pass', 'tile', 'version', 'counter'], + ascending=[True, True, True, True, False]) + + # Keep only the best version of each granule + granules = granules.drop_duplicates(subset=['cycle', 'pass', 'tile'], + keep='first') + + # Extract the file names of files passing the test + best_files = list(granules['files']) + + print(f"There are {str(len(best_files))} best files in directory.") + + + # Split filepath for naming json + pieces = dirs[0].split('/') + + # Write out best files as json + with open(os.path.join(outpath, pieces[5] + '_filtered.json'), 'w', encoding='utf-8') as f: + json.dump(best_files, f) + + print(f"Wrote out the unique and most recently processed {str(len(best_files))} files to {outpath}{pieces[5]}_filtered.json") + +# Directories to filter +dirs = ['/path_to_data_download/'] +# Outpath for json +out = '/path_out/' + +filterVersionPIXC(directories=dirs, outpath=out) \ No newline at end of file diff --git a/src/filter_version_riverSP_ex.py b/src/filter_version_riverSP_ex.py new file mode 100644 index 0000000..a41f6ff --- /dev/null +++ b/src/filter_version_riverSP_ex.py @@ -0,0 +1,99 @@ +""" +Takes a directory of SWOT RiverSP data (Reach OR Node) and, +filters it to find the best version of each granule, and +writes out a JSON containing the best files for each directory. + +Currently built for PGC0 and PIC0 versions + +Usage: + python3 filterVersionRiverSP.py + +Authors: Elisa Friedmann and Fiona Bennitt +Date: 2024-10-31 +""" + +import json +import os + +import pandas as pd + +def filterVersionRiverSP(directories, outpath): + """ + Reads in all filenames, sorts them, and retrieves the best version/ + counter for each file (e.g. PGC0 over PIC0, PIC0_03 over PIC0_02). + Writes out a json with the filenames for filtering upon read in + to the outpath directory. + + Parameters: + directories (list): List of directories to search for best files. + outpath (string): Where to save the output JSONs. + + Returns: + None + """ + + # Get all .shp file names from directories + for directory in directories: + # List to store all .shp file paths + shp_files = [] + for file in os.listdir(directory): + if file.endswith(".shp"): + shp_files.append(file) + + print(f"There are {str(len(shp_files))} original .shp files in directory.") + + # Make DataFrame of filenames + granules = pd.DataFrame({'files': shp_files}) + granules['cycle'] = granules['files'].str.slice(25, 28) + granules['pass'] = granules['files'].str.slice(29, 32) + granules['version'] = granules['files'].str.slice(-11, -7) + granules['counter'] = granules['files'].str.slice(-6, -4) + + # Sort the files + granules = granules.sort_values(by=['cycle', 'pass', 'version', 'counter'], + ascending=[True, True, True, False]) + + # Keep only the best version of each granule + granules = granules.drop_duplicates(subset=['cycle', 'pass'], + keep='first') + + # Extract the file names of files passing the test + best_files = list(granules['files']) + + print(f"There are {str(len(best_files))} best .shp files in directory.") + + # Extract base names (file name without extensions) from the list + base_names = set(os.path.splitext(os.path.basename(file))[0] for file in best_files) + + all_best_files = [] + # Loop over the directories to find all files with matching base names + # for directory in directories: + for file in os.listdir(directory): + # Get the file's base name (need to use extsep as .split will + # remove only .xml from .shp.xml and those files will get missed + # base_name, extension = os.path.exstep(os.path.basename(file)) + base_name, extension = os.path.basename(file).split(os.extsep, 1) + + # If the base name is in the list of best files, + # append that file to the list of files to keep + if base_name in base_names: + all_best_files.append(file) + + # Split filepath for naming json + pieces = directory.split('/') + + # Write out best files as json + with open(os.path.join(outpath, pieces[6] + '_' + pieces[7] + '_' + + pieces[8] + '_filtered.json'), + 'w', encoding='utf-8') as f: + json.dump(all_best_files, f) + + print(f"Wrote out the unique and most recently processed as .json: {str(len(all_best_files))} files to {outpath}{pieces[6]}_filtered.json") + +# Directories to filter +dirs = ['/path/SA/Reach', + '/path/GR/Reach'] +# Outpath for json +out = '/path/' + +filterVersionRiverSP(directories=dirs, outpath=out) \ No newline at end of file From 366fb163cf25448db892eebd5e4f103954243387 Mon Sep 17 00:00:00 2001 From: efried130 Date: Thu, 31 Oct 2024 17:01:58 +0000 Subject: [PATCH 2/3] Filter raw downloaded RiverSP or PIXC data for latest version-initial commit --- src/filter_version_PIXC_ex.py => filter_version_PIXC_ex.py | 0 src/filter_version_riverSP_ex.py => filter_version_riverSP_ex.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename src/filter_version_PIXC_ex.py => filter_version_PIXC_ex.py (100%) rename src/filter_version_riverSP_ex.py => filter_version_riverSP_ex.py (100%) diff --git a/src/filter_version_PIXC_ex.py b/filter_version_PIXC_ex.py similarity index 100% rename from src/filter_version_PIXC_ex.py rename to filter_version_PIXC_ex.py diff --git a/src/filter_version_riverSP_ex.py b/filter_version_riverSP_ex.py similarity index 100% rename from src/filter_version_riverSP_ex.py rename to filter_version_riverSP_ex.py From 354da9897a2c0177262d0c8dc619956fa1360fc2 Mon Sep 17 00:00:00 2001 From: Ellie <90273239+efried130@users.noreply.github.com> Date: Wed, 12 Mar 2025 17:35:03 -0400 Subject: [PATCH 3/3] Update filter_version_PIXC_ex.py Updated directories/filepaths --- filter_version_PIXC_ex.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/filter_version_PIXC_ex.py b/filter_version_PIXC_ex.py index 99c8dcd..78fa3c0 100644 --- a/filter_version_PIXC_ex.py +++ b/filter_version_PIXC_ex.py @@ -62,19 +62,20 @@ def filterVersionPIXC(directories, outpath): print(f"There are {str(len(best_files))} best files in directory.") - # Split filepath for naming json - pieces = dirs[0].split('/') + pieces = directory.split('/') # Write out best files as json - with open(os.path.join(outpath, pieces[5] + '_filtered.json'), 'w', encoding='utf-8') as f: + with open(os.path.join(outpath, pieces[-2] + '_filtered.json'), 'w', encoding='utf-8') as f: json.dump(best_files, f) - print(f"Wrote out the unique and most recently processed {str(len(best_files))} files to {outpath}{pieces[5]}_filtered.json") + print(f"Wrote out the unique and most recently processed {str(len(best_files))} files to {outpath}{pieces[5]}_filtered.json\n" ) -# Directories to filter -dirs = ['/path_to_data_download/'] -# Outpath for json -out = '/path_out/' +if __name__ == "__main__": + # Directories to filter + dirs = ['/yourPath/file1', + '/yourPath/file2'] + # Outpath for json + out = '/yourPath/' -filterVersionPIXC(directories=dirs, outpath=out) \ No newline at end of file + filterVersionPIXC(directories=dirs, outpath=out)