From 7d9cb3a1c4ed7e9e2391e61fb34f27814991ab7e Mon Sep 17 00:00:00 2001
From: efried130 <efriedmann@umass.edu>
Date: Thu, 31 Oct 2024 16:55:23 +0000
Subject: [PATCH 1/3] Filter raw downloaded RiverSP or PIXC data for latest
 version-initial commit

---
 src/filter_version_PIXC_ex.py    | 80 ++++++++++++++++++++++++++
 src/filter_version_riverSP_ex.py | 99 ++++++++++++++++++++++++++++++++
 2 files changed, 179 insertions(+)
 create mode 100644 src/filter_version_PIXC_ex.py
 create mode 100644 src/filter_version_riverSP_ex.py

diff --git a/src/filter_version_PIXC_ex.py b/src/filter_version_PIXC_ex.py
new file mode 100644
index 0000000..99c8dcd
--- /dev/null
+++ b/src/filter_version_PIXC_ex.py
@@ -0,0 +1,80 @@
+"""
+Takes a list of directories of SWOT PIXC data, filters
+them to find the best version of each granule, and writes
+out a JSON containing the best files for each directory.
+
+Currently built for PGC0 and PIC0 versions
+
+Usage:
+    python3 filterVersionRiverSP.py
+
+Authors: Fiona Bennitt and Elisa Friedmann
+Date: 2024-10-31
+"""
+
+import json
+import os
+
+import pandas as pd
+
+def filterVersionPIXC(directories, outpath):
+    """
+    Reads in all filenames, sorts them, and retrieves the best version/
+    counter for each file (e.g. PGC0 over PIC0, PIC0_03 over PIC0_02).
+    Writes out a json with the filenames for filtering upon read in
+    to the outpath directory.
+
+    Parameters:
+    directories (list): List of directories to search for best files.
+    outpath (string): Where to save the output JSON.
+    
+    Returns:
+    None
+    """
+    
+    # Get all file names from directories
+    for directory in directories:
+        # List to store all file paths
+        files = []
+        for file in os.listdir(directory):
+            files.append(file)
+    
+        print(f"There are {str(len(files))} original files in directory.")
+
+        # Make DataFrame of filenames
+        granules = pd.DataFrame({'files': files})
+        granules['cycle'] = granules['files'].str.slice(16, 19)
+        granules['pass'] = granules['files'].str.slice(20, 23)
+        granules['tile'] = granules['files'].str.slice(24, 28)
+        granules['version'] = granules['files'].str.slice(-10, -6)
+        granules['counter'] = granules['files'].str.slice(-5, -3)    
+
+        # Sort the files
+        granules = granules.sort_values(by=['cycle', 'pass', 'tile', 'version', 'counter'],
+                                        ascending=[True, True, True, True, False])    
+
+        # Keep only the best version of each granule
+        granules = granules.drop_duplicates(subset=['cycle', 'pass', 'tile'],
+                                            keep='first')    
+
+        # Extract the file names of files passing the test
+        best_files = list(granules['files'])
+
+        print(f"There are {str(len(best_files))} best files in directory.")
+
+
+        # Split filepath for naming json
+        pieces = dirs[0].split('/')
+
+        # Write out best files as json
+        with open(os.path.join(outpath, pieces[5] + '_filtered.json'), 'w', encoding='utf-8') as f:
+            json.dump(best_files, f)
+
+        print(f"Wrote out the unique and most recently processed {str(len(best_files))} files to {outpath}{pieces[5]}_filtered.json")
+
+# Directories to filter    
+dirs = ['/path_to_data_download/']
+# Outpath for json
+out = '/path_out/'
+
+filterVersionPIXC(directories=dirs, outpath=out)
\ No newline at end of file
diff --git a/src/filter_version_riverSP_ex.py b/src/filter_version_riverSP_ex.py
new file mode 100644
index 0000000..a41f6ff
--- /dev/null
+++ b/src/filter_version_riverSP_ex.py
@@ -0,0 +1,99 @@
+"""
+Takes a directory of SWOT RiverSP data (Reach OR Node) and,
+filters it to find the best version of each granule, and
+writes out a JSON containing the best files for each directory.
+
+Currently built for PGC0 and PIC0 versions
+
+Usage:
+    python3 filterVersionRiverSP.py
+
+Authors: Elisa Friedmann and Fiona Bennitt
+Date: 2024-10-31
+"""
+
+import json
+import os
+
+import pandas as pd
+
+def filterVersionRiverSP(directories, outpath):
+    """
+    Reads in all filenames, sorts them, and retrieves the best version/
+    counter for each file (e.g. PGC0 over PIC0, PIC0_03 over PIC0_02).
+    Writes out a json with the filenames for filtering upon read in
+    to the outpath directory.
+
+    Parameters:
+    directories (list): List of directories to search for best files.
+    outpath (string): Where to save the output JSONs.
+    
+    Returns:
+    None
+    """
+    
+    # Get all .shp file names from directories
+    for directory in directories:
+        # List to store all .shp file paths
+        shp_files = []
+        for file in os.listdir(directory):
+            if file.endswith(".shp"):
+                shp_files.append(file)
+    
+        print(f"There are {str(len(shp_files))} original .shp files in directory.")
+
+        # Make DataFrame of filenames
+        granules = pd.DataFrame({'files': shp_files})
+        granules['cycle'] = granules['files'].str.slice(25, 28)
+        granules['pass'] = granules['files'].str.slice(29, 32)  
+        granules['version'] = granules['files'].str.slice(-11, -7)
+        granules['counter'] = granules['files'].str.slice(-6, -4)     
+
+        # Sort the files
+        granules = granules.sort_values(by=['cycle', 'pass', 'version', 'counter'],
+                                        ascending=[True, True, True, False])    
+
+        # Keep only the best version of each granule
+        granules = granules.drop_duplicates(subset=['cycle', 'pass'],
+                                            keep='first')    
+
+        # Extract the file names of files passing the test
+        best_files = list(granules['files'])
+
+        print(f"There are {str(len(best_files))} best .shp files in directory.")
+
+        # Extract base names (file name without extensions) from the list
+        base_names = set(os.path.splitext(os.path.basename(file))[0] for file in best_files)
+
+        all_best_files = []
+        # Loop over the directories to find all files with matching base names
+        # for directory in directories:
+        for file in os.listdir(directory):
+            # Get the file's base name (need to use extsep as .split will
+            # remove only .xml from .shp.xml and those files will get missed
+            # base_name, extension = os.path.exstep(os.path.basename(file))
+            base_name, extension = os.path.basename(file).split(os.extsep, 1)
+
+            # If the base name is in the list of best files,
+            # append that file to the list of files to keep
+            if base_name in base_names:
+                all_best_files.append(file)
+
+        # Split filepath for naming json
+        pieces = directory.split('/')
+
+        # Write out best files as json
+        with open(os.path.join(outpath, pieces[6] + '_' + pieces[7] + '_' +
+                               pieces[8] + '_filtered.json'), 
+                  'w', encoding='utf-8') as f:
+            json.dump(all_best_files, f)
+
+        print(f"Wrote out the unique and most recently processed as .json: {str(len(all_best_files))} files to {outpath}{pieces[6]}_filtered.json")
+        
+# Directories to filter    
+dirs = ['/path/SA/Reach',
+        '/path/GR/Reach']
+# Outpath for json
+out = '/path/'
+
+filterVersionRiverSP(directories=dirs, outpath=out)
\ No newline at end of file

From 366fb163cf25448db892eebd5e4f103954243387 Mon Sep 17 00:00:00 2001
From: efried130 <efriedmann@umass.edu>
Date: Thu, 31 Oct 2024 17:01:58 +0000
Subject: [PATCH 2/3] Filter raw downloaded RiverSP or PIXC data for latest
 version-initial commit

---
 src/filter_version_PIXC_ex.py => filter_version_PIXC_ex.py       | 0
 src/filter_version_riverSP_ex.py => filter_version_riverSP_ex.py | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename src/filter_version_PIXC_ex.py => filter_version_PIXC_ex.py (100%)
 rename src/filter_version_riverSP_ex.py => filter_version_riverSP_ex.py (100%)

diff --git a/src/filter_version_PIXC_ex.py b/filter_version_PIXC_ex.py
similarity index 100%
rename from src/filter_version_PIXC_ex.py
rename to filter_version_PIXC_ex.py
diff --git a/src/filter_version_riverSP_ex.py b/filter_version_riverSP_ex.py
similarity index 100%
rename from src/filter_version_riverSP_ex.py
rename to filter_version_riverSP_ex.py

From 354da9897a2c0177262d0c8dc619956fa1360fc2 Mon Sep 17 00:00:00 2001
From: Ellie <90273239+efried130@users.noreply.github.com>
Date: Wed, 12 Mar 2025 17:35:03 -0400
Subject: [PATCH 3/3] Update filter_version_PIXC_ex.py

Updated directories/filepaths
---
 filter_version_PIXC_ex.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/filter_version_PIXC_ex.py b/filter_version_PIXC_ex.py
index 99c8dcd..78fa3c0 100644
--- a/filter_version_PIXC_ex.py
+++ b/filter_version_PIXC_ex.py
@@ -62,19 +62,20 @@ def filterVersionPIXC(directories, outpath):
 
         print(f"There are {str(len(best_files))} best files in directory.")
 
-
         # Split filepath for naming json
-        pieces = dirs[0].split('/')
+        pieces = directory.split('/')
 
         # Write out best files as json
-        with open(os.path.join(outpath, pieces[5] + '_filtered.json'), 'w', encoding='utf-8') as f:
+        with open(os.path.join(outpath, pieces[-2] + '_filtered.json'), 'w', encoding='utf-8') as f:
             json.dump(best_files, f)
 
-        print(f"Wrote out the unique and most recently processed {str(len(best_files))} files to {outpath}{pieces[5]}_filtered.json")
+        print(f"Wrote out the unique and most recently processed {str(len(best_files))} files to {outpath}{pieces[5]}_filtered.json\n" )
 
-# Directories to filter    
-dirs = ['/path_to_data_download/']
-# Outpath for json
-out = '/path_out/'
+if __name__ == "__main__":
+    # Directories to filter    
+    dirs = ['/yourPath/file1',
+            '/yourPath/file2']
+    # Outpath for json
+    out = '/yourPath/'
 
-filterVersionPIXC(directories=dirs, outpath=out)
\ No newline at end of file
+    filterVersionPIXC(directories=dirs, outpath=out)