1+ """Module for working with the ESGF database through their API.
2+
3+ # https://docs.google.com/document/d/1pxz1Kd3JHfFp8vR2JCVBfApbsHmbUQQstifhGNdc6U0/edit?usp=sharing
4+ # API AT: https://github.com/ESGF/esgf.github.io/wiki/ESGF_Search_REST_API#results-pagination
5+ """
6+
7+ from __future__ import print_function
8+ import requests
9+ import xml .etree .ElementTree as ET
10+ import pandas as pd
11+ import xarray as xr
12+ import numpy as np
13+
14+
15+ # Author: Unknown
16+ # I got the original version from a word document published by ESGF
17+ # https://docs.google.com/document/d/1pxz1Kd3JHfFp8vR2JCVBfApbsHmbUQQstifhGNdc6U0/edit?usp=sharing
18+ # API AT: https://github.com/ESGF/esgf.github.io/wiki/ESGF_Search_REST_API#results-pagination
19+ def esgf_search (server = "https://esgf-node.llnl.gov/esg-search/search" ,
20+ files_type = "OPENDAP" , local_node = True , project = "CMIP6" ,
21+ verbose = False , format = "application%2Fsolr%2Bjson" ,
22+ use_csrf = False , ** search ):
23+ client = requests .session ()
24+ payload = search
25+ payload ["project" ] = project
26+ payload ["type" ]= "File"
27+ if local_node :
28+ payload ["distrib" ] = "false"
29+ if use_csrf :
30+ client .get (server )
31+ if 'csrftoken' in client .cookies :
32+ # Django 1.6 and up
33+ csrftoken = client .cookies ['csrftoken' ]
34+ else :
35+ # older versions
36+ csrftoken = client .cookies ['csrf' ]
37+ payload ["csrfmiddlewaretoken" ] = csrftoken
38+
39+ payload ["format" ] = format
40+
41+ offset = 0
42+ numFound = 10000
43+ all_files = []
44+ files_type = files_type .upper ()
45+ while offset < numFound :
46+ payload ["offset" ] = offset
47+ url_keys = []
48+ for k in payload :
49+ url_keys += ["{}={}" .format (k , payload [k ])]
50+
51+ url = "{}/?{}" .format (server , "&" .join (url_keys ))
52+ print (f'\t \t - url: { url } ' , flush = True )
53+ r = client .get (url )
54+ r .raise_for_status ()
55+ resp = r .json ()["response" ]
56+ numFound = int (resp ["numFound" ])
57+ resp = resp ["docs" ]
58+ offset += len (resp )
59+ for d in resp :
60+ if verbose :
61+ for k in d :
62+ print ("{}: {}" .format (k ,d [k ]))
63+ url = d ["url" ]
64+ for f in d ["url" ]:
65+ sp = f .split ("|" )
66+ if sp [- 1 ] == files_type :
67+ all_files .append (sp [0 ].split (".html" )[0 ])
68+ return sorted (all_files )
69+
70+
71+ def get_df_from_esgf (result_df , archive_start_year , archive_end_year , time_chunk = 100 ):
72+ """
73+ Download data from an ESGF search using Xarray OpenDAP functionality
74+ Parameters:
75+ - result_df: formatted pandas dataframe of results from ESGF search
76+ - archive_start_year: start of period which must be contained in result
77+ - archive_end_year: end of period which must be contained in result
78+ Returns:
79+ - df: Xarray.DataSet associated to the ESGF search results
80+ """
81+ success = False
82+
83+ for data_node in result_df ['node' ].unique ():
84+ node_df = result_df [result_df ['node' ] == data_node ]
85+ min_year = int (np .min (node_df ['start' ].values ))
86+ max_year = int (np .max (node_df ['end' ].values ))
87+
88+ # If node contains data that covers our desired period
89+ if ( (min_year <= archive_start_year ) & (max_year >= archive_end_year ) ):
90+ # Try downloading
91+ try :
92+ print (f'\t \t - Trying to access data from: { data_node } ' )
93+ df = xr .open_mfdataset (node_df ['dap_link' ].values , chunks = {'time' :time_chunk })
94+ success = True
95+ break
96+ ...
97+ except Exception as e :
98+ print (f'\t \t - failed using node: { data_node } ' )
99+ ...
100+ ...
101+ ...
102+
103+ if success :
104+ print (f'\t \t - success using node: { data_node } ' )
105+ else :
106+ raise Exception ("Could not find appropriate data using ESGF API" )
107+
108+ return df
109+
110+
111+ def format_esgf_result (result ):
112+ """
113+ Formats and extracts metadata from the list of links returned by the ESGF API search call.
114+
115+ Parameters:
116+ - result: list of OpenDAP links resulting from ESGF API search call
117+ Returns:
118+ - result_df: formatted pandas dataframe of OpenDAP links + extracted metadata like year info and host data node
119+ """
120+ rows = []
121+ for dap_link in result :
122+ data_node = dap_link .split ('/' )[2 ]
123+ filename = dap_link .split ('/' )[- 1 ]
124+ ensemble_member = filename .split ('_' )[4 ]
125+ year_string = filename .split ('.' )[0 ].split ('_' )[- 1 ]
126+ start_year = year_string .split ('-' )[0 ][0 :4 ]
127+ end_year = year_string .split ('-' )[1 ][0 :4 ]
128+ row = [data_node , filename , start_year , end_year , dap_link , ensemble_member ]
129+ rows .append (row )
130+ result_df = pd .DataFrame (rows , columns = ['node' , 'file' , 'start' , 'end' , 'dap_link' , 'ensemble_member' ])
131+
132+ return result_df
133+
134+
135+ def get_recipe_entry_data (row , res = 'day' , variable = 'tas' ):
136+ """
137+ Downloads the data associated to a given entry in the recipe
138+
139+ Parameters:
140+ - row: pandas dataframe row which IDs the data we're currently interested in
141+ Returns:
142+ - df: Xarray.Dataset associated to the given recipe entry
143+ """
144+
145+ # Message:
146+ print (f'\t - Searching ESGF for archive data { row .archive_start_yr } -{ row .archive_end_yr } to use as target period { row .target_start_yr } -{ row .target_end_yr } ' , flush = True )
147+
148+ # Do the ESGF API search
149+ result = esgf_search (
150+ table_id = res , variable_id = variable , experiment_id = row .archive_experiment ,
151+ source_id = row .archive_model , member_id = row .archive_ensemble
152+ )
153+
154+ # Format the results
155+ result_df = format_esgf_result (result )
156+
157+ # Download the data, ensuring it contains the required period defined by the recipe
158+ df = get_df_from_esgf (result_df , row .archive_start_yr , row .archive_end_yr )
159+
160+ return df
0 commit comments