Skip to content

Commit 33cb775

Browse files
committed
experimental method avoiding pangeo
1 parent 7b42ebd commit 33cb775

File tree

3 files changed

+472
-55
lines changed

3 files changed

+472
-55
lines changed

stitches/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from .make_pangeo_table import make_pangeo_comparison, make_pangeo_table
1818
from .make_tas_archive import make_tas_archive
1919
from .package_data import fetch_quickstarter_data
20+
from .fx_esgf_api import get_recipe_entry_data
2021

2122
__all__ = [
2223
"match_neighborhood",
@@ -34,5 +35,6 @@
3435
"make_pangeo_table",
3536
"make_tas_archive",
3637
"fetch_quickstarter_data",
38+
"get_recipe_entry_data",
3739
"__version__",
3840
]

stitches/fx_esgf_api.py

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
"""Module for working with the ESGF database through their API.
2+
3+
# https://docs.google.com/document/d/1pxz1Kd3JHfFp8vR2JCVBfApbsHmbUQQstifhGNdc6U0/edit?usp=sharing
4+
# API AT: https://github.com/ESGF/esgf.github.io/wiki/ESGF_Search_REST_API#results-pagination
5+
"""
6+
7+
from __future__ import print_function
8+
import requests
9+
import xml.etree.ElementTree as ET
10+
import pandas as pd
11+
import xarray as xr
12+
import numpy as np
13+
14+
15+
# Author: Unknown
16+
# I got the original version from a word document published by ESGF
17+
# https://docs.google.com/document/d/1pxz1Kd3JHfFp8vR2JCVBfApbsHmbUQQstifhGNdc6U0/edit?usp=sharing
18+
# API AT: https://github.com/ESGF/esgf.github.io/wiki/ESGF_Search_REST_API#results-pagination
19+
def esgf_search(server="https://esgf-node.llnl.gov/esg-search/search",
20+
files_type="OPENDAP", local_node=True, project="CMIP6",
21+
verbose=False, format="application%2Fsolr%2Bjson",
22+
use_csrf=False, **search):
23+
client = requests.session()
24+
payload = search
25+
payload["project"] = project
26+
payload["type"]= "File"
27+
if local_node:
28+
payload["distrib"] = "false"
29+
if use_csrf:
30+
client.get(server)
31+
if 'csrftoken' in client.cookies:
32+
# Django 1.6 and up
33+
csrftoken = client.cookies['csrftoken']
34+
else:
35+
# older versions
36+
csrftoken = client.cookies['csrf']
37+
payload["csrfmiddlewaretoken"] = csrftoken
38+
39+
payload["format"] = format
40+
41+
offset = 0
42+
numFound = 10000
43+
all_files = []
44+
files_type = files_type.upper()
45+
while offset < numFound:
46+
payload["offset"] = offset
47+
url_keys = []
48+
for k in payload:
49+
url_keys += ["{}={}".format(k, payload[k])]
50+
51+
url = "{}/?{}".format(server, "&".join(url_keys))
52+
print(f'\t\t - url: {url}', flush=True)
53+
r = client.get(url)
54+
r.raise_for_status()
55+
resp = r.json()["response"]
56+
numFound = int(resp["numFound"])
57+
resp = resp["docs"]
58+
offset += len(resp)
59+
for d in resp:
60+
if verbose:
61+
for k in d:
62+
print("{}: {}".format(k,d[k]))
63+
url = d["url"]
64+
for f in d["url"]:
65+
sp = f.split("|")
66+
if sp[-1] == files_type:
67+
all_files.append(sp[0].split(".html")[0])
68+
return sorted(all_files)
69+
70+
71+
def get_df_from_esgf(result_df, archive_start_year, archive_end_year, time_chunk = 100 ):
72+
"""
73+
Download data from an ESGF search using Xarray OpenDAP functionality
74+
Parameters:
75+
- result_df: formatted pandas dataframe of results from ESGF search
76+
- archive_start_year: start of period which must be contained in result
77+
- archive_end_year: end of period which must be contained in result
78+
Returns:
79+
- df: Xarray.DataSet associated to the ESGF search results
80+
"""
81+
success = False
82+
83+
for data_node in result_df['node'].unique():
84+
node_df = result_df[result_df['node'] == data_node]
85+
min_year = int(np.min(node_df['start'].values))
86+
max_year = int(np.max(node_df['end'].values))
87+
88+
# If node contains data that covers our desired period
89+
if( (min_year <= archive_start_year) & (max_year >= archive_end_year) ):
90+
# Try downloading
91+
try:
92+
print(f'\t\t - Trying to access data from: {data_node}')
93+
df = xr.open_mfdataset(node_df['dap_link'].values, chunks = {'time':time_chunk})
94+
success = True
95+
break
96+
...
97+
except Exception as e:
98+
print(f'\t\t - failed using node: {data_node}')
99+
...
100+
...
101+
...
102+
103+
if success:
104+
print(f'\t\t - success using node: {data_node}')
105+
else:
106+
raise Exception("Could not find appropriate data using ESGF API")
107+
108+
return df
109+
110+
111+
def format_esgf_result(result):
112+
"""
113+
Formats and extracts metadata from the list of links returned by the ESGF API search call.
114+
115+
Parameters:
116+
- result: list of OpenDAP links resulting from ESGF API search call
117+
Returns:
118+
- result_df: formatted pandas dataframe of OpenDAP links + extracted metadata like year info and host data node
119+
"""
120+
rows = []
121+
for dap_link in result:
122+
data_node = dap_link.split('/')[2]
123+
filename = dap_link.split('/')[-1]
124+
ensemble_member = filename.split('_')[4]
125+
year_string = filename.split('.')[0].split('_')[-1]
126+
start_year = year_string.split('-')[0][0:4]
127+
end_year = year_string.split('-')[1][0:4]
128+
row = [data_node, filename, start_year, end_year, dap_link, ensemble_member]
129+
rows.append(row)
130+
result_df = pd.DataFrame(rows, columns=['node', 'file', 'start', 'end', 'dap_link', 'ensemble_member'])
131+
132+
return result_df
133+
134+
135+
def get_recipe_entry_data(row, res='day', variable = 'tas'):
136+
"""
137+
Downloads the data associated to a given entry in the recipe
138+
139+
Parameters:
140+
- row: pandas dataframe row which IDs the data we're currently interested in
141+
Returns:
142+
- df: Xarray.Dataset associated to the given recipe entry
143+
"""
144+
145+
# Message:
146+
print(f'\t - Searching ESGF for archive data {row.archive_start_yr}-{row.archive_end_yr} to use as target period {row.target_start_yr}-{row.target_end_yr}', flush=True)
147+
148+
# Do the ESGF API search
149+
result = esgf_search(
150+
table_id=res, variable_id=variable, experiment_id=row.archive_experiment,
151+
source_id=row.archive_model, member_id=row.archive_ensemble
152+
)
153+
154+
# Format the results
155+
result_df = format_esgf_result(result)
156+
157+
# Download the data, ensuring it contains the required period defined by the recipe
158+
df = get_df_from_esgf(result_df, row.archive_start_yr, row.archive_end_yr)
159+
160+
return df

0 commit comments

Comments
 (0)