From a8033cdb5324116fe59664c65c0af2d2114c0fe6 Mon Sep 17 00:00:00 2001 From: rileykk Date: Mon, 16 Jun 2025 12:56:11 -0700 Subject: [PATCH 01/21] git ignore --- cmr_s3_subscriber/.gitignore | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 cmr_s3_subscriber/.gitignore diff --git a/cmr_s3_subscriber/.gitignore b/cmr_s3_subscriber/.gitignore new file mode 100644 index 0000000..5adeb77 --- /dev/null +++ b/cmr_s3_subscriber/.gitignore @@ -0,0 +1,7 @@ +lambda/*.zip +subscriptions/ +terraform/.terraform/ +terraform/.terraform.lock.hcl +terraform/config.yaml +terraform/terraform.tfstate* +terraform/*.tfvars From acf8d0408b028c5fb34fa0987cebd2c0e08e7659 Mon Sep 17 00:00:00 2001 From: rileykk Date: Mon, 16 Jun 2025 12:57:23 -0700 Subject: [PATCH 02/21] initial commit --- cmr_s3_subscriber/delete.py | 90 ++++ cmr_s3_subscriber/lambda/Makefile | 26 ++ cmr_s3_subscriber/lambda/lambda_function.py | 394 ++++++++++++++++++ cmr_s3_subscriber/requirements.txt | 5 + cmr_s3_subscriber/requirements_maap.txt | 38 ++ cmr_s3_subscriber/subscriber.py | 113 +++++ cmr_s3_subscriber/terraform/iam.tf | 100 +++++ cmr_s3_subscriber/terraform/main.tf | 145 +++++++ cmr_s3_subscriber/terraform/outputs.tf | 32 ++ cmr_s3_subscriber/terraform/provider.tf | 44 ++ cmr_s3_subscriber/terraform/s3.tf | 16 + .../terraform/sqs_backfill_queue.tf | 56 +++ .../terraform/subscription/main.tf | 73 ++++ .../terraform/subscription/outputs.tf | 7 + .../terraform/subscription/variables.tf | 15 + .../terraform/subscription/versions.tf | 14 + cmr_s3_subscriber/terraform/variables.tf | 72 ++++ cmr_s3_subscriber/terraform/versions.tf | 13 + cmr_s3_subscriber/test_messages/all_four.json | 26 ++ .../test_messages/sample_umm.json | 363 ++++++++++++++++ .../test_messages/single_conf.json | 8 + .../test_messages/single_invalid.json | 11 + .../single_invalid_no_retry.json | 11 + .../test_messages/single_record.json | 8 + 24 files changed, 1680 insertions(+) create mode 100644 cmr_s3_subscriber/delete.py create mode 100644 cmr_s3_subscriber/lambda/Makefile create mode 100644 cmr_s3_subscriber/lambda/lambda_function.py create mode 100644 cmr_s3_subscriber/requirements.txt create mode 100644 cmr_s3_subscriber/requirements_maap.txt create mode 100644 cmr_s3_subscriber/subscriber.py create mode 100644 cmr_s3_subscriber/terraform/iam.tf create mode 100644 cmr_s3_subscriber/terraform/main.tf create mode 100644 cmr_s3_subscriber/terraform/outputs.tf create mode 100644 cmr_s3_subscriber/terraform/provider.tf create mode 100644 cmr_s3_subscriber/terraform/s3.tf create mode 100644 cmr_s3_subscriber/terraform/sqs_backfill_queue.tf create mode 100644 cmr_s3_subscriber/terraform/subscription/main.tf create mode 100644 cmr_s3_subscriber/terraform/subscription/outputs.tf create mode 100644 cmr_s3_subscriber/terraform/subscription/variables.tf create mode 100644 cmr_s3_subscriber/terraform/subscription/versions.tf create mode 100644 cmr_s3_subscriber/terraform/variables.tf create mode 100644 cmr_s3_subscriber/terraform/versions.tf create mode 100644 cmr_s3_subscriber/test_messages/all_four.json create mode 100644 cmr_s3_subscriber/test_messages/sample_umm.json create mode 100644 cmr_s3_subscriber/test_messages/single_conf.json create mode 100644 cmr_s3_subscriber/test_messages/single_invalid.json create mode 100644 cmr_s3_subscriber/test_messages/single_invalid_no_retry.json create mode 100644 cmr_s3_subscriber/test_messages/single_record.json diff --git a/cmr_s3_subscriber/delete.py b/cmr_s3_subscriber/delete.py new file mode 100644 index 0000000..ab7964b --- /dev/null +++ b/cmr_s3_subscriber/delete.py @@ -0,0 +1,90 @@ +import argparse +import requests +import earthaccess +import yaml +import os + + +def parse_args(): + parser = argparse.ArgumentParser() + + parser.add_argument( + 'config', + default='config.yaml', + help='Configuration file' + ) + + sub_id = parser.add_mutually_exclusive_group(required=True) + + sub_id.add_argument( + '--native_id', + help='Native ID to delete' + ) + + sub_id.add_argument( + '--response-xml', + help='XML file from subscribe.py from which to get native ID to delete' + ) + + parser.add_argument( + '--dryrun', + action='store_true', + help='Do not make CMR API calls except for auth' + ) + + return parser.parse_args() + + +def main(args): + with open(args.config, 'r') as f: + config = yaml.safe_load(f) + + os.environ['EARTHDATA_USERNAME'] = config['edl_username'] + os.environ['EARTHDATA_PASSWORD'] = config['edl_password'] + + auth = earthaccess.login(strategy='environment') + bearer_token = auth.token['access_token'] + + if args.native_id is not None: + nid = args.native_id + else: + from xml.etree import ElementTree as ET + + tree = ET.parse(args.response_xml) + root = tree.getroot() + nid = root.find('native-id').text + + headers = { + "Authorization": f"Bearer {bearer_token}", + } + + if args.dryrun: + print(f"Would issue a DELETE request to https://cmr.earthdata.nasa.gov/ingest/subscriptions/{nid}") + print(f'{headers=}') + exit() + + try: + response = requests.delete( + f"https://cmr.earthdata.nasa.gov/ingest/subscriptions/{nid}", + headers=headers, + ) + + if response.ok: + print("Successfully deleted CMR subscription") + print(f"Response: {response.text}") + + if args.response_xml is not None: + try: + os.unlink(args.response_xml) + except: + print('Could not delete XML file') + else: + print(f"Error deleting subscription. Status code: {response.status_code}") + print(f"Response: {response.text}") + + except Exception as e: + print(f"Error making subscription request: {str(e)}") + + +if __name__ == '__main__': + main(parse_args()) diff --git a/cmr_s3_subscriber/lambda/Makefile b/cmr_s3_subscriber/lambda/Makefile new file mode 100644 index 0000000..37338d8 --- /dev/null +++ b/cmr_s3_subscriber/lambda/Makefile @@ -0,0 +1,26 @@ + +package.zip: lambda_function.py ../requirements.txt + rm -f package.zip + rm -rf package/ + mkdir package + pip install --target ./package -r ../requirements.txt + cd package && zip -9r ../package.zip . + zip -9 package.zip lambda_function.py + rm -rf package/ + + +package_maap.zip: lambda_function.py ../requirements.txt + rm -f package_maap.zip + rm -rf package_maap/ + mkdir package_maap + pip install \ + --target ./package_maap \ + --platform manylinux2014_x86_64 \ + --implementation cp \ + -r ../requirements_maap.txt \ + --python-version 3.9 \ + --only-binary=:all: --upgrade + cd package_maap && zip -9r ../package_maap.zip . + zip -9 package_maap.zip lambda_function.py + rm -rf package_maap/ + diff --git a/cmr_s3_subscriber/lambda/lambda_function.py b/cmr_s3_subscriber/lambda/lambda_function.py new file mode 100644 index 0000000..c9c3e98 --- /dev/null +++ b/cmr_s3_subscriber/lambda/lambda_function.py @@ -0,0 +1,394 @@ +import hashlib +import json +import os +from urllib.parse import urlparse + +import boto3 +import earthaccess +import requests +import logging + +logging.basicConfig(level=logging.DEBUG) + +s3 = boto3.client('s3') +sns = boto3.client('sns') +ddb = boto3.client('dynamodb') + + +SNS_ARN = os.environ['SNS_ARN'] +DDB_ARN = os.environ['DDB_ARN'] + + +CHECKSUMS = { + "MD5": hashlib.md5, + "SHA-1": hashlib.sha1, + "SHA-256": hashlib.sha256, + "SHA-384": hashlib.sha384, + "SHA-512": hashlib.sha512, +} + + +try: + import requests_mock +except ImportError: + print('Request mocking disabled') + + +def _set_edl_environ(): + # Replace with your secret ID + secret_name = os.environ['SECRET_ARN'] + + # Create a Secrets Manager client + client = boto3.client('secretsmanager') + + # Get the secret value + get_secret_value_response = client.get_secret_value(SecretId=secret_name) + + # Decode the secret value + secret_value = get_secret_value_response['SecretString'] + secret_json = json.loads(secret_value) + + os.environ['EARTHDATA_USERNAME'] = secret_json.pop('EARTHDATA_USER') + os.environ['EARTHDATA_PASSWORD'] = secret_json.pop('EARTHDATA_PASSWORD') + + for secret_k in secret_json: + os.environ[secret_k] = secret_json[secret_k] + + +def _process_record(record, bearer_token): + print(f'Processing record: {record}') + + message_type = record['Type'] + + if message_type == 'Notification': + _handle_cmr_notification(json.loads(record['Message']), bearer_token) + elif message_type == 'SubscriptionConfirmation': + _forward_confirmation(record) + else: + print(f'Unknown message type: {message_type}') + _fail_out_record(record, reason='Unknown message type') + + +def _sns_send(subject, message): + resp = sns.publish( + TopicArn=SNS_ARN, + Subject=subject, + Message=json.dumps(message, indent=2)[:262144], + ) + + print(resp) + + +def _forward_confirmation(record): + print('Forwarding subscription confirmation by SNS') + _sns_send( + '[FireAlarm] Confirm forward ingest CMR subscription', + record + ) + + +def _fail_out_record(record, reason='exceeded retries'): + print('Publishing record failure to SNS') + _sns_send( + f'[FireAlarm] CMR forward ingest job failed ({reason})', + record + ) + + +def _submit_maap_job(short_name, granule_ur): + from maap.maap import MAAP + + maap = MAAP() + + ccid = maap.searchGranule( + short_name=short_name, + readable_granule_name=granule_ur, + cmr_host="cmr.earthdata.nasa.gov", + limit=1 + )[0]['concept-id'] + + algo = os.environ['MAAP_ALGO_ID'] + algo_version = os.environ['MAAP_ALGO_VERSION'] + queue = os.environ['MAAP_QUEUE'] + + kwargs = {v.removeprefix('_maap_kwarg_'): os.environ[v] for v in os.environ.keys() if v.startswith('_maap_kwarg_')} + + job = maap.submitJob( + identifier=granule_ur, + algo_id=algo, + version=algo_version, + queue=queue, + granule_id=granule_ur, + collection_id=ccid, + **kwargs + ) + + print(f'Submitted job {job.id}') + + return job.id + + +def _handle_cmr_notification(message, bearer_token): + print(f'Handling CMR notification: {message}') + + granule_metadata_url = f'{message["location"]}.umm_json' + + print(f'Getting UMM metadata for granule {message["granule-ur"]} from {granule_metadata_url}') + + umm_response = requests.get(granule_metadata_url) + try: + umm_response.raise_for_status() + except: + if umm_response.status_code == 404: + print(f'No record exists for granule {message["granule-ur"]} ({granule_metadata_url}). It was likely ' + f'superseded and will thus be skipped') + return + else: + raise + umm = umm_response.json() + + processed_umm = _process_umm(umm) + + print(json.dumps(processed_umm, indent=2)) + + if 'MAAP_PGT' in os.environ: + print('Submitting job through MAAP instead of staging in this function') + return _submit_maap_job(processed_umm['collection'], message["granule-ur"]) + + print(f'Downloading files for granule {processed_umm["granule"]}') + + s3_client = None + + if processed_umm['s3_credentials_url'] is not None: + print('Attempting to get temporary S3 credentials from the DAAC...') + creds = _try_get_s3_creds(processed_umm['s3_credentials_url'], bearer_token) + if creds is not None: + print(f'Got temporary S3 credentials: {creds["accessKeyId"]} (exp: {creds["expiration"]})') + s3_client = boto3.client( + 's3', + aws_access_key_id=creds['accessKeyId'], + aws_secret_access_key=creds['secretAccessKey'], + aws_session_token=creds['sessionToken'], + ) + else: + print('Could not get temporary S3 credentials, using HTTP') + + s3_tries = 3 if s3_client is not None else 0 + + for file in processed_umm['files']: + file_info = processed_umm['files'][file] + dl_path = None + + if s3_tries > 0 and 's3' in file_info: + dl_path = _try_download_s3(file_info['s3'], s3_client) + if dl_path is None: + s3_tries -= 1 + + if dl_path is None: + dl_path = _try_download_http(file_info['http'], bearer_token) + + if dl_path is None: + raise RuntimeError(f'Could not download file {file}') + + if file_info['checksum'] is not None and file_info['checksum']['Algorithm'] in CHECKSUMS: + print(f'Verifying checksum [{file_info["checksum"]["Algorithm"]}]...') + if not _try_validate_checksum(dl_path, file_info['checksum']): + raise RuntimeError(f'Checksum mismatch for file {file}') + print('Checksum verified') + + push_s3_client = boto3.client('s3') + + dst_bucket = os.environ['DST_BUCKET'] + dst_key = processed_umm['s3_prefix'] + file + + print(f'Uploading file {file} to s3://{dst_bucket}/{dst_key}') + + push_s3_client.upload_file( + dl_path, + dst_bucket, + dst_key + ) + + os.unlink(dl_path) + + +def _process_umm(umm): + # I don't think this is guaranteed (this or EntryTitle). What does other field look like? + collection = umm['CollectionReference']['ShortName'] + granule = umm['GranuleUR'] + links = umm['RelatedUrls'] + + archive_info = umm.get('DataGranule', {}).get('ArchiveAndDistributionInformation', []) + + archive_info = {i['Name']: i['Checksum'] for i in archive_info if 'Checksum' in i} + + http_urls = [u['URL'] for u in links if u['Type'] == 'GET DATA'] + s3_urls = [u['URL'] for u in links if u['Type'] == 'GET DATA VIA DIRECT ACCESS'] + + file_map = {} + + for u in http_urls: + filename = os.path.basename(u) + file_map.setdefault(filename, {})['http'] = u + + for u in s3_urls: + filename = os.path.basename(u) + file_map.setdefault(filename, {})['s3'] = u + + if len(file_map) == 0: + raise Exception('No data granules found') + + if any('http' not in file_map[f] for f in file_map): + raise Exception('Granule without HTTP fallback method found') + + for f in file_map: + if f in archive_info: + file_map[f]['checksum'] = archive_info[f] + else: + file_map[f]['checksum'] = None + + creds_url = None + + for url in links: + if url['URL'].endswith('/s3credentials'): + creds_url = url['URL'] + break + + collection_query = ddb.query( + TableName=DDB_ARN, + KeyConditionExpression='#collection_short_name = :c', + ExpressionAttributeValues={ + ':c': {'S': collection} + }, + ExpressionAttributeNames={ + '#collection_short_name': 'collection' + } + ) + + if len(collection_query['Items']) == 0: + s3_prefix = collection + else: + s3_prefix = collection_query['Items'][0]['s3_prefix']['S'] + + if s3_prefix[-1] != '/': + s3_prefix += '/' + + return dict( + granule=granule, + collection=collection, + files=file_map, + s3_prefix=s3_prefix, + s3_credentials_url=creds_url + ) + + +def _try_get_s3_creds(endpoint, bearer_token): + try: + headers = { + "Authorization": f"Bearer {bearer_token}", + } + + resp = requests.get(endpoint, headers=headers) + resp.raise_for_status() + return resp.json() + except Exception as e: + print(f'Failed to get S3 credentials [{resp.status_code}]: {e}') + return None + + +def _try_download_s3(url, client): + try: + parsed_url = urlparse(url) + + bucket = parsed_url.netloc + key = parsed_url.path.lstrip('/') + dst = os.path.join('/tmp', os.path.basename(key)) + + print(f'Attempting S3 download: {url}') + client.download_file(bucket, key, dst) + return dst + except Exception as e: + print(f'S3 download failed: {e}') + return None + + +def _try_download_http(url, bearer_token): + headers = { + "Authorization": f"Bearer {bearer_token}", + } + + try: + resp = requests.get(url, headers=headers, stream=True) + resp.raise_for_status() + + dst = os.path.join('/tmp', os.path.basename(url)) + + with open(dst, 'wb') as f: + for chunk in resp.iter_content(chunk_size=1024 ** 2): + if chunk: + f.write(chunk) + return dst + except Exception as e: + print(f'Failed to download file [{url}]: {e}') + return None + + +def _try_validate_checksum(path, checksum): + hash_fn = CHECKSUMS[checksum['Algorithm']]() + + with open(path, 'rb') as f: + for chunk in iter(lambda: f.read(1024 * hash_fn.block_size), b''): + hash_fn.update(chunk) + + dl_hash = hash_fn.hexdigest().lower() + + return dl_hash == checksum['Value'].lower() + + +def lambda_handler(event, context): + _set_edl_environ() + + auth = earthaccess.login(strategy='environment') + bearer_token = auth.token['access_token'] + + print(f'Bearer token: {bearer_token}') + + batch_item_failures = [] + + for record in event['Records']: + try: + _process_record(json.loads(record['body']), bearer_token) + except Exception as e: + rec_count = int(record.get('Attributes', {}).get("ApproximateReceiveCount", "5")) + + if rec_count <= 5: + batch_item_failures.append({"itemIdentifier": record['messageId']}) + print(f'Failed to process record {record["messageId"]} and will retry: {e}') + else: + print(f'Failed to process record {record["messageId"]} and will not retry: {e}') + _fail_out_record(record) + + # TODO implement + return { + 'statusCode': 200, + 'batchItemFailures': batch_item_failures + } + + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser() + + parser.add_argument( + 'event_file' + ) + + args = parser.parse_args() + + with open(args.event_file) as f: + event = json.load(f) + + ret = lambda_handler(event, {}) + + print(json.dumps(ret)) diff --git a/cmr_s3_subscriber/requirements.txt b/cmr_s3_subscriber/requirements.txt new file mode 100644 index 0000000..2e50c60 --- /dev/null +++ b/cmr_s3_subscriber/requirements.txt @@ -0,0 +1,5 @@ +earthaccess +requests +pyyaml +# maap-py @ git+https://github.com/MAAP-Project/maap-py.git + diff --git a/cmr_s3_subscriber/requirements_maap.txt b/cmr_s3_subscriber/requirements_maap.txt new file mode 100644 index 0000000..2578b76 --- /dev/null +++ b/cmr_s3_subscriber/requirements_maap.txt @@ -0,0 +1,38 @@ +earthaccess==0.11.0 +requests==2.32.4 +pyyaml==6.0.2 +maap-py @ git+https://github.com/RKuttruff/maap-py.git +aiobotocore==2.22.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.12.12 +aioitertools==0.12.0 +aiosignal==1.3.2 +attrs==25.3.0 +backoff==2.2.1 +boto3==1.37.3 +botocore==1.37.3 +bounded-pool-executor==0.0.3 +charset-normalizer==3.4.2 +configparser==7.2.0 +frozenlist==1.7.0 +fsspec==2025.5.1 +idna==3.10 +importlib-resources==6.5.2 +jmespath==1.0.1 +multidict==6.4.4 +multimethod==2.0 +numpy==2.0.2 +pqdm==0.2.0 +propcache==0.3.2 +python-cmr==0.13.0 +python-dateutil==2.9.0.post0 +s3fs==2025.5.1 +s3transfer==0.11.3 +six==1.17.0 +tinynetrc==1.3.1 +tqdm==4.67.1 +typing-extensions==4.14.0 +# urllib3==2.4.0 +urllib3 +wrapt==1.17.2 +yarl==1.20.1 diff --git a/cmr_s3_subscriber/subscriber.py b/cmr_s3_subscriber/subscriber.py new file mode 100644 index 0000000..de2be32 --- /dev/null +++ b/cmr_s3_subscriber/subscriber.py @@ -0,0 +1,113 @@ +import argparse +import requests +import earthaccess +import yaml +import os + + +def parse_args(): + parser = argparse.ArgumentParser() + + parser.add_argument( + 'config', + default='config.yaml', + help='Configuration file' + ) + + parser.add_argument( + 'ccid', + help='Collection concept ID to subscribe to' + ) + + parser.add_argument( + '-q', '--queue', + default=None, + help='Subscriber queue ARN', + required=False, + dest='queue' + ) + + parser.add_argument( + '--dir', + default='subscriptions', + help='Directory where CMR subscription XML responses are stored' + ) + + parser.add_argument( + '--dryrun', + action='store_true', + help='Do not make CMR API calls except for auth' + ) + + return parser.parse_args() + + +def main(args): + with open(args.config, 'r') as f: + config = yaml.safe_load(f) + + os.environ['EARTHDATA_USERNAME'] = config['edl_username'] + os.environ['EARTHDATA_PASSWORD'] = config['edl_password'] + + auth = earthaccess.login(strategy='environment') + bearer_token = auth.token['access_token'] + + if args.queue is None: + queue_arn = config['queue_arn'] + else: + queue_arn = args.queue + + ccid = args.ccid + os.makedirs(args.dir, exist_ok=True) + + subscription_request = { + "Name": f"{config['edl_username']}-{ccid}-subscription", + "CollectionConceptId": ccid, + "Type": "granule", + "Query": "*", + "EndPoint": queue_arn, + "Mode": ["New", "Update"], + "Method": "ingest", + "MetadataSpecification": { + "URL": "https://cdn.earthdata.nasa.gov/umm/subscription/v1.1.1", + "Name": "UMM-Sub", + "Version": "1.1.1" + } + } + + headers = { + "Authorization": f"Bearer {bearer_token}", + "Content-Type": "application/vnd.nasa.cmr.umm+json" + } + + if args.dryrun: + print("Would issue the following POST request to https://cmr.earthdata.nasa.gov/ingest/subscriptions/") + print(subscription_request) + print(f'{headers=}') + exit() + + try: + response = requests.post( + "https://cmr.earthdata.nasa.gov/ingest/subscriptions/", + headers=headers, + json=subscription_request + ) + + if response.ok: + print("Successfully created CMR subscription") + print(f"Response: {response.text}") + with open( + os.path.join(args.dir, f'{ccid}-subscription.xml'), + 'w' + ) as f: + f.write(response.text) + else: + print(f"Error creating subscription. Status code: {response.status_code}") + print(f"Response: {response.text}") + + except Exception as e: + print(f"Error making subscription request: {str(e)}") + + +if __name__ == '__main__': + main(parse_args()) diff --git a/cmr_s3_subscriber/terraform/iam.tf b/cmr_s3_subscriber/terraform/iam.tf new file mode 100644 index 0000000..aea4d62 --- /dev/null +++ b/cmr_s3_subscriber/terraform/iam.tf @@ -0,0 +1,100 @@ + +data "aws_iam_policy_document" "assume_policy" { + statement { + effect = "Allow" + + principals { + identifiers = ["lambda.amazonaws.com"] + type = "Service" + } + + actions = ["sts:AssumeRole"] + } +} + +data "aws_iam_policy_document" "lambda_policy" { + statement { + sid = "LambdaInvoke" + actions = ["lambda:InvokeFunction"] + effect = "Allow" + resources = ["arn:${data.aws_partition.current.id}:lambda:${data.aws_region.current.id}:${data.aws_caller_identity.current.account_id}:function:*"] + } + + statement { + sid = "Secret" + actions = ["secretsmanager:GetSecretValue"] + effect = "Allow" + resources = [aws_secretsmanager_secret.edl_secret.arn] + } + + statement { + sid = "DynamoDB" + actions = [ + "dynamodb:DescribeStream", + "dynamodb:DescribeTable", + "dynamodb:Get*", + "dynamodb:Query" + ] + effect = "Allow" + resources = [aws_dynamodb_table.collection_lookup.arn] + } + + statement { + sid = "SNS" + actions = [ + "sns:Get*", + "sns:List*", + "sns:Publish" + ] + effect = "Allow" + resources = [aws_sns_topic.subscriber_notify.arn] + } + + statement { + sid = "SQS" + actions = [ + "sqs:ReceiveMessage", + "sqs:DeleteMessage", + "sqs:GetQueueAttributes" + ] + effect = "Allow" + resources = ["arn:${data.aws_partition.current.id}:sqs:${data.aws_region.current.id}:${data.aws_caller_identity.current.account_id}:cmr-subscriber-*"] + } + + statement { + sid = "S3" + actions = [ + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + "s3:ListBucket" + ] + effect = "Allow" + resources = [ + "arn:${data.aws_partition.current.id}:s3:::${local.bucket}", + "arn:${data.aws_partition.current.id}:s3:::${local.bucket}/*" + ] + } +} + +resource "aws_iam_policy" "lambda_policy" { + name = "cmr-subscriber-lambda-policy" + description = "Custom permissions for CMR subscriber lambda function" + policy = data.aws_iam_policy_document.lambda_policy.json +} + +resource "aws_iam_role" "lambda_role" { + assume_role_policy = data.aws_iam_policy_document.assume_policy.json + name = "cmr-subscriber-lambda-role" +} + +resource "aws_iam_role_policy_attachment" "lambda_role_policy_attach" { + for_each = tomap({ + main_policy = aws_iam_policy.lambda_policy.arn, + aws_policy_0 = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole", + aws_policy_1 = "arn:aws:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole" + }) + policy_arn = each.value + role = aws_iam_role.lambda_role.name +} + diff --git a/cmr_s3_subscriber/terraform/main.tf b/cmr_s3_subscriber/terraform/main.tf new file mode 100644 index 0000000..0f49da2 --- /dev/null +++ b/cmr_s3_subscriber/terraform/main.tf @@ -0,0 +1,145 @@ + +resource "aws_dynamodb_table" "collection_lookup" { + name = "cmr-subscriber-collections" + billing_mode = "PAY_PER_REQUEST" + hash_key = "collection" + + attribute { + name = "collection" + type = "S" + } +} + +resource "aws_sns_topic" "subscriber_notify" { + name = "cmr-subscriber-notify" +} + +resource "aws_sns_topic_policy" "topic_policy" { + arn = aws_sns_topic.subscriber_notify.arn + policy = data.aws_iam_policy_document.sns_topic_policy.json +} + +data "aws_iam_policy_document" "sns_topic_policy" { + policy_id = "__default_policy_ID" + version = "2008-10-17" + + statement { + actions = [ + "SNS:GetTopicAttributes", + "SNS:SetTopicAttributes", + "SNS:AddPermission", + "SNS:RemovePermission", + "SNS:DeleteTopic", + "SNS:Subscribe", + "SNS:ListSubscriptionsByTopic", + "SNS:Publish" + ] + + condition { + test = "StringEquals" + values = [ + data.aws_caller_identity.current.account_id + ] + variable = "AWS:SourceOwner" + } + + effect = "Allow" + + principals { + identifiers = ["*"] + type = "AWS" + } + + resources = [ + aws_sns_topic.subscriber_notify.arn + ] + + sid = "__default_statement_ID" + } +} + + +resource "aws_secretsmanager_secret" "edl_secret" { + name = "cmr-subscription-secrets" + recovery_window_in_days = 0 +} + +resource "aws_secretsmanager_secret_version" "edl_secret" { + secret_id = aws_secretsmanager_secret.edl_secret.id + secret_string = jsonencode(merge( + { + EARTHDATA_USER = var.edl_username + EARTHDATA_PASSWORD = var.edl_password + }, + var.maap_config != null ? var.maap_config : {}, + var.maap_kwargs != null ? { for key, value in var.maap_kwargs : "_maap_kwarg_${key}" => value } : {}, + )) +} + +data "local_file" "lambda_package" { + filename = var.maap_config == null ? "../lambda/package.zip" : "../lambda/package_maap.zip" +} + +resource "aws_lambda_function" "lambda" { + function_name = "cmr-subscriber" + role = aws_iam_role.lambda_role.arn + architectures = ["x86_64"] + filename = data.local_file.lambda_package.filename + source_code_hash = data.local_file.lambda_package.content_base64sha256 + handler = "lambda_function.lambda_handler" + runtime = "python3.12" + memory_size = 512 + ephemeral_storage { + size = 2048 + } + timeout = 900 + description = "Function to handle incoming messages from CMR subscriptions." + + environment { + variables = { + SNS_ARN = aws_sns_topic.subscriber_notify.arn + DDB_ARN = aws_dynamodb_table.collection_lookup.arn + SECRET_ARN = aws_secretsmanager_secret.edl_secret.arn + DST_BUCKET = local.bucket + } + } + + vpc_config { + security_group_ids = var.lambda_vpc.security_groups + subnet_ids = var.lambda_vpc.subnet_ids + } + + reserved_concurrent_executions = 16 +} + +resource "local_file" "subscriber_config_file" { + filename = "config.yaml" + content = yamlencode({ + edl_username = var.edl_username + edl_password = var.edl_password + }) +} + +module "subscriptions" { + source = "./subscription" + for_each = toset(var.ccids) + + ccid = each.value + config_file = local_file.subscriber_config_file.filename + script_dir = "/Users/rileykk/FireAlarm/subscriber" +} + +resource "aws_lambda_event_source_mapping" "sqs_triggers" { + function_name = aws_lambda_function.lambda.arn + for_each = module.subscriptions + event_source_arn = each.value.queue_arn + + enabled = var.enable_triggers + + batch_size = 3 + function_response_types = ["ReportBatchItemFailures"] + + scaling_config { + maximum_concurrency = 16 + } +} diff --git a/cmr_s3_subscriber/terraform/outputs.tf b/cmr_s3_subscriber/terraform/outputs.tf new file mode 100644 index 0000000..839e32b --- /dev/null +++ b/cmr_s3_subscriber/terraform/outputs.tf @@ -0,0 +1,32 @@ + +output "notification_topic" { + value = aws_sns_topic.subscriber_notify.arn + description = "SNS topic for forwarding CMR subscription confirmations + error messages for unprocessable messages. Ensure this is subscribed to before creating subscriptions and/or enabling triggers." +} + +output "backfill_queue" { + value = aws_sqs_queue.backfill_queue.arn + description = "Special SQS queue that can be used to stage backfill data with the fill_queue_by_query.py script" +} + +output "collection_path_lookup_table" { + value = aws_dynamodb_table.collection_lookup.arn + description = "Lookup table to map collection short names to S3 paths to stage data to" +} + +output "staging_bucket" { + value = local.bucket + description = "S3 bucket data will be staged to" +} + +output "subscription_info" { + value = {for ccid, sub in module.subscriptions: ccid => sub.queue_arn} +} + +output "triggers_enabled" { + value = var.enable_triggers +} + +output "maap_enabled" { + value = var.maap_config != null +} diff --git a/cmr_s3_subscriber/terraform/provider.tf b/cmr_s3_subscriber/terraform/provider.tf new file mode 100644 index 0000000..c4bb9d4 --- /dev/null +++ b/cmr_s3_subscriber/terraform/provider.tf @@ -0,0 +1,44 @@ +provider "aws" { + region = "us-west-2" + +# access_key = "test" +# secret_key = "test" +# s3_use_path_style = false +# skip_credentials_validation = true +# skip_metadata_api_check = true +# skip_requesting_account_id = true +# +# endpoints { +# apigateway = "http://localhost:4566" +# apigatewayv2 = "http://localhost:4566" +# cloudformation = "http://localhost:4566" +# cloudwatch = "http://localhost:4566" +# dynamodb = "http://localhost:4566" +# ec2 = "http://localhost:4566" +# es = "http://localhost:4566" +# elasticache = "http://localhost:4566" +# firehose = "http://localhost:4566" +# iam = "http://localhost:4566" +# kinesis = "http://localhost:4566" +# lambda = "http://localhost:4566" +# rds = "http://localhost:4566" +# redshift = "http://localhost:4566" +# route53 = "http://localhost:4566" +# s3 = "http://s3.localhost.localstack.cloud:4566" +# secretsmanager = "http://localhost:4566" +# ses = "http://localhost:4566" +# sns = "http://localhost:4566" +# sqs = "http://localhost:4566" +# ssm = "http://localhost:4566" +# stepfunctions = "http://localhost:4566" +# sts = "http://localhost:4566" +# logs = "http://localhost:4566" +# } +} + +data "aws_caller_identity" "current" {} + +data "aws_region" "current" {} + +data "aws_partition" "current" {} + diff --git a/cmr_s3_subscriber/terraform/s3.tf b/cmr_s3_subscriber/terraform/s3.tf new file mode 100644 index 0000000..6da8008 --- /dev/null +++ b/cmr_s3_subscriber/terraform/s3.tf @@ -0,0 +1,16 @@ + +resource "aws_s3_bucket" "provisioned_bucket" { + bucket_prefix = var.bucket_name_prefix + count = var.existing_bucket == null ? 1 : 0 +} + + +data "aws_s3_bucket" "existing_bucket" { + bucket = var.existing_bucket + count = var.existing_bucket != null ? 1 : 0 +} + +locals { + bucket = var.existing_bucket == null ? aws_s3_bucket.provisioned_bucket[0].id : data.aws_s3_bucket.existing_bucket[0].id +} + diff --git a/cmr_s3_subscriber/terraform/sqs_backfill_queue.tf b/cmr_s3_subscriber/terraform/sqs_backfill_queue.tf new file mode 100644 index 0000000..95800a4 --- /dev/null +++ b/cmr_s3_subscriber/terraform/sqs_backfill_queue.tf @@ -0,0 +1,56 @@ + +resource "aws_sqs_queue" "backfill_queue" { + name = "cmr-subscriber-backfill-queue" + message_retention_seconds = 1209600 + policy = data.aws_iam_policy_document.queue_policy.json + visibility_timeout_seconds = 1800 +} + +data "aws_iam_policy_document" "queue_policy" { + statement { + sid = "__owner_statement" + effect = "Allow" + principals { + identifiers = ["arn:${data.aws_partition.current.id}:iam::${data.aws_caller_identity.current.account_id}:root"] + type = "AWS" + } + actions = ["SQS:*"] + resources = ["arn:${data.aws_partition.current.id}:sqs:${data.aws_region.current.id}:${data.aws_caller_identity.current.account_id}:cmr-subscriber-backfill-queue"] + } + statement { + sid = "CMR_subscription" + effect = "Allow" + principals { + identifiers = ["sns.amazonaws.com"] + type = "Service" + } + actions = ["SQS:SendMessage"] + resources = ["arn:${data.aws_partition.current.id}:sqs:${data.aws_region.current.id}:${data.aws_caller_identity.current.account_id}:cmr-subscriber-backfill-queue"] + + condition { + test = "StringEquals" + values = ["621933553860"] + variable = "aws:SourceAccount" + } + + condition { + test = "ArnLike" + values = ["arn:aws:sns:us-east-1:621933553860:cmr-subscriptions-prod"] + variable = "aws:SourceArn" + } + } +} + +resource "aws_lambda_event_source_mapping" "backfill_trigger" { + function_name = aws_lambda_function.lambda.arn + event_source_arn = aws_sqs_queue.backfill_queue.arn + + enabled = var.enable_triggers + + batch_size = 3 + function_response_types = ["ReportBatchItemFailures"] + + scaling_config { + maximum_concurrency = 16 + } +} diff --git a/cmr_s3_subscriber/terraform/subscription/main.tf b/cmr_s3_subscriber/terraform/subscription/main.tf new file mode 100644 index 0000000..ea3ff4a --- /dev/null +++ b/cmr_s3_subscriber/terraform/subscription/main.tf @@ -0,0 +1,73 @@ + +data "aws_caller_identity" "current" {} + +data "aws_region" "current" {} + +data "aws_partition" "current" {} + + +resource "aws_sqs_queue" "queue" { + name = "cmr-subscriber-${var.ccid}" + message_retention_seconds = 1209600 + policy = data.aws_iam_policy_document.queue_policy.json + visibility_timeout_seconds = 1800 +} + +data "aws_iam_policy_document" "queue_policy" { + statement { + sid = "__owner_statement" + effect = "Allow" + principals { + identifiers = ["arn:${data.aws_partition.current.id}:iam::${data.aws_caller_identity.current.account_id}:root"] + type = "AWS" + } + actions = ["SQS:*"] + resources = ["arn:${data.aws_partition.current.id}:sqs:${data.aws_region.current.id}:${data.aws_caller_identity.current.account_id}:cmr-subscriber-${var.ccid}"] + } + statement { + sid = "CMR_subscription" + effect = "Allow" + principals { + identifiers = ["sns.amazonaws.com"] + type = "Service" + } + actions = ["SQS:SendMessage"] + resources = ["arn:${data.aws_partition.current.id}:sqs:${data.aws_region.current.id}:${data.aws_caller_identity.current.account_id}:cmr-subscriber-${var.ccid}"] + + condition { + test = "StringEquals" + values = ["621933553860"] + variable = "aws:SourceAccount" + } + + condition { + test = "ArnLike" + values = ["arn:aws:sns:us-east-1:621933553860:cmr-subscriptions-prod"] + variable = "aws:SourceArn" + } + } +} + +resource "null_resource" "create_subscription" { + depends_on = [aws_sqs_queue.queue] + + triggers = { + script_dir = var.script_dir + ccid = var.ccid + config_file = var.config_file + } + + provisioner "local-exec" { + command = "source venv/bin/activate; python subscriber.py ${var.config_file} ${var.ccid} --queue ${aws_sqs_queue.queue.arn} --dryrun; cp subscriptions/C2930763263-LARC_CLOUD-subscription.xml subscriptions/${var.ccid}-subscription.xml" + working_dir = var.script_dir + interpreter = ["/bin/bash", "-c"] + } + + provisioner "local-exec" { + command = "source venv/bin/activate; python delete.py ${self.triggers.config_file} --response-xml subscriptions/${self.triggers.ccid}-subscription.xml --dryrun" + working_dir = self.triggers.script_dir + interpreter = ["/bin/bash", "-c"] + when = destroy + on_failure = continue + } +} diff --git a/cmr_s3_subscriber/terraform/subscription/outputs.tf b/cmr_s3_subscriber/terraform/subscription/outputs.tf new file mode 100644 index 0000000..f6760ee --- /dev/null +++ b/cmr_s3_subscriber/terraform/subscription/outputs.tf @@ -0,0 +1,7 @@ +output "queue_arn" { + value = aws_sqs_queue.queue.arn +} + +output "ccid" { + value = null_resource.create_subscription.triggers.ccid +} diff --git a/cmr_s3_subscriber/terraform/subscription/variables.tf b/cmr_s3_subscriber/terraform/subscription/variables.tf new file mode 100644 index 0000000..d21cd74 --- /dev/null +++ b/cmr_s3_subscriber/terraform/subscription/variables.tf @@ -0,0 +1,15 @@ + +variable "ccid" { + type = string + description = "Collection concept ID for this subscription" +} + +variable "config_file" { + type = string + description = "Script config file path" +} + +variable "script_dir" { + type = string + description = "Path to directory containing CMR scripts and their venv" +} diff --git a/cmr_s3_subscriber/terraform/subscription/versions.tf b/cmr_s3_subscriber/terraform/subscription/versions.tf new file mode 100644 index 0000000..5b52042 --- /dev/null +++ b/cmr_s3_subscriber/terraform/subscription/versions.tf @@ -0,0 +1,14 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.34.0" + } + null = { + source = "hashicorp/null" + } + } + + required_version = ">= 1.2.0" +} + diff --git a/cmr_s3_subscriber/terraform/variables.tf b/cmr_s3_subscriber/terraform/variables.tf new file mode 100644 index 0000000..e7e7290 --- /dev/null +++ b/cmr_s3_subscriber/terraform/variables.tf @@ -0,0 +1,72 @@ + +variable "existing_bucket" { + type = string + description = "Existing S3 bucket to use instead of provisioning a new one" + nullable = true +} + +variable "bucket_name_prefix" { + type = string + description = "Prefix of the S3 bucket name to provision. Will be appended with a random string" +} + +variable "edl_username" { + type = string + description = "Username for Earthdata Login" +} + +variable "edl_password" { + type = string + description = "Password for Earthdata Login" + sensitive = true +} + +variable "ccids" { + type = list(string) + description = "List of CMR collection-concept-IDs to subscribe to. Note: Initial apply should have either no CCIDs listed or triggers disabled until the notification SNS topic is subscribed to" +} + +variable "enable_triggers" { + type = bool + description = "Whether to enable the SQS -> Lambda triggers. Note: Initial apply should have either no CCIDs listed or triggers disabled until the notification SNS topic is subscribed to" + + default = false +} + +variable "lambda_vpc" { + type = object({ + subnet_ids = list(string) + security_groups = list(string) + }) + default = { + subnet_ids = [] + security_groups = [] + } + + description = "If VPC is required for the lambda function, list desired subnets & security groups here." + + validation { + condition = (length(var.lambda_vpc.subnet_ids) == 0 && length(var.lambda_vpc.security_groups) == 0) || (length(var.lambda_vpc.subnet_ids) > 0 && length(var.lambda_vpc.security_groups) > 0) + error_message = "Must provide at least one of both SG & subnet or neither" + } +} + +variable "maap_config" { + type = object({ + MAAP_ALGO_ID = string + MAAP_ALGO_VERSION = string + MAAP_QUEUE = string + MAAP_PGT = string + }) + default = null + + description = "If a MAAP job is desired to stage the data rather than the bundled lambda function, specify the necessary config values for MAAP access and the localization algorithm." +} + +variable "maap_kwargs" { + type = map(string) + default = null + + description = "Define any additional kwargs needed by the MAAP job here. kwargs already defined in code are granule_id = GranuleUR & collection_id = CCID" +} + diff --git a/cmr_s3_subscriber/terraform/versions.tf b/cmr_s3_subscriber/terraform/versions.tf new file mode 100644 index 0000000..43c2b5f --- /dev/null +++ b/cmr_s3_subscriber/terraform/versions.tf @@ -0,0 +1,13 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.34.0" + } + random = { + source = "hashicorp/random" + } + } + + required_version = ">= 1.2.0" +} \ No newline at end of file diff --git a/cmr_s3_subscriber/test_messages/all_four.json b/cmr_s3_subscriber/test_messages/all_four.json new file mode 100644 index 0000000..fbbe43e --- /dev/null +++ b/cmr_s3_subscriber/test_messages/all_four.json @@ -0,0 +1,26 @@ +{ + "Records": [ + { + "messageId": "b0574ee3-336a-4753-bb33-766eca3dbf92", + "body": "{\"Type\": \"Notification\", \"MessageId\": \"bc8511b9-9e4d-5130-9ff9-710989b0531e\", \"TopicArn\": \"arn:aws:sns:us-east-1:621933553860:cmr-subscriptions-prod\", \"Subject\": \"Update Notification\", \"Message\": \"{\\\"concept-id\\\": \\\"G3550282859-LARC_CLOUD\\\", \\\"granule-ur\\\": \\\"TEMPO_NO2_L3_V03_20250527T123504Z_S004.nc\\\", \\\"location\\\": \\\"https://cmr.earthdata.nasa.gov:443/concepts/G3550282859-LARC_CLOUD/2\\\", \\\"producer-granule-id\\\": \\\"TEMPO_NO2_L3_V03_20250527T123504Z_S004.nc\\\"}\", \"Timestamp\": \"2025-05-27T19:42:53.639Z\", \"SignatureVersion\": \"1\", \"Signature\": \"eDDx7tBu5YTU+ltKuHNjTsLraBSGSZTBazEVcmnpKJL3yS0pM7795FheN/u/xTyr3StzgQVxjPUQV5UK9rulZACH2QmNT4ZXKWOfAB8EAZDI/+/0EnTkRx5bWPLbNKRdcA99aPdlhcU62FfCSTHHHrichvV2gn/R5P2h7fR/+G5/1RJWPuplI5BNUvh8XdG0zpn8IAu8vFzH8BhNGGtGQJ+tZicflYPrZ4ZlF4SsEGm02l36FTe8ccpXCNswDbsK5Z8tRejMUntHMoEUOp4BmLUex70kpgjcciDQH4jRRmLmf6jkKSo4IaMl6M88WXcHvM+pWr7jDANSCxK2lv977A==\", \"SigningCertURL\": \"https://sns.us-east-1.amazonaws.com/SimpleNotificationService-9c6465fa7f48f5cacd23014631ec1136.pem\", \"UnsubscribeURL\": \"https://sns.us-east-1.amazonaws.com/?Action=Unsubscribe&SubscriptionArn=arn:aws:sns:us-east-1:621933553860:cmr-subscriptions-prod:b8d71539-6e1f-4ba9-90de-0b6cac50352b\", \"MessageAttributes\": {\"collection-concept-id\": {\"Type\": \"String\", \"Value\": \"C2930763263-LARC_CLOUD\"}, \"mode\": {\"Type\": \"String\", \"Value\": \"Update\"}, \"subscriber\": {\"Type\": \"String\", \"Value\": \"aqacf\"}}}" + }, + { + "messageId": "03cc98c5-20ad-44c5-82e3-e49ffbf804d8", + "body": "{\"Type\": \"SubscriptionConfirmation\", \"MessageId\": \"105d55c5-5970-4c3d-ad4e-3222095e3df2\", \"Token\": \"2336412f37fb687f5d51e6e2425a8a5875c3b7972f7c1552c44e8e9ef5fa79a1edadd32d49d594c91004d7b35ba5b85b59eefd73040470d671272eb92d14a1dfe95ed9e20d747bada2f017957feee2a55214f303644ea43e3b62eb4b0adddb3c424f08af718582351ee033782cd2344fcbc95b44b50d974e0484936fd4932bef\", \"TopicArn\": \"arn:aws:sns:us-east-1:621933553860:cmr-subscriptions-prod\", \"Message\": \"You have chosen to subscribe to the topic arn:aws:sns:us-east-1:621933553860:cmr-subscriptions-prod.\\nTo confirm the subscription, visit the SubscribeURL included in this message.\", \"SubscribeURL\": \"https://sns.us-east-1.amazonaws.com/?Action=ConfirmSubscription&TopicArn=arn:aws:sns:us-east-1:621933553860:cmr-subscriptions-prod&Token=2336412f37fb687f5d51e6e2425a8a5875c3b7972f7c1552c44e8e9ef5fa79a1edadd32d49d594c91004d7b35ba5b85b59eefd73040470d671272eb92d14a1dfe95ed9e20d747bada2f017957feee2a55214f303644ea43e3b62eb4b0adddb3c424f08af718582351ee033782cd2344fcbc95b44b50d974e0484936fd4932bef\", \"Timestamp\": \"2025-05-27T15:07:54.165Z\", \"SignatureVersion\": \"1\", \"Signature\": \"JjUTOnF42LeLGuJ+o/0M2LoeV87Nf4cNLQvYBW035FRnyMwQ2RbxrIFLFpFSS+zNjJJcFrmSF3GylJCyelWuPtsCYOQ2sNo+E24HvozbISY6UmhOyv5xDDtuB3uOFAeN2RgSuNjh/rZBehv/YwovX5mPGgYJfDVyRp3Swb8xTLuQ9oZXLf0bxGjXCdf6hnPI6Wj/FgWi/XxLDmrENKEmK1YT6u9vnvhMcqbHqzlpfPdNOFlVezYSHLVzhm26y/jAzIpfqrxMcBbyrtERnYW27T7jcpRfsFXwf6G5xXQXtxfFfig3CCaZ/6jUZvW1RQTghtbhT1jcA1MTWd9hQ++dyw==\", \"SigningCertURL\": \"https://sns.us-east-1.amazonaws.com/SimpleNotificationService-9c6465fa7f48f5cacd23014631ec1136.pem\"}" + }, + { + "messageId": "85cb8eeb-3446-48a6-841c-134b0c083cb1", + "body": "{\"Type\": \"Notification\", \"MessageId\": \"bc8511b9-9e4d-5130-9ff9-710989b0531e\", \"TopicArn\": \"arn:aws:sns:us-east-1:621933553860:cmr-subscriptions-prod\", \"Subject\": \"Update Notification\", \"Message\": \"{\\\"concept-id\\\": \\\"G3550282859-LARC_CLOUD\\\", \\\"granule-ur\\\": \\\"TEMPO_NO2_L3_V03_20250527T123504Z_S004.nc\\\", \\\"location\\\": \\\"https://localhost:443/concepts/G3550282859-LARC_CLOUD/2\\\", \\\"producer-granule-id\\\": \\\"TEMPO_NO2_L3_V03_20250527T123504Z_S004.nc\\\"}\", \"Timestamp\": \"2025-05-27T19:42:53.639Z\", \"SignatureVersion\": \"1\", \"Signature\": \"eDDx7tBu5YTU+ltKuHNjTsLraBSGSZTBazEVcmnpKJL3yS0pM7795FheN/u/xTyr3StzgQVxjPUQV5UK9rulZACH2QmNT4ZXKWOfAB8EAZDI/+/0EnTkRx5bWPLbNKRdcA99aPdlhcU62FfCSTHHHrichvV2gn/R5P2h7fR/+G5/1RJWPuplI5BNUvh8XdG0zpn8IAu8vFzH8BhNGGtGQJ+tZicflYPrZ4ZlF4SsEGm02l36FTe8ccpXCNswDbsK5Z8tRejMUntHMoEUOp4BmLUex70kpgjcciDQH4jRRmLmf6jkKSo4IaMl6M88WXcHvM+pWr7jDANSCxK2lv977A==\", \"SigningCertURL\": \"https://sns.us-east-1.amazonaws.com/SimpleNotificationService-9c6465fa7f48f5cacd23014631ec1136.pem\", \"UnsubscribeURL\": \"https://sns.us-east-1.amazonaws.com/?Action=Unsubscribe&SubscriptionArn=arn:aws:sns:us-east-1:621933553860:cmr-subscriptions-prod:b8d71539-6e1f-4ba9-90de-0b6cac50352b\", \"MessageAttributes\": {\"collection-concept-id\": {\"Type\": \"String\", \"Value\": \"C2930763263-LARC_CLOUD\"}, \"mode\": {\"Type\": \"String\", \"Value\": \"Update\"}, \"subscriber\": {\"Type\": \"String\", \"Value\": \"aqacf\"}}}", + "Attributes": { + "ApproximateReceiveCount": "1" + } + }, + { + "messageId": "cdcdb386-bb19-4379-bfd6-ef716aec05c4", + "body": "{\"Type\": \"Notification\", \"MessageId\": \"bc8511b9-9e4d-5130-9ff9-710989b0531e\", \"TopicArn\": \"arn:aws:sns:us-east-1:621933553860:cmr-subscriptions-prod\", \"Subject\": \"Update Notification\", \"Message\": \"{\\\"concept-id\\\": \\\"G3550282859-LARC_CLOUD\\\", \\\"granule-ur\\\": \\\"TEMPO_NO2_L3_V03_20250527T123504Z_S004.nc\\\", \\\"location\\\": \\\"https://localhost:443/concepts/G3550282859-LARC_CLOUD/2\\\", \\\"producer-granule-id\\\": \\\"TEMPO_NO2_L3_V03_20250527T123504Z_S004.nc\\\"}\", \"Timestamp\": \"2025-05-27T19:42:53.639Z\", \"SignatureVersion\": \"1\", \"Signature\": \"eDDx7tBu5YTU+ltKuHNjTsLraBSGSZTBazEVcmnpKJL3yS0pM7795FheN/u/xTyr3StzgQVxjPUQV5UK9rulZACH2QmNT4ZXKWOfAB8EAZDI/+/0EnTkRx5bWPLbNKRdcA99aPdlhcU62FfCSTHHHrichvV2gn/R5P2h7fR/+G5/1RJWPuplI5BNUvh8XdG0zpn8IAu8vFzH8BhNGGtGQJ+tZicflYPrZ4ZlF4SsEGm02l36FTe8ccpXCNswDbsK5Z8tRejMUntHMoEUOp4BmLUex70kpgjcciDQH4jRRmLmf6jkKSo4IaMl6M88WXcHvM+pWr7jDANSCxK2lv977A==\", \"SigningCertURL\": \"https://sns.us-east-1.amazonaws.com/SimpleNotificationService-9c6465fa7f48f5cacd23014631ec1136.pem\", \"UnsubscribeURL\": \"https://sns.us-east-1.amazonaws.com/?Action=Unsubscribe&SubscriptionArn=arn:aws:sns:us-east-1:621933553860:cmr-subscriptions-prod:b8d71539-6e1f-4ba9-90de-0b6cac50352b\", \"MessageAttributes\": {\"collection-concept-id\": {\"Type\": \"String\", \"Value\": \"C2930763263-LARC_CLOUD\"}, \"mode\": {\"Type\": \"String\", \"Value\": \"Update\"}, \"subscriber\": {\"Type\": \"String\", \"Value\": \"aqacf\"}}}", + "Attributes": { + "ApproximateReceiveCount": "6" + } + } + ] +} \ No newline at end of file diff --git a/cmr_s3_subscriber/test_messages/sample_umm.json b/cmr_s3_subscriber/test_messages/sample_umm.json new file mode 100644 index 0000000..b68fab4 --- /dev/null +++ b/cmr_s3_subscriber/test_messages/sample_umm.json @@ -0,0 +1,363 @@ +{ + "PGEVersionClass": { + "PGEVersion": "1.0.0" + }, + "RelatedUrls": [ + { + "URL": "https://data.asdc.earthdata.nasa.gov/asdc-prod-public/TEMPO/TEMPO_NO2_L3_V03/2025.05.27/thumb-TEMPO_NO2_L3_V03_20250527T123504Z_S004.nc.png", + "Description": "Download thumb-TEMPO_NO2_L3_V03_20250527T123504Z_S004.nc.png", + "Type": "GET RELATED VISUALIZATION" + }, + { + "URL": "s3://asdc-prod-public/TEMPO/TEMPO_NO2_L3_V03/2025.05.27/thumb-TEMPO_NO2_L3_V03_20250527T123504Z_S004.nc.png", + "Description": "This link provides direct download access via S3 to the granule", + "Type": "GET RELATED VISUALIZATION" + }, + { + "URL": "https://data.asdc.earthdata.nasa.gov/asdc-prod-protected/TEMPO/TEMPO_NO2_L3_V03/2025.05.27/TEMPO_NO2_L3_V03_20250527T123504Z_S004.nc", + "Description": "Download TEMPO_NO2_L3_V03_20250527T123504Z_S004.nc", + "Type": "GET DATA" + }, + { + "URL": "https://data.asdc.earthdata.nasa.gov/asdc-prod-protected/TEMPO/TEMPO_NO2_L3_V03/2025.05.27/TEMPO_NO2_L3_V03_20250527T123504Z_S004.nc.met", + "Description": "Download TEMPO_NO2_L3_V03_20250527T123504Z_S004.nc.met", + "Type": "EXTENDED METADATA" + }, + { + "URL": "s3://asdc-prod-protected/TEMPO/TEMPO_NO2_L3_V03/2025.05.27/TEMPO_NO2_L3_V03_20250527T123504Z_S004.nc", + "Description": "This link provides direct download access via S3 to the granule", + "Type": "GET DATA VIA DIRECT ACCESS" + }, + { + "URL": "s3://asdc-prod-protected/TEMPO/TEMPO_NO2_L3_V03/2025.05.27/TEMPO_NO2_L3_V03_20250527T123504Z_S004.nc.met", + "Description": "This link provides direct download access via S3 to the granule", + "Type": "EXTENDED METADATA" + }, + { + "URL": "https://data.asdc.earthdata.nasa.gov/s3credentials", + "Description": "api endpoint to retrieve temporary credentials valid for same-region direct s3 access", + "Type": "VIEW RELATED INFORMATION" + }, + { + "URL": "https://opendap.earthdata.nasa.gov/collections/C2930763263-LARC_CLOUD/granules/TEMPO_NO2_L3_V03_20250527T123504Z_S004.nc", + "Type": "USE SERVICE API", + "Subtype": "OPENDAP DATA", + "Description": "OPeNDAP request URL" + } + ], + "AccessConstraints": { + "Description": "WORLD", + "Value": 4 + }, + "SpatialExtent": { + "HorizontalSpatialDomain": { + "Geometry": { + "GPolygons": [ + { + "Boundary": { + "Points": [ + { + "Latitude": 57.74, + "Longitude": -111.2 + }, + { + "Latitude": 54.3, + "Longitude": -109.22 + }, + { + "Latitude": 50.02, + "Longitude": -107.3 + }, + { + "Latitude": 45.06, + "Longitude": -105.6 + }, + { + "Latitude": 39.74, + "Longitude": -104.22 + }, + { + "Latitude": 34.34, + "Longitude": -103.16 + }, + { + "Latitude": 28.48, + "Longitude": -102.3 + }, + { + "Latitude": 23.54, + "Longitude": -101.76 + }, + { + "Latitude": 17.26, + "Longitude": -101.26 + }, + { + "Latitude": 17.24, + "Longitude": -83.54 + }, + { + "Latitude": 17.48, + "Longitude": -65.14 + }, + { + "Latitude": 21.82, + "Longitude": -64.16 + }, + { + "Latitude": 25.66, + "Longitude": -63.08 + }, + { + "Latitude": 29.38, + "Longitude": -61.8 + }, + { + "Latitude": 32.8, + "Longitude": -60.38 + }, + { + "Latitude": 36.06, + "Longitude": -58.76 + }, + { + "Latitude": 39.2, + "Longitude": -56.9 + }, + { + "Latitude": 41.96, + "Longitude": -54.96 + }, + { + "Latitude": 44.7, + "Longitude": -52.68 + }, + { + "Latitude": 47.66, + "Longitude": -49.7 + }, + { + "Latitude": 50.44, + "Longitude": -46.24001 + }, + { + "Latitude": 52.82, + "Longitude": -42.56001 + }, + { + "Latitude": 54.88, + "Longitude": -38.60001 + }, + { + "Latitude": 56.66, + "Longitude": -34.3 + }, + { + "Latitude": 58.2, + "Longitude": -29.52 + }, + { + "Latitude": 59.34, + "Longitude": -24.88 + }, + { + "Latitude": 60.22, + "Longitude": -19.98 + }, + { + "Latitude": 60.4, + "Longitude": -19.94 + }, + { + "Latitude": 60.42, + "Longitude": -19.68001 + }, + { + "Latitude": 61.1, + "Longitude": -19.68001 + }, + { + "Latitude": 61.04, + "Longitude": -20.06 + }, + { + "Latitude": 61.14, + "Longitude": -20.14 + }, + { + "Latitude": 61.1, + "Longitude": -20.48 + }, + { + "Latitude": 61.92, + "Longitude": -20.5 + }, + { + "Latitude": 61.88, + "Longitude": -20.76001 + }, + { + "Latitude": 61.98, + "Longitude": -20.78 + }, + { + "Latitude": 61.94, + "Longitude": -21.08 + }, + { + "Latitude": 63.3, + "Longitude": -21.12001 + }, + { + "Latitude": 63.28, + "Longitude": -21.26001 + }, + { + "Latitude": 63.78, + "Longitude": -21.28 + }, + { + "Latitude": 63.62, + "Longitude": -22.34 + }, + { + "Latitude": 63.76, + "Longitude": -22.46001 + }, + { + "Latitude": 63.58, + "Longitude": -23.96001 + }, + { + "Latitude": 63.62, + "Longitude": -24.06 + }, + { + "Latitude": 62.4, + "Longitude": -32.58 + }, + { + "Latitude": 60.84, + "Longitude": -44.2 + }, + { + "Latitude": 59.24, + "Longitude": -59.24001 + }, + { + "Latitude": 58.8, + "Longitude": -63.98 + }, + { + "Latitude": 58.34, + "Longitude": -70.64 + }, + { + "Latitude": 57.88, + "Longitude": -80.52 + }, + { + "Latitude": 57.68, + "Longitude": -91.44 + }, + { + "Latitude": 57.72, + "Longitude": -96.82 + }, + { + "Latitude": 57.84, + "Longitude": -101.06 + }, + { + "Latitude": 58.08, + "Longitude": -106.94 + }, + { + "Latitude": 58.34, + "Longitude": -111.2 + }, + { + "Latitude": 57.74, + "Longitude": -111.2 + } + ] + } + } + ] + } + } + }, + "ProviderDates": [ + { + "Date": "2025-05-27T18:15:31.800Z", + "Type": "Create" + }, + { + "Date": "2025-05-27T18:15:31.800Z", + "Type": "Update" + } + ], + "CollectionReference": { + "ShortName": "TEST_COLLECTION", + "Version": "V03" + }, + "DataGranule": { + "ArchiveAndDistributionInformation": [ + { + "Checksum": { + "Algorithm": "MD5", + "Value": "a4c1d04490342e8c4dc13e45959802c1" + }, + "Name": "TEMPO_NO2_L3_V03_20250527T123504Z_S004.nc", + "Size": 549.5006294250488, + "SizeUnit": "MB" + }, + { + "Checksum": { + "Algorithm": "MD5", + "Value": "8bc2d5580be3703bad25e37642050767" + }, + "Name": "TEMPO_NO2_L3_V03_20250527T123504Z_S004.nc.met", + "Size": 5.275390625, + "SizeUnit": "KB" + } + ], + "DayNightFlag": "Unspecified", + "Identifiers": [ + { + "Identifier": "TEMPO_NO2_L3_V03_20250527T123504Z_S004.nc", + "IdentifierType": "ProducerGranuleId" + }, + { + "Identifier": "RFC1321 MD5 = not yet calculated", + "IdentifierType": "LocalVersionId" + } + ], + "ProductionDateTime": "2025-05-27T17:48:09+00:00" + }, + "TemporalExtent": { + "RangeDateTime": { + "BeginningDateTime": "2025-05-27T12:35:04+00:00", + "EndingDateTime": "2025-05-27T13:15:02+00:00" + } + }, + "GranuleUR": "TEMPO_NO2_L3_V03_20250527T123504Z_S004.nc", + "Projects": [ + { + "ShortName": "TEMPO" + } + ], + "MetadataSpecification": { + "URL": "https://cdn.earthdata.nasa.gov/umm/granule/v1.6.6", + "Name": "UMM-G", + "Version": "1.6.6" + }, + "InputGranules": [ + "TEMPO_NO2_L2_V03_20250527T123504Z_S004G01.nc", + "TEMPO_NO2_L2_V03_20250527T124144Z_S004G02.nc", + "TEMPO_NO2_L2_V03_20250527T124824Z_S004G03.nc", + "TEMPO_NO2_L2_V03_20250527T125505Z_S004G04.nc", + "TEMPO_NO2_L2_V03_20250527T130145Z_S004G05.nc", + "TEMPO_NO2_L2_V03_20250527T130825Z_S004G06.nc" + ] +} diff --git a/cmr_s3_subscriber/test_messages/single_conf.json b/cmr_s3_subscriber/test_messages/single_conf.json new file mode 100644 index 0000000..0b08db1 --- /dev/null +++ b/cmr_s3_subscriber/test_messages/single_conf.json @@ -0,0 +1,8 @@ +{ + "Records": [ + { + "messageId": "b0574ee3-336a-4753-bb33-766eca3dbf92", + "body": "{\"Type\": \"SubscriptionConfirmation\", \"MessageId\": \"105d55c5-5970-4c3d-ad4e-3222095e3df2\", \"Token\": \"2336412f37fb687f5d51e6e2425a8a5875c3b7972f7c1552c44e8e9ef5fa79a1edadd32d49d594c91004d7b35ba5b85b59eefd73040470d671272eb92d14a1dfe95ed9e20d747bada2f017957feee2a55214f303644ea43e3b62eb4b0adddb3c424f08af718582351ee033782cd2344fcbc95b44b50d974e0484936fd4932bef\", \"TopicArn\": \"arn:aws:sns:us-east-1:621933553860:cmr-subscriptions-prod\", \"Message\": \"You have chosen to subscribe to the topic arn:aws:sns:us-east-1:621933553860:cmr-subscriptions-prod.\\nTo confirm the subscription, visit the SubscribeURL included in this message.\", \"SubscribeURL\": \"https://sns.us-east-1.amazonaws.com/?Action=ConfirmSubscription&TopicArn=arn:aws:sns:us-east-1:621933553860:cmr-subscriptions-prod&Token=2336412f37fb687f5d51e6e2425a8a5875c3b7972f7c1552c44e8e9ef5fa79a1edadd32d49d594c91004d7b35ba5b85b59eefd73040470d671272eb92d14a1dfe95ed9e20d747bada2f017957feee2a55214f303644ea43e3b62eb4b0adddb3c424f08af718582351ee033782cd2344fcbc95b44b50d974e0484936fd4932bef\", \"Timestamp\": \"2025-05-27T15:07:54.165Z\", \"SignatureVersion\": \"1\", \"Signature\": \"JjUTOnF42LeLGuJ+o/0M2LoeV87Nf4cNLQvYBW035FRnyMwQ2RbxrIFLFpFSS+zNjJJcFrmSF3GylJCyelWuPtsCYOQ2sNo+E24HvozbISY6UmhOyv5xDDtuB3uOFAeN2RgSuNjh/rZBehv/YwovX5mPGgYJfDVyRp3Swb8xTLuQ9oZXLf0bxGjXCdf6hnPI6Wj/FgWi/XxLDmrENKEmK1YT6u9vnvhMcqbHqzlpfPdNOFlVezYSHLVzhm26y/jAzIpfqrxMcBbyrtERnYW27T7jcpRfsFXwf6G5xXQXtxfFfig3CCaZ/6jUZvW1RQTghtbhT1jcA1MTWd9hQ++dyw==\", \"SigningCertURL\": \"https://sns.us-east-1.amazonaws.com/SimpleNotificationService-9c6465fa7f48f5cacd23014631ec1136.pem\"}" + } + ] +} \ No newline at end of file diff --git a/cmr_s3_subscriber/test_messages/single_invalid.json b/cmr_s3_subscriber/test_messages/single_invalid.json new file mode 100644 index 0000000..17bf0da --- /dev/null +++ b/cmr_s3_subscriber/test_messages/single_invalid.json @@ -0,0 +1,11 @@ +{ + "Records": [ + { + "messageId": "b0574ee3-336a-4753-bb33-766eca3dbf92", + "body": "{\"Type\": \"Notification\", \"MessageId\": \"bc8511b9-9e4d-5130-9ff9-710989b0531e\", \"TopicArn\": \"arn:aws:sns:us-east-1:621933553860:cmr-subscriptions-prod\", \"Subject\": \"Update Notification\", \"Message\": \"{\\\"concept-id\\\": \\\"G3550282859-LARC_CLOUD\\\", \\\"granule-ur\\\": \\\"TEMPO_NO2_L3_V03_20250527T123504Z_S004.nc\\\", \\\"location\\\": \\\"https://localhost:443/concepts/G3550282859-LARC_CLOUD/2\\\", \\\"producer-granule-id\\\": \\\"TEMPO_NO2_L3_V03_20250527T123504Z_S004.nc\\\"}\", \"Timestamp\": \"2025-05-27T19:42:53.639Z\", \"SignatureVersion\": \"1\", \"Signature\": \"eDDx7tBu5YTU+ltKuHNjTsLraBSGSZTBazEVcmnpKJL3yS0pM7795FheN/u/xTyr3StzgQVxjPUQV5UK9rulZACH2QmNT4ZXKWOfAB8EAZDI/+/0EnTkRx5bWPLbNKRdcA99aPdlhcU62FfCSTHHHrichvV2gn/R5P2h7fR/+G5/1RJWPuplI5BNUvh8XdG0zpn8IAu8vFzH8BhNGGtGQJ+tZicflYPrZ4ZlF4SsEGm02l36FTe8ccpXCNswDbsK5Z8tRejMUntHMoEUOp4BmLUex70kpgjcciDQH4jRRmLmf6jkKSo4IaMl6M88WXcHvM+pWr7jDANSCxK2lv977A==\", \"SigningCertURL\": \"https://sns.us-east-1.amazonaws.com/SimpleNotificationService-9c6465fa7f48f5cacd23014631ec1136.pem\", \"UnsubscribeURL\": \"https://sns.us-east-1.amazonaws.com/?Action=Unsubscribe&SubscriptionArn=arn:aws:sns:us-east-1:621933553860:cmr-subscriptions-prod:b8d71539-6e1f-4ba9-90de-0b6cac50352b\", \"MessageAttributes\": {\"collection-concept-id\": {\"Type\": \"String\", \"Value\": \"C2930763263-LARC_CLOUD\"}, \"mode\": {\"Type\": \"String\", \"Value\": \"Update\"}, \"subscriber\": {\"Type\": \"String\", \"Value\": \"aqacf\"}}}", + "Attributes": { + "ApproximateReceiveCount": "1" + } + } + ] +} \ No newline at end of file diff --git a/cmr_s3_subscriber/test_messages/single_invalid_no_retry.json b/cmr_s3_subscriber/test_messages/single_invalid_no_retry.json new file mode 100644 index 0000000..ed79db0 --- /dev/null +++ b/cmr_s3_subscriber/test_messages/single_invalid_no_retry.json @@ -0,0 +1,11 @@ +{ + "Records": [ + { + "messageId": "b0574ee3-336a-4753-bb33-766eca3dbf92", + "body": "{\"Type\": \"Notification\", \"MessageId\": \"bc8511b9-9e4d-5130-9ff9-710989b0531e\", \"TopicArn\": \"arn:aws:sns:us-east-1:621933553860:cmr-subscriptions-prod\", \"Subject\": \"Update Notification\", \"Message\": \"{\\\"concept-id\\\": \\\"G3550282859-LARC_CLOUD\\\", \\\"granule-ur\\\": \\\"TEMPO_NO2_L3_V03_20250527T123504Z_S004.nc\\\", \\\"location\\\": \\\"https://localhost:443/concepts/G3550282859-LARC_CLOUD/2\\\", \\\"producer-granule-id\\\": \\\"TEMPO_NO2_L3_V03_20250527T123504Z_S004.nc\\\"}\", \"Timestamp\": \"2025-05-27T19:42:53.639Z\", \"SignatureVersion\": \"1\", \"Signature\": \"eDDx7tBu5YTU+ltKuHNjTsLraBSGSZTBazEVcmnpKJL3yS0pM7795FheN/u/xTyr3StzgQVxjPUQV5UK9rulZACH2QmNT4ZXKWOfAB8EAZDI/+/0EnTkRx5bWPLbNKRdcA99aPdlhcU62FfCSTHHHrichvV2gn/R5P2h7fR/+G5/1RJWPuplI5BNUvh8XdG0zpn8IAu8vFzH8BhNGGtGQJ+tZicflYPrZ4ZlF4SsEGm02l36FTe8ccpXCNswDbsK5Z8tRejMUntHMoEUOp4BmLUex70kpgjcciDQH4jRRmLmf6jkKSo4IaMl6M88WXcHvM+pWr7jDANSCxK2lv977A==\", \"SigningCertURL\": \"https://sns.us-east-1.amazonaws.com/SimpleNotificationService-9c6465fa7f48f5cacd23014631ec1136.pem\", \"UnsubscribeURL\": \"https://sns.us-east-1.amazonaws.com/?Action=Unsubscribe&SubscriptionArn=arn:aws:sns:us-east-1:621933553860:cmr-subscriptions-prod:b8d71539-6e1f-4ba9-90de-0b6cac50352b\", \"MessageAttributes\": {\"collection-concept-id\": {\"Type\": \"String\", \"Value\": \"C2930763263-LARC_CLOUD\"}, \"mode\": {\"Type\": \"String\", \"Value\": \"Update\"}, \"subscriber\": {\"Type\": \"String\", \"Value\": \"aqacf\"}}}", + "Attributes": { + "ApproximateReceiveCount": "6" + } + } + ] +} \ No newline at end of file diff --git a/cmr_s3_subscriber/test_messages/single_record.json b/cmr_s3_subscriber/test_messages/single_record.json new file mode 100644 index 0000000..2b2c258 --- /dev/null +++ b/cmr_s3_subscriber/test_messages/single_record.json @@ -0,0 +1,8 @@ +{ + "Records": [ + { + "messageId": "b0574ee3-336a-4753-bb33-766eca3dbf92", + "body": "{\"Type\": \"Notification\", \"MessageId\": \"bc8511b9-9e4d-5130-9ff9-710989b0531e\", \"TopicArn\": \"arn:aws:sns:us-east-1:621933553860:cmr-subscriptions-prod\", \"Subject\": \"Update Notification\", \"Message\": \"{\\\"concept-id\\\": \\\"G3550282859-LARC_CLOUD\\\", \\\"granule-ur\\\": \\\"TEMPO_NO2_L3_V03_20250527T123504Z_S004.nc\\\", \\\"location\\\": \\\"https://cmr.earthdata.nasa.gov:443/concepts/G3550282859-LARC_CLOUD/2\\\", \\\"producer-granule-id\\\": \\\"TEMPO_NO2_L3_V03_20250527T123504Z_S004.nc\\\"}\", \"Timestamp\": \"2025-05-27T19:42:53.639Z\", \"SignatureVersion\": \"1\", \"Signature\": \"eDDx7tBu5YTU+ltKuHNjTsLraBSGSZTBazEVcmnpKJL3yS0pM7795FheN/u/xTyr3StzgQVxjPUQV5UK9rulZACH2QmNT4ZXKWOfAB8EAZDI/+/0EnTkRx5bWPLbNKRdcA99aPdlhcU62FfCSTHHHrichvV2gn/R5P2h7fR/+G5/1RJWPuplI5BNUvh8XdG0zpn8IAu8vFzH8BhNGGtGQJ+tZicflYPrZ4ZlF4SsEGm02l36FTe8ccpXCNswDbsK5Z8tRejMUntHMoEUOp4BmLUex70kpgjcciDQH4jRRmLmf6jkKSo4IaMl6M88WXcHvM+pWr7jDANSCxK2lv977A==\", \"SigningCertURL\": \"https://sns.us-east-1.amazonaws.com/SimpleNotificationService-9c6465fa7f48f5cacd23014631ec1136.pem\", \"UnsubscribeURL\": \"https://sns.us-east-1.amazonaws.com/?Action=Unsubscribe&SubscriptionArn=arn:aws:sns:us-east-1:621933553860:cmr-subscriptions-prod:b8d71539-6e1f-4ba9-90de-0b6cac50352b\", \"MessageAttributes\": {\"collection-concept-id\": {\"Type\": \"String\", \"Value\": \"C2930763263-LARC_CLOUD\"}, \"mode\": {\"Type\": \"String\", \"Value\": \"Update\"}, \"subscriber\": {\"Type\": \"String\", \"Value\": \"aqacf\"}}}" + } + ] +} \ No newline at end of file From 0c082ffd79597aa02ee79f0b5628db1f09e7e2cb Mon Sep 17 00:00:00 2001 From: rileykk Date: Mon, 16 Jun 2025 12:58:32 -0700 Subject: [PATCH 03/21] ignore update --- cmr_s3_subscriber/.gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cmr_s3_subscriber/.gitignore b/cmr_s3_subscriber/.gitignore index 5adeb77..4a6586d 100644 --- a/cmr_s3_subscriber/.gitignore +++ b/cmr_s3_subscriber/.gitignore @@ -2,6 +2,9 @@ lambda/*.zip subscriptions/ terraform/.terraform/ terraform/.terraform.lock.hcl -terraform/config.yaml +terraform/config.yaml* terraform/terraform.tfstate* terraform/*.tfvars +terraform/credentials.env +config.yaml +credentials.env \ No newline at end of file From e1ceedfa9305b6069fc31ace253ac45f45531cbd Mon Sep 17 00:00:00 2001 From: rileykk Date: Mon, 16 Jun 2025 13:00:38 -0700 Subject: [PATCH 04/21] remove cmr api dryrun --- cmr_s3_subscriber/terraform/subscription/main.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmr_s3_subscriber/terraform/subscription/main.tf b/cmr_s3_subscriber/terraform/subscription/main.tf index ea3ff4a..35527fc 100644 --- a/cmr_s3_subscriber/terraform/subscription/main.tf +++ b/cmr_s3_subscriber/terraform/subscription/main.tf @@ -58,13 +58,13 @@ resource "null_resource" "create_subscription" { } provisioner "local-exec" { - command = "source venv/bin/activate; python subscriber.py ${var.config_file} ${var.ccid} --queue ${aws_sqs_queue.queue.arn} --dryrun; cp subscriptions/C2930763263-LARC_CLOUD-subscription.xml subscriptions/${var.ccid}-subscription.xml" + command = "source venv/bin/activate; python subscriber.py ${var.config_file} ${var.ccid} --queue ${aws_sqs_queue.queue.arn}" working_dir = var.script_dir interpreter = ["/bin/bash", "-c"] } provisioner "local-exec" { - command = "source venv/bin/activate; python delete.py ${self.triggers.config_file} --response-xml subscriptions/${self.triggers.ccid}-subscription.xml --dryrun" + command = "source venv/bin/activate; python delete.py ${self.triggers.config_file} --response-xml subscriptions/${self.triggers.ccid}-subscription.xml" working_dir = self.triggers.script_dir interpreter = ["/bin/bash", "-c"] when = destroy From f6e91e03992ff5978c84eaaeaebc1827390b8169 Mon Sep 17 00:00:00 2001 From: rileykk Date: Tue, 24 Jun 2025 14:48:42 -0700 Subject: [PATCH 05/21] small fix --- cmr_s3_subscriber/lambda/lambda_function.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmr_s3_subscriber/lambda/lambda_function.py b/cmr_s3_subscriber/lambda/lambda_function.py index c9c3e98..9cc6da6 100644 --- a/cmr_s3_subscriber/lambda/lambda_function.py +++ b/cmr_s3_subscriber/lambda/lambda_function.py @@ -105,7 +105,7 @@ def _submit_maap_job(short_name, granule_ur): readable_granule_name=granule_ur, cmr_host="cmr.earthdata.nasa.gov", limit=1 - )[0]['concept-id'] + )[0]['collection-concept-id'] algo = os.environ['MAAP_ALGO_ID'] algo_version = os.environ['MAAP_ALGO_VERSION'] From a09903321ef19212e98afac66f6dbf668d2714cf Mon Sep 17 00:00:00 2001 From: rileykk Date: Tue, 24 Jun 2025 14:50:55 -0700 Subject: [PATCH 06/21] added backfill/reconciliation script --- cmr_s3_subscriber/fill_queue_by_query.py | 297 +++++++++++++++++++++++ 1 file changed, 297 insertions(+) create mode 100644 cmr_s3_subscriber/fill_queue_by_query.py diff --git a/cmr_s3_subscriber/fill_queue_by_query.py b/cmr_s3_subscriber/fill_queue_by_query.py new file mode 100644 index 0000000..f0dbd3b --- /dev/null +++ b/cmr_s3_subscriber/fill_queue_by_query.py @@ -0,0 +1,297 @@ +import argparse +import json +import os +from datetime import datetime +from urllib.parse import urlparse +from uuid import uuid4 + +import boto3 +import earthaccess +import requests +import yaml + +CMR_MIN_PAGE = 10 +CMR_MAX_PAGE = 2000 + + +def parse_arguments(): + parser = argparse.ArgumentParser() + + parser.add_argument( + 'config', + default='config.yaml', + help='Configuration file for Earthdata login and SQS ARN' + ) + + parser.add_argument( + 'ccid', + help='Collection concept ID to fill data for' + ) + + parser.add_argument( + '--start-time', + required=False, + default=None, + type=lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ'), + help='Optional filter collection granules to those at or after this time. Must be in yyyy-mm-ddThh:mm:ssZ ' + 'format' + ) + + parser.add_argument( + '--end-time', + required=False, + default=None, + type=lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ'), + help='Optional filter collection granules to those at or before this time. Must be in yyyy-mm-ddThh:mm:ssZ ' + 'format' + ) + + def __validate_bbox(s): + coords = [float(c) for c in s.split(',')] + + if len(coords) != 4: + raise ValueError(f'Expected 4 coordinates but got {len(coords)}') + + if not (-180 <= coords[0] <= 180 and -180 <= coords[2] <= 180): + raise ValueError('A longitude coordinate is out of range') + + if not (-90 <= coords[1] <= 90 and -90 <= coords[3] <= 90): + raise ValueError('A latitude coordinate is out of range') + + if coords[2] <= coords[0] or coords[3] <= coords[1]: + raise ValueError('One of min/max lat/lon are flipped') + + return s + + parser.add_argument( + '--bbox', + required=False, + type=__validate_bbox, + default=None, + help='Optional filter to filter collection granule to those matching a bounding box. Defined as 4 ' + 'comma-separated numbers in order of: ' + 'minimum_longitude,minimum_latitude,maximum_longitude,maximum_latitude. Longitudes must be between +/-180 ' + 'and latitudes must be between +/- 90' + ) + + def __validate_s3_url(s): + parsed = urlparse(s) + + if parsed.scheme != 's3': + raise ValueError(f'Expected S3 URL but got {parsed.scheme}') + + return s + + parser.add_argument( + '--staged-data', + required=False, + default=None, + type=__validate_s3_url, + help='S3 URL prefix for location of data already staged. If provided, granules with all data files already ' + 'staged in S3 will be filtered out' + ) + + def __positive_integer(s): + i = int(s) + + if i <= 0: + raise ValueError(f'Expected positive integer but got {i}') + + return i + + parser.add_argument( + '--limit', + required=False, + default=None, + type=__positive_integer, + help='Limit the number of generated SQS messages' + ) + + parser.add_argument( + '--page-size', + required=False, + default=1000, + type=__positive_integer, + help='Page size for CMR queries. Default: 1000, min: 10, max: 2000. Values outside the min/max will be ' + 'overridden' + ) + + return parser.parse_args() + + +def filter_granules(granules, s3_url): + parsed_url = urlparse(s3_url) + s3 = boto3.client('s3') + + paginator = s3.get_paginator('list_objects_v2') + + staged_files = [] + + for page in paginator.paginate(Bucket=parsed_url.netloc, Prefix=parsed_url.path.lstrip('/')): + staged_files.extend([os.path.basename(o['Key']) for o in page.get('Contents', [])]) + + staged_files = set(staged_files) + + print(f'Found {len(staged_files):,} files staged in S3') + + filtered_granules = [] + + for granule in granules: + links = granule['umm']['RelatedUrls'] + + data_urls = [u['URL'] for u in links if u['Type'] == 'GET DATA'] + data_files = set([os.path.basename(url) for url in data_urls]) + + if data_files.intersection(staged_files) != data_files: + filtered_granules.append(granule) + + print(f'Filtered {len(granules):,} granules to {len(filtered_granules):,}') + return filtered_granules + + +def main(args): + with open(args.config, 'r') as f: + config = yaml.safe_load(f) + + os.environ['EARTHDATA_USERNAME'] = config['edl_username'] + os.environ['EARTHDATA_PASSWORD'] = config['edl_password'] + + auth = earthaccess.login(strategy='environment') + bearer_token = auth.token['access_token'] + + queue_arn = config['queue_arn'] + + ccid = args.ccid + + cmr_search_url = 'https://cmr.earthdata.nasa.gov/search/granules.umm_json_v1_4' + + search_query = { + 'collection_concept_id': ccid, + 'page_size': min(max(args.page_size, CMR_MIN_PAGE), CMR_MAX_PAGE) + } + + if args.start_time is not None or args.end_time is not None: + start_q_str = args.start_time.strftime('%Y-%m-%dT%H:%M:%SZ') if args.start_time is not None else '' + end_q_str = args.end_time.strftime('%Y-%m-%dT%H:%M:%SZ') if args.end_time is not None else '' + + search_query['temporal[]'] = f'{start_q_str},{end_q_str}' + + if args.bbox is not None: + search_query['bounding_box[]'] = args.bbox + + matched_granules = [] + + print(f'Querying {cmr_search_url} with params {search_query}') + response = requests.get(cmr_search_url, params=search_query) + response.raise_for_status() + + response_json = response.json() + + n_hits = response_json['hits'] + matched_granules.extend(response_json['items']) + search_after = response.headers.get('CMR-Search-After', None) + + while search_after is not None: + headers = {'CMR-Search-After': search_after} + print(f'Querying {cmr_search_url} with params {search_query} and headers {headers}') + response = requests.get(cmr_search_url, params=search_query, headers=headers) + response.raise_for_status() + + response_json = response.json() + + matched_granules.extend(response_json['items']) + search_after = response.headers.get('CMR-Search-After', None) + + print(f'Finished CMR query. Found {len(matched_granules):,} granules.') + + if len(matched_granules) != n_hits: + print('Mismatch between number of granules and initial number of hits') + exit(1) + + print(json.dumps(matched_granules[0], indent=2)) + + if args.staged_data is not None: + matched_granules = filter_granules(matched_granules, args.staged_data) + + if args.limit is not None: + print(f'Limiting {len(matched_granules)} granules to {args.limit}') + matched_granules = matched_granules[:args.limit] + + print('Converting to CMR notifications') + + def _try_get_pgid_from_umm(umm): + try: + identifiers = umm['DataGranule']['Identifiers'] + for i in identifiers: + if i['IdentifierType'] == 'ProducerGranuleId': + return i['Identifier'] + return umm['GranuleUR'] + except: + return umm['GranuleUR'] + + matched_granules = [ + { + 'concept-id': m['meta']['concept-id'], + 'granule-ur': m['umm']['GranuleUR'], + 'location': f"https://cmr.earthdata.nasa.gov:443/concepts/" + f"{m['meta']['concept-id']}/{m['meta']['revision-id']}", + 'producer-granule-id': _try_get_pgid_from_umm(m['umm']) + } for m in matched_granules + ] + + print(json.dumps(matched_granules[0], indent=2)) + + print('Converting to SQS messages') + + matched_granules = [ + { + 'Type': 'Notification', + 'MessageId': str(uuid4()), + 'TopicArn': 'arn:aws:sns:us-east-1:621933553860:cmr-subscriptions-prod', + 'Subject': 'New Notification', + 'Message': json.dumps(m), + 'Timestamp': datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ'), + 'MessageAttributes': { + 'collection-concept-id': { + 'Type': 'String', + 'Value': ccid + }, + 'mode': { + 'Type': 'String', + 'Value': 'New' + }, + 'subscriber': { + 'Type': 'String', + 'Value': config['edl_username'] + } + } + + } for m in matched_granules + ] + + print(json.dumps(matched_granules[0], indent=2)) + + sqs = boto3.client('sqs') + + queue_url = sqs.get_queue_url(QueueName=queue_arn.split(':')[-1])['QueueUrl'] + + message_batches = [[{ + 'Id': str(uuid4()), + 'MessageBody': json.dumps(m), + } for m in matched_granules[i:i+10]] for i in range(0, len(matched_granules), 10)] + + failed = [] + + for batch in message_batches: + print(f'Sending batch of {len(batch)} messages to queue {queue_url}') + response = sqs.send_message_batch(QueueUrl=queue_url, Entries=batch) + failed.extend(response.get('Failed', [])) + + if len(failed) > 0: + print(f'Failed {len(failed):,} failed messages:\n{json.dumps(failed, indent=2)}') + + +if __name__ == '__main__': + main(parse_arguments()) + # filter_granules([], 's3://aqacf-nexus-stage/TEMPO/TEMPO_NO2_L3_V03/') + From 8ba7e3c9aeb1c6d92cc7a8549cac42b0c6cd9915 Mon Sep 17 00:00:00 2001 From: rileykk Date: Thu, 26 Jun 2025 08:26:10 -0700 Subject: [PATCH 07/21] update terraform --- cmr_s3_subscriber/terraform/main.tf | 2 +- cmr_s3_subscriber/terraform/outputs.tf | 2 +- cmr_s3_subscriber/terraform/provider.tf | 68 ++++++++++++------------ cmr_s3_subscriber/terraform/variables.tf | 20 +++++++ 4 files changed, 56 insertions(+), 36 deletions(-) diff --git a/cmr_s3_subscriber/terraform/main.tf b/cmr_s3_subscriber/terraform/main.tf index 0f49da2..55033cd 100644 --- a/cmr_s3_subscriber/terraform/main.tf +++ b/cmr_s3_subscriber/terraform/main.tf @@ -126,7 +126,7 @@ module "subscriptions" { ccid = each.value config_file = local_file.subscriber_config_file.filename - script_dir = "/Users/rileykk/FireAlarm/subscriber" + script_dir = var.script_dir } resource "aws_lambda_event_source_mapping" "sqs_triggers" { diff --git a/cmr_s3_subscriber/terraform/outputs.tf b/cmr_s3_subscriber/terraform/outputs.tf index 839e32b..c5b55a2 100644 --- a/cmr_s3_subscriber/terraform/outputs.tf +++ b/cmr_s3_subscriber/terraform/outputs.tf @@ -20,7 +20,7 @@ output "staging_bucket" { } output "subscription_info" { - value = {for ccid, sub in module.subscriptions: ccid => sub.queue_arn} + value = { for ccid, sub in module.subscriptions : ccid => sub.queue_arn } } output "triggers_enabled" { diff --git a/cmr_s3_subscriber/terraform/provider.tf b/cmr_s3_subscriber/terraform/provider.tf index c4bb9d4..4572d5f 100644 --- a/cmr_s3_subscriber/terraform/provider.tf +++ b/cmr_s3_subscriber/terraform/provider.tf @@ -1,39 +1,39 @@ provider "aws" { - region = "us-west-2" + region = "us-west-2" -# access_key = "test" -# secret_key = "test" -# s3_use_path_style = false -# skip_credentials_validation = true -# skip_metadata_api_check = true -# skip_requesting_account_id = true -# -# endpoints { -# apigateway = "http://localhost:4566" -# apigatewayv2 = "http://localhost:4566" -# cloudformation = "http://localhost:4566" -# cloudwatch = "http://localhost:4566" -# dynamodb = "http://localhost:4566" -# ec2 = "http://localhost:4566" -# es = "http://localhost:4566" -# elasticache = "http://localhost:4566" -# firehose = "http://localhost:4566" -# iam = "http://localhost:4566" -# kinesis = "http://localhost:4566" -# lambda = "http://localhost:4566" -# rds = "http://localhost:4566" -# redshift = "http://localhost:4566" -# route53 = "http://localhost:4566" -# s3 = "http://s3.localhost.localstack.cloud:4566" -# secretsmanager = "http://localhost:4566" -# ses = "http://localhost:4566" -# sns = "http://localhost:4566" -# sqs = "http://localhost:4566" -# ssm = "http://localhost:4566" -# stepfunctions = "http://localhost:4566" -# sts = "http://localhost:4566" -# logs = "http://localhost:4566" -# } + # access_key = "test" + # secret_key = "test" + # s3_use_path_style = false + # skip_credentials_validation = true + # skip_metadata_api_check = true + # skip_requesting_account_id = true + # + # endpoints { + # apigateway = "http://localhost:4566" + # apigatewayv2 = "http://localhost:4566" + # cloudformation = "http://localhost:4566" + # cloudwatch = "http://localhost:4566" + # dynamodb = "http://localhost:4566" + # ec2 = "http://localhost:4566" + # es = "http://localhost:4566" + # elasticache = "http://localhost:4566" + # firehose = "http://localhost:4566" + # iam = "http://localhost:4566" + # kinesis = "http://localhost:4566" + # lambda = "http://localhost:4566" + # rds = "http://localhost:4566" + # redshift = "http://localhost:4566" + # route53 = "http://localhost:4566" + # s3 = "http://s3.localhost.localstack.cloud:4566" + # secretsmanager = "http://localhost:4566" + # ses = "http://localhost:4566" + # sns = "http://localhost:4566" + # sqs = "http://localhost:4566" + # ssm = "http://localhost:4566" + # stepfunctions = "http://localhost:4566" + # sts = "http://localhost:4566" + # logs = "http://localhost:4566" + # } } data "aws_caller_identity" "current" {} diff --git a/cmr_s3_subscriber/terraform/variables.tf b/cmr_s3_subscriber/terraform/variables.tf index e7e7290..24c3cda 100644 --- a/cmr_s3_subscriber/terraform/variables.tf +++ b/cmr_s3_subscriber/terraform/variables.tf @@ -21,6 +21,26 @@ variable "edl_password" { sensitive = true } +variable "script_dir" { + type = string + description = "Path to directory containing CMR subscription scripts and python venv" + + validation { + condition = fileexists(join("/", [trimsuffix(pathexpand(var.script_dir), "/"), "subscriber.py"])) + error_message = "Cannot find subscriber script in script_dir" + } + + validation { + condition = fileexists(join("/", [trimsuffix(pathexpand(var.script_dir), "/"), "delete.py"])) + error_message = "Cannot find subscription deletion script in script_dir" + } + + validation { + condition = fileexists(join("/", [trimsuffix(pathexpand(var.script_dir), "/"), "venv", "bin", "python"])) + error_message = "Cannot find python venv in script_dir (expected to be in script_dir/venv)" + } +} + variable "ccids" { type = list(string) description = "List of CMR collection-concept-IDs to subscribe to. Note: Initial apply should have either no CCIDs listed or triggers disabled until the notification SNS topic is subscribed to" From ec105248e7c2b8af8cae42533def788bbc628976 Mon Sep 17 00:00:00 2001 From: rileykk Date: Wed, 2 Jul 2025 16:07:42 -0700 Subject: [PATCH 08/21] update maap tag --- cmr_s3_subscriber/lambda/lambda_function.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmr_s3_subscriber/lambda/lambda_function.py b/cmr_s3_subscriber/lambda/lambda_function.py index 9cc6da6..cc7b939 100644 --- a/cmr_s3_subscriber/lambda/lambda_function.py +++ b/cmr_s3_subscriber/lambda/lambda_function.py @@ -114,7 +114,7 @@ def _submit_maap_job(short_name, granule_ur): kwargs = {v.removeprefix('_maap_kwarg_'): os.environ[v] for v in os.environ.keys() if v.startswith('_maap_kwarg_')} job = maap.submitJob( - identifier=granule_ur, + identifier=f"CMR_subscriber_ingest_{granule_ur}", algo_id=algo, version=algo_version, queue=queue, From cadaa1371f14d8dcbd9da38b15127099a372b39e Mon Sep 17 00:00:00 2001 From: rileykk Date: Wed, 2 Jul 2025 16:10:48 -0700 Subject: [PATCH 09/21] add ASF header to all source files --- cmr_s3_subscriber/delete.py | 16 ++++++++++++++++ cmr_s3_subscriber/fill_queue_by_query.py | 16 ++++++++++++++++ cmr_s3_subscriber/lambda/Makefile | 15 +++++++++++++++ cmr_s3_subscriber/lambda/lambda_function.py | 16 ++++++++++++++++ cmr_s3_subscriber/subscriber.py | 16 ++++++++++++++++ cmr_s3_subscriber/terraform/iam.tf | 15 +++++++++++++++ cmr_s3_subscriber/terraform/main.tf | 15 +++++++++++++++ cmr_s3_subscriber/terraform/outputs.tf | 15 +++++++++++++++ cmr_s3_subscriber/terraform/provider.tf | 18 ++++++++++++++++++ cmr_s3_subscriber/terraform/s3.tf | 15 +++++++++++++++ .../terraform/sqs_backfill_queue.tf | 15 +++++++++++++++ .../terraform/subscription/main.tf | 15 +++++++++++++++ .../terraform/subscription/outputs.tf | 16 ++++++++++++++++ .../terraform/subscription/variables.tf | 15 +++++++++++++++ .../terraform/subscription/versions.tf | 16 ++++++++++++++++ cmr_s3_subscriber/terraform/variables.tf | 15 +++++++++++++++ cmr_s3_subscriber/terraform/versions.tf | 16 ++++++++++++++++ 17 files changed, 265 insertions(+) diff --git a/cmr_s3_subscriber/delete.py b/cmr_s3_subscriber/delete.py index ab7964b..66ce428 100644 --- a/cmr_s3_subscriber/delete.py +++ b/cmr_s3_subscriber/delete.py @@ -1,3 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + import argparse import requests import earthaccess diff --git a/cmr_s3_subscriber/fill_queue_by_query.py b/cmr_s3_subscriber/fill_queue_by_query.py index f0dbd3b..8953a8e 100644 --- a/cmr_s3_subscriber/fill_queue_by_query.py +++ b/cmr_s3_subscriber/fill_queue_by_query.py @@ -1,3 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + import argparse import json import os diff --git a/cmr_s3_subscriber/lambda/Makefile b/cmr_s3_subscriber/lambda/Makefile index 37338d8..f2dedf0 100644 --- a/cmr_s3_subscriber/lambda/Makefile +++ b/cmr_s3_subscriber/lambda/Makefile @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + package.zip: lambda_function.py ../requirements.txt rm -f package.zip diff --git a/cmr_s3_subscriber/lambda/lambda_function.py b/cmr_s3_subscriber/lambda/lambda_function.py index cc7b939..092e8a8 100644 --- a/cmr_s3_subscriber/lambda/lambda_function.py +++ b/cmr_s3_subscriber/lambda/lambda_function.py @@ -1,3 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + import hashlib import json import os diff --git a/cmr_s3_subscriber/subscriber.py b/cmr_s3_subscriber/subscriber.py index de2be32..274145e 100644 --- a/cmr_s3_subscriber/subscriber.py +++ b/cmr_s3_subscriber/subscriber.py @@ -1,3 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + import argparse import requests import earthaccess diff --git a/cmr_s3_subscriber/terraform/iam.tf b/cmr_s3_subscriber/terraform/iam.tf index aea4d62..103dbac 100644 --- a/cmr_s3_subscriber/terraform/iam.tf +++ b/cmr_s3_subscriber/terraform/iam.tf @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + data "aws_iam_policy_document" "assume_policy" { statement { diff --git a/cmr_s3_subscriber/terraform/main.tf b/cmr_s3_subscriber/terraform/main.tf index 55033cd..b7283fd 100644 --- a/cmr_s3_subscriber/terraform/main.tf +++ b/cmr_s3_subscriber/terraform/main.tf @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + resource "aws_dynamodb_table" "collection_lookup" { name = "cmr-subscriber-collections" diff --git a/cmr_s3_subscriber/terraform/outputs.tf b/cmr_s3_subscriber/terraform/outputs.tf index c5b55a2..eed8ed8 100644 --- a/cmr_s3_subscriber/terraform/outputs.tf +++ b/cmr_s3_subscriber/terraform/outputs.tf @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + output "notification_topic" { value = aws_sns_topic.subscriber_notify.arn diff --git a/cmr_s3_subscriber/terraform/provider.tf b/cmr_s3_subscriber/terraform/provider.tf index 4572d5f..717be5f 100644 --- a/cmr_s3_subscriber/terraform/provider.tf +++ b/cmr_s3_subscriber/terraform/provider.tf @@ -1,6 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + provider "aws" { region = "us-west-2" + # Uncomment the below rows to test using localstack + # access_key = "test" # secret_key = "test" # s3_use_path_style = false diff --git a/cmr_s3_subscriber/terraform/s3.tf b/cmr_s3_subscriber/terraform/s3.tf index 6da8008..eca1320 100644 --- a/cmr_s3_subscriber/terraform/s3.tf +++ b/cmr_s3_subscriber/terraform/s3.tf @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + resource "aws_s3_bucket" "provisioned_bucket" { bucket_prefix = var.bucket_name_prefix diff --git a/cmr_s3_subscriber/terraform/sqs_backfill_queue.tf b/cmr_s3_subscriber/terraform/sqs_backfill_queue.tf index 95800a4..59ace5a 100644 --- a/cmr_s3_subscriber/terraform/sqs_backfill_queue.tf +++ b/cmr_s3_subscriber/terraform/sqs_backfill_queue.tf @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + resource "aws_sqs_queue" "backfill_queue" { name = "cmr-subscriber-backfill-queue" diff --git a/cmr_s3_subscriber/terraform/subscription/main.tf b/cmr_s3_subscriber/terraform/subscription/main.tf index 35527fc..1a6105d 100644 --- a/cmr_s3_subscriber/terraform/subscription/main.tf +++ b/cmr_s3_subscriber/terraform/subscription/main.tf @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + data "aws_caller_identity" "current" {} diff --git a/cmr_s3_subscriber/terraform/subscription/outputs.tf b/cmr_s3_subscriber/terraform/subscription/outputs.tf index f6760ee..322b9f0 100644 --- a/cmr_s3_subscriber/terraform/subscription/outputs.tf +++ b/cmr_s3_subscriber/terraform/subscription/outputs.tf @@ -1,3 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + output "queue_arn" { value = aws_sqs_queue.queue.arn } diff --git a/cmr_s3_subscriber/terraform/subscription/variables.tf b/cmr_s3_subscriber/terraform/subscription/variables.tf index d21cd74..18af560 100644 --- a/cmr_s3_subscriber/terraform/subscription/variables.tf +++ b/cmr_s3_subscriber/terraform/subscription/variables.tf @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + variable "ccid" { type = string diff --git a/cmr_s3_subscriber/terraform/subscription/versions.tf b/cmr_s3_subscriber/terraform/subscription/versions.tf index 5b52042..6385e6c 100644 --- a/cmr_s3_subscriber/terraform/subscription/versions.tf +++ b/cmr_s3_subscriber/terraform/subscription/versions.tf @@ -1,3 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + terraform { required_providers { aws = { diff --git a/cmr_s3_subscriber/terraform/variables.tf b/cmr_s3_subscriber/terraform/variables.tf index 24c3cda..722c9c4 100644 --- a/cmr_s3_subscriber/terraform/variables.tf +++ b/cmr_s3_subscriber/terraform/variables.tf @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + variable "existing_bucket" { type = string diff --git a/cmr_s3_subscriber/terraform/versions.tf b/cmr_s3_subscriber/terraform/versions.tf index 43c2b5f..0b206f0 100644 --- a/cmr_s3_subscriber/terraform/versions.tf +++ b/cmr_s3_subscriber/terraform/versions.tf @@ -1,3 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + terraform { required_providers { aws = { From 2d08f092de98ed18019add128af098b7f7add952 Mon Sep 17 00:00:00 2001 From: rileykk Date: Mon, 7 Jul 2025 08:06:47 -0700 Subject: [PATCH 10/21] cache temp s3 credentials --- cmr_s3_subscriber/lambda/lambda_function.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmr_s3_subscriber/lambda/lambda_function.py b/cmr_s3_subscriber/lambda/lambda_function.py index 092e8a8..17e5212 100644 --- a/cmr_s3_subscriber/lambda/lambda_function.py +++ b/cmr_s3_subscriber/lambda/lambda_function.py @@ -17,6 +17,7 @@ import hashlib import json import os +from functools import cache from urllib.parse import urlparse import boto3 @@ -298,6 +299,7 @@ def _process_umm(umm): ) +@cache def _try_get_s3_creds(endpoint, bearer_token): try: headers = { From 47847b56cf9acec344770033da41fb13244ca5b8 Mon Sep 17 00:00:00 2001 From: rileykk Date: Thu, 10 Jul 2025 09:02:20 -0700 Subject: [PATCH 11/21] Update to support collection-level options --- cmr_s3_subscriber/lambda/lambda_function.py | 37 ++++++++++++++++--- cmr_s3_subscriber/terraform/main.tf | 2 + .../terraform/subscription/main.tf | 30 ++++++++++++++- .../terraform/subscription/variables.tf | 19 ++++++++++ cmr_s3_subscriber/terraform/variables.tf | 15 ++++++++ 5 files changed, 97 insertions(+), 6 deletions(-) diff --git a/cmr_s3_subscriber/lambda/lambda_function.py b/cmr_s3_subscriber/lambda/lambda_function.py index 17e5212..8074513 100644 --- a/cmr_s3_subscriber/lambda/lambda_function.py +++ b/cmr_s3_subscriber/lambda/lambda_function.py @@ -17,6 +17,7 @@ import hashlib import json import os +import traceback from functools import cache from urllib.parse import urlparse @@ -112,11 +113,14 @@ def _fail_out_record(record, reason='exceeded retries'): ) -def _submit_maap_job(short_name, granule_ur): +def _submit_maap_job(short_name, granule_ur, maap_config=None): from maap.maap import MAAP maap = MAAP() + if maap_config is None: + raise Exception('MAAP config not provided') + ccid = maap.searchGranule( short_name=short_name, readable_granule_name=granule_ur, @@ -130,13 +134,21 @@ def _submit_maap_job(short_name, granule_ur): kwargs = {v.removeprefix('_maap_kwarg_'): os.environ[v] for v in os.environ.keys() if v.startswith('_maap_kwarg_')} - job = maap.submitJob( + kwargs.update(maap_config) + + job_kwargs = dict( identifier=f"CMR_subscriber_ingest_{granule_ur}", algo_id=algo, version=algo_version, queue=queue, granule_id=granule_ur, collection_id=ccid, + ) + + print(f'Submitting MAAP job with parameters: {dict(**job_kwargs, **kwargs)}') + + job = maap.submitJob( + **job_kwargs, **kwargs ) @@ -170,7 +182,11 @@ def _handle_cmr_notification(message, bearer_token): if 'MAAP_PGT' in os.environ: print('Submitting job through MAAP instead of staging in this function') - return _submit_maap_job(processed_umm['collection'], message["granule-ur"]) + return _submit_maap_job( + processed_umm['collection'], + message["granule-ur"], + processed_umm['maap_config'] + ) print(f'Downloading files for granule {processed_umm["granule"]}') @@ -282,10 +298,18 @@ def _process_umm(umm): } ) + print(f'Checked lookup table for {collection}') + print(collection_query['Items']) + if len(collection_query['Items']) == 0: s3_prefix = collection + maap_config = None else: - s3_prefix = collection_query['Items'][0]['s3_prefix']['S'] + collection_entry = collection_query['Items'][0] + + s3_prefix = collection_entry['s3_prefix']['S'] if 's3_prefix' in collection_entry else collection + maap_config = collection_entry['maap_config']['M'] if 'maap_config' in collection_entry else {} + maap_config = {k: list(v.values())[0] for k, v in maap_config.items()} if s3_prefix[-1] != '/': s3_prefix += '/' @@ -295,7 +319,8 @@ def _process_umm(umm): collection=collection, files=file_map, s3_prefix=s3_prefix, - s3_credentials_url=creds_url + s3_credentials_url=creds_url, + maap_config=maap_config, ) @@ -386,6 +411,8 @@ def lambda_handler(event, context): print(f'Failed to process record {record["messageId"]} and will not retry: {e}') _fail_out_record(record) + print(traceback.format_exc()) + # TODO implement return { 'statusCode': 200, diff --git a/cmr_s3_subscriber/terraform/main.tf b/cmr_s3_subscriber/terraform/main.tf index b7283fd..1f08875 100644 --- a/cmr_s3_subscriber/terraform/main.tf +++ b/cmr_s3_subscriber/terraform/main.tf @@ -142,6 +142,8 @@ module "subscriptions" { ccid = each.value config_file = local_file.subscriber_config_file.filename script_dir = var.script_dir + ddb_table = aws_dynamodb_table.collection_lookup.name + options = lookup(var.collection_options, each.value, null) } resource "aws_lambda_event_source_mapping" "sqs_triggers" { diff --git a/cmr_s3_subscriber/terraform/subscription/main.tf b/cmr_s3_subscriber/terraform/subscription/main.tf index 1a6105d..27fef53 100644 --- a/cmr_s3_subscriber/terraform/subscription/main.tf +++ b/cmr_s3_subscriber/terraform/subscription/main.tf @@ -28,6 +28,34 @@ resource "aws_sqs_queue" "queue" { visibility_timeout_seconds = 1800 } +data "aws_dynamodb_table" "table" { + name = var.ddb_table +} + +locals { + options_json = var.options == null ? {} : merge( + var.options.s3_path == null ? {} : {s3_prefix = {S = var.options.s3_path}}, + var.options.maap_config == null ? {} : {maap_config = { + M = merge( + { zarr_config_url = { S = var.options.maap_config.zarr_config_url } }, + { variables = { S = var.options.maap_config.variables } }, + var.options.maap_config.polygon == null ? {} : {polygon = { S = var.options.maap_config.polygon }} + ) + }} + ) +} + +resource "aws_dynamodb_table_item" "collection_options" { + hash_key = data.aws_dynamodb_table.table.hash_key + item = jsonencode(merge( + zipmap([data.aws_dynamodb_table.table.hash_key], [{S = var.options.shortname}]), + local.options_json + )) + table_name = data.aws_dynamodb_table.table.name + + count = var.options != null ? 1 : 0 +} + data "aws_iam_policy_document" "queue_policy" { statement { sid = "__owner_statement" @@ -64,7 +92,7 @@ data "aws_iam_policy_document" "queue_policy" { } resource "null_resource" "create_subscription" { - depends_on = [aws_sqs_queue.queue] + depends_on = [aws_sqs_queue.queue, aws_dynamodb_table_item.collection_options] triggers = { script_dir = var.script_dir diff --git a/cmr_s3_subscriber/terraform/subscription/variables.tf b/cmr_s3_subscriber/terraform/subscription/variables.tf index 18af560..95533ad 100644 --- a/cmr_s3_subscriber/terraform/subscription/variables.tf +++ b/cmr_s3_subscriber/terraform/subscription/variables.tf @@ -28,3 +28,22 @@ variable "script_dir" { type = string description = "Path to directory containing CMR scripts and their venv" } + +variable "ddb_table" { + type = string + description = "DynamoDB table for CCID-options mappings" +} + +variable "options" { + type = object({ + shortname = string + s3_path = optional(string) + maap_config = optional(object({ + zarr_config_url = string + variables = optional(string, "*") + polygon = optional(string) + })) + }) + description = "Collection options" + nullable = true +} diff --git a/cmr_s3_subscriber/terraform/variables.tf b/cmr_s3_subscriber/terraform/variables.tf index 722c9c4..c28bb24 100644 --- a/cmr_s3_subscriber/terraform/variables.tf +++ b/cmr_s3_subscriber/terraform/variables.tf @@ -61,6 +61,21 @@ variable "ccids" { description = "List of CMR collection-concept-IDs to subscribe to. Note: Initial apply should have either no CCIDs listed or triggers disabled until the notification SNS topic is subscribed to" } +variable "collection_options" { + type = map(object({ + shortname = string + s3_path = optional(string) + maap_config = optional(object({ + zarr_config_url = string + variables = optional(string, "*") + polygon = optional(string) + })) + })) + description = "Mapping of CCID to collection options. If specified, must provide the short name of the collection plus an s3 path and/or MAAP options. MAAP options consist of an S3 URL for job configuration and an optional list of variables (either '*' or a space-separated list wrapped in quotes)" + + # TODO: Validations +} + variable "enable_triggers" { type = bool description = "Whether to enable the SQS -> Lambda triggers. Note: Initial apply should have either no CCIDs listed or triggers disabled until the notification SNS topic is subscribed to" From c5f5b802331dd8cb7340d4933dbea4187dc77862 Mon Sep 17 00:00:00 2001 From: rileykk Date: Thu, 10 Jul 2025 09:02:41 -0700 Subject: [PATCH 12/21] Updates to queue fill script --- cmr_s3_subscriber/fill_queue_by_query.py | 34 ++++++++++++++++++++---- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/cmr_s3_subscriber/fill_queue_by_query.py b/cmr_s3_subscriber/fill_queue_by_query.py index 8953a8e..8b993ca 100644 --- a/cmr_s3_subscriber/fill_queue_by_query.py +++ b/cmr_s3_subscriber/fill_queue_by_query.py @@ -132,6 +132,18 @@ def __positive_integer(s): 'overridden' ) + parser.add_argument( + '--dryrun', + action='store_true', + help='Do not push generated messages to SQS' + ) + + parser.add_argument( + '--samples', + action='store_true', + help='Print samples of transformed data to stdout' + ) + return parser.parse_args() @@ -223,12 +235,20 @@ def main(args): if len(matched_granules) != n_hits: print('Mismatch between number of granules and initial number of hits') exit(1) + elif len(matched_granules) == 0: + print('No granules returned from CMR') + return - print(json.dumps(matched_granules[0], indent=2)) + if args.samples: + print(json.dumps(matched_granules[0], indent=2)) if args.staged_data is not None: matched_granules = filter_granules(matched_granules, args.staged_data) + if len(matched_granules) == 0: + print('No new granules returned from CMR') + return + if args.limit is not None: print(f'Limiting {len(matched_granules)} granules to {args.limit}') matched_granules = matched_granules[:args.limit] @@ -255,7 +275,8 @@ def _try_get_pgid_from_umm(umm): } for m in matched_granules ] - print(json.dumps(matched_granules[0], indent=2)) + if args.samples: + print(json.dumps(matched_granules[0], indent=2)) print('Converting to SQS messages') @@ -285,7 +306,12 @@ def _try_get_pgid_from_umm(umm): } for m in matched_granules ] - print(json.dumps(matched_granules[0], indent=2)) + if args.samples: + print(json.dumps(matched_granules[0], indent=2)) + + if args.dryrun: + print(f'Produced {len(matched_granules)} messages that would be sent to SQS.') + return sqs = boto3.client('sqs') @@ -309,5 +335,3 @@ def _try_get_pgid_from_umm(umm): if __name__ == '__main__': main(parse_arguments()) - # filter_granules([], 's3://aqacf-nexus-stage/TEMPO/TEMPO_NO2_L3_V03/') - From 51fb3e7fc710d414a2b586d20b8e12d027b4af87 Mon Sep 17 00:00:00 2001 From: rileykk Date: Tue, 19 Aug 2025 07:21:05 -0700 Subject: [PATCH 13/21] New options: - Separate trigger enable for backfill queue and main queues - Add delay for sqs messages - Add triggering for new granules only --- cmr_s3_subscriber/lambda/lambda_function.py | 98 +++++++++++++++---- cmr_s3_subscriber/subscriber.py | 8 +- .../terraform/sqs_backfill_queue.tf | 2 +- .../terraform/subscription/main.tf | 12 ++- .../terraform/subscription/variables.tf | 4 +- cmr_s3_subscriber/terraform/variables.tf | 15 ++- 6 files changed, 109 insertions(+), 30 deletions(-) diff --git a/cmr_s3_subscriber/lambda/lambda_function.py b/cmr_s3_subscriber/lambda/lambda_function.py index 8074513..c95619d 100644 --- a/cmr_s3_subscriber/lambda/lambda_function.py +++ b/cmr_s3_subscriber/lambda/lambda_function.py @@ -17,6 +17,7 @@ import hashlib import json import os +import re import traceback from functools import cache from urllib.parse import urlparse @@ -169,13 +170,19 @@ def _handle_cmr_notification(message, bearer_token): umm_response.raise_for_status() except: if umm_response.status_code == 404: - print(f'No record exists for granule {message["granule-ur"]} ({granule_metadata_url}). It was likely ' - f'superseded and will thus be skipped') - return + print(f'Got a 404 for granule {message["granule-ur"]} at URL {granule_metadata_url}, it was likely ' + f'superseded. Trying to pull the latest revision.') + + maybe_umm = _maybe_get_latest_revision(message['location']) + + if maybe_umm is None: + return + else: + umm_response = maybe_umm else: raise - umm = umm_response.json() + umm = umm_response.json() processed_umm = _process_umm(umm) print(json.dumps(processed_umm, indent=2)) @@ -245,6 +252,71 @@ def _handle_cmr_notification(message, bearer_token): os.unlink(dl_path) +def _maybe_get_latest_revision(location_url): + match = re.search(r'/\d+$', location_url) + + if match is None: + return None + + latest_rev_url = location_url[:match.start()] + '.umm_json' + + umm_response = requests.get(latest_rev_url) + try: + umm_response.raise_for_status() + except: + if umm_response.status_code == 404: + print(f'No record exists for latest revision URL ({latest_rev_url}). It was may no longer exist ' + f'and will thus be skipped') + return None + else: + raise + + umm = umm_response.json() + collection = umm['CollectionReference']['ShortName'] + + print('Got latest revision metadata, now checking if it is allowed to use this for this collection') + + collection_options = _search_collection_options(collection) + + if collection_options is None: + print('Skipping this granule because there is no configuration to allow using latest revision metadata') + return None + + if 'use_latest_rev' not in collection_options: + print('Skipping this granule because there is no configuration to allow using latest revision metadata') + return None + + if collection_options['use_latest_rev']['BOOL']: + print('Collection supports using latest revision metadata, continuing...') + return umm_response + else: + print('Skipping this granule because its collection configuration disallows using latest revision metadata') + return None + + +@cache +def _search_collection_options(collection): + collection_query = ddb.query( + TableName=DDB_ARN, + KeyConditionExpression='#collection_short_name = :c', + ExpressionAttributeValues={ + ':c': {'S': collection} + }, + ExpressionAttributeNames={ + '#collection_short_name': 'collection' + } + ) + + print(f'Checked lookup table for {collection}') + print(collection_query['Items']) + + if len(collection_query['Items']) == 0: + return None + else: + collection_entry = collection_query['Items'][0] + return collection_entry + + def _process_umm(umm): # I don't think this is guaranteed (this or EntryTitle). What does other field look like? collection = umm['CollectionReference']['ShortName'] @@ -287,26 +359,12 @@ def _process_umm(umm): creds_url = url['URL'] break - collection_query = ddb.query( - TableName=DDB_ARN, - KeyConditionExpression='#collection_short_name = :c', - ExpressionAttributeValues={ - ':c': {'S': collection} - }, - ExpressionAttributeNames={ - '#collection_short_name': 'collection' - } - ) - - print(f'Checked lookup table for {collection}') - print(collection_query['Items']) + collection_entry = _search_collection_options(collection) - if len(collection_query['Items']) == 0: + if collection_entry is None: s3_prefix = collection maap_config = None else: - collection_entry = collection_query['Items'][0] - s3_prefix = collection_entry['s3_prefix']['S'] if 's3_prefix' in collection_entry else collection maap_config = collection_entry['maap_config']['M'] if 'maap_config' in collection_entry else {} maap_config = {k: list(v.values())[0] for k, v in maap_config.items()} diff --git a/cmr_s3_subscriber/subscriber.py b/cmr_s3_subscriber/subscriber.py index 274145e..bb38d94 100644 --- a/cmr_s3_subscriber/subscriber.py +++ b/cmr_s3_subscriber/subscriber.py @@ -55,6 +55,12 @@ def parse_args(): help='Do not make CMR API calls except for auth' ) + parser.add_argument( + '--new-only', + action='store_true', + help='Only subscribe to new new granules, ignoring revisions' + ) + return parser.parse_args() @@ -82,7 +88,7 @@ def main(args): "Type": "granule", "Query": "*", "EndPoint": queue_arn, - "Mode": ["New", "Update"], + "Mode": ["New"] if args.new_only else ["New", "Update"], "Method": "ingest", "MetadataSpecification": { "URL": "https://cdn.earthdata.nasa.gov/umm/subscription/v1.1.1", diff --git a/cmr_s3_subscriber/terraform/sqs_backfill_queue.tf b/cmr_s3_subscriber/terraform/sqs_backfill_queue.tf index 59ace5a..e5d3e8c 100644 --- a/cmr_s3_subscriber/terraform/sqs_backfill_queue.tf +++ b/cmr_s3_subscriber/terraform/sqs_backfill_queue.tf @@ -60,7 +60,7 @@ resource "aws_lambda_event_source_mapping" "backfill_trigger" { function_name = aws_lambda_function.lambda.arn event_source_arn = aws_sqs_queue.backfill_queue.arn - enabled = var.enable_triggers + enabled = var.enable_backfill_trigger batch_size = 3 function_response_types = ["ReportBatchItemFailures"] diff --git a/cmr_s3_subscriber/terraform/subscription/main.tf b/cmr_s3_subscriber/terraform/subscription/main.tf index 27fef53..83d72ce 100644 --- a/cmr_s3_subscriber/terraform/subscription/main.tf +++ b/cmr_s3_subscriber/terraform/subscription/main.tf @@ -26,6 +26,7 @@ resource "aws_sqs_queue" "queue" { message_retention_seconds = 1209600 policy = data.aws_iam_policy_document.queue_policy.json visibility_timeout_seconds = 1800 + delay_seconds = var.options == null ? 0 : var.options.delay } data "aws_dynamodb_table" "table" { @@ -34,22 +35,25 @@ data "aws_dynamodb_table" "table" { locals { options_json = var.options == null ? {} : merge( - var.options.s3_path == null ? {} : {s3_prefix = {S = var.options.s3_path}}, + var.options.s3_path == null ? {} : {s3_prefix = { S = var.options.s3_path }}, + var.options.polygon == null ? {} : {polygon = { S = var.options.polygon }}, var.options.maap_config == null ? {} : {maap_config = { M = merge( { zarr_config_url = { S = var.options.maap_config.zarr_config_url } }, { variables = { S = var.options.maap_config.variables } }, - var.options.maap_config.polygon == null ? {} : {polygon = { S = var.options.maap_config.polygon }} ) }} ) + + trigger_on_revisions = var.options == null ? true : var.options.trigger_on_revisions } resource "aws_dynamodb_table_item" "collection_options" { hash_key = data.aws_dynamodb_table.table.hash_key item = jsonencode(merge( zipmap([data.aws_dynamodb_table.table.hash_key], [{S = var.options.shortname}]), - local.options_json + local.options_json, + {use_latest_rev = { BOOL = !var.options.trigger_on_revisions }} )) table_name = data.aws_dynamodb_table.table.name @@ -101,7 +105,7 @@ resource "null_resource" "create_subscription" { } provisioner "local-exec" { - command = "source venv/bin/activate; python subscriber.py ${var.config_file} ${var.ccid} --queue ${aws_sqs_queue.queue.arn}" + command = "source venv/bin/activate; python subscriber.py ${var.config_file} ${var.ccid} --queue ${aws_sqs_queue.queue.arn} ${local.trigger_on_revisions ? "" : "--new-only"}" working_dir = var.script_dir interpreter = ["/bin/bash", "-c"] } diff --git a/cmr_s3_subscriber/terraform/subscription/variables.tf b/cmr_s3_subscriber/terraform/subscription/variables.tf index 95533ad..2cab8be 100644 --- a/cmr_s3_subscriber/terraform/subscription/variables.tf +++ b/cmr_s3_subscriber/terraform/subscription/variables.tf @@ -38,10 +38,12 @@ variable "options" { type = object({ shortname = string s3_path = optional(string) + polygon = optional(string) + trigger_on_revisions = optional(bool, true) + delay = optional(number, 0) maap_config = optional(object({ zarr_config_url = string variables = optional(string, "*") - polygon = optional(string) })) }) description = "Collection options" diff --git a/cmr_s3_subscriber/terraform/variables.tf b/cmr_s3_subscriber/terraform/variables.tf index c28bb24..11832ef 100644 --- a/cmr_s3_subscriber/terraform/variables.tf +++ b/cmr_s3_subscriber/terraform/variables.tf @@ -63,12 +63,14 @@ variable "ccids" { variable "collection_options" { type = map(object({ - shortname = string - s3_path = optional(string) + shortname = string + s3_path = optional(string) + polygon = optional(string) + trigger_on_revisions = optional(bool, true) + delay = optional(number, 0) maap_config = optional(object({ zarr_config_url = string variables = optional(string, "*") - polygon = optional(string) })) })) description = "Mapping of CCID to collection options. If specified, must provide the short name of the collection plus an s3 path and/or MAAP options. MAAP options consist of an S3 URL for job configuration and an optional list of variables (either '*' or a space-separated list wrapped in quotes)" @@ -83,6 +85,13 @@ variable "enable_triggers" { default = false } +variable "enable_backfill_trigger" { + type = bool + description = "Whether to enable the SQS -> Lambda trigger for the backfill queue" + + default = true +} + variable "lambda_vpc" { type = object({ subnet_ids = list(string) From 475470cbfd15668b4fce2b0d0c65c8e39939df32 Mon Sep 17 00:00:00 2001 From: rileykk Date: Tue, 19 Aug 2025 07:22:32 -0700 Subject: [PATCH 14/21] New options for queue filling script: - Force granules to be on first revision (simulating new-only triggering) - Option to dump generated sns and sqs messages to files --- cmr_s3_subscriber/fill_queue_by_query.py | 34 +++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/cmr_s3_subscriber/fill_queue_by_query.py b/cmr_s3_subscriber/fill_queue_by_query.py index 8b993ca..8644049 100644 --- a/cmr_s3_subscriber/fill_queue_by_query.py +++ b/cmr_s3_subscriber/fill_queue_by_query.py @@ -144,6 +144,22 @@ def __positive_integer(s): help='Print samples of transformed data to stdout' ) + parser.add_argument( + '--force-first-rev', + action='store_true', + help='For testing purposes, force the message granule URLs to be for revision 1 to better simulate new granule ' + 'notifications' + ) + + parser.add_argument( + '--dump', + choices=['sns', 'sqs'], + default=[], + help='If set, dump all generated messages to a file. sns == dump mocked CMR SNS subscription messages. sqs == ' + 'dump wrapped SQS messages', + nargs='*' + ) + return parser.parse_args() @@ -270,7 +286,7 @@ def _try_get_pgid_from_umm(umm): 'concept-id': m['meta']['concept-id'], 'granule-ur': m['umm']['GranuleUR'], 'location': f"https://cmr.earthdata.nasa.gov:443/concepts/" - f"{m['meta']['concept-id']}/{m['meta']['revision-id']}", + f"{m['meta']['concept-id']}/{1 if args.force_first_rev else m['meta']['revision-id']}", 'producer-granule-id': _try_get_pgid_from_umm(m['umm']) } for m in matched_granules ] @@ -278,6 +294,14 @@ def _try_get_pgid_from_umm(umm): if args.samples: print(json.dumps(matched_granules[0], indent=2)) + if 'sns' in args.dump: + dump_fname = f'{args.ccid}_dump.sns.json' + + with open(dump_fname, 'w') as f: + json.dump(matched_granules, f, indent=2) + + print(f'Dumped generated SNS messages to {dump_fname}') + print('Converting to SQS messages') matched_granules = [ @@ -309,6 +333,14 @@ def _try_get_pgid_from_umm(umm): if args.samples: print(json.dumps(matched_granules[0], indent=2)) + if 'sqs' in args.dump: + dump_fname = f'{args.ccid}_dump.sqs.json' + + with open(dump_fname, 'w') as f: + json.dump(matched_granules, f, indent=2) + + print(f'Dumped generated SQS messages to {dump_fname}') + if args.dryrun: print(f'Produced {len(matched_granules)} messages that would be sent to SQS.') return From 4fbc894021e8a0d4202ddf6aae07239ca7ff3471 Mon Sep 17 00:00:00 2001 From: rileykk Date: Mon, 8 Sep 2025 10:47:20 -0700 Subject: [PATCH 15/21] New option: Geo-filter for granules --- cmr_s3_subscriber/lambda/Makefile | 2 +- cmr_s3_subscriber/lambda/lambda_function.py | 48 +++++++++++++++++++-- cmr_s3_subscriber/requirements.txt | 1 + cmr_s3_subscriber/requirements_maap.txt | 2 +- 4 files changed, 48 insertions(+), 5 deletions(-) diff --git a/cmr_s3_subscriber/lambda/Makefile b/cmr_s3_subscriber/lambda/Makefile index f2dedf0..b807fb2 100644 --- a/cmr_s3_subscriber/lambda/Makefile +++ b/cmr_s3_subscriber/lambda/Makefile @@ -33,7 +33,7 @@ package_maap.zip: lambda_function.py ../requirements.txt --platform manylinux2014_x86_64 \ --implementation cp \ -r ../requirements_maap.txt \ - --python-version 3.9 \ + --python-version 3.12 \ --only-binary=:all: --upgrade cd package_maap && zip -9r ../package_maap.zip . zip -9 package_maap.zip lambda_function.py diff --git a/cmr_s3_subscriber/lambda/lambda_function.py b/cmr_s3_subscriber/lambda/lambda_function.py index c95619d..c5833bd 100644 --- a/cmr_s3_subscriber/lambda/lambda_function.py +++ b/cmr_s3_subscriber/lambda/lambda_function.py @@ -16,6 +16,7 @@ import hashlib import json +import logging import os import re import traceback @@ -25,7 +26,8 @@ import boto3 import earthaccess import requests -import logging +from shapely import from_wkt, intersects +from shapely.geometry import box logging.basicConfig(level=logging.DEBUG) @@ -46,6 +48,8 @@ "SHA-512": hashlib.sha512, } +GLOBAL = box(-180, -90, 180, 90) + try: import requests_mock @@ -115,7 +119,11 @@ def _fail_out_record(record, reason='exceeded retries'): def _submit_maap_job(short_name, granule_ur, maap_config=None): - from maap.maap import MAAP + try: + from maap.maap import MAAP + except ImportError: + print('FATAL: MAAP py package not installed. Try rebuilding zip package with the package_maap.zip target') + raise maap = MAAP() @@ -185,7 +193,15 @@ def _handle_cmr_notification(message, bearer_token): umm = umm_response.json() processed_umm = _process_umm(umm) - print(json.dumps(processed_umm, indent=2)) + print(json.dumps(processed_umm, indent=2, default=lambda o: repr(o))) + + if processed_umm['collection_geo'] is not None: + desired_geo = processed_umm['collection_geo'] + granule_geo = processed_umm['spatial_extent'] + + if not intersects(granule_geo, desired_geo): + print(f'Granule {message["granule-ur"]} does not intersect geo filter. Skipping.') + return if 'MAAP_PGT' in os.environ: print('Submitting job through MAAP instead of staging in this function') @@ -359,16 +375,40 @@ def _process_umm(umm): creds_url = url['URL'] break + try: + bounding_rectangles = umm['SpatialExtent']['HorizontalSpatialDomain']['Geometry']['BoundingRectangles'] + + if len(bounding_rectangles) > 1: + raise ValueError('Multiple bounding rectangles given when one expected') + + bbox_dict = bounding_rectangles[0] + + bbox = box( + bbox_dict['WestBoundingCoordinate'], + bbox_dict['SouthBoundingCoordinate'], + bbox_dict['EastBoundingCoordinate'], + bbox_dict['NorthBoundingCoordinate'], + ) + except Exception as e: + print(f'WARN: Unable to get bbox from umm: {e!r}. Using global extent instead') + bbox = GLOBAL + collection_entry = _search_collection_options(collection) if collection_entry is None: s3_prefix = collection + desired_geo = None maap_config = None else: s3_prefix = collection_entry['s3_prefix']['S'] if 's3_prefix' in collection_entry else collection + desired_geo = from_wkt(collection_entry['polygon']['S']) if 'polygon' in collection_entry else None maap_config = collection_entry['maap_config']['M'] if 'maap_config' in collection_entry else {} maap_config = {k: list(v.values())[0] for k, v in maap_config.items()} + if desired_geo.geom_type != 'Polygon': + print(f'WARN: Collection settings define incorrect geo filter geometry type. Must be POLYGON. Disabling filter') + desired_geo = None + if s3_prefix[-1] != '/': s3_prefix += '/' @@ -379,6 +419,8 @@ def _process_umm(umm): s3_prefix=s3_prefix, s3_credentials_url=creds_url, maap_config=maap_config, + spatial_extent=bbox, + collection_geo=desired_geo, ) diff --git a/cmr_s3_subscriber/requirements.txt b/cmr_s3_subscriber/requirements.txt index 2e50c60..b3c2a5c 100644 --- a/cmr_s3_subscriber/requirements.txt +++ b/cmr_s3_subscriber/requirements.txt @@ -1,5 +1,6 @@ earthaccess requests pyyaml +shapely # maap-py @ git+https://github.com/MAAP-Project/maap-py.git diff --git a/cmr_s3_subscriber/requirements_maap.txt b/cmr_s3_subscriber/requirements_maap.txt index 2578b76..f9900c4 100644 --- a/cmr_s3_subscriber/requirements_maap.txt +++ b/cmr_s3_subscriber/requirements_maap.txt @@ -32,7 +32,7 @@ six==1.17.0 tinynetrc==1.3.1 tqdm==4.67.1 typing-extensions==4.14.0 -# urllib3==2.4.0 urllib3 wrapt==1.17.2 yarl==1.20.1 +shapely From 3d1387393cdcb33a7579a0300a7ea5e0a3b2fde5 Mon Sep 17 00:00:00 2001 From: rileykk Date: Mon, 29 Sep 2025 16:01:50 -0700 Subject: [PATCH 16/21] fixes and updated MAAP submission --- cmr_s3_subscriber/lambda/lambda_function.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cmr_s3_subscriber/lambda/lambda_function.py b/cmr_s3_subscriber/lambda/lambda_function.py index c5833bd..c2c4db4 100644 --- a/cmr_s3_subscriber/lambda/lambda_function.py +++ b/cmr_s3_subscriber/lambda/lambda_function.py @@ -152,6 +152,7 @@ def _submit_maap_job(short_name, granule_ur, maap_config=None): queue=queue, granule_id=granule_ur, collection_id=ccid, + concept_id=ccid, ) print(f'Submitting MAAP job with parameters: {dict(**job_kwargs, **kwargs)}') @@ -161,6 +162,11 @@ def _submit_maap_job(short_name, granule_ur, maap_config=None): **kwargs ) + if job.id is None or job.id == '': + print(f'MAAP job submission failed: {job.error_details}') + print(job) + raise Exception('MAAP job submission failed') + print(f'Submitted job {job.id}') return job.id @@ -405,7 +411,7 @@ def _process_umm(umm): maap_config = collection_entry['maap_config']['M'] if 'maap_config' in collection_entry else {} maap_config = {k: list(v.values())[0] for k, v in maap_config.items()} - if desired_geo.geom_type != 'Polygon': + if desired_geo is not None and desired_geo.geom_type != 'Polygon': print(f'WARN: Collection settings define incorrect geo filter geometry type. Must be POLYGON. Disabling filter') desired_geo = None From 2d83bbd21e8630df78c8f925ee706e7e59704a74 Mon Sep 17 00:00:00 2001 From: rileykk Date: Mon, 29 Sep 2025 16:02:22 -0700 Subject: [PATCH 17/21] add option to set trigger enabled status on a per-collection basis --- cmr_s3_subscriber/terraform/main.tf | 2 +- cmr_s3_subscriber/terraform/variables.tf | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/cmr_s3_subscriber/terraform/main.tf b/cmr_s3_subscriber/terraform/main.tf index 1f08875..90c9288 100644 --- a/cmr_s3_subscriber/terraform/main.tf +++ b/cmr_s3_subscriber/terraform/main.tf @@ -151,7 +151,7 @@ resource "aws_lambda_event_source_mapping" "sqs_triggers" { for_each = module.subscriptions event_source_arn = each.value.queue_arn - enabled = var.enable_triggers + enabled = var.collection_options[each.value.ccid].trigger_enable_override != null ? var.collection_options[each.value.ccid].trigger_enable_override : var.enable_triggers batch_size = 3 function_response_types = ["ReportBatchItemFailures"] diff --git a/cmr_s3_subscriber/terraform/variables.tf b/cmr_s3_subscriber/terraform/variables.tf index 11832ef..628bbb2 100644 --- a/cmr_s3_subscriber/terraform/variables.tf +++ b/cmr_s3_subscriber/terraform/variables.tf @@ -63,11 +63,12 @@ variable "ccids" { variable "collection_options" { type = map(object({ - shortname = string - s3_path = optional(string) - polygon = optional(string) - trigger_on_revisions = optional(bool, true) - delay = optional(number, 0) + shortname = string + s3_path = optional(string) + polygon = optional(string) + trigger_on_revisions = optional(bool, true) + trigger_enable_override = optional(bool) + delay = optional(number, 0) maap_config = optional(object({ zarr_config_url = string variables = optional(string, "*") @@ -86,7 +87,7 @@ variable "enable_triggers" { } variable "enable_backfill_trigger" { - type = bool + type = bool description = "Whether to enable the SQS -> Lambda trigger for the backfill queue" default = true From 0c9b370b7e8f27a17982140289a3f40aae81382c Mon Sep 17 00:00:00 2001 From: rileykk Date: Thu, 4 Dec 2025 12:28:11 -0800 Subject: [PATCH 18/21] update maap_config Can now be an arbitrary set of maap params/overrides --- cmr_s3_subscriber/terraform/subscription/main.tf | 15 +++++++-------- .../terraform/subscription/variables.tf | 5 +---- cmr_s3_subscriber/terraform/variables.tf | 5 +---- 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/cmr_s3_subscriber/terraform/subscription/main.tf b/cmr_s3_subscriber/terraform/subscription/main.tf index 83d72ce..96cfb69 100644 --- a/cmr_s3_subscriber/terraform/subscription/main.tf +++ b/cmr_s3_subscriber/terraform/subscription/main.tf @@ -34,16 +34,15 @@ data "aws_dynamodb_table" "table" { } locals { - options_json = var.options == null ? {} : merge( + // Need to use this weird ternary. Apparently terraform can't handle {} or {k = v,...}, + // so it's [{}, {k = v, ...}][condition ? 0 : 1] + options_json = [{}, merge( var.options.s3_path == null ? {} : {s3_prefix = { S = var.options.s3_path }}, var.options.polygon == null ? {} : {polygon = { S = var.options.polygon }}, - var.options.maap_config == null ? {} : {maap_config = { - M = merge( - { zarr_config_url = { S = var.options.maap_config.zarr_config_url } }, - { variables = { S = var.options.maap_config.variables } }, - ) - }} - ) + [{}, {maap_config = { + M = {for key in keys(var.options.maap_config) : key => {S = var.options.maap_config[key]}} + }}][var.options.maap_config == null ? 0 : 1] + )][var.options == null ? 0 : 1] trigger_on_revisions = var.options == null ? true : var.options.trigger_on_revisions } diff --git a/cmr_s3_subscriber/terraform/subscription/variables.tf b/cmr_s3_subscriber/terraform/subscription/variables.tf index 2cab8be..b2cb66d 100644 --- a/cmr_s3_subscriber/terraform/subscription/variables.tf +++ b/cmr_s3_subscriber/terraform/subscription/variables.tf @@ -41,10 +41,7 @@ variable "options" { polygon = optional(string) trigger_on_revisions = optional(bool, true) delay = optional(number, 0) - maap_config = optional(object({ - zarr_config_url = string - variables = optional(string, "*") - })) + maap_config = optional(map(string)) }) description = "Collection options" nullable = true diff --git a/cmr_s3_subscriber/terraform/variables.tf b/cmr_s3_subscriber/terraform/variables.tf index 628bbb2..dea37d5 100644 --- a/cmr_s3_subscriber/terraform/variables.tf +++ b/cmr_s3_subscriber/terraform/variables.tf @@ -69,10 +69,7 @@ variable "collection_options" { trigger_on_revisions = optional(bool, true) trigger_enable_override = optional(bool) delay = optional(number, 0) - maap_config = optional(object({ - zarr_config_url = string - variables = optional(string, "*") - })) + maap_config = optional(map(string)) })) description = "Mapping of CCID to collection options. If specified, must provide the short name of the collection plus an s3 path and/or MAAP options. MAAP options consist of an S3 URL for job configuration and an optional list of variables (either '*' or a space-separated list wrapped in quotes)" From d0ac6b1a351bcdf21f6430386ef103d63da2d7f5 Mon Sep 17 00:00:00 2001 From: rileykk Date: Thu, 4 Dec 2025 12:28:35 -0800 Subject: [PATCH 19/21] update maap_config Can now be an arbitrary set of maap params/overrides --- cmr_s3_subscriber/lambda/lambda_function.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/cmr_s3_subscriber/lambda/lambda_function.py b/cmr_s3_subscriber/lambda/lambda_function.py index c2c4db4..d19ba90 100644 --- a/cmr_s3_subscriber/lambda/lambda_function.py +++ b/cmr_s3_subscriber/lambda/lambda_function.py @@ -20,6 +20,7 @@ import os import re import traceback +from copy import deepcopy from functools import cache from urllib.parse import urlparse @@ -155,12 +156,19 @@ def _submit_maap_job(short_name, granule_ur, maap_config=None): concept_id=ccid, ) - print(f'Submitting MAAP job with parameters: {dict(**job_kwargs, **kwargs)}') + final_maap_args = deepcopy(job_kwargs) + final_maap_args.update(kwargs) - job = maap.submitJob( - **job_kwargs, - **kwargs - ) + # print(f'Submitting MAAP job with parameters: {dict(**job_kwargs, **kwargs)}') + # + # job = maap.submitJob( + # **job_kwargs, + # **kwargs + # ) + + print(f'Submitting MAAP job with parameters: {final_maap_args}') + + job = maap.submitJob(**final_maap_args) if job.id is None or job.id == '': print(f'MAAP job submission failed: {job.error_details}') From 8ae506bc73d78c7650918024dfbbad17bed056ae Mon Sep 17 00:00:00 2001 From: rileykk Date: Thu, 4 Dec 2025 12:29:05 -0800 Subject: [PATCH 20/21] fix getting of delivery count in lambda --- cmr_s3_subscriber/lambda/lambda_function.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmr_s3_subscriber/lambda/lambda_function.py b/cmr_s3_subscriber/lambda/lambda_function.py index d19ba90..c30a7a6 100644 --- a/cmr_s3_subscriber/lambda/lambda_function.py +++ b/cmr_s3_subscriber/lambda/lambda_function.py @@ -516,7 +516,7 @@ def lambda_handler(event, context): try: _process_record(json.loads(record['body']), bearer_token) except Exception as e: - rec_count = int(record.get('Attributes', {}).get("ApproximateReceiveCount", "5")) + rec_count = int(record.get('attributes', {}).get("ApproximateReceiveCount", "5")) if rec_count <= 5: batch_item_failures.append({"itemIdentifier": record['messageId']}) From 26a935f55546bcd483a529180b8c7e1ef90a3af3 Mon Sep 17 00:00:00 2001 From: rileykk Date: Tue, 9 Dec 2025 09:13:40 -0800 Subject: [PATCH 21/21] process polygonal extents in UMM --- cmr_s3_subscriber/lambda/lambda_function.py | 38 +++++++++++++++------ 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/cmr_s3_subscriber/lambda/lambda_function.py b/cmr_s3_subscriber/lambda/lambda_function.py index c30a7a6..3f5e41d 100644 --- a/cmr_s3_subscriber/lambda/lambda_function.py +++ b/cmr_s3_subscriber/lambda/lambda_function.py @@ -28,7 +28,7 @@ import earthaccess import requests from shapely import from_wkt, intersects -from shapely.geometry import box +from shapely.geometry import box, Polygon, MultiPolygon logging.basicConfig(level=logging.DEBUG) @@ -390,19 +390,35 @@ def _process_umm(umm): break try: - bounding_rectangles = umm['SpatialExtent']['HorizontalSpatialDomain']['Geometry']['BoundingRectangles'] + spatial_extent = umm['SpatialExtent']['HorizontalSpatialDomain'] - if len(bounding_rectangles) > 1: - raise ValueError('Multiple bounding rectangles given when one expected') + if 'Geometry' in spatial_extent: + if 'BoundingRectangles' in spatial_extent['Geometry']: + bounding_rectangles = spatial_extent['Geometry']['BoundingRectangles'] - bbox_dict = bounding_rectangles[0] + if len(bounding_rectangles) > 1: + raise ValueError('Multiple bounding rectangles given when one expected') - bbox = box( - bbox_dict['WestBoundingCoordinate'], - bbox_dict['SouthBoundingCoordinate'], - bbox_dict['EastBoundingCoordinate'], - bbox_dict['NorthBoundingCoordinate'], - ) + bbox_dict = bounding_rectangles[0] + + bbox = box( + bbox_dict['WestBoundingCoordinate'], + bbox_dict['SouthBoundingCoordinate'], + bbox_dict['EastBoundingCoordinate'], + bbox_dict['NorthBoundingCoordinate'], + ) + elif 'GPolygons' in spatial_extent['Geometry']: + polygons = [] + + for polygon in spatial_extent['Geometry']['GPolygons']: + polygons.append(Polygon([(p['Longitude'], p['Latitude']) for p in polygon['Boundary']['Points']])) + + if len(polygons) == 1: + bbox = polygons[0] + else: + bbox = MultiPolygon(polygons) + else: + raise ValueError('Spatial extent geometry not provided') except Exception as e: print(f'WARN: Unable to get bbox from umm: {e!r}. Using global extent instead') bbox = GLOBAL