From 0645a26a7dd6acd9b925cba92a7e5812065a90a4 Mon Sep 17 00:00:00 2001 From: Martin Carlsson Date: Wed, 7 Jan 2026 19:59:07 +0100 Subject: [PATCH 1/2] Parse multiple CELEX numbers as YAML list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for handling multiple CELEX numbers (comma or space-separated) in both main documents and amendments. When multiple CELEX numbers are present, they are now stored as a YAML list instead of a single string. Changes: - Add parse_celex_numbers() function to split CELEX strings - Update YAML front matter to output single value or list - Apply same parsing to amendment CELEX numbers đŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 --- sfs_processor.py | 43 +++++++++++++++++++++++++++++++++++++++++- temporal/amendments.py | 16 ++++++++++++++-- 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/sfs_processor.py b/sfs_processor.py index c2b37262..8d2cea4a 100644 --- a/sfs_processor.py +++ b/sfs_processor.py @@ -44,6 +44,27 @@ from formatters.predocs_parser import parse_predocs_string +def parse_celex_numbers(celex_string: str) -> List[str]: + """ + Parse CELEX numbers from a string into a list. + + Handles multiple CELEX numbers separated by commas or spaces. + + Args: + celex_string (str): One or more CELEX numbers (e.g., "32001L0083, 32004L0027") + + Returns: + List[str]: List of individual CELEX numbers + """ + if not celex_string or not celex_string.strip(): + return [] + + # Split by comma and/or whitespace, filter out empty strings + celex_list = [celex.strip() for celex in re.split(r'[,\s]+', celex_string) if celex.strip()] + + return celex_list + + def create_safe_filename(beteckning: str, preserve_selex_tags: bool = False) -> str: """ Create a safe filename from beteckning. @@ -399,7 +420,17 @@ def convert_to_markdown(data: Dict[str, Any], fetch_predocs_from_api: bool = Fal # Fallback to original string yaml_front_matter += f"forarbeten: {format_yaml_value(predocs)}\n" if celex_nummer: - yaml_front_matter += f"celex: {format_yaml_value(celex_nummer)}\n" + # Parse CELEX numbers (can be comma-separated or space-separated) + celex_list = parse_celex_numbers(celex_nummer) + + if len(celex_list) == 1: + # Single CELEX number - write as string + yaml_front_matter += f"celex: {format_yaml_value(celex_list[0])}\n" + elif len(celex_list) > 1: + # Multiple CELEX numbers - write as list + yaml_front_matter += "celex:\n" + for celex in celex_list: + yaml_front_matter += f" - {format_yaml_value(celex)}\n" # Add eu_direktiv only if it's true if eu_direktiv: @@ -415,6 +446,16 @@ def convert_to_markdown(data: Dict[str, Any], fetch_predocs_from_api: bool = Fal yaml_front_matter += f" rubrik: {format_yaml_value(amendment['rubrik'])}\n" if amendment['ikraft_datum']: yaml_front_matter += f" ikraft_datum: {format_yaml_value(amendment['ikraft_datum'])}\n" + if amendment.get('celex'): + celex = amendment['celex'] + if isinstance(celex, list): + # Multiple CELEX numbers + yaml_front_matter += " celex:\n" + for celex_num in celex: + yaml_front_matter += f" - {format_yaml_value(celex_num)}\n" + else: + # Single CELEX number + yaml_front_matter += f" celex: {format_yaml_value(celex)}\n" if amendment['anteckningar']: yaml_front_matter += f" anteckningar: {format_yaml_value(amendment['anteckningar'])}\n" diff --git a/temporal/amendments.py b/temporal/amendments.py index 42b22d62..82dc6cd3 100644 --- a/temporal/amendments.py +++ b/temporal/amendments.py @@ -10,7 +10,8 @@ def extract_amendments(andringsforfattningar: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Extract and format amendment information, sorted chronologically by ikraft_datum.""" from util.datetime_utils import format_datetime # Import to avoid circular imports - + import re + amendments = [] for amendment in andringsforfattningar: @@ -21,10 +22,21 @@ def extract_amendments(andringsforfattningar: List[Dict[str, Any]]) -> List[Dict 'anteckningar': clean_text(amendment.get('anteckningar')) } + # Handle CELEX numbers (can be comma-separated or space-separated) + celex_nummer = amendment.get('celexnummer') + if celex_nummer: + # Parse CELEX numbers - split by comma and/or whitespace + celex_list = [celex.strip() for celex in re.split(r'[,\s]+', celex_nummer) if celex.strip()] + + if len(celex_list) == 1: + amendment_data['celex'] = celex_list[0] + elif len(celex_list) > 1: + amendment_data['celex'] = celex_list + # Only include non-empty amendments if amendment_data['beteckning']: amendments.append(amendment_data) - + # Sort amendments chronologically by ikraft_datum # Amendments without ikraft_datum will be sorted to the end amendments.sort(key=lambda x: x['ikraft_datum'] or '9999-12-31') From e7c6f9db69315e5aa421469e7cffff7434cca62d Mon Sep 17 00:00:00 2001 From: Martin Carlsson Date: Wed, 7 Jan 2026 22:10:17 +0100 Subject: [PATCH 2/2] Update frontmatter sorting to support CELEX lists MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The frontmatter sorter now properly handles CELEX numbers in amendments: - Add 'celex' to AMENDMENT_ORDER for proper field ordering - Parse nested YAML lists (e.g., multiple CELEX numbers) - Generate correct YAML output for both single values and lists This completes the CELEX number support by ensuring that frontmatter sorting preserves CELEX data in the correct format. đŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 --- formatters/sort_frontmatter.py | 127 +++++++++++++++++++++++---------- 1 file changed, 88 insertions(+), 39 deletions(-) diff --git a/formatters/sort_frontmatter.py b/formatters/sort_frontmatter.py index e6d0c4ef..3266b0eb 100644 --- a/formatters/sort_frontmatter.py +++ b/formatters/sort_frontmatter.py @@ -14,14 +14,14 @@ def sort_amendments_list(amendment_lines: list) -> str: """ Sorterar innehĂ„llet i en andringsforfattningar-lista. - + Args: amendment_lines: Lista med rader som representerar andringsforfattningar - + Returns: str: Sorterad YAML-representation av andringsforfattningar """ - AMENDMENT_ORDER = ['beteckning', 'rubrik', 'ikraft_datum', 'anteckningar'] + AMENDMENT_ORDER = ['beteckning', 'rubrik', 'ikraft_datum', 'celex', 'anteckningar'] # Hantera det felaktiga formatet dĂ€r första raden börjar direkt efter kolon processed_lines = [] @@ -35,33 +35,51 @@ def sort_amendments_list(amendment_lines: list) -> str: # Parsa amendment items amendments = [] current_amendment = {} - + current_list_key = None # Track if we're parsing a nested list + for line in processed_lines: stripped = line.strip() - + # Ny amendment item (börjar med -) if stripped.startswith('-'): - # Spara föregĂ„ende amendment om den finns - if current_amendment: - amendments.append(current_amendment) - - # Starta ny amendment - current_amendment = {} - - # Kolla om det finns data pĂ„ samma rad som - - if ':' in stripped: - parts = stripped[1:].split(':', 1) # Ta bort - först - key = parts[0].strip() - value = parts[1].strip() if len(parts) > 1 else '' - current_amendment[key] = value - + # Check if this is a list item within a nested list (indented with 6 spaces) + if line.startswith(' -'): + # This is a nested list item (e.g., for celex) + if current_list_key: + list_value = stripped[1:].strip() # Remove the '-' and trim + if current_list_key not in current_amendment: + current_amendment[current_list_key] = [] + current_amendment[current_list_key].append(list_value) + else: + # This is a new amendment item + # Spara föregĂ„ende amendment om den finns + if current_amendment: + amendments.append(current_amendment) + + # Starta ny amendment + current_amendment = {} + current_list_key = None + + # Kolla om det finns data pĂ„ samma rad som - + if ':' in stripped: + parts = stripped[1:].split(':', 1) # Ta bort - först + key = parts[0].strip() + value = parts[1].strip() if len(parts) > 1 else '' + current_amendment[key] = value + # Property inom amendment item elif ':' in line and (line.startswith(' ') or line.startswith(' ')): parts = line.strip().split(':', 1) key = parts[0].strip() value = parts[1].strip() if len(parts) > 1 else '' if key: - current_amendment[key] = value + if value: + # Simple key-value pair + current_amendment[key] = value + current_list_key = None + else: + # Empty value, might be start of a nested list + current_list_key = key # Spara sista amendment if current_amendment: @@ -70,7 +88,7 @@ def sort_amendments_list(amendment_lines: list) -> str: # Bygg sorterad YAML med korrekt indentation if not amendments: return '' - + result_lines = [] for i, amendment in enumerate(amendments): # LĂ€gg till första property med - prefix @@ -78,31 +96,62 @@ def sort_amendments_list(amendment_lines: list) -> str: for prop in AMENDMENT_ORDER: if prop in amendment: value = amendment[prop] - # LĂ€gg till citattecken runt vĂ€rden som innehĂ„ller kolon eller speciella tecken - if ':' in value or value.startswith('"') or '"' in value: - if not (value.startswith('"') and value.endswith('"')): - value = f'"{value}"' - - if first_prop: - result_lines.append(f" - {prop}: {value}") - first_prop = False + + # Handle lists (e.g., celex with multiple values) + if isinstance(value, list): + if first_prop: + result_lines.append(f" - {prop}:") + first_prop = False + else: + result_lines.append(f" {prop}:") + + for item in value: + # Add quotes if needed + if ':' in str(item) or (isinstance(item, str) and (item.startswith('"') or '"' in item)): + if not (str(item).startswith('"') and str(item).endswith('"')): + item = f'"{item}"' + result_lines.append(f" - {item}") else: - result_lines.append(f" {prop}: {value}") + # LĂ€gg till citattecken runt vĂ€rden som innehĂ„ller kolon eller speciella tecken + if ':' in str(value) or (isinstance(value, str) and (value.startswith('"') or '"' in value)): + if not (str(value).startswith('"') and str(value).endswith('"')): + value = f'"{value}"' + + if first_prop: + result_lines.append(f" - {prop}: {value}") + first_prop = False + else: + result_lines.append(f" {prop}: {value}") # LĂ€gg till okĂ€nda properties sist unknown_props = [k for k in amendment.keys() if k not in AMENDMENT_ORDER] for prop in unknown_props: value = amendment[prop] - # LĂ€gg till citattecken runt vĂ€rden som innehĂ„ller kolon eller speciella tecken - if ':' in value or value.startswith('"') or '"' in value: - if not (value.startswith('"') and value.endswith('"')): - value = f'"{value}"' - - if first_prop: - result_lines.append(f" - {prop}: {value}") - first_prop = False + + # Handle lists + if isinstance(value, list): + if first_prop: + result_lines.append(f" - {prop}:") + first_prop = False + else: + result_lines.append(f" {prop}:") + + for item in value: + if ':' in str(item) or (isinstance(item, str) and (item.startswith('"') or '"' in item)): + if not (str(item).startswith('"') and str(item).endswith('"')): + item = f'"{item}"' + result_lines.append(f" - {item}") else: - result_lines.append(f" {prop}: {value}") + # LĂ€gg till citattecken runt vĂ€rden som innehĂ„ller kolon eller speciella tecken + if ':' in str(value) or (isinstance(value, str) and (value.startswith('"') or '"' in value)): + if not (str(value).startswith('"') and str(value).endswith('"')): + value = f'"{value}"' + + if first_prop: + result_lines.append(f" - {prop}: {value}") + first_prop = False + else: + result_lines.append(f" {prop}: {value}") return '\n' + '\n'.join(result_lines)