diff --git a/formatters/sort_frontmatter.py b/formatters/sort_frontmatter.py index e6d0c4e..3266b0e 100644 --- a/formatters/sort_frontmatter.py +++ b/formatters/sort_frontmatter.py @@ -14,14 +14,14 @@ def sort_amendments_list(amendment_lines: list) -> str: """ Sorterar innehållet i en andringsforfattningar-lista. - + Args: amendment_lines: Lista med rader som representerar andringsforfattningar - + Returns: str: Sorterad YAML-representation av andringsforfattningar """ - AMENDMENT_ORDER = ['beteckning', 'rubrik', 'ikraft_datum', 'anteckningar'] + AMENDMENT_ORDER = ['beteckning', 'rubrik', 'ikraft_datum', 'celex', 'anteckningar'] # Hantera det felaktiga formatet där första raden börjar direkt efter kolon processed_lines = [] @@ -35,33 +35,51 @@ def sort_amendments_list(amendment_lines: list) -> str: # Parsa amendment items amendments = [] current_amendment = {} - + current_list_key = None # Track if we're parsing a nested list + for line in processed_lines: stripped = line.strip() - + # Ny amendment item (börjar med -) if stripped.startswith('-'): - # Spara föregående amendment om den finns - if current_amendment: - amendments.append(current_amendment) - - # Starta ny amendment - current_amendment = {} - - # Kolla om det finns data på samma rad som - - if ':' in stripped: - parts = stripped[1:].split(':', 1) # Ta bort - först - key = parts[0].strip() - value = parts[1].strip() if len(parts) > 1 else '' - current_amendment[key] = value - + # Check if this is a list item within a nested list (indented with 6 spaces) + if line.startswith(' -'): + # This is a nested list item (e.g., for celex) + if current_list_key: + list_value = stripped[1:].strip() # Remove the '-' and trim + if current_list_key not in current_amendment: + current_amendment[current_list_key] = [] + current_amendment[current_list_key].append(list_value) + else: + # This is a new amendment item + # Spara föregående amendment om den finns + if current_amendment: + amendments.append(current_amendment) + + # Starta ny amendment + current_amendment = {} + current_list_key = None + + # Kolla om det finns data på samma rad som - + if ':' in stripped: + parts = stripped[1:].split(':', 1) # Ta bort - först + key = parts[0].strip() + value = parts[1].strip() if len(parts) > 1 else '' + current_amendment[key] = value + # Property inom amendment item elif ':' in line and (line.startswith(' ') or line.startswith(' ')): parts = line.strip().split(':', 1) key = parts[0].strip() value = parts[1].strip() if len(parts) > 1 else '' if key: - current_amendment[key] = value + if value: + # Simple key-value pair + current_amendment[key] = value + current_list_key = None + else: + # Empty value, might be start of a nested list + current_list_key = key # Spara sista amendment if current_amendment: @@ -70,7 +88,7 @@ def sort_amendments_list(amendment_lines: list) -> str: # Bygg sorterad YAML med korrekt indentation if not amendments: return '' - + result_lines = [] for i, amendment in enumerate(amendments): # Lägg till första property med - prefix @@ -78,31 +96,62 @@ def sort_amendments_list(amendment_lines: list) -> str: for prop in AMENDMENT_ORDER: if prop in amendment: value = amendment[prop] - # Lägg till citattecken runt värden som innehåller kolon eller speciella tecken - if ':' in value or value.startswith('"') or '"' in value: - if not (value.startswith('"') and value.endswith('"')): - value = f'"{value}"' - - if first_prop: - result_lines.append(f" - {prop}: {value}") - first_prop = False + + # Handle lists (e.g., celex with multiple values) + if isinstance(value, list): + if first_prop: + result_lines.append(f" - {prop}:") + first_prop = False + else: + result_lines.append(f" {prop}:") + + for item in value: + # Add quotes if needed + if ':' in str(item) or (isinstance(item, str) and (item.startswith('"') or '"' in item)): + if not (str(item).startswith('"') and str(item).endswith('"')): + item = f'"{item}"' + result_lines.append(f" - {item}") else: - result_lines.append(f" {prop}: {value}") + # Lägg till citattecken runt värden som innehåller kolon eller speciella tecken + if ':' in str(value) or (isinstance(value, str) and (value.startswith('"') or '"' in value)): + if not (str(value).startswith('"') and str(value).endswith('"')): + value = f'"{value}"' + + if first_prop: + result_lines.append(f" - {prop}: {value}") + first_prop = False + else: + result_lines.append(f" {prop}: {value}") # Lägg till okända properties sist unknown_props = [k for k in amendment.keys() if k not in AMENDMENT_ORDER] for prop in unknown_props: value = amendment[prop] - # Lägg till citattecken runt värden som innehåller kolon eller speciella tecken - if ':' in value or value.startswith('"') or '"' in value: - if not (value.startswith('"') and value.endswith('"')): - value = f'"{value}"' - - if first_prop: - result_lines.append(f" - {prop}: {value}") - first_prop = False + + # Handle lists + if isinstance(value, list): + if first_prop: + result_lines.append(f" - {prop}:") + first_prop = False + else: + result_lines.append(f" {prop}:") + + for item in value: + if ':' in str(item) or (isinstance(item, str) and (item.startswith('"') or '"' in item)): + if not (str(item).startswith('"') and str(item).endswith('"')): + item = f'"{item}"' + result_lines.append(f" - {item}") else: - result_lines.append(f" {prop}: {value}") + # Lägg till citattecken runt värden som innehåller kolon eller speciella tecken + if ':' in str(value) or (isinstance(value, str) and (value.startswith('"') or '"' in value)): + if not (str(value).startswith('"') and str(value).endswith('"')): + value = f'"{value}"' + + if first_prop: + result_lines.append(f" - {prop}: {value}") + first_prop = False + else: + result_lines.append(f" {prop}: {value}") return '\n' + '\n'.join(result_lines) diff --git a/sfs_processor.py b/sfs_processor.py index c2b3726..8d2cea4 100644 --- a/sfs_processor.py +++ b/sfs_processor.py @@ -44,6 +44,27 @@ from formatters.predocs_parser import parse_predocs_string +def parse_celex_numbers(celex_string: str) -> List[str]: + """ + Parse CELEX numbers from a string into a list. + + Handles multiple CELEX numbers separated by commas or spaces. + + Args: + celex_string (str): One or more CELEX numbers (e.g., "32001L0083, 32004L0027") + + Returns: + List[str]: List of individual CELEX numbers + """ + if not celex_string or not celex_string.strip(): + return [] + + # Split by comma and/or whitespace, filter out empty strings + celex_list = [celex.strip() for celex in re.split(r'[,\s]+', celex_string) if celex.strip()] + + return celex_list + + def create_safe_filename(beteckning: str, preserve_selex_tags: bool = False) -> str: """ Create a safe filename from beteckning. @@ -399,7 +420,17 @@ def convert_to_markdown(data: Dict[str, Any], fetch_predocs_from_api: bool = Fal # Fallback to original string yaml_front_matter += f"forarbeten: {format_yaml_value(predocs)}\n" if celex_nummer: - yaml_front_matter += f"celex: {format_yaml_value(celex_nummer)}\n" + # Parse CELEX numbers (can be comma-separated or space-separated) + celex_list = parse_celex_numbers(celex_nummer) + + if len(celex_list) == 1: + # Single CELEX number - write as string + yaml_front_matter += f"celex: {format_yaml_value(celex_list[0])}\n" + elif len(celex_list) > 1: + # Multiple CELEX numbers - write as list + yaml_front_matter += "celex:\n" + for celex in celex_list: + yaml_front_matter += f" - {format_yaml_value(celex)}\n" # Add eu_direktiv only if it's true if eu_direktiv: @@ -415,6 +446,16 @@ def convert_to_markdown(data: Dict[str, Any], fetch_predocs_from_api: bool = Fal yaml_front_matter += f" rubrik: {format_yaml_value(amendment['rubrik'])}\n" if amendment['ikraft_datum']: yaml_front_matter += f" ikraft_datum: {format_yaml_value(amendment['ikraft_datum'])}\n" + if amendment.get('celex'): + celex = amendment['celex'] + if isinstance(celex, list): + # Multiple CELEX numbers + yaml_front_matter += " celex:\n" + for celex_num in celex: + yaml_front_matter += f" - {format_yaml_value(celex_num)}\n" + else: + # Single CELEX number + yaml_front_matter += f" celex: {format_yaml_value(celex)}\n" if amendment['anteckningar']: yaml_front_matter += f" anteckningar: {format_yaml_value(amendment['anteckningar'])}\n" diff --git a/temporal/amendments.py b/temporal/amendments.py index 42b22d6..82dc6cd 100644 --- a/temporal/amendments.py +++ b/temporal/amendments.py @@ -10,7 +10,8 @@ def extract_amendments(andringsforfattningar: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Extract and format amendment information, sorted chronologically by ikraft_datum.""" from util.datetime_utils import format_datetime # Import to avoid circular imports - + import re + amendments = [] for amendment in andringsforfattningar: @@ -21,10 +22,21 @@ def extract_amendments(andringsforfattningar: List[Dict[str, Any]]) -> List[Dict 'anteckningar': clean_text(amendment.get('anteckningar')) } + # Handle CELEX numbers (can be comma-separated or space-separated) + celex_nummer = amendment.get('celexnummer') + if celex_nummer: + # Parse CELEX numbers - split by comma and/or whitespace + celex_list = [celex.strip() for celex in re.split(r'[,\s]+', celex_nummer) if celex.strip()] + + if len(celex_list) == 1: + amendment_data['celex'] = celex_list[0] + elif len(celex_list) > 1: + amendment_data['celex'] = celex_list + # Only include non-empty amendments if amendment_data['beteckning']: amendments.append(amendment_data) - + # Sort amendments chronologically by ikraft_datum # Amendments without ikraft_datum will be sorted to the end amendments.sort(key=lambda x: x['ikraft_datum'] or '9999-12-31')