From 0645a26a7dd6acd9b925cba92a7e5812065a90a4 Mon Sep 17 00:00:00 2001
From: Martin Carlsson <martin@marca.se>
Date: Wed, 7 Jan 2026 19:59:07 +0100
Subject: [PATCH 1/2] Parse multiple CELEX numbers as YAML list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for handling multiple CELEX numbers (comma or space-separated)
in both main documents and amendments. When multiple CELEX numbers are
present, they are now stored as a YAML list instead of a single string.

Changes:
- Add parse_celex_numbers() function to split CELEX strings
- Update YAML front matter to output single value or list
- Apply same parsing to amendment CELEX numbers

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 sfs_processor.py       | 43 +++++++++++++++++++++++++++++++++++++++++-
 temporal/amendments.py | 16 ++++++++++++++--
 2 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/sfs_processor.py b/sfs_processor.py
index c2b37262..8d2cea4a 100644
--- a/sfs_processor.py
+++ b/sfs_processor.py
@@ -44,6 +44,27 @@
 from formatters.predocs_parser import parse_predocs_string
 
 
+def parse_celex_numbers(celex_string: str) -> List[str]:
+    """
+    Parse CELEX numbers from a string into a list.
+
+    Handles multiple CELEX numbers separated by commas or spaces.
+
+    Args:
+        celex_string (str): One or more CELEX numbers (e.g., "32001L0083, 32004L0027")
+
+    Returns:
+        List[str]: List of individual CELEX numbers
+    """
+    if not celex_string or not celex_string.strip():
+        return []
+
+    # Split by comma and/or whitespace, filter out empty strings
+    celex_list = [celex.strip() for celex in re.split(r'[,\s]+', celex_string) if celex.strip()]
+
+    return celex_list
+
+
 def create_safe_filename(beteckning: str, preserve_selex_tags: bool = False) -> str:
     """
     Create a safe filename from beteckning.
@@ -399,7 +420,17 @@ def convert_to_markdown(data: Dict[str, Any], fetch_predocs_from_api: bool = Fal
             # Fallback to original string
             yaml_front_matter += f"forarbeten: {format_yaml_value(predocs)}\n"
     if celex_nummer:
-        yaml_front_matter += f"celex: {format_yaml_value(celex_nummer)}\n"
+        # Parse CELEX numbers (can be comma-separated or space-separated)
+        celex_list = parse_celex_numbers(celex_nummer)
+
+        if len(celex_list) == 1:
+            # Single CELEX number - write as string
+            yaml_front_matter += f"celex: {format_yaml_value(celex_list[0])}\n"
+        elif len(celex_list) > 1:
+            # Multiple CELEX numbers - write as list
+            yaml_front_matter += "celex:\n"
+            for celex in celex_list:
+                yaml_front_matter += f"  - {format_yaml_value(celex)}\n"
 
     # Add eu_direktiv only if it's true
     if eu_direktiv:
@@ -415,6 +446,16 @@ def convert_to_markdown(data: Dict[str, Any], fetch_predocs_from_api: bool = Fal
                 yaml_front_matter += f"    rubrik: {format_yaml_value(amendment['rubrik'])}\n"
             if amendment['ikraft_datum']:
                 yaml_front_matter += f"    ikraft_datum: {format_yaml_value(amendment['ikraft_datum'])}\n"
+            if amendment.get('celex'):
+                celex = amendment['celex']
+                if isinstance(celex, list):
+                    # Multiple CELEX numbers
+                    yaml_front_matter += "    celex:\n"
+                    for celex_num in celex:
+                        yaml_front_matter += f"      - {format_yaml_value(celex_num)}\n"
+                else:
+                    # Single CELEX number
+                    yaml_front_matter += f"    celex: {format_yaml_value(celex)}\n"
             if amendment['anteckningar']:
                 yaml_front_matter += f"    anteckningar: {format_yaml_value(amendment['anteckningar'])}\n"
 
diff --git a/temporal/amendments.py b/temporal/amendments.py
index 42b22d62..82dc6cd3 100644
--- a/temporal/amendments.py
+++ b/temporal/amendments.py
@@ -10,7 +10,8 @@
 def extract_amendments(andringsforfattningar: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     """Extract and format amendment information, sorted chronologically by ikraft_datum."""
     from util.datetime_utils import format_datetime  # Import to avoid circular imports
-    
+    import re
+
     amendments = []
 
     for amendment in andringsforfattningar:
@@ -21,10 +22,21 @@ def extract_amendments(andringsforfattningar: List[Dict[str, Any]]) -> List[Dict
             'anteckningar': clean_text(amendment.get('anteckningar'))
         }
 
+        # Handle CELEX numbers (can be comma-separated or space-separated)
+        celex_nummer = amendment.get('celexnummer')
+        if celex_nummer:
+            # Parse CELEX numbers - split by comma and/or whitespace
+            celex_list = [celex.strip() for celex in re.split(r'[,\s]+', celex_nummer) if celex.strip()]
+
+            if len(celex_list) == 1:
+                amendment_data['celex'] = celex_list[0]
+            elif len(celex_list) > 1:
+                amendment_data['celex'] = celex_list
+
         # Only include non-empty amendments
         if amendment_data['beteckning']:
             amendments.append(amendment_data)
-    
+
     # Sort amendments chronologically by ikraft_datum
     # Amendments without ikraft_datum will be sorted to the end
     amendments.sort(key=lambda x: x['ikraft_datum'] or '9999-12-31')

From e7c6f9db69315e5aa421469e7cffff7434cca62d Mon Sep 17 00:00:00 2001
From: Martin Carlsson <martin@marca.se>
Date: Wed, 7 Jan 2026 22:10:17 +0100
Subject: [PATCH 2/2] Update frontmatter sorting to support CELEX lists
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The frontmatter sorter now properly handles CELEX numbers in amendments:
- Add 'celex' to AMENDMENT_ORDER for proper field ordering
- Parse nested YAML lists (e.g., multiple CELEX numbers)
- Generate correct YAML output for both single values and lists

This completes the CELEX number support by ensuring that frontmatter
sorting preserves CELEX data in the correct format.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 formatters/sort_frontmatter.py | 127 +++++++++++++++++++++++----------
 1 file changed, 88 insertions(+), 39 deletions(-)

diff --git a/formatters/sort_frontmatter.py b/formatters/sort_frontmatter.py
index e6d0c4ef..3266b0eb 100644
--- a/formatters/sort_frontmatter.py
+++ b/formatters/sort_frontmatter.py
@@ -14,14 +14,14 @@
 def sort_amendments_list(amendment_lines: list) -> str:
     """
     Sorterar innehållet i en andringsforfattningar-lista.
-    
+
     Args:
         amendment_lines: Lista med rader som representerar andringsforfattningar
-        
+
     Returns:
         str: Sorterad YAML-representation av andringsforfattningar
     """
-    AMENDMENT_ORDER = ['beteckning', 'rubrik', 'ikraft_datum', 'anteckningar']
+    AMENDMENT_ORDER = ['beteckning', 'rubrik', 'ikraft_datum', 'celex', 'anteckningar']
     
     # Hantera det felaktiga formatet där första raden börjar direkt efter kolon
     processed_lines = []
@@ -35,33 +35,51 @@ def sort_amendments_list(amendment_lines: list) -> str:
     # Parsa amendment items
     amendments = []
     current_amendment = {}
-    
+    current_list_key = None  # Track if we're parsing a nested list
+
     for line in processed_lines:
         stripped = line.strip()
-        
+
         # Ny amendment item (börjar med -)
         if stripped.startswith('-'):
-            # Spara föregående amendment om den finns
-            if current_amendment:
-                amendments.append(current_amendment)
-            
-            # Starta ny amendment
-            current_amendment = {}
-            
-            # Kolla om det finns data på samma rad som -
-            if ':' in stripped:
-                parts = stripped[1:].split(':', 1)  # Ta bort - först
-                key = parts[0].strip()
-                value = parts[1].strip() if len(parts) > 1 else ''
-                current_amendment[key] = value
-        
+            # Check if this is a list item within a nested list (indented with 6 spaces)
+            if line.startswith('      -'):
+                # This is a nested list item (e.g., for celex)
+                if current_list_key:
+                    list_value = stripped[1:].strip()  # Remove the '-' and trim
+                    if current_list_key not in current_amendment:
+                        current_amendment[current_list_key] = []
+                    current_amendment[current_list_key].append(list_value)
+            else:
+                # This is a new amendment item
+                # Spara föregående amendment om den finns
+                if current_amendment:
+                    amendments.append(current_amendment)
+
+                # Starta ny amendment
+                current_amendment = {}
+                current_list_key = None
+
+                # Kolla om det finns data på samma rad som -
+                if ':' in stripped:
+                    parts = stripped[1:].split(':', 1)  # Ta bort - först
+                    key = parts[0].strip()
+                    value = parts[1].strip() if len(parts) > 1 else ''
+                    current_amendment[key] = value
+
         # Property inom amendment item
         elif ':' in line and (line.startswith('    ') or line.startswith('  ')):
             parts = line.strip().split(':', 1)
             key = parts[0].strip()
             value = parts[1].strip() if len(parts) > 1 else ''
             if key:
-                current_amendment[key] = value
+                if value:
+                    # Simple key-value pair
+                    current_amendment[key] = value
+                    current_list_key = None
+                else:
+                    # Empty value, might be start of a nested list
+                    current_list_key = key
     
     # Spara sista amendment
     if current_amendment:
@@ -70,7 +88,7 @@ def sort_amendments_list(amendment_lines: list) -> str:
     # Bygg sorterad YAML med korrekt indentation
     if not amendments:
         return ''
-    
+
     result_lines = []
     for i, amendment in enumerate(amendments):
         # Lägg till första property med - prefix
@@ -78,31 +96,62 @@ def sort_amendments_list(amendment_lines: list) -> str:
         for prop in AMENDMENT_ORDER:
             if prop in amendment:
                 value = amendment[prop]
-                # Lägg till citattecken runt värden som innehåller kolon eller speciella tecken
-                if ':' in value or value.startswith('"') or '"' in value:
-                    if not (value.startswith('"') and value.endswith('"')):
-                        value = f'"{value}"'
-                
-                if first_prop:
-                    result_lines.append(f"  - {prop}: {value}")
-                    first_prop = False
+
+                # Handle lists (e.g., celex with multiple values)
+                if isinstance(value, list):
+                    if first_prop:
+                        result_lines.append(f"  - {prop}:")
+                        first_prop = False
+                    else:
+                        result_lines.append(f"    {prop}:")
+
+                    for item in value:
+                        # Add quotes if needed
+                        if ':' in str(item) or (isinstance(item, str) and (item.startswith('"') or '"' in item)):
+                            if not (str(item).startswith('"') and str(item).endswith('"')):
+                                item = f'"{item}"'
+                        result_lines.append(f"      - {item}")
                 else:
-                    result_lines.append(f"    {prop}: {value}")
+                    # Lägg till citattecken runt värden som innehåller kolon eller speciella tecken
+                    if ':' in str(value) or (isinstance(value, str) and (value.startswith('"') or '"' in value)):
+                        if not (str(value).startswith('"') and str(value).endswith('"')):
+                            value = f'"{value}"'
+
+                    if first_prop:
+                        result_lines.append(f"  - {prop}: {value}")
+                        first_prop = False
+                    else:
+                        result_lines.append(f"    {prop}: {value}")
         
         # Lägg till okända properties sist
         unknown_props = [k for k in amendment.keys() if k not in AMENDMENT_ORDER]
         for prop in unknown_props:
             value = amendment[prop]
-            # Lägg till citattecken runt värden som innehåller kolon eller speciella tecken
-            if ':' in value or value.startswith('"') or '"' in value:
-                if not (value.startswith('"') and value.endswith('"')):
-                    value = f'"{value}"'
-            
-            if first_prop:
-                result_lines.append(f"  - {prop}: {value}")
-                first_prop = False
+
+            # Handle lists
+            if isinstance(value, list):
+                if first_prop:
+                    result_lines.append(f"  - {prop}:")
+                    first_prop = False
+                else:
+                    result_lines.append(f"    {prop}:")
+
+                for item in value:
+                    if ':' in str(item) or (isinstance(item, str) and (item.startswith('"') or '"' in item)):
+                        if not (str(item).startswith('"') and str(item).endswith('"')):
+                            item = f'"{item}"'
+                    result_lines.append(f"      - {item}")
             else:
-                result_lines.append(f"    {prop}: {value}")
+                # Lägg till citattecken runt värden som innehåller kolon eller speciella tecken
+                if ':' in str(value) or (isinstance(value, str) and (value.startswith('"') or '"' in value)):
+                    if not (str(value).startswith('"') and str(value).endswith('"')):
+                        value = f'"{value}"'
+
+                if first_prop:
+                    result_lines.append(f"  - {prop}: {value}")
+                    first_prop = False
+                else:
+                    result_lines.append(f"    {prop}: {value}")
     
     return '\n' + '\n'.join(result_lines)