Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 88 additions & 39 deletions formatters/sort_frontmatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@
def sort_amendments_list(amendment_lines: list) -> str:
"""
Sorterar innehållet i en andringsforfattningar-lista.

Args:
amendment_lines: Lista med rader som representerar andringsforfattningar

Returns:
str: Sorterad YAML-representation av andringsforfattningar
"""
AMENDMENT_ORDER = ['beteckning', 'rubrik', 'ikraft_datum', 'anteckningar']
AMENDMENT_ORDER = ['beteckning', 'rubrik', 'ikraft_datum', 'celex', 'anteckningar']

# Hantera det felaktiga formatet där första raden börjar direkt efter kolon
processed_lines = []
Expand All @@ -35,33 +35,51 @@ def sort_amendments_list(amendment_lines: list) -> str:
# Parsa amendment items
amendments = []
current_amendment = {}

current_list_key = None # Track if we're parsing a nested list

for line in processed_lines:
stripped = line.strip()

# Ny amendment item (börjar med -)
if stripped.startswith('-'):
# Spara föregående amendment om den finns
if current_amendment:
amendments.append(current_amendment)

# Starta ny amendment
current_amendment = {}

# Kolla om det finns data på samma rad som -
if ':' in stripped:
parts = stripped[1:].split(':', 1) # Ta bort - först
key = parts[0].strip()
value = parts[1].strip() if len(parts) > 1 else ''
current_amendment[key] = value

# Check if this is a list item within a nested list (indented with 6 spaces)
if line.startswith(' -'):
# This is a nested list item (e.g., for celex)
if current_list_key:
list_value = stripped[1:].strip() # Remove the '-' and trim
if current_list_key not in current_amendment:
current_amendment[current_list_key] = []
current_amendment[current_list_key].append(list_value)
else:
# This is a new amendment item
# Spara föregående amendment om den finns
if current_amendment:
amendments.append(current_amendment)

# Starta ny amendment
current_amendment = {}
current_list_key = None

# Kolla om det finns data på samma rad som -
if ':' in stripped:
parts = stripped[1:].split(':', 1) # Ta bort - först
key = parts[0].strip()
value = parts[1].strip() if len(parts) > 1 else ''
current_amendment[key] = value

# Property inom amendment item
elif ':' in line and (line.startswith(' ') or line.startswith(' ')):
parts = line.strip().split(':', 1)
key = parts[0].strip()
value = parts[1].strip() if len(parts) > 1 else ''
if key:
current_amendment[key] = value
if value:
# Simple key-value pair
current_amendment[key] = value
current_list_key = None
else:
# Empty value, might be start of a nested list
current_list_key = key

# Spara sista amendment
if current_amendment:
Expand All @@ -70,39 +88,70 @@ def sort_amendments_list(amendment_lines: list) -> str:
# Bygg sorterad YAML med korrekt indentation
if not amendments:
return ''

result_lines = []
for i, amendment in enumerate(amendments):
# Lägg till första property med - prefix
first_prop = True
for prop in AMENDMENT_ORDER:
if prop in amendment:
value = amendment[prop]
# Lägg till citattecken runt värden som innehåller kolon eller speciella tecken
if ':' in value or value.startswith('"') or '"' in value:
if not (value.startswith('"') and value.endswith('"')):
value = f'"{value}"'

if first_prop:
result_lines.append(f" - {prop}: {value}")
first_prop = False

# Handle lists (e.g., celex with multiple values)
if isinstance(value, list):
if first_prop:
result_lines.append(f" - {prop}:")
first_prop = False
else:
result_lines.append(f" {prop}:")

for item in value:
# Add quotes if needed
if ':' in str(item) or (isinstance(item, str) and (item.startswith('"') or '"' in item)):
if not (str(item).startswith('"') and str(item).endswith('"')):
item = f'"{item}"'
result_lines.append(f" - {item}")
else:
result_lines.append(f" {prop}: {value}")
# Lägg till citattecken runt värden som innehåller kolon eller speciella tecken
if ':' in str(value) or (isinstance(value, str) and (value.startswith('"') or '"' in value)):
if not (str(value).startswith('"') and str(value).endswith('"')):
value = f'"{value}"'

if first_prop:
result_lines.append(f" - {prop}: {value}")
first_prop = False
else:
result_lines.append(f" {prop}: {value}")

# Lägg till okända properties sist
unknown_props = [k for k in amendment.keys() if k not in AMENDMENT_ORDER]
for prop in unknown_props:
value = amendment[prop]
# Lägg till citattecken runt värden som innehåller kolon eller speciella tecken
if ':' in value or value.startswith('"') or '"' in value:
if not (value.startswith('"') and value.endswith('"')):
value = f'"{value}"'

if first_prop:
result_lines.append(f" - {prop}: {value}")
first_prop = False

# Handle lists
if isinstance(value, list):
if first_prop:
result_lines.append(f" - {prop}:")
first_prop = False
else:
result_lines.append(f" {prop}:")

for item in value:
if ':' in str(item) or (isinstance(item, str) and (item.startswith('"') or '"' in item)):
if not (str(item).startswith('"') and str(item).endswith('"')):
item = f'"{item}"'
result_lines.append(f" - {item}")
else:
result_lines.append(f" {prop}: {value}")
# Lägg till citattecken runt värden som innehåller kolon eller speciella tecken
if ':' in str(value) or (isinstance(value, str) and (value.startswith('"') or '"' in value)):
if not (str(value).startswith('"') and str(value).endswith('"')):
value = f'"{value}"'

if first_prop:
result_lines.append(f" - {prop}: {value}")
first_prop = False
else:
result_lines.append(f" {prop}: {value}")

return '\n' + '\n'.join(result_lines)

Expand Down
43 changes: 42 additions & 1 deletion sfs_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,27 @@
from formatters.predocs_parser import parse_predocs_string


def parse_celex_numbers(celex_string: str) -> List[str]:
"""
Parse CELEX numbers from a string into a list.

Handles multiple CELEX numbers separated by commas or spaces.

Args:
celex_string (str): One or more CELEX numbers (e.g., "32001L0083, 32004L0027")

Returns:
List[str]: List of individual CELEX numbers
"""
if not celex_string or not celex_string.strip():
return []

# Split by comma and/or whitespace, filter out empty strings
celex_list = [celex.strip() for celex in re.split(r'[,\s]+', celex_string) if celex.strip()]

return celex_list


def create_safe_filename(beteckning: str, preserve_selex_tags: bool = False) -> str:
"""
Create a safe filename from beteckning.
Expand Down Expand Up @@ -399,7 +420,17 @@ def convert_to_markdown(data: Dict[str, Any], fetch_predocs_from_api: bool = Fal
# Fallback to original string
yaml_front_matter += f"forarbeten: {format_yaml_value(predocs)}\n"
if celex_nummer:
yaml_front_matter += f"celex: {format_yaml_value(celex_nummer)}\n"
# Parse CELEX numbers (can be comma-separated or space-separated)
celex_list = parse_celex_numbers(celex_nummer)

if len(celex_list) == 1:
# Single CELEX number - write as string
yaml_front_matter += f"celex: {format_yaml_value(celex_list[0])}\n"
elif len(celex_list) > 1:
# Multiple CELEX numbers - write as list
yaml_front_matter += "celex:\n"
for celex in celex_list:
yaml_front_matter += f" - {format_yaml_value(celex)}\n"

# Add eu_direktiv only if it's true
if eu_direktiv:
Expand All @@ -415,6 +446,16 @@ def convert_to_markdown(data: Dict[str, Any], fetch_predocs_from_api: bool = Fal
yaml_front_matter += f" rubrik: {format_yaml_value(amendment['rubrik'])}\n"
if amendment['ikraft_datum']:
yaml_front_matter += f" ikraft_datum: {format_yaml_value(amendment['ikraft_datum'])}\n"
if amendment.get('celex'):
celex = amendment['celex']
if isinstance(celex, list):
# Multiple CELEX numbers
yaml_front_matter += " celex:\n"
for celex_num in celex:
yaml_front_matter += f" - {format_yaml_value(celex_num)}\n"
else:
# Single CELEX number
yaml_front_matter += f" celex: {format_yaml_value(celex)}\n"
if amendment['anteckningar']:
yaml_front_matter += f" anteckningar: {format_yaml_value(amendment['anteckningar'])}\n"

Expand Down
16 changes: 14 additions & 2 deletions temporal/amendments.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
def extract_amendments(andringsforfattningar: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Extract and format amendment information, sorted chronologically by ikraft_datum."""
from util.datetime_utils import format_datetime # Import to avoid circular imports

import re

amendments = []

for amendment in andringsforfattningar:
Expand All @@ -21,10 +22,21 @@ def extract_amendments(andringsforfattningar: List[Dict[str, Any]]) -> List[Dict
'anteckningar': clean_text(amendment.get('anteckningar'))
}

# Handle CELEX numbers (can be comma-separated or space-separated)
celex_nummer = amendment.get('celexnummer')
if celex_nummer:
# Parse CELEX numbers - split by comma and/or whitespace
celex_list = [celex.strip() for celex in re.split(r'[,\s]+', celex_nummer) if celex.strip()]

if len(celex_list) == 1:
amendment_data['celex'] = celex_list[0]
elif len(celex_list) > 1:
amendment_data['celex'] = celex_list

# Only include non-empty amendments
if amendment_data['beteckning']:
amendments.append(amendment_data)

# Sort amendments chronologically by ikraft_datum
# Amendments without ikraft_datum will be sorted to the end
amendments.sort(key=lambda x: x['ikraft_datum'] or '9999-12-31')
Expand Down