From f9170fe424c5fa281f24c98e7cf0bd51c07aed5e Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Thu, 20 Nov 2025 14:02:06 -0700 Subject: [PATCH 1/4] Docstring updates --- compass/_cli/main.py | 2 +- compass/common/base.py | 70 +++++++++- compass/common/tree.py | 24 ++-- compass/exceptions.py | 1 - compass/extraction/apply.py | 45 ++++++- compass/pb.py | 90 +++++++------ compass/scripts/download.py | 74 ++++++++--- compass/scripts/process.py | 67 +++++----- compass/services/base.py | 13 +- compass/services/cpu.py | 24 ++-- compass/services/threaded.py | 24 +++- compass/services/usage.py | 9 +- compass/utilities/base.py | 48 +++++-- compass/utilities/enums.py | 32 ++++- compass/utilities/finalize.py | 130 +++++++++++-------- compass/utilities/io.py | 34 +++-- compass/utilities/jurisdictions.py | 42 ++++-- compass/utilities/location.py | 26 +++- compass/utilities/nt.py | 51 ++++++++ compass/utilities/parsing.py | 79 ++++++++---- compass/validation/content.py | 54 ++++---- compass/validation/graphs.py | 60 ++++++--- compass/validation/location.py | 201 ++++++++++++++++++++--------- compass/warn.py | 1 - 24 files changed, 828 insertions(+), 373 deletions(-) diff --git a/compass/_cli/main.py b/compass/_cli/main.py index 1bd608e58..c9e72023d 100644 --- a/compass/_cli/main.py +++ b/compass/_cli/main.py @@ -11,7 +11,7 @@ @click.version_option(version=__version__) @click.pass_context def main(ctx): - """Ordinance command line interface""" + """COMPASS command line interface""" ctx.ensure_object(dict) diff --git a/compass/common/base.py b/compass/common/base.py index 75954004b..e7df1a974 100644 --- a/compass/common/base.py +++ b/compass/common/base.py @@ -128,7 +128,29 @@ def llm_response_does_not_start_with_no(response): def setup_async_decision_tree( graph_setup_func, usage_sub_label=None, **kwargs ): - """Setup Async Decision tree for ordinance extraction""" + """Setup an ``AsyncDecisionTree`` for ordinance extraction + + Parameters + ---------- + graph_setup_func : callable + Factory that returns a fully configured + :class:`networkx.DiGraph`. + usage_sub_label : str, optional + Optional usage label reported to the LLM usage tracker. + **kwargs + Keyword arguments forwarded to ``graph_setup_func``. + + Returns + ------- + AsyncDecisionTree + Decision tree wrapping the graph produced by + ``graph_setup_func``. + + Notes + ----- + The function asserts that the tree has recorded at least the system + prompt before returning the constructed wrapper. + """ G = graph_setup_func(**kwargs) # noqa: N806 tree = AsyncDecisionTree(G, usage_sub_label=usage_sub_label) assert len(tree.chat_llm_caller.messages) == 1 @@ -136,7 +158,22 @@ def setup_async_decision_tree( async def run_async_tree(tree, response_as_json=True): - """Run Async Decision Tree and return output as dict""" + """Run an async decision tree and optionally parse JSON output + + Parameters + ---------- + tree : AsyncDecisionTree + Decision tree to execute. + response_as_json : bool, optional + When ``True`` (default), attempts to parse the LLM response as + JSON using :func:`compass.utilities.llm_response_as_json`. + + Returns + ------- + dict or str or None + Parsed dictionary when ``response_as_json`` is ``True``, raw + response otherwise. Returns ``None`` if execution fails. + """ try: response = await tree.async_run() except COMPASSRuntimeError: @@ -149,14 +186,39 @@ async def run_async_tree(tree, response_as_json=True): async def run_async_tree_with_bm(tree, base_messages): - """Run Async Decision Tree from base messages; return dict output""" + """Run an async decision tree using seed "base" messages + + Parameters + ---------- + tree : AsyncDecisionTree + Decision tree to execute. + base_messages : list of dict + Messages to preload into the tree's chat caller before running. + + Returns + ------- + dict or str or None + Output from :func:`run_async_tree`, filtered by the + ``response_as_json`` default. + """ tree.chat_llm_caller.messages = base_messages assert len(tree.chat_llm_caller.messages) == len(base_messages) return await run_async_tree(tree) def empty_output(feature): - """Empty output for a feature (not found in text)""" + """Return the default empty result for a missing feature + + Parameters + ---------- + feature : str + Name of the feature to seed in the empty output structure. + + Returns + ------- + list of dict + Empty result placeholders used by downstream extraction logic. + """ if feature in {"structures", "property line"}: return [ {"feature": f"{feature} (participating)"}, diff --git a/compass/common/tree.py b/compass/common/tree.py index 2d948cb45..05bf036a9 100644 --- a/compass/common/tree.py +++ b/compass/common/tree.py @@ -66,32 +66,22 @@ def __init__(self, graph, usage_sub_label=None): @property def chat_llm_caller(self): - """ChatLLMCaller: ChatLLMCaller instance for this tree""" + """ChatLLMCaller: LLM caller bound to the decision tree""" return self.graph.graph["chat_llm_caller"] @cached_property def tree_name(self): - """str: Name of the decision tree""" + """str: Configured decision tree name""" return self._g.graph.get("_d_tree_name", "Unknown decision tree") @property def messages(self): - """Get a list of the conversation messages with the LLM - - Returns - ------- - list - """ + """list: Conversation messages exchanged with the LLM""" return self.chat_llm_caller.messages @property def all_messages_txt(self): - """Get a printout of the full conversation with the LLM - - Returns - ------- - str - """ + """str: Formatted conversation transcript""" messages = [ f"{msg['role'].upper()}: {msg['content']}" for msg in self.messages ] @@ -140,6 +130,12 @@ async def async_run(self, node0="init"): out : str or None Final response from LLM at the leaf node or ``None`` if an ``AttributeError`` was raised during execution. + + Raises + ------ + compass.exceptions.COMPASSRuntimeError + Raised when the traversal encounters an unexpected + exception that is not an ``AttributeError``. """ self._history = [] diff --git a/compass/exceptions.py b/compass/exceptions.py index 2042b1dd5..53de495e3 100644 --- a/compass/exceptions.py +++ b/compass/exceptions.py @@ -10,7 +10,6 @@ class COMPASSError(Exception): """Generic COMPASS Error""" def __init__(self, *args, **kwargs): - """Init exception and broadcast message to logger""" super().__init__(*args, **kwargs) if args: logger.error(str(args[0]), stacklevel=2) diff --git a/compass/extraction/apply.py b/compass/extraction/apply.py index c36bed35e..590a371b1 100644 --- a/compass/extraction/apply.py +++ b/compass/extraction/apply.py @@ -37,13 +37,20 @@ async def check_for_ordinance_info( has the ``"contains_ord_info"`` key, it will not be processed. To force a document to be processed by this function, remove that key from the documents ``attrs``. + model_config : compass.llm.config.LLMConfig + Configuration describing which LLM service, splitter, and call + parameters should be used for extraction. + heuristic : object + Domain-specific heuristic implementing a ``check`` method to + qualify text chunks for further processing. tech : str Technology of interest (e.g. "solar", "wind", etc). This is used to set up some document validation decision trees. - text_splitter : LCTextSplitter, optional - Optional Langchain text splitter (or subclass instance), or any - object that implements a `split_text` method. The method should - take text as input (str) and return a list of text chunks. + ordinance_text_collector_class : type + Collector class invoked to capture ordinance text chunks. + permitted_use_text_collector_class : type, optional + Collector class used to capture permitted-use districts text. + When ``None``, the permitted-use workflow is skipped. usage_tracker : UsageTracker, optional Optional tracker instance to monitor token usage during LLM calls. By default, ``None``. @@ -61,6 +68,12 @@ async def check_for_ordinance_info( and an ``"ordinance_text"`` key containing the ordinance text snippet. Note that the snippet may contain other info as well, but should encapsulate all of the ordinance text. + + Notes + ----- + The function updates progress bar logging as chunks are processed + and sets ``contains_district_info`` when + ``permitted_use_text_collector_class`` is provided. """ if "contains_ord_info" in doc.attrs: return doc @@ -134,6 +147,9 @@ async def extract_date(doc, model_config, usage_tracker=None): ---------- doc : elm.web.document.BaseDocument A document potentially containing date information. + model_config : compass.llm.config.LLMConfig + Configuration describing which LLM service, splitter, and call + parameters should be used for date extraction. usage_tracker : UsageTracker, optional Optional tracker instance to monitor token usage during LLM calls. By default, ``None``. @@ -145,6 +161,11 @@ async def extract_date(doc, model_config, usage_tracker=None): the parsing are stored in the documents attrs. In particular, the attrs will contain a ``"date"`` key that will contain the parsed date information. + + Notes + ----- + Documents already containing a ``"date"`` attribute are returned + without reprocessing. """ if "date" in doc.attrs: logger.debug( @@ -186,8 +207,9 @@ async def extract_ordinance_text_with_llm( Optional Langchain text splitter (or subclass instance), or any object that implements a `split_text` method. The method should take text as input (str) and return a list of text chunks. - extractor : WindOrdinanceTextExtractor - Object used for ordinance text extraction. + extractor : compass.extraction.base.OrdinanceTextExtractor + Extractor instance exposing ``parsers`` that consume text + chunks and update ``doc.attrs``. original_text_key : str String corresponding to the `doc.attrs` key containing the original text (before extraction). @@ -250,6 +272,9 @@ async def extract_ordinance_text_with_ngram_validation( Optional Langchain text splitter (or subclass instance), or any object that implements a `split_text` method. The method should take text as input (str) and return a list of text chunks. + extractor : compass.extraction.base.OrdinanceTextExtractor + Extractor instance exposing ``parsers`` that consume text + chunks and update ``doc.attrs``. original_text_key : str String corresponding to the `doc.attrs` key containing the original text (before extraction). @@ -406,6 +431,9 @@ async def extract_ordinance_values(doc, parser, text_key, out_key): that are found to contain ordinance data. Note that if the document's attrs does not contain the `text_key` key, it will not be processed. + parser : compass.extraction.base.StructuredParser + Parser instance with an async ``parse`` method that converts + cleaned ordinance text into structured values. text_key : str Name of the key under which cleaned text is stored in `doc.attrs`. This text should be ready for extraction. @@ -418,6 +446,11 @@ async def extract_ordinance_values(doc, parser, text_key, out_key): elm.web.document.BaseDocument Document that has been parsed for ordinance values. The results of the extraction are stored in the document's attrs. + + Notes + ----- + When the cleaned text is missing or empty the function emits a + :class:`compass.warn.COMPASSWarning` and leaves ``doc`` unchanged. """ if not doc.attrs.get(text_key): msg = ( diff --git a/compass/pb.py b/compass/pb.py index 740f6697b..146314a33 100644 --- a/compass/pb.py +++ b/compass/pb.py @@ -74,7 +74,19 @@ def render(self, task): # noqa: PLR6301 class _COMPASSProgressBars: - """COMPASS progress bar configurations""" + """Manage the suite of rich progress bars used by COMPASS runs + + The class maintains a primary progress bar plus a set of + jurisdiction-scoped progress bars for downloads, crawling, and + parsing subtasks. It centralizes creation, teardown, and cost + tracking so CLI runs can display consistent status updates. + + Notes + ----- + Instances are typically accessed via the module-level singleton + :data:`COMPASS_PB`. Use the context managers for scoped tasks to + ensure progress bars are removed even when exceptions occur. + """ def __init__(self, console=None): """ @@ -114,7 +126,7 @@ def __init__(self, console=None): @property def group(self): - """rich.console.Group: Group of renderable progress bars.""" + """rich.console.Group: Group of renderable progress bars""" return self._group def create_main_task(self, num_jurisdictions): @@ -148,15 +160,13 @@ def create_main_task(self, num_jurisdictions): ) def progress_main_task(self): - """Advance the main task one step - - In other words, mark one jurisdiction as complete. + """Advance the main jurisdiction task by one unit Raises ------ COMPASSNotInitializedError - If the main task has not been set up (i.e. - `create_main_task` has not been called). + If the main task has not been set up via + :meth:`create_main_task`. """ if self._main_task is None: msg = ( @@ -168,18 +178,16 @@ def progress_main_task(self): self._main.update(self._main_task, advance=1) def update_total_cost(self, cost, replace=False): - """Update the total cost of the run + """Update the aggregate LLM cost displayed in the main bar Parameters ---------- - cost : int or float - Cost value used for update. + cost : float + Cost increment or replacement value in US dollars. replace : bool, optional - If ``True``, the `cost` input will completely replace the - total cost, but only if the `cost` value is equal to or - larger than the existing total cost (i.e. we never want the - cost to decrease). If ``False``, the `cost` input is just - added to the running total. By default, ``False``. + When ``True`` the total cost is replaced by ``cost`` , + provided it does not move backwards. When ``False`` + the cost is added cumulatively. By default, ``False``. """ if replace: if cost + 0.01 >= self._total_cost: @@ -192,15 +200,15 @@ def update_total_cost(self, cost, replace=False): @contextmanager def jurisdiction_prog_bar(self, location, progress_main=True): - """Set a progress bar for the processing of one jurisdiction + """Context manager for jurisdiction-wide processing progress Parameters ---------- location : str Name of jurisdiction being processed. - progress_main : bool, default=True - Option to progress the main task when exiting this context - manager. + progress_main : bool, optional + If ``True``, the main jurisdiction task advances when the + context exits successfully. By default, ``True``. Yields ------ @@ -259,10 +267,10 @@ def update_jurisdiction_task(self, location, *args, **kwargs): @contextmanager def jurisdiction_sub_prog(self, location): - """Start a sub-progress update area for location + """Context manager for text-only jurisdiction sub-progress - This type of sub-progress does not have a bar, so it's useful - for tasks with an unknown length/duration. + This variant omits a progress bar and is intended for steps with + unknown durations, such as intermediate parsing tasks. Parameters ---------- @@ -296,7 +304,7 @@ def jurisdiction_sub_prog(self, location): @contextmanager def jurisdiction_sub_prog_bar(self, location): - """Start a sub-progress bar for location + """Context manager for jurisdiction sub-progress with a bar Parameters ---------- @@ -337,7 +345,7 @@ def jurisdiction_sub_prog_bar(self, location): @asynccontextmanager async def file_download_prog_bar(self, location, num_downloads): - """Display a progress bar for file downloads for a jurisdiction + """Async context manager for jurisdiction download progress Parameters ---------- @@ -368,7 +376,7 @@ async def file_download_prog_bar(self, location, num_downloads): ) def start_file_download_prog_bar(self, location, num_downloads): - """Setup a progress bar for download of files for a jurisdiction + """Create and register a download progress bar for a location Parameters ---------- @@ -377,10 +385,10 @@ def start_file_download_prog_bar(self, location, num_downloads): num_downloads : int Total number of downloads being processed. - Yields - ------ - rich.progress.Progress - `rich` progress bar initialized for this jurisdiction. + Returns + ------- + tuple + Two-item tuple of the progress instance and created task ID. Raises ------ @@ -417,7 +425,7 @@ def start_file_download_prog_bar(self, location, num_downloads): async def tear_down_file_download_prog_bar( self, location, num_downloads, pb, task ): - """Tear down the progress bar showing file downloads + """Complete and remove a file download progress bar Parameters ---------- @@ -437,7 +445,7 @@ async def tear_down_file_download_prog_bar( self._group.renderables.remove(pb) def update_download_task(self, location, *args, **kwargs): - """Update the task corresponding to the jurisdiction download + """Update a jurisdiction download progress entry Parameters ---------- @@ -452,14 +460,14 @@ def update_download_task(self, location, *args, **kwargs): @asynccontextmanager async def website_crawl_prog_bar(self, location, num_pages): - """Set a progress bar for crawling jurisdiction websites + """Async context manager for website crawling progress Parameters ---------- location : str Name of jurisdiction being processed. - num_downloads : int - Total number of downloads being processed. + num_pages : int + Total number of pages expected for the crawl. Yields ------ @@ -516,7 +524,7 @@ def _remove_website_crawl_prog_bar(self, location): self._group.renderables.remove(pb) def update_website_crawl_task(self, location, *args, **kwargs): - """Update task corresponding to the jurisdiction website crawl + """Update the website crawl progress for a jurisdiction Parameters ---------- @@ -530,7 +538,7 @@ def update_website_crawl_task(self, location, *args, **kwargs): self._wc_pbs[location].update(task_id, *args, **kwargs) def update_website_crawl_doc_found(self, location): - """Update task to say that one more document has been found + """Increment the count of documents discovered during crawling Parameters ---------- @@ -548,14 +556,14 @@ def update_website_crawl_doc_found(self, location): @asynccontextmanager async def compass_website_crawl_prog_bar(self, location, num_pages): - """Set a progress bar for compass-style crawling of websites + """Async context manager for COMPASS-style website crawling Parameters ---------- location : str Name of jurisdiction being processed. - num_downloads : int - Total number of downloads being processed. + num_pages : int + Total number of pages expected for the crawl. Yields ------ @@ -612,7 +620,7 @@ def _remove_compass_website_crawl_prog_bar(self, location): self._group.renderables.remove(pb) def update_compass_website_crawl_task(self, location, *args, **kwargs): - """Update task corresponding to the jurisdiction website crawl + """Update COMPASS-style crawl progress for a jurisdiction Parameters ---------- @@ -626,7 +634,7 @@ def update_compass_website_crawl_task(self, location, *args, **kwargs): self._cwc_pbs[location].update(task_id, *args, **kwargs) def update_compass_website_crawl_doc_found(self, location): - """Update task to say that one more document has been found + """Increment COMPASS-style crawl document discovery count Parameters ---------- diff --git a/compass/scripts/download.py b/compass/scripts/download.py index 5d1bcbfd6..c2910a07b 100644 --- a/compass/scripts/download.py +++ b/compass/scripts/download.py @@ -161,6 +161,11 @@ async def find_jurisdiction_website( ): """Search for the main landing page of a given jurisdiction + This function submits two pre-determined queries based on the + jurisdiction name, prioritizing official landing pages. Additional + ``kwargs`` (for example, alternate search engines) can be supplied + to fine-tune behavior. + Parameters ---------- jurisdiction : Jurisdiction @@ -188,6 +193,12 @@ async def find_jurisdiction_website( usage_tracker : UsageTracker, optional Optional tracker instance to monitor token usage during LLM calls. By default, ``None``. + url_ignore_substrings : list of str, optional + URL substrings that should be excluded from search results. + Substrings are applied case-insensitively. By default, ``None``. + **kwargs + Additional arguments forwarded to + :func:`elm.web.search.run.search_with_fallback`. Returns ------- @@ -251,6 +262,9 @@ async def download_jurisdiction_ordinances_from_website( ---------- website : str URL of the jurisdiction website to search. + heuristic : callable + Callable taking an :class:`elm.web.document.BaseDocument` and + returning ``True`` when the document should be kept. keyword_points : dict Dictionary of keyword points to use for scoring links. Keys are keywords, values are points to assign to links @@ -294,7 +308,7 @@ async def download_jurisdiction_ordinances_from_website( no ordinance document was found. results : list, optional List of crawl4ai results containing metadata about the crawled - pages. This is only returned if `return_c4ai_results` is + pages. Only returned when ``return_c4ai_results`` evaluates to ``True``. Notes @@ -376,6 +390,9 @@ async def download_jurisdiction_ordinances_from_website_compass_crawl( ---------- website : str URL of the jurisdiction website to search. + heuristic : callable + Callable taking an :class:`elm.web.document.BaseDocument` and + returning ``True`` when the document should be kept. keyword_points : dict Dictionary of keyword points to use for scoring links. Keys are keywords, values are points to assign to links @@ -387,7 +404,13 @@ async def download_jurisdiction_ordinances_from_website_compass_crawl( "pw_launch_kwargs" key in these will also be used to initialize the :class:`elm.web.search.google.PlaywrightGoogleLinkSearch` used for the Google URL search. By default, ``None``. - max_urls : int, optional + already_visited : set of str, optional + URLs that have already been crawled and should be skipped. + By default, ``None``. + num_link_scores_to_check_per_page : int, default=4 + Number of top-scoring links to visit per page. + By default, ``4``. + max_urls : int, default=100 Max number of URLs to check from the website before terminating the search. By default, ``100``. crawl_semaphore : :class:`asyncio.Semaphore`, optional @@ -471,12 +494,11 @@ async def download_jurisdiction_ordinance_using_search_engine( Parameters ---------- + question_templates : sequence of str + Query templates that will be formatted with the jurisdiction + name before submission to the search engine. jurisdiction : Jurisdiction Location objects representing the jurisdiction. - model_configs : dict - Dictionary of :class:`~compass.llm.config.LLMConfig` instances. - Should have at minium a "default" key that is used as a fallback - for all tasks. num_urls : int, optional Number of unique Google search result URL's to check for ordinance document. By default, ``5``. @@ -499,9 +521,14 @@ async def download_jurisdiction_ordinance_using_search_engine( playwright browsers used to download content from the web open concurrently. If ``None``, no limits are applied. By default, ``None``. - usage_tracker : UsageTracker, optional - Optional tracker instance to monitor token usage during - LLM calls. By default, ``None``. + url_ignore_substrings : list of str, optional + URL substrings that should be excluded from search results. + Substrings are applied case-insensitively. By default, ``None``. + **kwargs + Additional keyword arguments forwarded to + :func:`elm.web.search.run.web_search_links_as_docs`. Common + entries include ``usage_tracker`` for logging LLM usage and + extra Playwright configuration. Returns ------- @@ -571,17 +598,29 @@ async def filter_ordinance_docs( Parameters ---------- + docs : sequence of elm.web.document.BaseDocument + Documents to screen for ordinance content. jurisdiction : Jurisdiction Location objects representing the jurisdiction. model_configs : dict Dictionary of LLMConfig instances. Should have at minium a "default" key that is used as a fallback for all tasks. + heuristic : object + Domain-specific heuristic implementing a ``check`` method to + qualify ordinance content. tech : str Technology of interest (e.g. "solar", "wind", etc). This is used to set up some document validation decision trees. + ordinance_text_collector_class : type + Collector class used to extract ordinance text sections. + permitted_use_text_collector_class : type + Collector class used to extract permitted-use text sections. usage_tracker : UsageTracker, optional Optional tracker instance to monitor token usage during LLM calls. By default, ``None``. + check_for_correct_jurisdiction : bool, default=True + If ``True`` run jurisdiction validation before, content checks. + By default, ``True``. Returns ------- @@ -589,6 +628,11 @@ async def filter_ordinance_docs( List of :obj:`~elm.web.document.BaseDocument` instances possibly containing ordinance information, or ``None`` if no ordinance document was found. + + Notes + ----- + The function updates CLI progress bars to reflect each filtering + phase and returns documents sorted by quality heuristics. """ if check_for_correct_jurisdiction: COMPASS_PB.update_jurisdiction_task( @@ -656,7 +700,7 @@ async def _docs_from_web_search( on_search_complete_hook, **kwargs, ): - """Download docs from web using jurisdiction queries""" + """Download documents from the web using jurisdiction queries""" queries = [ question.format(jurisdiction=jurisdiction.full_name) for question in question_templates @@ -690,7 +734,7 @@ async def _docs_from_web_search( async def _down_select_docs_correct_jurisdiction( docs, jurisdiction, usage_tracker, model_config ): - """Remove all documents not pertaining to the jurisdiction""" + """Remove documents that do not match the target jurisdiction""" jurisdiction_validator = JurisdictionValidator( text_splitter=model_config.text_splitter, llm_service=model_config.llm_service, @@ -716,7 +760,7 @@ async def _down_select_docs_correct_content( permitted_use_text_collector_class, usage_tracker, ): - """Remove all documents that don't contain ordinance info""" + """Remove documents that do not contain ordinance information""" return await filter_documents( docs, validation_coroutine=_contains_ordinances, @@ -733,7 +777,7 @@ async def _down_select_docs_correct_content( async def _contains_ordinances( doc, model_configs, usage_tracker=None, **kwargs ): - """Helper coroutine that checks for ordinance and date info""" + """Determine whether a document contains ordinance information""" model_config = model_configs.get( LLMTasks.DOCUMENT_CONTENT_VALIDATION, model_configs[LLMTasks.DEFAULT], @@ -757,7 +801,7 @@ async def _contains_ordinances( def _sort_final_ord_docs(all_ord_docs): - """Sort the list of documents by year, type, and text length""" + """Sort ordinance documents by desirability heuristics""" if not all_ord_docs: return None @@ -765,7 +809,7 @@ def _sort_final_ord_docs(all_ord_docs): def _ord_doc_sorting_key(doc): - """Sorting key for documents. The higher this value, the better""" + """Compute a composite sorting score for ordinance documents""" latest_year, latest_month, latest_day = doc.attrs.get("date", (-1, -1, -1)) best_docs_from_website = doc.attrs.get(_SCORE_KEY, 0) prefer_pdf_files = isinstance(doc, PDFDocument) diff --git a/compass/scripts/process.py b/compass/scripts/process.py index c6dd6eba1..175bd684a 100644 --- a/compass/scripts/process.py +++ b/compass/scripts/process.py @@ -518,7 +518,7 @@ def __init__( @cached_property def browser_semaphore(self): - """asyncio.Semaphore or None: Sem to limit # of browsers""" + """asyncio.Semaphore or None: Browser concurrency limiter""" return ( asyncio.Semaphore( self.web_search_params.max_num_concurrent_browsers @@ -529,7 +529,7 @@ def browser_semaphore(self): @cached_property def crawl_semaphore(self): - """asyncio.Semaphore or None: Sem to limit # of crawls""" + """asyncio.Semaphore or None: Concurrency limiter for crawls""" return ( asyncio.Semaphore( self.web_search_params.max_num_concurrent_website_searches @@ -540,7 +540,7 @@ def crawl_semaphore(self): @cached_property def search_engine_semaphore(self): - """asyncio.Semaphore or None: Sem to limit # of SE queries""" + """asyncio.Semaphore: Concurrency limiter for search queries""" return asyncio.Semaphore(MAX_CONCURRENT_SEARCH_ENGINE_QUERIES) @cached_property @@ -556,14 +556,14 @@ def _jurisdiction_semaphore(self): @property def jurisdiction_semaphore(self): - """asyncio.Semaphore or AsyncExitStack: Jurisdictions limit""" + """asyncio.Semaphore or AsyncExitStack: Jurisdiction context""" if self._jurisdiction_semaphore is None: return AsyncExitStack() return self._jurisdiction_semaphore @cached_property def file_loader_kwargs(self): - """dict: Keyword arguments for `AsyncWebFileLoader`""" + """dict: Keyword arguments for ``AsyncWebFileLoader``""" file_loader_kwargs = _configure_file_loader_kwargs( self.process_kwargs.file_loader_kwargs ) @@ -576,7 +576,7 @@ def file_loader_kwargs(self): @cached_property def local_file_loader_kwargs(self): - """dict: Keyword arguments for `AsyncLocalFileLoader`""" + """dict: Keyword arguments for ``AsyncLocalFileLoader``""" file_loader_kwargs = { "pdf_read_coroutine": read_pdf_file, "pdf_read_kwargs": ( @@ -596,7 +596,7 @@ def local_file_loader_kwargs(self): @cached_property def known_local_docs(self): - """dict: Known filepaths by jurisdiction code""" + """dict: Known filepaths keyed by jurisdiction code""" known_local_docs = self.process_kwargs.known_local_docs or {} if isinstance(known_local_docs, str): known_local_docs = load_config(known_local_docs) @@ -604,7 +604,7 @@ def known_local_docs(self): @cached_property def known_doc_urls(self): - """dict: Known URL's keyed by jurisdiction code""" + """dict: Known URLs keyed by jurisdiction code""" known_doc_urls = self.process_kwargs.known_doc_urls or {} if isinstance(known_doc_urls, str): known_doc_urls = load_config(known_doc_urls) @@ -612,12 +612,12 @@ def known_doc_urls(self): @cached_property def tpe_kwargs(self): - """dict: Keyword arguments for `ThreadPoolExecutor`""" + """dict: Keyword arguments for ``ThreadPoolExecutor``""" return _configure_thread_pool_kwargs(self.process_kwargs.tpe_kwargs) @cached_property def _base_services(self): - """list: List of required services to run for processing""" + """list: Services required to support jurisdiction processing""" base_services = [ TempFileCachePB( td_kwargs=self.process_kwargs.td_kwargs, @@ -704,7 +704,7 @@ async def run(self, jurisdiction_fp): return run_msg async def _run_all(self, jurisdictions): - """Process all jurisdictions with running services""" + """Process all jurisdictions while required services run""" services = [model.llm_service for model in set(self.models.values())] services += self._base_services _ = self.file_loader_kwargs # init loader kwargs once @@ -743,7 +743,7 @@ async def _run_all(self, jurisdictions): async def _processed_jurisdiction_info_with_pb( self, jurisdiction, *args, **kwargs ): - """Process jurisdiction and update progress bar""" + """Process a jurisdiction while updating the progress bar""" async with self.jurisdiction_semaphore: with COMPASS_PB.jurisdiction_prog_bar(jurisdiction.full_name): return await self._processed_jurisdiction_info( @@ -751,7 +751,7 @@ async def _processed_jurisdiction_info_with_pb( ) async def _processed_jurisdiction_info(self, *args, **kwargs): - """Drop `doc` from RAM and only keep enough info to re-build""" + """Convert processed document to minimal metadata""" doc = await self._process_jurisdiction_with_logging(*args, **kwargs) @@ -771,7 +771,7 @@ async def _process_jurisdiction_with_logging( known_doc_urls=None, usage_tracker=None, ): - """Retrieve ordinance document with async logs""" + """Retrieve ordinance document with location-scoped logging""" async with LocationFileLog( self.log_listener, self.dirs.logs, @@ -868,7 +868,14 @@ def _tracked_progress(self): self._jsp = None async def run(self): - """Download and parse document for a single jurisdiction""" + """Download and parse ordinances for a single jurisdiction + + Returns + ------- + elm.web.document.BaseDocument or None + Document containing ordinance information, or ``None`` when + no valid ordinance content was identified. + """ start_time = time.monotonic() doc = None try: @@ -882,7 +889,7 @@ async def run(self): return doc async def _run(self): - """Search for docs and parse them for ordinances""" + """Search for documents and parse them for ordinances""" if self.known_local_docs: doc = await self._try_find_ordinances( method=self._load_known_local_documents, @@ -914,7 +921,7 @@ async def _run(self): return None async def _try_find_ordinances(self, method, *args, **kwargs): - """Try to find ordinances using specified method""" + """Execute a retrieval method and parse resulting documents""" docs = await method(*args, **kwargs) if docs is None: return None @@ -926,7 +933,7 @@ async def _try_find_ordinances(self, method, *args, **kwargs): return await self._parse_docs_for_ordinances(docs) async def _load_known_local_documents(self): - """Load local ordinance documents""" + """Load ordinance documents from known local file paths""" docs = await load_known_docs( self.jurisdiction, @@ -968,7 +975,7 @@ async def _load_known_local_documents(self): return docs async def _download_known_url_documents(self): - """Download ordinance documents from known URLs""" + """Download ordinance documents from pre-specified URLs""" docs = await download_known_urls( self.jurisdiction, @@ -1011,7 +1018,7 @@ async def _download_known_url_documents(self): return docs async def _find_documents_using_search_engine(self): - """Search the web for an ordinance document and construct it""" + """Search the web for ordinance docs using search engines""" docs = await download_jurisdiction_ordinance_using_search_engine( self.tech_specs.questions, self.jurisdiction, @@ -1050,7 +1057,7 @@ async def _find_documents_using_search_engine(self): return docs async def _find_documents_from_website(self): - """Search the website for ordinance documents""" + """Search the jurisdiction website for ordinance documents""" if self.jurisdiction_website and self.validate_user_website_input: await self._validate_jurisdiction_website() @@ -1080,7 +1087,7 @@ async def _find_documents_from_website(self): return docs async def _validate_jurisdiction_website(self): - """Validate user input for jurisdiction website""" + """Validate a user-supplied jurisdiction website URL""" if self.jurisdiction_website is None: return @@ -1111,7 +1118,7 @@ async def _validate_jurisdiction_website(self): self.jurisdiction_website = None async def _try_find_jurisdiction_website(self): - """Use web to try to find the main jurisdiction website""" + """Locate the primary jurisdiction website via search""" COMPASS_PB.update_jurisdiction_task( self.jurisdiction.full_name, description="Searching for jurisdiction website...", @@ -1130,7 +1137,7 @@ async def _try_find_jurisdiction_website(self): ) async def _try_elm_crawl(self): - """Try crawling website using ELM crawler""" + """Crawl the jurisdiction website using the ELM crawler""" self.jurisdiction_website = await get_redirected_url( self.jurisdiction_website, timeout=30 ) @@ -1162,7 +1169,7 @@ async def _try_elm_crawl(self): return docs, scrape_results async def _try_compass_crawl(self, scrape_results): - """Try to crawl the website with compass-style crawling""" + """Crawl the jurisdiction website using the COMPASS crawler""" checked_urls = set() for scrape_result in scrape_results: checked_urls.update({sub_res.url for sub_res in scrape_result}) @@ -1194,7 +1201,7 @@ async def _try_compass_crawl(self, scrape_results): ) async def _parse_docs_for_ordinances(self, docs): - """Parse docs (in order) for ordinances""" + """Parse candidate documents in order until ordinances found""" for possible_ord_doc in docs: doc = await self._try_extract_all_ordinances(possible_ord_doc) ord_count = num_ordinances_in_doc( @@ -1211,7 +1218,7 @@ async def _parse_docs_for_ordinances(self, docs): return None async def _try_extract_all_ordinances(self, possible_ord_doc): - """Try to extract ordinance values and permitted districts""" + """Extract both ordinance values and permitted-use districts""" with self._tracked_progress(): tasks = [ asyncio.create_task( @@ -1227,7 +1234,7 @@ async def _try_extract_all_ordinances(self, possible_ord_doc): @property def _extraction_task_kwargs(self): - """Keyword-argument pairs to pass to _try_extract_ordinances""" + """list: Dictionaries describing extraction task config""" return [ { "extractor_class": self.tech_specs.ordinance_text_extractor, @@ -1276,7 +1283,7 @@ async def _try_extract_ordinances( text_model, value_model, ): - """Try applying a single extractor to the relevant legal text""" + """Apply a single extractor and parser to legal text""" logger.debug( "Checking for ordinances in doc from %s", possible_ord_doc.attrs.get("source", "unknown source"), @@ -1304,7 +1311,7 @@ async def _try_extract_ordinances( return out async def _record_usage(self): - """Dump usage to file if tracker given""" + """Persist usage tracking data when a tracker is available""" if self.usage_tracker is None: return diff --git a/compass/services/base.py b/compass/services/base.py index b6c9e4cb7..5dda897a6 100644 --- a/compass/services/base.py +++ b/compass/services/base.py @@ -45,7 +45,7 @@ class Service(ABC): @classmethod def _queue(cls): - """Get queue for class.""" + """Return the service queue for the class""" service_name = cls.__name__ queue = get_service_queue(service_name) if queue is None: @@ -81,6 +81,8 @@ def name(self): async def process_using_futures(self, fut, *args, **kwargs): """Process a call to the service + The result is communicated by updating ``fut``. + Parameters ---------- fut : asyncio.Future @@ -109,12 +111,7 @@ def release_resources(self): # noqa: B027 @property @abstractmethod def can_process(self): - """Check if process function can be called. - - This should be a fast-running method that returns a boolean - indicating whether or not the service can accept more - processing calls. - """ + """bool: Flag indicating whether the service can accept work""" @abstractmethod async def process(self, *args, **kwargs): @@ -177,7 +174,7 @@ def name(self): return f"{self.__class__.__name__}-{self.model_name}{self.service_tag}" def _queue(self): - """Get queue for class""" + """Return the service queue for this instance""" queue = get_service_queue(self.name) if queue is None: msg = MISSING_SERVICE_MESSAGE.format(service_name=self.name) diff --git a/compass/services/cpu.py b/compass/services/cpu.py index 480be34b0..e23110cd3 100644 --- a/compass/services/cpu.py +++ b/compass/services/cpu.py @@ -47,26 +47,22 @@ def can_process(self): return True async def process(self, fn, pdf_bytes, **kwargs): - """Write URL doc to file asynchronously + """Execute a PDF parsing function in the process pool Parameters ---------- - doc : elm.web.document.BaseDocument - Document containing meta information about the file. Must - have a "source" key in the ``attrs`` dict containing the - URL, which will be converted to a file name using - :func:`elm.web.utilities.compute_fn_from_url`. - file_content : str or bytes - File content, typically string text for HTML files and bytes - for PDF file. - make_name_unique : bool, optional - Option to make file name unique by adding a UUID at the end - of the file name. By default, ``False``. + fn : callable + Callable executed inside the process pool. Receives + ``pdf_bytes`` as the first argument. + pdf_bytes : bytes + Raw PDF payload forwarded to ``fn``. + **kwargs + Additional keyword arguments passed to ``fn``. Returns ------- - Path - Path to output file. + Any + Result returned by ``fn`` after execution. """ loop = asyncio.get_running_loop() return await loop.run_in_executor( diff --git a/compass/services/threaded.py b/compass/services/threaded.py index c91951444..f80d8ff23 100644 --- a/compass/services/threaded.py +++ b/compass/services/threaded.py @@ -386,6 +386,11 @@ async def process(self, tracker): tracker : UsageTracker A usage tracker instance that contains usage info to be added to output file. + + Returns + ------- + dict + Updated usage dictionary persisted to ``usage_fp``. """ self._is_processing = True try: @@ -425,11 +430,9 @@ def can_process(self): async def process( self, jurisdiction, doc, seconds_elapsed, usage_tracker=None ): - """Add usage from tracker to file + """Record jurisdiction metadata in the tracking file - Any existing usage info in the file will remain unchanged - EXCEPT for anything under the label of the input `tracker`, - all of which will be replaced with info from the tracker itself. + The file on disk is updated in-place. Parameters ---------- @@ -482,6 +485,13 @@ async def process(self, html_fp, **kwargs): **kwargs Additional keyword-value argument pairs to pass to :class:`elm.web.document.HTMLDocument`. + + Returns + ------- + tuple + Two-item tuple of the loaded + :class:`~elm.web.document.HTMLDocument` + and the raw HTML string content. """ loop = asyncio.get_running_loop() return await loop.run_in_executor( @@ -612,7 +622,9 @@ async def read_html_file(html_fp, **kwargs): Returns ------- - elm.web.document.HTMLDocument - HTMLDocument instance with text loaded into page. + tuple + Two-item tuple of the loaded + :class:`~elm.web.document.HTMLDocument` + and the raw HTML string content. """ return await HTMLFileLoader.call(html_fp, **kwargs) diff --git a/compass/services/usage.py b/compass/services/usage.py index 0df00c332..3613fabb5 100644 --- a/compass/services/usage.py +++ b/compass/services/usage.py @@ -135,14 +135,7 @@ def add_to(self, other): @property def totals(self): - """Compute total usage across all sub-labels - - Returns - ------- - dict - Dictionary containing usage information totaled across all - sub-labels. - """ + """dict: Aggregated usage totals across all sub-labels""" totals = {} for model, model_usage in self.items(): total_model_usage = totals[model] = {} diff --git a/compass/utilities/base.py b/compass/utilities/base.py index 949c18a4f..4580b062b 100644 --- a/compass/utilities/base.py +++ b/compass/utilities/base.py @@ -8,24 +8,41 @@ def title_preserving_caps(string): - """Convert string to title case, preserving existing capitalization + """Convert text to title case while keeping intentional capitals Parameters ---------- string : str - Input string potentially containing capitalized words. + Input text that may already contain capitalized acronyms or + proper nouns. Returns ------- str - String converted to title case, preserving existing - capitalization. + Title-cased string in which words containing existing uppercase + characters retain their capitalization. + + Examples + -------- + >>> title_preserving_caps("NREL solar ordinance") + 'NREL Solar Ordinance' """ return " ".join(map(_cap, string.split(" "))) class WebSearchParams: - """Helper class to store web search params""" + """Capture configuration for jurisdiction web searches + + The class normalizes and stores search-related settings that are + reused across multiple search operations, including browser + concurrency, engine preferences, and filtering rules. + + Notes + ----- + Instances lazily translate the provided search engine definitions + into ELM-compatible keyword arguments via :attr:`se_kwargs`, + enabling straightforward reuse when issuing queries. + """ def __init__( self, @@ -124,7 +141,19 @@ def se_kwargs(self): class Directories: - """Helper class to store directories used in COMPASS run""" + """Encapsulate filesystem locations used by a COMPASS run + + The helper centralizes directory computations so downstream code + can rely on fully resolved :class:`pathlib.Path` instances for + logging, cleaned text, downloaded ordinances, and intermediate + databases. + + Notes + ----- + All provided paths are expanded to absolute form when the class is + instantiated, guaranteeing consistent behavior across relative and + user-expanded paths. + """ def __init__( self, @@ -175,7 +204,6 @@ def __init__( ) def __iter__(self): - """Iterate over all directories""" yield self.out yield self.logs yield self.clean_files @@ -183,16 +211,16 @@ def __iter__(self): yield self.jurisdiction_dbs def make_dirs(self): - """Create all directories if they do not exist""" + """Create the managed directories if they do not exist""" for folder in self: folder.mkdir(exist_ok=True, parents=True) def _cap(word): - """Capitalize first letter of the word""" + """Capitalize the first character of ``word``; preserve the rest""" return "".join([word[0].upper(), word[1:]]) def _full_path(in_path): - """Expand and resolve input path""" + """Resolve an input path to an absolute :class:`pathlib.Path`""" return Path(in_path).expanduser().resolve() diff --git a/compass/utilities/enums.py b/compass/utilities/enums.py index cf0864089..fa2e092b4 100644 --- a/compass/utilities/enums.py +++ b/compass/utilities/enums.py @@ -4,7 +4,22 @@ class LLMUsageCategory(StrEnum): - """COMPASS LLM usage categories""" + """Enumerate semantic buckets for tracking LLM usage + + The values in this enumeration provide consistent labels when + recording usage metrics, billing data, and telemetry associated + with LLM calls originating from COMPASS pipelines. Each category + maps to a specific functional concern (e.g., ordinance value + extraction, jurisdiction validation) allowing downstream analytics + to aggregate usage meaningfully. + + Notes + ----- + Values intentionally mirror the task names used when instantiating + :class:`~compass.llm.calling.BaseLLMCaller` implementations so that + the enumerations can be converted to strings without additional + mapping logic. + """ CHAT = auto() """Usage related to general LLM chat calls""" @@ -35,7 +50,20 @@ class LLMUsageCategory(StrEnum): class LLMTasks(StrEnum): - """LLM-based COMPASS tasks""" + """Human-friendly task identifiers for LLM workflows + + This enumeration exposes the set of user-facing task names that map + onto :class:`LLMUsageCategory` entries. Pipeline components use + these values for configuration (e.g., selecting prompt templates) + while the paired usage categories ensure consistent metrics + tracking. + + Notes + ----- + When a task is defined as a direct alias of an + :class:`LLMUsageCategory`, it inherits the corresponding usage label + so downstream monitoring does not require additional translation. + """ DATE_EXTRACTION = LLMUsageCategory.DATE_EXTRACTION """Date extraction task""" diff --git a/compass/utilities/finalize.py b/compass/utilities/finalize.py index 58f6eaac3..be1a457b8 100644 --- a/compass/utilities/finalize.py +++ b/compass/utilities/finalize.py @@ -52,32 +52,43 @@ def save_run_meta( total_cost, models, ): - """Write out meta information about ordinance collection run + """Persist metadata describing an ordinance collection run Parameters ---------- - dirs : :class:`~compass.utilities.base.Directories` - Directories instance containing information about the output - directories used for the run. + dirs : compass.utilities.base.Directories + Directory container describing where outputs, logs, and working + files should be written during the run. tech : {"wind", "solar", "small wind"} - Technology that was the target of the run. - start_date, end_date : datetime.datetime - Instances representing the start and end dates, respectively. - num_jurisdictions_searched, num_jurisdictions_found : int - Total number of jurisdictions that were searched and actually - found, respectively. + Technology targeted by the collection run. The value is stored + verbatim in the metadata file for downstream reporting. + start_date : datetime.datetime + Timestamp marking when the run began. + end_date : datetime.datetime + Timestamp marking when the run finished. + num_jurisdictions_searched : int + Number of jurisdictions evaluated during the run. + num_jurisdictions_found : int + Number of jurisdictions that produced at least one ordinance. total_cost : float - Total cost of the processing, in $. + Aggregate cost incurred by LLM usage for the run. ``None`` or + zero values are recorded as ``null`` in the metadata. models : dict - Dictionary mapping task names (from - :class:`~compass.utilities.enums.LLMTasks`) to - :class:`~compass.llm.config.OpenAIConfig` instances used for the - run. + Mapping from LLM task identifiers (as str) to configuration + objects (:class:`~compass.llm.config.OpenAIConfig`) used + throughout the run. The function records a condensed summary of + each configuration. Returns ------- - run_time : float - Total processing run-time, in seconds. + float + Total runtime of the collection, expressed in seconds. + + Notes + ----- + The function writes ``meta.json`` into ``dirs.out`` alongside + references to other artifacts generated during the run. The return + value mirrors the ``total_time`` entry stored in the metadata. """ try: @@ -124,32 +135,32 @@ def save_run_meta( def doc_infos_to_db(doc_infos): - """Convert list of docs to output database + """Aggregate parsed ordinance CSV files into a normalized database Parameters ---------- - doc_infos : iterable of dict - Iterable of dictionaries, where each dictionary has at least the - following keys: - - - "ord_db_fp": Path to parsed ordinance CSV file - - "source": URL of the file from which ordinances were - extracted - - "date": Tuple of (year, month, day). Any of the values can - be ``None``. - - "jurisdiction": Instance of Jurisdiction representing the - jurisdiction associated with these ordinance values. - - If this iterable is empty, and empty DataFrame (with the correct - columns) is returned. + doc_infos : Iterable + Iterable of dictionaries describing ordinance extraction + results. Each dictionary must contain ``"ord_db_fp"`` (path to a + parsed CSV), ``"source"`` (document URL), ``"date"`` (tuple of + year, month, day, with ``None`` allowed), and ``"jurisdiction"`` + (a :class:`~compass.utilities.location.Jurisdiction` instance). Returns ------- - ordinances : pandas.DataFrame - DataFrame containing ordinances collected from all individual - CSV's. - count : int - Total number jurisdictions for which ordinances were found. + pandas.DataFrame + Consolidated ordinance dataset containing the columns enumerated + in :data:`_PARSED_COLS`. + int + Number of jurisdictions contributing at least one ordinance to + the consolidated dataset. + + Notes + ----- + Empty or ``None`` entries in ``doc_infos`` are skipped. Ordinance + CSVs that lack parsed values (``num_ordinances_dataframe`` equals + zero) are ignored. The returned DataFrame enforces an ordered column + layout and casts the ``quantitative`` flag to nullable boolean. """ db = [] for doc_info in doc_infos: @@ -180,19 +191,24 @@ def doc_infos_to_db(doc_infos): def save_db(db, out_dir): - """Split DB into qualitative vs quantitative and save to disk + """Write qualitative and quantitative ordinance outputs to disk Parameters ---------- db : pandas.DataFrame - Pandas DataFrame containing ordinance data to save. Must have - all columns in :obj:`QUANT_OUT_COLS` and :obj:`QUAL_OUT_COLS` - as well as a ``"quantitative"`` column that contains a boolean - determining whether the rwo belongs in the quantitative output - file (``True``) or the qualitative output file (``False``). + Ordinance dataset containing the full set of columns listed in + :data:`QUANT_OUT_COLS` and :data:`QUAL_OUT_COLS`, plus the + ``quantitative`` boolean flag that dictates output routing. out_dir : path-like - Path to output directory where ordinance database csv files - should be written. + Directory where ``qualitative_ordinances.csv`` and + ``quantitative_ordinances.csv`` should be written. The directory + is created by :class:`pathlib.Path` if necessary. + + Notes + ----- + Empty DataFrames short-circuit without creating output files. The + function respects the boolean ``quantitative`` column and assumes it + has already been sanitized by :func:`doc_infos_to_db`. """ if db.empty: return @@ -272,23 +288,31 @@ def _extract_model_info_from_all_models(models): def compile_run_summary_message( total_seconds, total_cost, out_dir, document_count ): - """Summarize the run results into a formatted string + """Create a human-readable summary of a completed run Parameters ---------- - total_seconds : int or float - Total number of seconds the run took to complete. - total_cost : int or float - Total cost of the run, in $. + total_seconds : float or int + Duration of the run in seconds. + total_cost : float or int or None + Monetary cost incurred by the run. ``None`` or zero suppresses + the cost line in the summary. out_dir : path-like - Path to output directory where the run results are saved. + Location of the run output directory. The value is embedded in + the summary text. document_count : int - Number of documents found during the run. + Number of documents discovered across all jurisdictions. Returns ------- str - Formatted string summarizing the run results. + Summary string formatted for CLI presentation with ``rich`` + markup. + + Notes + ----- + The function does not perform I/O; callers may log or display the + returned string as needed. """ runtime = _elapsed_time_as_str(total_seconds) total_cost = ( diff --git a/compass/utilities/io.py b/compass/utilities/io.py index c099e7956..02f3e9c50 100644 --- a/compass/utilities/io.py +++ b/compass/utilities/io.py @@ -10,23 +10,35 @@ async def load_local_docs(fps, **kwargs): - """Load a document for each input filepath + """Load local documents into `elm` document instances Parameters ---------- - fps : iterable of path-like - Iterable of paths representing documents to load. - kwargs - Keyword-argument pairs to initialize - :class:`elm.web.file_loader.AsyncLocalFileLoader`. + fps : Iterable + Iterable of paths referencing local files to load. + **kwargs + Additional keyword arguments forwarded to + :class:`elm.web.file_loader.AsyncLocalFileLoader` for + configuration such as ``loader``, caching, or parsing options. Returns ------- - list - List of non-empty document instances containing information from - the local documents. If a file could not be loaded (i.e. - document instance is empty), it will not be included in the - output list. + list of elm.web.document.BaseDocument + Non-empty loaded documents corresponding to the supplied + filepaths. Empty results (e.g., unreadable files) are filtered + out of the returned list. + + Raises + ------ + elm.exceptions.ELMError + Propagated when the underlying loader fails to read one of the + provided files and is configured to raise on errors. + + Notes + ----- + Detailed debug information about loaded page counts is emitted via + the ``compass.utilities.io`` logger at ``TRACE`` level to assist + with troubleshooting ingestion runs. """ logger.trace("Loading docs for the following paths:\n%r", fps) logger.trace( diff --git a/compass/utilities/jurisdictions.py b/compass/utilities/jurisdictions.py index 0dfbe0682..774edcd9d 100644 --- a/compass/utilities/jurisdictions.py +++ b/compass/utilities/jurisdictions.py @@ -18,19 +18,24 @@ def load_all_jurisdiction_info(): - """Load DataFrame containing info for all jurisdictions + """Load canonical jurisdiction metadata for the continental US Returns ------- pandas.DataFrame - DataFrame containing info like names, FIPS, websites, etc. for - all jurisdictions. + Table containing jurisdiction names, FIPS codes, official + websites, and related attributes. + + Notes + ----- + Missing values are normalized to ``None`` to simplify downstream + serialization. """ return pd.read_csv(_COUNTY_DATA_FP).replace({np.nan: None}) def jurisdiction_websites(jurisdiction_info=None): - """Load mapping of jurisdiction name and state to website + """Build a mapping of jurisdiction identifiers to website URLs Parameters ---------- @@ -43,8 +48,13 @@ def jurisdiction_websites(jurisdiction_info=None): Returns ------- dict - Dictionary where keys are FIPS codes and values are the relevant - website URL. + Mapping from jurisdiction FIPS codes to their primary website + URLs. + + Notes + ----- + The helper uses FIPS codes rather than string names to avoid + collisions between same-named jurisdictions in different states. """ if jurisdiction_info is None: jurisdiction_info = load_all_jurisdiction_info() @@ -55,7 +65,10 @@ def jurisdiction_websites(jurisdiction_info=None): def load_jurisdictions_from_fp(jurisdiction_fp): - """Load jurisdiction info based on jurisdictions in the input fp + """Load jurisdiction metadata for entries listed in a CSV file + + This loader trims whitespace, deduplicates request rows, and filters + out jurisdictions not present in the canonical data set. Parameters ---------- @@ -66,9 +79,18 @@ def load_jurisdictions_from_fp(jurisdiction_fp): Returns ------- pandas.DataFrame - DataFrame containing jurisdiction info like names, FIPS, - websites, etc. for all requested jurisdictions (that were - found). + Jurisdiction information, including FIPS codes and websites, + for every matching entry in the lookup table. + + Raises + ------ + COMPASSValueError + If the input file is missing required columns (``State`` or + ``Jurisdiction Type`` when subdivisions are provided). + + Notes + ----- + Missing jurisdictions trigger warnings with a tabular summary. """ jurisdictions = pd.read_csv(jurisdiction_fp).replace({np.nan: None}) jurisdictions = _validate_jurisdiction_input(jurisdictions) diff --git a/compass/utilities/location.py b/compass/utilities/location.py index b6bc3da94..6ef8ece35 100644 --- a/compass/utilities/location.py +++ b/compass/utilities/location.py @@ -14,7 +14,21 @@ class Jurisdiction: - """Class representing a jurisdiction""" + """Model a geographic jurisdiction used throughout COMPASS + + The class normalizes casing for location components and provides + convenience properties for rendering jurisdiction names with + correct prefixes. It is designed to align with ordinance validation + logic that expects consistent casing and phrasing across states, + counties, and municipal subdivisions. + + Notes + ----- + Instances compare case-insensitively for type and state, while the + county and subdivision name comparisons preserve their stored + casing. Hashing and ``str`` conversions defer to the full display + name generated by :attr:`full_name`. + """ def __init__( self, @@ -65,7 +79,7 @@ def __init__( @cached_property def full_name(self): - """str: Full jurisdiction name""" + """str: Comma-separated jurisdiction display name""" name_parts = [ self.full_subdivision_phrase, self.full_county_phrase, @@ -76,7 +90,7 @@ def full_name(self): @cached_property def full_name_the_prefixed(self): - """str: Full jurisdiction name with `the` prefix if needed""" + """str: Full location name prefixed with ``the`` as needed""" if self.type.casefold() == "state": return f"the state of {self.state}" @@ -87,7 +101,7 @@ def full_name_the_prefixed(self): @cached_property def full_subdivision_phrase(self): - """str: Full jurisdiction subdivision phrase, or empty str""" + """str: Subdivision phrase for the jurisdiction or empty str""" if not self.subdivision_name: return "" @@ -98,7 +112,7 @@ def full_subdivision_phrase(self): @cached_property def full_subdivision_phrase_the_prefixed(self): - """str: Full jurisdiction subdivision phrase, or empty str""" + """str: Subdivision phrase prefixed with ``the`` as needed""" if self.type.casefold() in JURISDICTION_TYPES_AS_PREFIXES: return f"the {self.full_subdivision_phrase}" @@ -106,7 +120,7 @@ def full_subdivision_phrase_the_prefixed(self): @cached_property def full_county_phrase(self): - """str: Full jurisdiction county phrase, or empty str""" + """str: County phrase for the jurisdiction or empty str""" if not self.county: return "" diff --git a/compass/utilities/nt.py b/compass/utilities/nt.py index d937aad32..d27e8f7c0 100644 --- a/compass/utilities/nt.py +++ b/compass/utilities/nt.py @@ -15,6 +15,32 @@ ], defaults=[None, None, None, None, 25], ) +ProcessKwargs.__doc__ = """Execution options passed to `compass process` + +Parameters +---------- +known_local_docs : list of path-like, optional + Local ordinance files to seed the run. ``None`` disables the seed. + By default, ``None``. +known_doc_urls : list of str, optional + Known ordinance URLs to prioritize during retrieval. + By default, ``None``. +file_loader_kwargs : dict, optional + Keyword arguments forwarded to the document loader implementation. + By default, ``None``. +td_kwargs : dict, optional + Additional configuration for top-level document discovery logic. + By default, ``None``. +tpe_kwargs : dict, optional + Parameters controlling text parsing and extraction. + By default, ``None``. +ppe_kwargs : dict, optional + Parameters controlling permitted-use parsing and extraction. + By default, ``None``. +max_num_concurrent_jurisdictions : int, default=25 + Maximum number of jurisdictions processed simultaneously. + By default, ``25``. +""" TechSpec = namedtuple( "TechSpec", @@ -31,3 +57,28 @@ "website_url_keyword_points", ], ) +TechSpec.__doc__ = """Bundle extraction configuration for a technology + +Parameters +---------- +name : str + Display name for the technology (e.g., ``"solar"``). +questions : dict + Prompt templates or question sets used during extraction. +heuristic : callable + Function implementing heuristic filters prior to LLM invocation. +ordinance_text_collector : callable + Callable that gathers candidate ordinance text spans. +ordinance_text_extractor : callable + Callable that extracts relevant ordinance snippets. +permitted_use_text_collector : callable + Callable that gathers candidate permitted-use text spans. +permitted_use_text_extractor : callable + Callable that extracts permitted-use content. +structured_ordinance_parser : callable + Callable that transforms ordinance text into structured values. +structured_permitted_use_parser : callable + Callable that transforms permitted-use text into structured values. +website_url_keyword_points : dict or None + Weightings for scoring website URLs during search. +""" diff --git a/compass/utilities/parsing.py b/compass/utilities/parsing.py index 1bb4284a0..aac176754 100644 --- a/compass/utilities/parsing.py +++ b/compass/utilities/parsing.py @@ -1,4 +1,4 @@ -"""COMPASS Ordinances parsing utilities.""" +"""COMPASS ordinance parsing utilities""" import json import logging @@ -19,32 +19,38 @@ def clean_backticks_from_llm_response(content): Parameters ---------- content : str - LLM response that may or may not contain markdown-style triple - backticks. + LLM response that may contain markdown triple backticks. Returns ------- str - LLM response stripped of the markdown-style backticks + Response stripped of all leading and trailing backtick markers. """ content = content.lstrip().rstrip() return content.removeprefix("```").lstrip("\n").removesuffix("```") def llm_response_as_json(content): - """LLM response to JSON + """Parse a raw LLM response into JSON-compatible data Parameters ---------- content : str - LLM response that contains a string representation of - a JSON file. + Response text expected to contain a JSON object, possibly with + Markdown fences or Python boolean literals. Returns ------- dict - Response parsed into dictionary. This dictionary will be empty - if the response cannot be parsed by JSON. + Parsed JSON structure. When parsing fails, the function returns + an empty dictionary. + + Notes + ----- + The parser strips Markdown code fences, coerces Python-style + booleans to lowercase JSON literals, and logs the raw response on + decode failure. The logging includes guidance for increasing token + limits or updating prompts. """ content = clean_backticks_from_llm_response(content) content = content.removeprefix("json").lstrip("\n") @@ -65,7 +71,12 @@ def llm_response_as_json(content): def merge_overlapping_texts(text_chunks, n=300): - """Merge chunks of text by removing any overlap. + """Merge text chunks while trimming overlapping boundaries + + Overlap detection compares at most ``n`` characters at each + boundary but never more than half the length of the accumulated + output. Chunks that do not overlap are concatenated with a newline + separator. Parameters ---------- @@ -81,7 +92,7 @@ def merge_overlapping_texts(text_chunks, n=300): Returns ------- str - Merged text. + Merged text assembled from the non-overlapping portions. """ text_chunks = list(filter(None, text_chunks)) if not text_chunks: @@ -103,7 +114,7 @@ def merge_overlapping_texts(text_chunks, n=300): def extract_ord_year_from_doc_attrs(doc_attrs): - """Extract year corresponding to the ordinance from doc instance + """Extract the ordinance year stored in document attributes Parameters ---------- @@ -117,15 +128,21 @@ def extract_ord_year_from_doc_attrs(doc_attrs): Returns ------- int or None - Parsed year for ordinance (int) or ``None`` if it wasn't found - in the document's attrs. + Parsed ordinance year or ``None`` when unavailable or invalid. + + Examples + -------- + >>> extract_ord_year_from_doc_attrs({"date": (2024, 5, 17)}) + 2024 + >>> extract_ord_year_from_doc_attrs({"date": (None, None, None)}) + None """ year = doc_attrs.get("date", (None, None, None))[0] return year if year is not None and year > 0 else None def num_ordinances_in_doc(doc, exclude_features=None): - """Count number of ordinances found in document + """Count the number of ordinance entries on a document Parameters ---------- @@ -139,7 +156,7 @@ def num_ordinances_in_doc(doc, exclude_features=None): Returns ------- int - Number of unique ordinance values extracted from this document. + Number of ordinance rows represented in ``doc``. """ if doc is None or doc.attrs.get("ordinance_values") is None: return 0 @@ -150,7 +167,7 @@ def num_ordinances_in_doc(doc, exclude_features=None): def num_ordinances_dataframe(data, exclude_features=None): - """Count number of ordinances found in DataFrame + """Count ordinance rows contained in a DataFrame Parameters ---------- @@ -164,7 +181,13 @@ def num_ordinances_dataframe(data, exclude_features=None): Returns ------- int - Number of unique ordinance values extracted from this DataFrame. + Count of rows meeting the ordinance criteria. + + Raises + ------ + KeyError + If the input DataFrame lacks the ``feature`` column when + ``exclude_features`` is provided. """ if exclude_features: mask = ~data["feature"].str.casefold().isin(exclude_features) @@ -174,7 +197,7 @@ def num_ordinances_dataframe(data, exclude_features=None): def ordinances_bool_index(data): - """Array of bools indicating rows containing ordinances in DataFrame + """Compute a boolean mask indicating ordinance rows Parameters ---------- @@ -184,9 +207,8 @@ def ordinances_bool_index(data): Returns ------- - array-like - Array of bools indicating rows containing ordinances in - DataFrame. + numpy.ndarray + Boolean mask identifying rows that contain ordinance values. """ if data is None or data.empty: return np.array([], dtype=bool) @@ -200,7 +222,7 @@ def ordinances_bool_index(data): def load_config(config_fp): - """Load a JSON or JSON5 config file + """Load configuration data from JSON or JSON5 sources Parameters ---------- @@ -210,12 +232,19 @@ def load_config(config_fp): Returns ------- dict - Dictionary containing the config file contents. + Parsed configuration object. Raises ------ COMPASSValueError - If the config file does not end with `.json` or `.json5`. + If the file path does not exist or the extension is not + ``.json`` or ``.json5``. + + Notes + ----- + JSON5 loading is handled via :mod:`pyjson5`, enabling comments and + trailing commas, among other quality-of-life improvements over + standard JSON, which uses the built-in :func:`json.load`. """ config_fp = Path(config_fp) diff --git a/compass/validation/content.py b/compass/validation/content.py index 8e41cbe33..1b084cd9d 100644 --- a/compass/validation/content.py +++ b/compass/validation/content.py @@ -18,13 +18,12 @@ class ParseChunksWithMemory: - """Check text chunks by sometimes looking at previous chunks + """Iterate through text chunks while caching prior LLM decisions - The idea behind this approach is that sometimes the context for a - setback or other ordinances is found in a previous chunk, so it may - be worthwhile (especially for validation purposes) to check a few - text chunks back for some validation pieces. In order to do this - semi-efficiently, we make use of a cache that's labeled "memory". + This helper stores an in-memory cache of prior validation results so + each chunk can optionally reuse outcomes from earlier LLM calls. The + design supports revisiting a configurable number of preceding text + chunks when newer chunks lack sufficient context. """ def __init__(self, text_chunks, num_to_recall=2): @@ -61,38 +60,30 @@ def _inverted_text(self, starting_ind): yield from inverted_text[:self.num_to_recall] async def parse_from_ind(self, ind, key, llm_call_callback): - """Validate a chunk of text + """Validate a chunk by consulting current and prior context - Validation occurs by querying the LLM using the input prompt and - parsing the `key` from the response JSON. The prompt should - request that the key be a boolean output. If the key retrieved - from the LLM response is False, a number of previous text chunks - are checked as well, using the same prompt. This can be helpful - in cases where the answer to the validation prompt (e.g. does - this text pertain to a large WECS?) is only found in a previous - text chunk. + Cached verdicts are reused to avoid redundant LLM calls when + neighboring chunks have already been assessed. If the cache + lacks a verdict, the callback is executed and the result stored. Parameters ---------- ind : int - Positive integer corresponding to the chunk index. - Must be less than `len(text_chunks)`. + Index of the chunk to inspect. Must be less than the number + of available chunks. key : str - A key expected in the JSON output of the LLM containing the - response for the validation question. This string will also - be used to format the system prompt before it is passed to - the LLM. + JSON key expected in the LLM response. The same key is used + to populate the decision cache. llm_call_callback : callable - Callable that takes a `key` and `text_chunk` as inputs and - returns a boolean indicating whether or not the text chunk - passes the validation check. + Awaitable invoked with ``(key, text_chunk)`` that returns a + boolean indicating whether the chunk satisfies the LLM + validation check. Returns ------- bool - ``True`` if the LLM returned ``True`` for this text chunk or - `num_to_recall-1` text chunks before it. - ``False`` otherwise. + ``True`` if the selected or recalled chunk satisfies the + check, ``False`` otherwise. """ logger.debug("Checking %r for ind %d", key, ind) mem_text = zip( @@ -316,7 +307,7 @@ async def parse_by_chunks( callbacks=None, min_chunks_to_process=3, ): - """Parse text by chunks, passing to callbacks if it's legal text + """Stream text chunks through heuristic and legal validators This method goes through the chunks one by one, and passes them to the callback parsers if the `legal_text_validator` check passes. If @@ -346,6 +337,13 @@ async def parse_by_chunks( min_chunks_to_process : int, optional Minimum number of chunks to process before aborting due to text not being legal. By default, ``3``. + + Notes + ----- + This coroutine only orchestrates validation. Callbacks are + responsible for persisting any extracted results. Callback futures + are awaited concurrently and share the same task name as the caller + to simplify tracing within structured logging. """ passed_heuristic_mem = [] callbacks = callbacks or [] diff --git a/compass/validation/graphs.py b/compass/validation/graphs.py index 31c2287d9..1bb9a1984 100644 --- a/compass/validation/graphs.py +++ b/compass/validation/graphs.py @@ -8,18 +8,28 @@ def setup_graph_correct_document_type(**kwargs): - """Setup graph to check for correct document type in legal text + """Build a decision tree for validating ordinance document types Parameters ---------- **kwargs - Keyword-value pairs to add to graph. + Additional keyword arguments forwarded to + :func:`compass.common.setup_graph_no_nodes`. The helper consumes + ``doc_is_from_ocr`` (default ``False``) to alter draft-detection + prompts for scanned documents. Returns ------- networkx.DiGraph - Graph instance that can be used to initialize an - `elm.tree.DecisionTree`. + Graph suitable for constructing an ``elm.tree.DecisionTree`` + that distinguishes legally binding ordinances from draft, + planning, meeting, and similar documents. + + Notes + ----- + The resulting graph encodes a structured sequence of Yes/No prompts + that culminate in a JSON response containing summary metadata and a + legal-text boolean keyed by ``{key}``. """ doc_is_from_ocr = kwargs.pop("doc_is_from_ocr", False) @@ -277,20 +287,29 @@ def setup_graph_correct_document_type(**kwargs): def setup_graph_correct_jurisdiction_type(jurisdiction, **kwargs): - """Setup graph to check for correct jurisdiction type in legal text + """Build a decision tree for jurisdiction-type validation Parameters ---------- - jurisdiction : Jurisdiction - Jurisdiction for which validation is being performed. + jurisdiction : compass.utilities.location.Jurisdiction + Target jurisdiction descriptor that guides prompt wording. **kwargs - Keyword-value pairs to add to graph. + Additional keyword arguments forwarded to + :func:`compass.common.setup_graph_no_nodes` (for example, + ``usage_tracker`` or ``llm_service`` identifiers). Returns ------- networkx.DiGraph - Graph instance that can be used to initialize an - `elm.tree.DecisionTree`. + Graph capturing the sequence of questions needed to verify + whether ordinance text names the expected jurisdiction type and + geography. + + Notes + ----- + The prompts collected through this graph expect the LLM to return a + JSON payload keyed by ``correct_jurisdiction`` plus a human-readable + explanation summarizing the reasoning. """ G = setup_graph_no_nodes( # noqa: N806 d_tree_name="Correct jurisdiction type", **kwargs @@ -497,20 +516,29 @@ def setup_graph_correct_jurisdiction_type(jurisdiction, **kwargs): def setup_graph_correct_jurisdiction_from_url(jurisdiction, **kwargs): - """Setup graph to check for correct jurisdiction in URL + """Build a decision tree for validating jurisdictions from URLs Parameters ---------- - jurisdiction : Jurisdiction - Jurisdiction for which validation is being performed. + jurisdiction : compass.utilities.location.Jurisdiction + Jurisdiction descriptor supplying state, county, and subdivision + phrases used in prompts. **kwargs - Keyword-value pairs to add to graph. + Additional keyword arguments forwarded to + :func:`compass.common.setup_graph_no_nodes`. Returns ------- networkx.DiGraph - Graph instance that can be used to initialize an - `elm.tree.DecisionTree`. + Graph that queries whether a URL explicitly references the + jurisdiction's state, county, and subdivision names and returns + a JSON verdict. + + Notes + ----- + The graph aggregates boolean keys such as ``correct_state`` and + ``correct_county``. The final prompt instructs the LLM to emit a + JSON document describing each match plus an explanatory string. """ G = setup_graph_no_nodes( # noqa: N806 d_tree_name="Correct jurisdiction type from URL", **kwargs diff --git a/compass/validation/location.py b/compass/validation/location.py index 0a54682dd..6d07d7a7a 100644 --- a/compass/validation/location.py +++ b/compass/validation/location.py @@ -22,7 +22,7 @@ class DTreeURLJurisdictionValidator(BaseLLMCaller): - """Validator that checks whether a URL matches a jurisdiction""" + """Validate whether a URL appears to target a jurisdiction""" SYSTEM_MESSAGE = ( "You are an expert data analyst that examines URLs to determine if " @@ -36,28 +36,49 @@ def __init__(self, jurisdiction, **kwargs): Parameters ---------- - structured_llm_caller : StructuredLLMCaller - Instance used for structured validation queries. + jurisdiction : compass.utilities.location.Jurisdiction + Jurisdiction descriptor with the target location attributes. **kwargs - Additional keyword arguments to pass to the - :class:`~compass.llm.calling.BaseLLMCaller` instance. + Additional keyword arguments forwarded to + :class:`~compass.llm.calling.BaseLLMCaller` for model + selection, temperature, or tracing control. + + Notes + ----- + The validator stores the input jurisdiction for subsequent URL + checks; it does not perform any validation work during + instantiation. """ super().__init__(**kwargs) self.jurisdiction = jurisdiction async def check(self, url): - """Check if the content passes the validation + """Determine whether the supplied URL targets the jurisdiction Parameters ---------- - content : str - Document content to validate. + url : str + URL string to evaluate. Empty values short-circuit to + ``False``. Returns ------- bool - ``True`` if the content passes the validation check, - ``False`` otherwise. + ``True`` when the decision-tree evaluation finds all + jurisdiction criteria satisfied, ``False`` otherwise. + + Raises + ------ + compass.exceptions.COMPASSError + Propagated if underlying LLM interactions fail while the + caller has configured :class:`BaseLLMCaller` to raise. + + Notes + ----- + The method delegates to an internal asynchronous decision tree + backed by :class:`ChatLLMCaller`. The validator aggregates + structured responses and only approves when each required + attribute matches the target jurisdiction. """ if not url: return False @@ -87,7 +108,7 @@ def _parse_output(self, props): # noqa: PLR6301 class DTreeJurisdictionValidator(BaseLLMCaller): - """Jurisdiction Validation using a decision tree""" + """Validate ordinance text against a target jurisdiction""" META_SCORE_KEY = "Jurisdiction Validation Score" """Key in doc.attrs where score is stored""" @@ -103,28 +124,42 @@ def __init__(self, jurisdiction, **kwargs): Parameters ---------- - structured_llm_caller : StructuredLLMCaller - Instance used for structured validation queries. + jurisdiction : compass.utilities.location.Jurisdiction + Jurisdiction descriptor identifying expected applicability. **kwargs - Additional keyword arguments to pass to the - :class:`~compass.llm.calling.BaseLLMCaller` instance. + Additional keyword arguments forwarded to + :class:`~compass.llm.calling.BaseLLMCaller` for configuring + LLM temperature, timeout, or similar options. """ super().__init__(**kwargs) self.jurisdiction = jurisdiction async def check(self, content): - """Check if the content passes the validation + """Determine whether ordinance text matches the jurisdiction + + The decision tree checks jurisdiction type, state, and + subdivision alignment. Parameters ---------- content : str - Document content to validate. + Plain-text ordinance content extracted from a document. Returns ------- bool - ``True`` if the content passes the validation check, - ``False`` otherwise. + ``True`` when the decision tree concludes the ordinance is + scoped to the configured jurisdiction, ``False`` otherwise. + + Raises + ------ + compass.exceptions.COMPASSError + Raised if the underlying LLM caller propagates an execution + failure. + + Notes + ----- + Empty content returns ``False`` without invoking the LLM. """ if not content: return False @@ -154,20 +189,13 @@ def _parse_output(self, props): # noqa: PLR6301 class JurisdictionValidator: - """COMPASS Ordinance Jurisdiction validator - - Combines the logic of several validators into a single class. - - Purpose: - Determine whether a document pertains to a specific county. - Responsibilities: - 1. Use a combination of heuristics and LLM queries to determine - whether or not a document pertains to a particular county. - Key Relationships: - Uses a StructuredLLMCaller for LLM queries and delegates - sub-validation to - :class:`DTreeJurisdictionValidator`, - and :class:`DTreeURLJurisdictionValidator`. + """Coordinate URL and text jurisdiction validation for documents + + Notes + ----- + The validator stores the score threshold, optional text splitter, + and keyword arguments so they can be reused across many documents + without reconfiguration. """ def __init__(self, score_thresh=0.8, text_splitter=None, **kwargs): @@ -176,36 +204,58 @@ def __init__(self, score_thresh=0.8, text_splitter=None, **kwargs): Parameters ---------- score_thresh : float, optional - Score threshold to exceed when voting on content from raw - pages. By default, ``0.8``. - text_splitter : LCTextSplitter, optional - Optional text splitter instance to attach to doc (used for - splitting out pages in an HTML document). - By default, ``None``. + Threshold applied to the weighted page vote. Documents at or + above the threshold are considered jurisdiction matches. + Default is ``0.8``. + text_splitter : elm.web.text_splitter.LCTextSplitter, optional + Optional splitter attached to documents lacking a + ``text_splitter`` attribute so validators can iterate page + content consistently. Default is ``None``. **kwargs - Additional keyword arguments to pass to the - :class:`~compass.llm.calling.BaseLLMCaller` instance. + Additional keyword arguments forwarded to + :class:`~compass.llm.calling.BaseLLMCaller` and reused when + instantiating subordinate validators. """ self.score_thresh = score_thresh self.text_splitter = text_splitter self.kwargs = kwargs async def check(self, doc, jurisdiction): - """Check if the document belongs to the county + """Assess whether a document applies to the jurisdiction Parameters ---------- doc : elm.web.document.BaseDocument - Document instance. Should contain a "source" key in the - ``attrs`` that contains a URL (used for the URL validation - check). Raw content will be parsed for county name and - correct jurisdiction. + Document to evaluate. The validator expects + ``doc.raw_pages`` and, when available, a + ``doc.attrs['source']`` URL for supplemental URL validation. + jurisdiction : compass.utilities.location.Jurisdiction + Target jurisdiction descriptor capturing the required + location attributes. Returns ------- bool - `True` if the doc contents pertain to the input county. - `False` otherwise. + ``True`` when either the URL or document text validation + confirms jurisdiction alignment, ``False`` otherwise. + + Raises + ------ + compass.exceptions.COMPASSError + Propagated if subordinate validators encounter LLM caller + errors. + + Notes + ----- + The method temporarily overrides ``doc.text_splitter`` when a + custom splitter is provided, ensuring the original splitter is + restored after validation completes. + + Examples + -------- + >>> validator = JurisdictionValidator() + >>> await validator.check(document, jurisdiction) + True """ if hasattr(doc, "text_splitter") and self.text_splitter is not None: old_splitter = doc.text_splitter @@ -244,7 +294,13 @@ async def _check(self, doc, jurisdiction): class JurisdictionWebsiteValidator: - """COMPASS Ordinance Jurisdiction Website validator""" + """Validate whether a website is the primary jurisdiction portal + + Notes + ----- + The validator stores the initialization arguments so they can be + reused across many documents without reconfiguration. + """ WEB_PAGE_CHECK_SYSTEM_MESSAGE = ( "You are an expert data analyst that examines website text to " @@ -261,36 +317,55 @@ def __init__( Parameters ---------- - browser_semaphore : :class:`asyncio.Semaphore`, optional - Semaphore instance that can be used to limit the number of - playwright browsers open concurrently. If ``None``, no - limits are applied. By default, ``None``. + browser_semaphore : asyncio.Semaphore, optional + Semaphore constraining concurrent Playwright usage. + ``None`` applies no concurrency limit. Default is ``None``. file_loader_kwargs : dict, optional - Dictionary of keyword arguments pairs to initialize - :class:`elm.web.file_loader.AsyncWebFileLoader`. - By default, ``None``. + Keyword arguments passed to + :class:`elm.web.file_loader.AsyncWebFileLoader`. Default is + ``None``. **kwargs - Additional keyword arguments to pass to the - :class:`~compass.llm.calling.BaseLLMCaller` instance. - + Additional keyword arguments cached for downstream LLM + calls triggered during validation. """ self.browser_semaphore = browser_semaphore self.file_loader_kwargs = file_loader_kwargs or {} self.kwargs = kwargs async def check(self, url, jurisdiction): - """Check if the website is the main website for a jurisdiction + """Determine whether a website serves as a jurisdiction's portal + + The validator first performs an inexpensive URL classification + before downloading page content. Only when the URL fails the + initial check does it fetch and inspect the page text using a + generic LLM caller. Parameters ---------- url : str - URL of the website to validate. + URL to inspect. Empty values return ``False`` immediately. + jurisdiction : compass.utilities.location.Jurisdiction + Target jurisdiction descriptor used to frame the validation + prompts. Returns ------- bool - ``True`` if the website is the main website for the given - jurisdiction; ``False`` otherwise. + ``True`` when either the URL quick check or the full page + evaluation indicates the site is the official main website + for the jurisdiction. + + Raises + ------ + compass.exceptions.COMPASSError + Propagated from :class:`BaseLLMCaller` if configured to + raise on LLM failures. + + Examples + -------- + >>> validator = JurisdictionWebsiteValidator() + >>> await validator.check("https://county.gov", jurisdiction) + True """ url_validator = DTreeURLJurisdictionValidator( diff --git a/compass/warn.py b/compass/warn.py index 89976aab7..677688dc4 100644 --- a/compass/warn.py +++ b/compass/warn.py @@ -10,7 +10,6 @@ class COMPASSWarning(UserWarning): """Generic COMPASS Warning""" def __init__(self, *args, **kwargs): - """Init exception and broadcast message to logger.""" super().__init__(*args, **kwargs) if args: logger.warning(str(args[0]), stacklevel=2) From f63e0dd7e134b9f14120f1fb9c0e67924c0b971c Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Thu, 20 Nov 2025 14:02:16 -0700 Subject: [PATCH 2/4] Update copilot instructions --- .github/copilot-instructions.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index b3777f3ee..e54f15e37 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -90,8 +90,11 @@ Use `pixi add --feature python-dev ` to add a dependency that is only u - Never include a period (".") at the end of the first line of docstrings. - Do not add a short summary to __init__ methods. Instead, keep the line blank and start the "Parameters" section after a second newline. - Do not document parameters in the class docstring - do that in the __init__ docstring instead. -- All @property and @cached_property method documentation should be one line long and should start with the return type. -- "Protected" functions and methods should always be documented using only one-line summary docstrings. +- Do not add docstring to dunder methods (e.g., __str__, __repr__, etc.) unless absolutely necessary. +- All @property and @cached_property method documentation should be one line long and should start with the return type followed by a colon (e.g. `"""str: My string property"""`). +- If a parameter has a default value, always end the description with the sentence `"By default, ."` +- If the default value for a parameter is **not** `None`, document it using the format: `param_name : type, default=`. If the default value for a parameter **is** `None`, use the format : `param_name : type, optional`. +- "Protected" functions and methods (i.e. starting with an underscore) should always be documented using **only** one-line summary docstrings. ## 7. Coding Guidelines (Rust) - Workspace-managed deps; update root `Cargo.toml` if adding shared dependency. From 8509ccbb3b56a0a8d5fb24065be9ed284b998d35 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Thu, 20 Nov 2025 14:18:29 -0700 Subject: [PATCH 3/4] Fix crosslinks --- compass/common/base.py | 7 ++++--- compass/extraction/apply.py | 6 +++--- compass/utilities/finalize.py | 3 +-- compass/utilities/parsing.py | 5 ++--- compass/validation/graphs.py | 10 +++++----- compass/validation/location.py | 15 ++++++++------- 6 files changed, 23 insertions(+), 23 deletions(-) diff --git a/compass/common/base.py b/compass/common/base.py index e7df1a974..5447032a4 100644 --- a/compass/common/base.py +++ b/compass/common/base.py @@ -164,9 +164,10 @@ async def run_async_tree(tree, response_as_json=True): ---------- tree : AsyncDecisionTree Decision tree to execute. - response_as_json : bool, optional - When ``True`` (default), attempts to parse the LLM response as - JSON using :func:`compass.utilities.llm_response_as_json`. + response_as_json : bool, default=True + If ``True``, attempts to parse the LLM response as JSON using + :func:`compass.utilities.parsing.llm_response_as_json`. + By default, ``True``. Returns ------- diff --git a/compass/extraction/apply.py b/compass/extraction/apply.py index 590a371b1..e43008902 100644 --- a/compass/extraction/apply.py +++ b/compass/extraction/apply.py @@ -207,7 +207,7 @@ async def extract_ordinance_text_with_llm( Optional Langchain text splitter (or subclass instance), or any object that implements a `split_text` method. The method should take text as input (str) and return a list of text chunks. - extractor : compass.extraction.base.OrdinanceTextExtractor + extractor : object Extractor instance exposing ``parsers`` that consume text chunks and update ``doc.attrs``. original_text_key : str @@ -272,7 +272,7 @@ async def extract_ordinance_text_with_ngram_validation( Optional Langchain text splitter (or subclass instance), or any object that implements a `split_text` method. The method should take text as input (str) and return a list of text chunks. - extractor : compass.extraction.base.OrdinanceTextExtractor + extractor : object Extractor instance exposing ``parsers`` that consume text chunks and update ``doc.attrs``. original_text_key : str @@ -431,7 +431,7 @@ async def extract_ordinance_values(doc, parser, text_key, out_key): that are found to contain ordinance data. Note that if the document's attrs does not contain the `text_key` key, it will not be processed. - parser : compass.extraction.base.StructuredParser + parser : object Parser instance with an async ``parse`` method that converts cleaned ordinance text into structured values. text_key : str diff --git a/compass/utilities/finalize.py b/compass/utilities/finalize.py index be1a457b8..c3b9424b1 100644 --- a/compass/utilities/finalize.py +++ b/compass/utilities/finalize.py @@ -149,8 +149,7 @@ def doc_infos_to_db(doc_infos): Returns ------- pandas.DataFrame - Consolidated ordinance dataset containing the columns enumerated - in :data:`_PARSED_COLS`. + Consolidated ordinance dataset. int Number of jurisdictions contributing at least one ordinance to the consolidated dataset. diff --git a/compass/utilities/parsing.py b/compass/utilities/parsing.py index aac176754..f7def5d5c 100644 --- a/compass/utilities/parsing.py +++ b/compass/utilities/parsing.py @@ -242,9 +242,8 @@ def load_config(config_fp): Notes ----- - JSON5 loading is handled via :mod:`pyjson5`, enabling comments and - trailing commas, among other quality-of-life improvements over - standard JSON, which uses the built-in :func:`json.load`. + JSON5 enables comments and trailing commas, among other + quality-of-life improvements over vanilla JSON. """ config_fp = Path(config_fp) diff --git a/compass/validation/graphs.py b/compass/validation/graphs.py index 1bb9a1984..126bca8ba 100644 --- a/compass/validation/graphs.py +++ b/compass/validation/graphs.py @@ -14,9 +14,9 @@ def setup_graph_correct_document_type(**kwargs): ---------- **kwargs Additional keyword arguments forwarded to - :func:`compass.common.setup_graph_no_nodes`. The helper consumes - ``doc_is_from_ocr`` (default ``False``) to alter draft-detection - prompts for scanned documents. + :func:`compass.common.base.setup_graph_no_nodes`. The helper + consumes ``doc_is_from_ocr`` (default ``False``) to alter + draft-detection prompts for scanned documents. Returns ------- @@ -295,7 +295,7 @@ def setup_graph_correct_jurisdiction_type(jurisdiction, **kwargs): Target jurisdiction descriptor that guides prompt wording. **kwargs Additional keyword arguments forwarded to - :func:`compass.common.setup_graph_no_nodes` (for example, + :func:`compass.common.base.setup_graph_no_nodes` (for example, ``usage_tracker`` or ``llm_service`` identifiers). Returns @@ -525,7 +525,7 @@ def setup_graph_correct_jurisdiction_from_url(jurisdiction, **kwargs): phrases used in prompts. **kwargs Additional keyword arguments forwarded to - :func:`compass.common.setup_graph_no_nodes`. + :func:`compass.common.base.setup_graph_no_nodes`. Returns ------- diff --git a/compass/validation/location.py b/compass/validation/location.py index 6d07d7a7a..25d994ae6 100644 --- a/compass/validation/location.py +++ b/compass/validation/location.py @@ -71,14 +71,15 @@ async def check(self, url): ------ compass.exceptions.COMPASSError Propagated if underlying LLM interactions fail while the - caller has configured :class:`BaseLLMCaller` to raise. + caller has configured + :class:`~compass.llm.calling.BaseLLMCaller` to raise. Notes ----- The method delegates to an internal asynchronous decision tree - backed by :class:`ChatLLMCaller`. The validator aggregates - structured responses and only approves when each required - attribute matches the target jurisdiction. + backed by :class:`~compass.llm.calling.ChatLLMCaller`. The + validator aggregates structured responses and only approves when + each required attribute matches the target jurisdiction. """ if not url: return False @@ -207,7 +208,7 @@ def __init__(self, score_thresh=0.8, text_splitter=None, **kwargs): Threshold applied to the weighted page vote. Documents at or above the threshold are considered jurisdiction matches. Default is ``0.8``. - text_splitter : elm.web.text_splitter.LCTextSplitter, optional + text_splitter : LCTextSplitter, optional Optional splitter attached to documents lacking a ``text_splitter`` attribute so validators can iterate page content consistently. Default is ``None``. @@ -358,8 +359,8 @@ async def check(self, url, jurisdiction): Raises ------ compass.exceptions.COMPASSError - Propagated from :class:`BaseLLMCaller` if configured to - raise on LLM failures. + Propagated from :class:`~compass.llm.calling.BaseLLMCaller` + if configured to raise on LLM failures. Examples -------- From c77899049992c9d21c9502e343b7b65f99d6d5d5 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Thu, 20 Nov 2025 14:20:35 -0700 Subject: [PATCH 4/4] Implement PR review comments --- compass/pb.py | 4 ++-- compass/utilities/base.py | 8 ++++++++ compass/validation/graphs.py | 2 +- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/compass/pb.py b/compass/pb.py index 146314a33..93a4e4704 100644 --- a/compass/pb.py +++ b/compass/pb.py @@ -182,10 +182,10 @@ def update_total_cost(self, cost, replace=False): Parameters ---------- - cost : float + cost : float or int Cost increment or replacement value in US dollars. replace : bool, optional - When ``True`` the total cost is replaced by ``cost`` , + When ``True`` the total cost is replaced by ``cost``, provided it does not move backwards. When ``False`` the cost is added cumulatively. By default, ``False``. """ diff --git a/compass/utilities/base.py b/compass/utilities/base.py index 4580b062b..d40393645 100644 --- a/compass/utilities/base.py +++ b/compass/utilities/base.py @@ -204,6 +204,14 @@ def __init__( ) def __iter__(self): + """Yield managed directory paths in canonical order + + Yields + ------ + pathlib.Path + Each of the managed directories in the following order: + out, logs, clean_files, ordinance_files, jurisdiction_dbs. + """ yield self.out yield self.logs yield self.clean_files diff --git a/compass/validation/graphs.py b/compass/validation/graphs.py index 126bca8ba..92d4ee05a 100644 --- a/compass/validation/graphs.py +++ b/compass/validation/graphs.py @@ -29,7 +29,7 @@ def setup_graph_correct_document_type(**kwargs): ----- The resulting graph encodes a structured sequence of Yes/No prompts that culminate in a JSON response containing summary metadata and a - legal-text boolean keyed by ``{key}``. + legal-text boolean. """ doc_is_from_ocr = kwargs.pop("doc_is_from_ocr", False)