From 80e66a0ba8fecd345e9fe077bca8f2ef2d8f229e Mon Sep 17 00:00:00 2001 From: Joe Karow <58997957+JoeKarow@users.noreply.github.com> Date: Thu, 9 Oct 2025 17:56:55 -0400 Subject: [PATCH 01/24] plan Signed-off-by: Joe Karow <58997957+JoeKarow@users.noreply.github.com> --- GUIDE.md | 1751 +++++++++++++++++++++++++++++++++++++++++++ reorg-plan.md | 1975 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 3726 insertions(+) create mode 100644 GUIDE.md create mode 100644 reorg-plan.md diff --git a/GUIDE.md b/GUIDE.md new file mode 100644 index 00000000..a6d44bd2 --- /dev/null +++ b/GUIDE.md @@ -0,0 +1,1751 @@ +# Mango Tango CLI - Developer Onboarding Guide + +**Analysis Date:** October 7, 2025 + +--- + +## TL;DR - Quick Start + +**What is this?** A Python CLI tool for detecting coordinated inauthentic behavior in social media data using pluggable analyzers. + +**Get running in 3 steps:** +```bash +python -m venv venv && ./bootstrap.sh # Setup +python cibmangotree.py # Run +# Import sample data → Select analyzer → View results +``` + +**Want to create an analyzer?** +1. Copy `analyzers/example/` structure +2. Define interface in `interface.py` (inputs, outputs, params) +3. Implement analysis in `main.py` (Polars-based) +4. Register in `analyzers/__init__.py` + +**Key concepts:** +- **Primary analyzers**: Process raw data → output files +- **Secondary analyzers**: Post-process primary outputs +- **Web presenters**: Visualize results in Dash/Shiny +- **Everything is Parquet**: CSV/Excel imports → Parquet → Analysis → Parquet → Web viz + +**Critical gotcha:** Always call `input_reader.preprocess()` before using input data - it maps user columns to your schema. + +--- + +## Table of Contents + +- [Overview](#overview) +- [Getting Started](#getting-started) + - [Prerequisites](#prerequisites) + - [Initial Setup](#initial-setup) + - [First Run Experience](#first-run-experience) + - [Quick Validation](#quick-validation) +- [Architecture](#architecture) + - [High-Level Design](#high-level-design) + - [Directory Structure](#directory-structure) + - [Core Abstractions](#core-abstractions) + - [Data Flow](#data-flow) + - [Extension Points](#extension-points) +- [How To](#how-to) + - [Run Tests](#run-tests) + - [Add a Web Presenter](#add-a-web-presenter) + - [Add a Secondary Analyzer](#add-a-secondary-analyzer) + - [Debug an Analyzer](#debug-an-analyzer) + - [Export Analysis Results](#export-analysis-results) + - [Change Storage Location](#change-storage-location) + - [Work with Large Datasets](#work-with-large-datasets) +- [Key Insights](#key-insights) + - [Design Decisions](#design-decisions) + - [Conventions and Patterns](#conventions-and-patterns) + - [Gotchas and Non-Obvious Behavior](#gotchas-and-non-obvious-behavior) + - [Common Pitfalls](#common-pitfalls) + - [Modern vs Legacy Patterns](#modern-vs-legacy-patterns) + - [Notable Implementation Details](#notable-implementation-details) +- [Dependencies & Integrations](#dependencies--integrations) +- [Open Questions & Uncertainties](#open-questions--uncertainties) +- [Team & Contribution Workflow](#team--contribution-workflow) +- [Additional Resources](#additional-resources) +- [Quick Reference](#quick-reference) + +--- + +## Overview + +**Mango Tango CLI** is a Python command-line tool for detecting **Coordinated Inauthentic Behavior (CIB)** in social media data. The application provides a plugin-based architecture where analysts can create modular "analyzers" that process social media datasets to detect patterns of coordination, manipulation, or suspicious behavior. + +**What makes this unique:** +- **Plugin architecture**: Analyzers are self-contained modules that declare their inputs, outputs, and parameters +- **Terminal-based UI**: Rich interactive menus built with `inquirer` and `rich` libraries +- **Data pipeline**: Import CSV/Excel → Run analysis → View results in web dashboards +- **Storage abstraction**: Projects and analysis results persist in user data directories using TinyDB + +**Tech Stack:** +- Python 3.12 (required) +- Polars (primary data processing) +- Pydantic (data modeling and validation) +- TinyDB (lightweight JSON database) +- Shiny/Dash (web-based result visualization) +- Rich (terminal UI) + +--- + +## Getting Started + +### Prerequisites +- **Python 3.12** (strict requirement) +- Virtual environment tools + +### Initial Setup + +1. **Create virtual environment:** + ```bash + python -m venv venv + ``` + +2. **Run bootstrap script:** + - PowerShell: `./bootstrap.ps1` + - Bash: `./bootstrap.sh` + + This installs dependencies and sets up pre-commit hooks for `isort` and `black` formatting. + +3. **Start the application:** + ```bash + python -m cibmangotree + # OR + python cibmangotree.py + ``` + +### First Run Experience + +The entry point is `cibmangotree.py` which: +1. Shows a loading message ("🥭 CIB Mango Tree is starting...") +2. Lazy-loads heavy imports (analyzers, components) +3. Initializes storage in platform-specific user data directory +4. Sets up logging +5. Displays splash screen +6. Launches the main menu + +**Main Menu Flow:** +- Import dataset for new project → Select analysis → View results +- Review existing project → Select analysis → View results + +### Quick Validation + +After setup, run the application and you should see: +- The mango emoji splash screen +- An interactive menu asking what you'd like to do +- No errors in the console + +--- + +## Architecture + +### High-Level Design + +```mermaid +graph TB + Entry[cibmangotree.py] --> MainMenu[components/main_menu] + MainMenu --> Import[Import Dataset] + MainMenu --> Review[Review Project] + + Import --> NewProject[Create Project] + NewProject --> Storage[(Storage Layer)] + + Review --> SelectProject[Select Project] + SelectProject --> AnalysisMenu[Analysis Menu] + + AnalysisMenu --> RunAnalyzer[Run Analyzer] + RunAnalyzer --> PrimaryAnalyzer[Primary Analyzer] + PrimaryAnalyzer --> SecondaryAnalyzers[Secondary Analyzers] + SecondaryAnalyzers --> WebPresenter[Web Presenter] + + Storage --> TinyDB[(TinyDB)] + Storage --> ParquetFiles[(Parquet Files)] +``` + +### Directory Structure + +**Key directories explained:** + +``` +mango-tango-cli/ +├── cibmangotree.py # Entry point - starts the app +├── app/ # Core application models and context +│ ├── app.py # App class - orchestrates projects +│ ├── app_context.py # Application-level context +│ ├── project_context.py # Project-level context +│ └── logger.py # Logging setup +├── components/ # UI components (terminal screens) +│ ├── main_menu.py # Top-level menu +│ ├── project_main.py # Project management screen +│ ├── analysis_main.py # Analysis execution screen +│ └── select_analysis.py # Analyzer selection UI +├── analyzers/ # ⭐ Plugin analyzers (add new ones here) +│ ├── __init__.py # Registers all analyzers in suite +│ ├── example/ # Example analyzer (reference this!) +│ │ ├── example_base/ # Primary analyzer +│ │ ├── example_report/ # Secondary analyzer +│ │ └── example_web/ # Web presenter +│ ├── hashtags/ # Hashtag analysis +│ ├── ngrams/ # N-gram analysis +│ └── temporal/ # Time-based analysis +├── analyzer_interface/ # ⭐ Core analyzer framework +│ ├── declaration.py # Analyzer registration classes +│ ├── suite.py # Analyzer suite management +│ ├── context.py # Runtime context for analyzers +│ └── params.py # Parameter type definitions +├── storage/ # Data persistence layer +│ └── __init__.py # Storage class - file/DB operations +├── importing/ # CSV/Excel import logic +├── terminal_tools/ # Terminal UI utilities +└── meta/ # Version info +``` + +**What goes where:** +- **New analyzers** → `analyzers/{analyzer_name}/` (copy structure from `example/`) +- **UI screens** → `components/` +- **Business logic** → `app/` +- **Data models** → Use Pydantic models inline or in relevant modules + +### Core Abstractions + +#### 1. **Analyzer Plugin System** + +The architecture centers around three types of analyzers: + +**Primary Analyzer** (`AnalyzerDeclaration`): +- Entry point for analysis +- Declares input columns, parameters, outputs +- Processes raw data → generates output files +- Example: `analyzers/example/example_base/` + +**Secondary Analyzer** (`SecondaryAnalyzerDeclaration`): +- Consumes output from primary analyzer +- Can depend on other secondary analyzers (topologically sorted) +- Example: Generate statistics from primary results + +**Web Presenter** (`WebPresenterDeclaration`): +- Visualizes analyzer results +- Creates Dash or Shiny apps +- Example: `analyzers/example/example_web/` + +#### 2. **Storage Layer** + +`storage/Storage` class manages: +- **Projects**: Imported datasets (stored as Parquet) +- **Analyses**: Analysis runs with parameters and results +- **TinyDB**: Metadata (JSON file in user data dir) +- **File System**: Organized directory structure per project + +**Directory layout** (in user data dir): +``` +projects/ +└── {project_id}/ + ├── input.parquet # Imported data + └── analysis/ + └── {analysis_id}/ + ├── primary_outputs/ # Primary analyzer results + ├── secondary_outputs/ # Secondary analyzer results + ├── exports/ # User exports + └── web_presenters/ # Web presenter state +``` + +#### 3. **Context Pattern** + +The app uses context objects to pass state through the UI layers: + +- `ViewContext` → Terminal + App instance +- `AppContext` → Storage + Analyzer Suite +- `ProjectContext` → Project model + App context +- `AnalysisContext` → Analysis model + Project context +- `PrimaryAnalyzerContext` → Input/output paths, params for analyzer + +**Why contexts?** They provide type-safe, structured access to dependencies without global state. + +### Data Flow + +**Import → Analyze → Visualize:** + +```mermaid +sequenceDiagram + participant User + participant UI as Terminal UI + participant App + participant Storage + participant Analyzer + participant WebServer + + User->>UI: Import CSV/Excel + UI->>App: create_project() + App->>Storage: Save as Parquet + + User->>UI: Select Analyzer + UI->>User: Map columns to analyzer inputs + UI->>User: Configure parameters + + User->>UI: Run Analysis + UI->>Analyzer: Execute with context + Analyzer->>Storage: Write outputs (Parquet) + + User->>UI: View Results + UI->>WebServer: Launch web presenter + WebServer->>Storage: Read output files + WebServer->>User: Display interactive dashboard +``` + +**Key insight:** Everything is Parquet-based. CSV/Excel → Parquet → Analysis → Parquet → Web visualization. + +### Extension Points + +#### Adding a New Analyzer + +1. **Create directory structure:** + ``` + analyzers/my_analyzer/ + ├── __init__.py + ├── interface.py # Declare inputs/outputs/params + └── main.py # Analysis logic + ``` + +2. **Define interface** (`interface.py`): + ```python + from analyzer_interface import AnalyzerInterface, AnalyzerInput, InputColumn, ... + + interface = AnalyzerInterface( + id="my_analyzer", + name="My Analyzer", + input=AnalyzerInput(columns=[...]), + outputs=[...], + params=[...], + ) + ``` + +3. **Implement analysis** (`main.py`): + ```python + def main(context: PrimaryAnalyzerContext): + input_reader = context.input() + df = input_reader.preprocess(pl.read_parquet(input_reader.parquet_path)) + + # Your analysis logic here + result = df.select(...) + + result.write_parquet(context.output("my_output").parquet_path) + ``` + +4. **Register** (`__init__.py`): + ```python + from analyzer_interface import AnalyzerDeclaration + from .interface import interface + from .main import main + + my_analyzer = AnalyzerDeclaration( + interface=interface, + main=main, + is_distributed=False, # Set True when ready for production + ) + ``` + +5. **Add to suite** (`analyzers/__init__.py`): + ```python + from .my_analyzer import my_analyzer + + suite = AnalyzerSuite(all_analyzers=[..., my_analyzer]) + ``` + +--- + +## How To + +### Run Tests + +```bash +# Run all tests +pytest + +# Run with verbose output +pytest -v + +# Run specific test file +pytest analyzers/example/test_example_base.py +``` + +Tests use the pattern: `test_{analyzer_name}.py` files alongside analyzer code. + +### Add a Web Presenter + +See `analyzers/example/example_web/` for the pattern: + +```python +from analyzer_interface import WebPresenterDeclaration, WebPresenterInterface + +def factory(context: WebPresenterContext): + # Create Dash app or Shiny app + # Access analyzer outputs via context.base.table(output_id) + df = pl.read_parquet(context.base.table("output_id").parquet_path) + + # Build your visualization + ... + +web_presenter = WebPresenterDeclaration( + interface=WebPresenterInterface(...), + factory=factory, + name=__name__, # Important for asset path resolution + shiny=False, # True for Shiny, False for Dash +) +``` + +**Assets:** Place CSS/JS/images in `assets/` folder next to your web presenter module. Dash will serve them automatically. + +### Add a Secondary Analyzer + +Secondary analyzers process primary analyzer outputs: + +```python +from analyzer_interface import SecondaryAnalyzerDeclaration, SecondaryAnalyzerInterface + +def main(context: SecondaryAnalyzerContext): + # Access primary analyzer outputs + primary_output = pl.read_parquet( + context.base.table("primary_output_id").parquet_path + ) + + # Access parameters from primary analyzer + param_value = context.base_params.get("param_id") + + # Process and write output + result = process(primary_output) + result.write_parquet(context.output("my_output").parquet_path) + +secondary = SecondaryAnalyzerDeclaration( + interface=SecondaryAnalyzerInterface( + id="my_secondary", + base_analyzer=primary_interface, # Reference to primary + depends_on=[], # Other secondary analyzers this depends on + ... + ), + main=main, +) +``` + +### Debug an Analyzer + +1. **Check logs:** Logs are in `{user_data_dir}/logs/mangotango.log` + ```bash + # Run with debug logging + python cibmangotree.py --log-level DEBUG + + # Tail the log file + tail -f ~/Library/Application\ Support/MangoTango/logs/mangotango.log + ``` + +2. **Print debugging:** Use `print()` or `rich.print()` - they'll show in terminal + +3. **Inspect data:** Load Parquet files directly: + ```python + import polars as pl + + # Find your data in user data directory + df = pl.read_parquet("path/to/output.parquet") + print(df.head()) + print(df.schema) + ``` + +4. **Test without UI:** Write unit tests to run your analyzer directly: + ```python + # See analyzers/example/test_example_base.py for pattern + ``` + +### Export Analysis Results + +**Via UI:** Project → Analysis → Export Outputs + +**Supported formats:** CSV, Excel (XLSX), JSON, Parquet + +**Chunking:** For large datasets, configure chunk size in settings to split exports into multiple files (e.g., for Excel's row limit). + +### Change Storage Location + +Storage uses `platformdirs` to find user data directory. To override, modify `Storage.__init__()` in `storage/__init__.py`: + +```python +# Default (platform-specific) +self.user_data_dir = platformdirs.user_data_dir( + appname=app_name, appauthor=app_author, ensure_exists=True +) + +# Custom location +self.user_data_dir = "/custom/path/to/data" +``` + +### Work with Large Datasets + +**Best practices:** + +1. **Use lazy evaluation:** + ```python + # Good - lazy evaluation + df = pl.scan_parquet(path) + result = df.filter(...).select(...).collect() + + # Avoid - loads everything into memory + df = pl.read_parquet(path) + ``` + +2. **Stream outputs:** + ```python + # Use sink_parquet for streaming writes + df.lazy().sink_parquet(output_path) + ``` + +3. **Batch processing:** If needed, use `iter_batches()` on PyArrow ParquetFile + +4. **Set sorted hints:** + ```python + df = df.sort(COL_TIMESTAMP) + df = df.lazy().set_sorted(COL_TIMESTAMP) + # Now group_by_dynamic and other operations can optimize + ``` + +5. **Use filters early in lazy chain:** + ```python + # Good - filter before expensive operations + df = pl.scan_parquet(path).filter(...).group_by(...).collect() + + # Bad - filter after expensive operations + df = pl.scan_parquet(path).group_by(...).collect() + df = df.filter(...) + ``` + +### Create a Custom Parameter Type + +While the framework provides `IntegerParam` and `TimeBinningParam`, you can extend it: + +1. **Define param model** in `analyzer_interface/params.py`: + ```python + class MyCustomParam(BaseModel): + type: Literal["my_custom"] = "my_custom" + # Your config fields here + + class MyCustomValue(BaseModel): + # Runtime value structure + pass + + # Update unions + ParamType = Union[TimeBinningParam, IntegerParam, MyCustomParam] + ParamValue = Union[TimeBinningValue, int, MyCustomValue] + ``` + +2. **Add UI handler** in `components/analysis_params.py` to prompt user for value + +3. **Use in analyzer:** + ```python + AnalyzerParam( + id="my_param", + type=MyCustomParam(...), + default=MyCustomValue(...) + ) + ``` + +### Profile Analyzer Performance + +To find bottlenecks in your analyzer: + +1. **Use time tracking:** + ```python + import time + + start = time.time() + # ... operation ... + print(f"Operation took {time.time() - start:.2f}s") + ``` + +2. **Check query plan:** + ```python + lazy_df = pl.scan_parquet(path).filter(...).select(...) + print(lazy_df.explain()) # Shows optimization plan + ``` + +3. **Monitor memory:** + ```python + import psutil + process = psutil.Process() + print(f"Memory: {process.memory_info().rss / 1024**2:.2f} MB") + ``` + +4. **Profile with cProfile:** + ```bash + python -m cProfile -o output.prof cibmangotree.py + python -m pstats output.prof + ``` + +### Handle Missing/Invalid Data + +**Strategy 1: Filter out invalids** +```python +df = df.filter( + pl.col(COL_TEXT).is_not_null() & + (pl.col(COL_TEXT).str.len_chars() > 0) +) +``` + +**Strategy 2: Fill with defaults** +```python +df = df.with_columns( + pl.col(COL_COUNT).fill_null(0) +) +``` + +**Strategy 3: Validate and warn** +```python +invalid_count = df.filter(pl.col(COL_ID).is_null()).height +if invalid_count > 0: + warnings.warn(f"{invalid_count} rows have null IDs and will be excluded") +df = df.filter(pl.col(COL_ID).is_not_null()) +``` + +### Use Default Parameters Dynamically + +The `default_params` function in `AnalyzerDeclaration` can inspect input data: + +```python +def compute_defaults(context: PrimaryAnalyzerContext) -> dict[str, ParamValue]: + input_reader = context.input() + df = input_reader.preprocess(pl.read_parquet(input_reader.parquet_path)) + + # Example: set default window based on data time span + time_span = df.select( + (pl.col("timestamp").max() - pl.col("timestamp").min()) + ).item() + + if time_span > timedelta(days=365): + window = TimeBinningValue(unit="month", amount=1) + else: + window = TimeBinningValue(unit="day", amount=1) + + return {"time_window": window} + +analyzer = AnalyzerDeclaration( + interface=interface, + main=main, + default_params=compute_defaults, # Called when creating new analysis +) +``` + +### Debug Import Issues + +If your CSV/Excel import isn't working: + +1. **Check file encoding:** + ```bash + file -I yourfile.csv + # If not UTF-8, convert: + iconv -f ISO-8859-1 -t UTF-8 yourfile.csv > yourfile_utf8.csv + ``` + +2. **Inspect with sample:** + ```python + import polars as pl + df = pl.read_csv("yourfile.csv", n_rows=10) + print(df) + print(df.schema) + ``` + +3. **Check delimiter:** + The importer auto-detects, but you can override in the UI's manual config mode. + +4. **Look at logs:** + Logs at `{user_data_dir}/logs/mangotango.log` show detailed import errors. + +--- + +## Key Insights + +### Design Decisions + +**Why Polars instead of Pandas?** +- Performance: Much faster for large datasets +- Arrow-native: Better memory efficiency +- API: More consistent and explicit + +**Why TinyDB instead of SQLite?** +- Simplicity: JSON file, no schema migrations +- Portability: Easy to inspect/debug +- Sufficient for metadata storage (not for analysis data) + +**Why plugin architecture?** +- Extensibility: Analysts can add analyzers without modifying core +- Isolation: Each analyzer is self-contained +- Discoverability: All analyzers auto-registered in suite + +**Why separate primary/secondary analyzers?** +- Reusability: Secondary analyzers can work across different primaries +- Dependency management: Topological sort ensures correct execution order +- Clarity: Separates data generation from post-processing + +### Conventions and Patterns + +#### Naming Conventions + +- **Analyzer IDs:** Use `snake_case` (e.g., `time_coordination`) +- **File names:** Match Python conventions (`my_module.py`) +- **Output IDs:** Descriptive snake_case (e.g., `character_count`) + +#### Code Style + +- **Formatting:** `black` (enforced by pre-commit) +- **Import sorting:** `isort` (enforced by pre-commit) +- **Type hints:** Strongly encouraged, especially for public APIs +- **Pydantic models:** Use for all data structures crossing boundaries + +#### Analyzer Patterns + +**Always do this:** +1. Call `input_reader.preprocess()` before using input data +2. Write outputs to paths from `context.output(id).parquet_path` +3. Match declared columns in interface exactly + +**Never do this:** +- Don't read files outside of provided context paths +- Don't modify the input data files +- Don't use global state + +### Gotchas and Non-Obvious Behavior + +1. **Column mapping is user-driven:** Your analyzer declares what columns it needs, but users map their CSV columns to your schema. Don't assume column names! + +2. **Preprocessing is mandatory:** The `input_reader.preprocess()` call transforms user data to match your interface. Skip it and you'll get wrong column names or types. + +3. **`is_distributed` flag:** Analyzers with `is_distributed=False` only show in development mode. Set to `True` when ready for end users. + +4. **Parquet everywhere:** All analyzer I/O uses Parquet. Don't try to write CSV/JSON in analyzer main logic (that's for exports only). + +5. **Context paths are temporary during execution:** The context provides paths - use them, don't construct your own. + +6. **TinyDB is single-file:** All metadata in one JSON file (`db.json`). Database locks prevent concurrent access. + +7. **Bootstrap scripts required:** Dependencies include compiled packages (like Polars). The bootstrap script ensures proper installation. + +8. **Module registration:** After creating an analyzer, you MUST add it to `analyzers/__init__.py` suite, or it won't be discovered. + +9. **Output column order matters:** Columns in output DataFrames should match the order declared in the interface for consistency. + +10. **Web presenter state persistence:** Web presenters can store state in `context.state_dir`, which persists between runs. Useful for caching expensive computations. + +11. **Sample data location:** Sample datasets are in `sample_data/` directory: + - `fake_data.csv`: Small synthetic dataset for testing + - `reddit_vm.csv`: Larger real-world Reddit data (618KB) + +12. **Timezone handling in datetime columns:** The preprocessing system detects timezones, warns if multiple are found, then strips them. All datetimes become timezone-naive in analysis. If you need timezone-aware analysis, you must handle it in your analyzer logic. + +13. **ProgressReporter uses multiprocessing:** This means you can't pass non-picklable objects through it. Keep progress updates simple (floats only). + +14. **AnalyzerSuite has a typo:** `primary_anlyzers` (not `primary_analyzers`). This is internal so doesn't affect your code, but you'll see it in tracebacks. + +15. **Test fixtures must call test functions:** The `@pytest.mark.skip()` on helper functions means you MUST call them from your own test functions. They won't auto-run. + +16. **Column `internal` flag:** Outputs with `internal=True` don't show in export UI. Use for intermediate results that users don't need. + +17. **Parameter backfill vs default:** + - `default`: Value for NEW analyses + - `backfill_value`: Value for OLD analyses created before param existed + - Don't confuse them! + +18. **Polars lazy evaluation pitfalls:** + - Calling `.describe()`, `.head()`, etc. on lazy frames triggers collection + - Some operations force eager evaluation (check docs) + - Use `.collect()` explicitly when you want materialization + +19. **File selector state persists:** The app remembers the last directory you browsed. This is stored in TinyDB under `file_selector_state`. + +20. **temp_dir is NOT persistent:** The `context.temp_dir` is cleaned up after analysis. Only use it for temporary files during execution. For persistent state, use `context.state_dir` (web presenters only). + +### Common Pitfalls + +**❌ Don't do this:** + +```python +# Forgetting to preprocess +df = pl.read_parquet(input_reader.parquet_path) +result = df.select("message_text") # May not exist! + +# Hardcoding column names from CSV +df = df.filter(pl.col("Tweet") == "...") # User's column name + +# Using .collect() on huge datasets +df = pl.read_parquet(huge_file) # Loads all into RAM +``` + +**✅ Do this instead:** + +```python +# Always preprocess +df = input_reader.preprocess(pl.read_parquet(input_reader.parquet_path)) +result = df.select("message_text") # Guaranteed to exist + +# Use interface column names +df = df.filter(pl.col("message_text") == "...") + +# Use lazy evaluation +df = pl.scan_parquet(huge_file).filter(...).collect() +``` + +### Modern vs Legacy Patterns + +**Modern (emulate this):** +- ✅ Pydantic models for data validation +- ✅ Type hints everywhere +- ✅ Polars for data processing +- ✅ Context objects for dependency injection +- ✅ Declarative analyzer interfaces + +**Legacy (avoid in new code):** +- ❌ V1 analyzers (in `_bootstrap_analyses_v1()`) - only for backward compatibility +- ❌ Direct file path manipulation +- ❌ Global state or singletons + +### Notable Implementation Details + +#### Entry Point & Startup Optimization + +**Lazy loading in entry point:** +Heavy imports (analyzers, components) are deferred until after the loading message displays. This makes startup feel faster. The entry point (`cibmangotree.py`) follows this pattern: +1. Show loading message early (uses `rich.Console`) +2. Import heavy modules (analyzers suite, components) +3. Initialize storage & logging +4. Display splash screen +5. Launch main menu + +**Multiprocessing freeze support:** +`freeze_support()` call at the top of `cibmangotree.py` enables PyInstaller packaging for distributable executables. Required for Windows packaging. + +**Windows ANSI support:** +`enable_windows_ansi_support()` ensures colored terminal output works on Windows terminals that don't natively support ANSI escape codes. + +#### Analyzer Lifecycle & Dependency Management + +**Topological sorting for secondary analyzers:** +`find_toposorted_secondary_analyzers()` in `AnalyzerSuite` performs depth-first traversal to resolve dependencies. The algorithm: +1. Visits each secondary analyzer +2. Recursively visits its dependencies first +3. Appends to result list only after all dependencies are visited +4. Uses `visited_ids` set to prevent duplicate visits + +This ensures secondary analyzers always run after their dependencies. + +**Cached properties in AnalyzerSuite:** +The suite uses `@cached_property` extensively to avoid re-computing lookups: +- `primary_anlyzers` (note the typo - used internally) +- `_primary_analyzers_lookup` +- `_secondary_analyzers` +- `_secondary_analyzers_by_base` +- `web_presenters_by_primary` + +These caches persist for the lifetime of the app, improving performance. + +**Development vs. Distributed mode:** +The `is_development()` function checks for a `VERSION` file. If absent, the app is in development mode and shows ALL analyzers. In distributed mode (VERSION file exists), only analyzers with `is_distributed=True` are visible. This lets developers test analyzers before releasing them to end users. + +#### Column Mapping & Data Preprocessing + +**Column mapping and preprocessing:** +The `input_reader.preprocess()` call is the most critical and complex part of the analyzer interface. It: +1. **Renames columns** from user's schema to analyzer's expected names (via column mapping dict) +2. **Converts data types** using semantic inference (see `preprocessing/series_semantic.py`) +3. **Applies transformations** specified in the column interface + +The preprocessing logic handles: +- Native datetime/date columns (already correct type) +- Datetime strings with timezone info (extracts and warns about multiple timezones) +- Unix timestamps (seconds or milliseconds, auto-detected) +- URLs (strips whitespace, validates http/https) +- Identifiers (validates alphanumeric with allowed chars) +- Generic text/integer/float/boolean (catch-all fallbacks) + +**Column name hints & fuzzy matching:** +The `column_automap()` function in `analyzer_interface/column_automap.py` implements intelligent column matching: +1. Scores each (user column, expected column) pair based on data type compatibility +2. Boosts score by +10 if `name_hints` match (all words in hint must be in column name) +3. Selects best match for each expected column + +This means better hints → better auto-mapping → less manual work for users. + +**Data type compatibility scoring:** +The system scores data type conversions (see `analyzer_interface/data_type_compatibility.py`): +- Exact matches get highest score +- Compatible types get lower scores (e.g., integer → float) +- Incompatible types return `None` (excluded from matching) + +**Semantic data type inference:** +`preprocessing/series_semantic.py` defines `SeriesSemantic` classes that: +1. Check column type matches expected structural type +2. Sample data (default 100 rows) for validation +3. Attempt conversion with `try_convert` function +4. Validate results meet threshold (default 80% valid) + +For example, `datetime_string` tries to parse with timezone handling, warns if multiple timezones detected, strips TZ info, and validates result is non-null. + +#### Storage & File Management + +**Storage paths are platform-aware:** +Uses `platformdirs` to respect OS conventions: +- macOS: `~/Library/Application Support/MangoTango/` +- Windows: `%APPDATA%/MangoTango/` +- Linux: `~/.local/share/MangoTango/` + +**File locking for database:** +TinyDB is protected by `FileLock` (from `filelock` package) to prevent concurrent access from multiple app instances. Lock file is in temp directory (`platformdirs.user_cache_dir`). + +**V1 analyzer migration:** +The `_bootstrap_analyses_v1()` method in Storage handles permanent backward compatibility. Old analyses stored in `analyzers/` directory (legacy) are auto-migrated to `analysis/` with `__v1__` prefix in database IDs. This runs on every app startup within the database lock. + +**Parquet everywhere:** +All analyzer I/O uses Parquet format because: +- Columnar storage (efficient for analytics) +- Built-in compression +- Schema preservation +- Fast with Polars +- Supports streaming writes (`sink_parquet`) + +**Export chunking for large datasets:** +The `_export_output()` method supports chunking via `export_chunk_size` setting: +1. If chunk size set, calculates number of chunks needed +2. Uses PyArrow's `iter_batches()` for memory-efficient iteration +3. Collects batches into chunks of specified size +4. Writes separate files: `output_0.csv`, `output_1.csv`, etc. +5. Yields progress fraction after each chunk + +This is critical for Excel exports (1M row limit) and memory-constrained environments. + +#### Terminal UI & Progress Reporting + +**Progress reporting with multiprocessing:** +`ProgressReporter` uses multiprocessing to show animated progress: +- Spawns separate process for UI updates +- Shares progress value via `multiprocessing.Value` (double precision) +- Uses `multiprocessing.Event` for done signal +- Displays bouncing bar animation (see `_spinner_frames`) +- Updates every 0.1 seconds + +The context manager pattern (`with ProgressReporter(...) as progress:`) ensures cleanup even on errors. + +**Terminal context nesting:** +The `TerminalContext` in `terminal_tools.inception` allows nested UI contexts. Each level can add prefixes/decorations without components knowing about parent contexts. This enables the hierarchical menu structure. + +#### Web Presenter Architecture + +**Dash vs. Shiny support:** +Web presenters can use either framework: +- **Dash** (legacy): Plotly-based, React under the hood +- **Shiny** (modern): Python port of R Shiny, more Pythonic + +The `WebPresenterDeclaration` has a `shiny` boolean flag. The factory function returns different types: +- Dash: Modifies `context.dash_app` directly +- Shiny: Returns `FactoryOutputContext` with `ShinyContext` containing panel + server handler + +**Asset serving:** +For Dash presenters, the `server_name` parameter (typically `__name__`) determines asset path resolution. Assets in `assets/` folder adjacent to module are auto-served at `/assets/` URL. + +**State persistence:** +Web presenters can store persistent state in `context.state_dir`. This directory is unique per project/analyzer/presenter combo and survives between runs. Useful for: +- Cached computations +- User preferences +- Session state + +#### Testing Infrastructure + +**Test helpers in `testing/` module:** +- `test_primary_analyzer()`: Runs analyzer with test data, compares outputs +- `test_secondary_analyzer()`: Tests with primary outputs + dependencies +- `CsvTestData`, `JsonTestData`, `ExcelTestData`: Load test fixtures +- `PolarsTestData`: Programmatically created test data +- `compare_dfs()`: DataFrame comparison with helpful diff output + +**Pytest skip decorators:** +Test helper functions have `@pytest.mark.skip()` to prevent pytest from running them directly (they're meant to be called from actual test functions). + +**Test data semantics:** +The `semantics` parameter in test data lets you specify column type conversions (e.g., `{"message_id": identifier}`) to match how real data would be preprocessed. + +#### Services & Utilities + +**Tokenizer service (new addition):** +The `services/tokenizer/` module provides a pluggable tokenization framework: +- `AbstractTokenizer`: Base class for all tokenizers +- `TokenizerConfig`: Configuration for case handling, emoji inclusion, length filtering +- `_preprocess_text()`: Unicode normalization, case handling +- `_postprocess_tokens()`: Whitespace stripping, emoji filtering, length filtering +- `_is_emoji()`: Sophisticated emoji detection covering multiple Unicode ranges + +This service appears designed for future n-gram or text analysis features. + +**Polars performance patterns:** +Real-world analyzers (hashtags, time_coordination) demonstrate best practices: +1. Use `.lazy()` for query building +2. Call `.collect()` only once at the end +3. Use `group_by_dynamic()` for time-windowed aggregations +4. Use `sink_parquet()` for streaming writes +5. Set `.set_sorted()` hint when data is sorted (enables optimizations) + +#### Error Handling & Validation + +**Draft analysis flag:** +When an analysis fails, `is_draft=True` is set in the database. This: +- Prevents export and web presenter options +- Shows warning in UI +- Allows user to delete or re-run +- Persists partial outputs for debugging + +**Parameter validation:** +The `ParamType` system (in `analyzer_interface/params.py`) supports: +- `IntegerParam`: min/max bounds validation +- `TimeBinningParam`: structured time window config +- Future: extensible for more param types + +Parameters are validated before being passed to analyzers, so analyzer code can trust the types. + +**Column schema validation:** +The system validates that: +- Analyzer outputs match declared schema (column names and order) +- Test outputs match interface specs +- Column mappings cover all required columns + +Missing any of these causes clear error messages. + +--- + +## Real-World Analyzer Examples & Patterns + +### Hashtag Analyzer Deep Dive + +The hashtag analyzer (`analyzers/hashtags/`) demonstrates several advanced techniques: + +**Gini Coefficient for Coordination Detection:** +- Uses inequality measure to detect trending/coordinated hashtag usage +- Formula: `(n + 1 - 2 * sum(cumulative_counts) / total) / n` +- High Gini → few hashtags dominate → potential coordination +- Low Gini → even distribution → organic activity + +**Dynamic Time Windows:** +- Uses `group_by_dynamic()` with configurable windows (via `TimeBinningParam`) +- Sliding windows via `every` and `period` parameters +- Sorts data first and calls `.set_sorted()` for optimization + +**Hashtag Extraction Strategy:** +1. Check if `#` symbols exist in data +2. If yes: Extract with regex `r"(#\S+)"` +3. If no: Raise error (assumes pre-extracted format) +4. Explode list of hashtags for per-tag analysis + +**Smoothing Results:** +Applies rolling mean with window size 3 to reduce noise in Gini time series. Creates both raw and smoothed versions. + +**DateTime Conversion Handling:** +Explicitly checks if timestamp column is already `pl.Datetime`, converts if needed. This makes the analyzer more robust to preprocessing variations. + +### Time Coordination Analyzer Deep Dive + +The time_coordination analyzer (`analyzers/time_coordination/`) shows co-occurrence detection: + +**Sliding Window Approach:** +- Window size: 15 minutes +- Step size: 5 minutes (windows overlap) +- Users posting in same window are "co-occurring" + +**Self-Join Pattern:** +```python +df = df.join(df, on=COL_TIMESTAMP, how="inner") +``` +This creates all pairs of users within each time window. Clever use of Polars join to generate combinations. + +**Frequency Aggregation:** +Groups by user pairs, counts co-occurrences, sorts by frequency descending. High-frequency pairs are most suspicious. + +**Data Cleaning:** +Filters out null user IDs and timestamps before analysis. Essential for real-world data with missing values. + +### N-Grams Analyzer (Multi-Module) + +The ngrams analyzer suite demonstrates the multi-module pattern: + +**Primary Analyzer (`ngrams_base/`):** +- Extracts n-grams from text +- Tokenizes using pluggable tokenizer service +- Configurable n-gram size (unigrams, bigrams, trigrams, etc.) + +**Secondary Analyzer (`ngram_stats/`):** +- Computes statistics on n-gram outputs +- Depends on primary analyzer outputs +- Generates frequency distributions, top terms, etc. + +**Web Presenter (`ngram_web/`):** +- Visualizes n-gram distributions +- Interactive filtering and exploration +- Uses Shiny framework (modern approach) + +This pattern shows how to build complex analyses from composable pieces. + +### Temporal Analyzer + +The temporal analyzer shows time-series aggregation: + +**Message Volume Over Time:** +Groups messages by time bins, counts per bin. Simple but effective for identifying activity patterns. + +**Temporal Bar Plot Web Presenter:** +Separate web presenter for temporal visualization. Shows how web presenters can be shared across analyzers or analyzer-specific. + +### Common Patterns Across Analyzers + +**1. Defensive Data Loading:** +```python +df = df.filter(pl.col(COL_ID).is_not_null() & pl.col(COL_TIME).is_not_null()) +``` +Always filter out nulls in critical columns. + +**2. Lazy Then Collect:** +```python +df = df.lazy() +# ... transformations ... +df = df.collect() +df.write_parquet(output_path) +``` +Build query lazily, execute once, materialize for output. + +**3. Progress Reporting for Long Operations:** +```python +with ProgressReporter("Processing data") as progress: + # ... work ... + progress.update(0.5) # 50% done + # ... more work ... + progress.update(1.0) # Done +``` + +**4. Column Constant Usage:** +Define column names as constants in `interface.py`, import in `main.py`: +```python +# interface.py +COL_USER_ID = "user_id" + +# main.py +from .interface import COL_USER_ID +df.select(pl.col(COL_USER_ID)) # Type-safe, refactorable +``` + +**5. Explicit DateTime Handling:** +Check type, convert if needed: +```python +if not isinstance(df.schema[COL_TIME], pl.Datetime): + df = df.with_columns(pl.col(COL_TIME).str.to_datetime().alias(COL_TIME)) +``` + +## Dependencies & Integrations + +### Key Dependencies + +| Dependency | Purpose | Why It Matters | +|------------|---------|----------------| +| **polars** | Data processing | Core analysis engine, replacing Pandas | +| **pydantic** | Data validation | Type-safe models, runtime validation | +| **inquirer** | Terminal UI | Interactive prompts and menus | +| **rich** | Terminal formatting | Beautiful console output, progress bars | +| **platformdirs** | Cross-platform paths | User data directory location | +| **tinydb** | JSON database | Lightweight metadata storage | +| **dash** | Web dashboards | Interactive visualizations (legacy) | +| **shiny** | Web dashboards | Modern Python web framework | +| **pyarrow** | Parquet support | Columnar file format backend | +| **xlsxwriter** | Excel export | Writing analysis results to .xlsx | +| **fastexcel** | Excel import | Fast CSV/Excel reading | + +### Development Dependencies + +- **pytest**: Testing framework +- **black**: Code formatter (enforced) +- **isort**: Import sorter (enforced) +- **pyinstaller**: Packaging for executables + +### External Integrations + +**None currently.** The tool is self-contained and processes local files. + +**Potential integrations** (based on codebase hints): +- Social media APIs (for data import) +- Cloud storage (for large datasets) + +### Configuration + +**Command-line arguments:** +```bash +python cibmangotree.py --log-level DEBUG # Set logging verbosity +python cibmangotree.py --noop # Test mode, exits immediately +``` + +**No configuration files** - settings stored in TinyDB. + +--- + +## Performance Considerations + +### Memory Management + +**Analyzer memory profile:** +- Input data loaded once via `input_reader.preprocess()` +- Polars uses lazy evaluation → minimal memory until `.collect()` +- Output writes use `sink_parquet()` for streaming (no full materialization) + +**Typical memory footprint:** +- Small datasets (<10k rows): <100 MB +- Medium datasets (10k-1M rows): 100 MB - 1 GB +- Large datasets (>1M rows): 1 GB+ (use lazy evaluation!) + +**Memory optimization techniques:** +1. **Never read entire input eagerly:** Use `pl.scan_parquet()` not `pl.read_parquet()` +2. **Filter early:** Apply filters before expensive operations +3. **Use streaming writes:** `sink_parquet()` and `sink_csv()` don't materialize +4. **Batch processing:** For huge datasets, use `iter_batches()` from PyArrow +5. **Clear intermediate results:** Delete large DataFrames when done + +### Execution Time + +**Benchmark reference (on M1 Mac, 10k rows):** +- CSV import: ~0.1s +- Column mapping + preprocessing: ~0.05s +- Simple aggregation (hashtag count): ~0.2s +- Complex aggregation (Gini coefficient): ~0.5s +- Parquet write: ~0.05s +- **Total typical analysis: <1s for 10k rows** + +**Scaling factors:** +- CSV import: O(n) +- Polars operations: typically O(n) to O(n log n) +- Self-joins (like time_coordination): O(n²) in worst case (use carefully!) +- Group-by operations: O(n log n) + +**Optimization tips:** +1. **Use Polars native operations:** Much faster than Python loops +2. **Avoid row-by-row processing:** Vectorize with Polars expressions +3. **Use `.explain()` on lazy frames:** Check if query plan is efficient +4. **Profile with ProgressReporter:** Identify slow sections +5. **Consider parallelization:** For independent operations, Polars uses all cores + +### Storage & I/O + +**File sizes:** +- Parquet compression ratio: typically 3-5x smaller than CSV +- TinyDB size: negligible (<1 MB even with 100s of analyses) +- Web presenter state: varies by analyzer + +**I/O optimization:** +- Parquet read/write is fast (columnar format, optimized for analytics) +- TinyDB uses file locks (slight overhead on concurrent access) +- Export operations can be slow for large datasets (use chunking) + +### Web Presenter Performance + +**Dash/Shiny considerations:** +- Loading 1M+ rows in browser: too slow, pre-aggregate first +- Recommended max rows for interactive tables: ~10k +- Use server-side filtering for large datasets +- Cache expensive computations in `context.state_dir` + +**Pattern for large datasets:** +```python +# Don't load full dataset in web presenter +# Instead, pre-aggregate in secondary analyzer +# and load small aggregated result +``` + +## Open Questions & Uncertainties + +### Areas Needing Clarification + +1. **What determines a "good" CIB detection?** The example analyzers show the pattern, but domain knowledge of what patterns indicate coordinated behavior isn't obvious from code. The hashtag analyzer provides a good example: it uses Gini coefficient to measure inequality in hashtag distribution as a proxy for coordination events. + +2. **Web presenter choice (Dash vs Shiny)?** Both frameworks are supported. Shiny appears to be the more modern choice (note the `shiny` flag in `WebPresenterDeclaration`). Dash is legacy but still functional. + +3. **Tokenizer service future:** The `services/tokenizer/` module exists but isn't heavily used yet. It appears designed for future n-gram or text analysis features. The architecture supports pluggable tokenizers with configurable preprocessing. + +4. **React dashboard API:** The `FactoryOutputContext` has an `api` field for REST API output (presumably for React dashboards). This feature doesn't appear to be actively used yet. + +5. **UI component testing:** Tests are co-located with analyzers (e.g., `test_example_base.py`). UI components likely require manual testing or end-to-end tests (not heavily present in current codebase). + +6. **Production deployment strategy:** The codebase supports PyInstaller packaging (`freeze_support()`) but there's no documented deployment process. Likely distributed as standalone executable. + +### Clarifications from Code Exploration + +✅ **`is_development()` function:** Defined in `meta/get_version.py`. Returns `True` if no `VERSION` file exists, indicating development mode. + +✅ **Windows ANSI support:** Imported from `terminal_tools.utils` - enables color output on Windows terminals. + +✅ **Splash function:** Defined in `components/splash.py`. Shows ASCII art logo (three sizes based on terminal width) and a mango tree. Adaptive to terminal size. + +✅ **Typo in `AnalyzerSuite.primary_anlyzers`:** This is indeed a typo (`anlyzers` vs `analyzers`). It's used internally so doesn't affect external API. + +### Questions a New Developer Might Have + +**Q: How do I test my analyzer without going through the full UI?** +- A: Write pytest unit tests like `test_example_base.py`. You can construct a mock `PrimaryAnalyzerContext` and call your `main()` function directly. The testing framework provides `test_primary_analyzer()` helper that handles setup/teardown. Import test data with `CsvTestData`, `PolarsTestData`, etc. + +**Q: What's the performance profile for large datasets (millions of rows)?** +- A: Polars is optimized for large datasets. Typical analysis on 10k rows: <1s. For millions of rows, use lazy evaluation (`scan_parquet`, `lazy()`) religiously. The storage layer uses `sink_parquet()` for streaming writes. Avoid `.collect()` on full datasets. Self-joins (like time_coordination) can be O(n²) - use carefully on large data. + +**Q: How do I contribute an analyzer back to the project?** +- A: See `CONTRIBUTING.md`. Fork → feature branch from `develop` → PR targeting `develop`. Set `is_distributed=False` initially, then `True` when ready for production. Follow pre-commit hooks (black, isort). Include tests and sample data. + +**Q: Can analyzers call external services or are they sandboxed?** +- A: No sandboxing - analyzers are regular Python code and can call external services, APIs, etc. Use appropriate error handling. Be mindful of rate limits and network failures. Consider adding parameters for API keys (though storing secrets safely is currently not built into the framework). + +**Q: What happens if an analyzer crashes mid-execution?** +- A: No automatic rollback. Partial outputs may exist on disk (in `primary_outputs/` or `secondary_outputs/`). The analysis is marked as draft (`is_draft=True`) in the database. The UI shows a warning and prevents export/web presenter access. Users can delete and re-run. For debugging, check logs at `{user_data_dir}/logs/mangotango.log`. + +**Q: How do I localize/internationalize the UI?** +- A: Currently not supported - all strings are hardcoded in English. This would be a good contribution opportunity! You'd need to: + 1. Extract strings to translation files (e.g., using `gettext`) + 2. Add language selection in settings + 3. Update all UI components to use translated strings + +**Q: Can I run multiple instances of the app simultaneously?** +- A: Yes, with caveats. File locks on TinyDB prevent corruption. However, you might see lock contention (slowness) when both instances access the database. Each instance can work on different projects concurrently without issues. + +**Q: How do I access data from one analyzer in another?** +- A: Use secondary analyzers with the `depends_on` field. Secondary analyzers can access: + - Primary analyzer outputs via `context.base.table(output_id)` + - Other secondary analyzer outputs via `context.dependency(interface).table(output_id)` + - Primary analyzer parameters via `context.base_params` + +**Q: What if my analyzer needs gigabytes of temporary storage?** +- A: Use `context.temp_dir` for temporary files during analysis. This directory is cleaned up after execution. For large intermediate results, consider: + 1. Writing to temp_dir as Parquet (compressed) + 2. Using lazy frames to avoid materialization + 3. Streaming processing with batches + 4. Breaking into multiple secondary analyzers (each gets its own temp_dir) + +**Q: Can I use Pandas instead of Polars?** +- A: Technically yes (Polars DataFrames can convert to/from Pandas), but strongly discouraged. The entire framework is optimized for Polars. You'd lose performance benefits and might hit memory issues. If you must, use `.to_pandas()` and `.from_pandas()` sparingly. + +**Q: How do I debug issues with column mapping?** +- A: Check the column mapping dict stored in `AnalysisModel.column_mapping`. You can inspect it via: + ```python + analysis = context.storage.list_project_analyses(project_id)[0] + print(analysis.column_mapping) + ``` + This shows which user columns map to which analyzer columns. + +**Q: What's the difference between `temp_dir` and `state_dir`?** +- A: + - `temp_dir`: Available in all analyzer contexts, temporary (cleaned up after run), unique per execution + - `state_dir`: Only in web presenter contexts, persistent (survives reruns), unique per project/analyzer/presenter combo + - Use temp_dir for intermediate processing, state_dir for caching in web presenters + +**Q: Can I create a web presenter that works with multiple analyzers?** +- A: Not directly - each web presenter is tied to one primary analyzer via `base_analyzer` field. However, you could: + 1. Create a secondary analyzer that combines outputs from multiple primaries + 2. Create a web presenter for that secondary analyzer + 3. Or create separate presenter instances for each analyzer (more common pattern) + +--- + +## Troubleshooting Common Issues + +### Analyzer Won't Show Up + +**Problem:** You created an analyzer but it doesn't appear in the UI. + +**Solutions:** +1. **Check registration:** Did you add it to `analyzers/__init__.py` suite? + ```python + from .my_analyzer import my_analyzer + suite = AnalyzerSuite(all_analyzers=[..., my_analyzer]) + ``` + +2. **Check `is_distributed` flag:** In development mode, all analyzers show. In distributed mode (VERSION file exists), only analyzers with `is_distributed=True` appear. + +3. **Restart the app:** The suite is loaded once at startup. Restart after adding new analyzers. + +4. **Check for Python syntax errors:** Run `python -m py_compile analyzers/my_analyzer/__init__.py` to check for errors. + +### Column Mapping Fails + +**Problem:** User's columns don't map to your analyzer's expected columns. + +**Solutions:** +1. **Improve name hints:** Add more variations users might use + ```python + name_hints=["user", "author", "username", "screen_name", "screen name", "poster"] + ``` + +2. **Check data type compatibility:** Ensure your column's `data_type` can convert from user's data type (see `data_type_compatibility.py`) + +3. **Test with sample data:** Import one of the sample datasets and see which columns auto-match + +4. **Manual mapping:** Users can always override auto-mapping manually in the UI + +### Preprocessing Errors + +**Problem:** `input_reader.preprocess()` raises errors or returns wrong types. + +**Solutions:** +1. **Check for null values:** The preprocessing may fail on columns with many nulls + ```python + # Before preprocess + df = df.filter(pl.col("column").is_not_null()) + ``` + +2. **Verify data types:** Use `df.schema` to check structural types before preprocessing + +3. **Look at semantic inference logs:** Check logs for warnings about timezone handling, type conversion failures + +4. **Test semantic inference directly:** + ```python + from preprocessing.series_semantic import infer_series_semantic + semantic = infer_series_semantic(df["column"]) + print(semantic.semantic_name if semantic else "No match") + ``` + +### Polars Performance Issues + +**Problem:** Analysis is very slow or runs out of memory. + +**Solutions:** +1. **Use lazy evaluation:** + ```python + # Before + df = pl.read_parquet(path) # Loads everything + result = df.filter(...).select(...) + + # After + df = pl.scan_parquet(path) # Lazy + result = df.filter(...).select(...).collect() # Execute once + ``` + +2. **Check query plan:** + ```python + print(df.lazy().filter(...).select(...).explain()) + ``` + Look for expensive operations like full scans that could be optimized. + +3. **Filter early, select late:** + ```python + # Good + df.filter(...).select(["col1", "col2"]) + + # Bad + df.select(["col1", "col2", "col3", ...]).filter(...) + ``` + +4. **Avoid unnecessary collects:** + ```python + # Bad - multiple collects + df1 = df.lazy().filter(...).collect() + df2 = df1.lazy().select(...).collect() + + # Good - single collect + df2 = df.lazy().filter(...).select(...).collect() + ``` + +### Web Presenter Not Loading + +**Problem:** Web presenter fails to start or shows blank page. + +**Solutions:** +1. **Check for exceptions in terminal:** Dash/Shiny errors appear in console + +2. **Verify output files exist:** + ```python + import os + print(os.path.exists(context.base.table("output_id").parquet_path)) + ``` + +3. **Test data loading separately:** + ```python + df = pl.read_parquet(context.base.table("output_id").parquet_path) + print(df.head()) + ``` + +4. **Check port conflicts:** Default port 8050 might be in use. Kill other processes or change port. + +5. **Clear state directory:** Corrupted cache might cause issues + ```bash + rm -rf {state_dir}/* + ``` + +### TinyDB Lock Timeouts + +**Problem:** "Lock timeout" errors when accessing the database. + +**Solutions:** +1. **Close other app instances:** Only one instance should write at a time + +2. **Check for stale locks:** + ```bash + rm {user_cache_dir}/db.lock + ``` + +3. **Increase lock timeout:** Modify `FileLock` timeout in `storage/__init__.py` (default is usually sufficient) + +### Import Failures + +**Problem:** CSV/Excel import fails or produces wrong results. + +**Solutions:** +1. **Check file encoding:** + ```bash + file -I yourfile.csv + ``` + Non-UTF-8 files need conversion. + +2. **Inspect with Polars directly:** + ```python + import polars as pl + df = pl.read_csv("file.csv", n_rows=10) + print(df) + ``` + +3. **Try manual import configuration:** Use the UI's manual config to specify separator, encoding, etc. + +4. **Check for malformed rows:** Some CSV files have inconsistent column counts + +5. **Use Excel if CSV parsing fails:** Excel import is often more robust + +### Test Failures + +**Problem:** Your analyzer tests fail with cryptic errors. + +**Solutions:** +1. **Ensure you're calling the test function:** + ```python + def test_my_analyzer(): + test_primary_analyzer(...) # Must call the helper + ``` + +2. **Check test data schema matches interface:** + ```python + # Input CSV must have columns matching interface column names + # Output CSV must have columns matching declared output columns + ``` + +3. **Use semantics for non-string types:** + ```python + from preprocessing.series_semantic import identifier, datetime_string + + input=CsvTestData( + path, + semantics={"user_id": identifier, "timestamp": datetime_string} + ) + ``` + +4. **Check for floating point precision issues:** Use approximate comparisons for floats + +5. **Run with verbose mode:** + ```bash + pytest -v -s analyzers/my_analyzer/test_my_analyzer.py + ``` + +## Team & Contribution Workflow + +### Git Workflow + +This project uses **Git Flow** with `develop` as the integration branch: + +``` +main (production releases) + ↑ +develop (integration branch) ← TARGET YOUR PRs HERE + ↑ +feature/* or bugfix/* (your work) +``` + +**Important:** Always branch from and PR into `develop`, NOT `main`. + +### Pre-commit Hooks + +Automatically run on commit: +- **isort**: Sorts imports +- **black**: Formats code + +Manual run: +```bash +isort . +black . +``` + +### Code Review Expectations + +From `CONTRIBUTING.md`: +1. Automated CI/CD checks (tests, quality) +2. Manual review by maintainers +3. Approval required before merge +4. PRs must target `develop` branch + +### Commit Message Format + +Use conventional commits: +``` +feat(analyzer): add temporal correlation analyzer + +- Implement sliding window correlation +- Add configurable time windows +- Include statistical significance tests + +Fixes #42 +``` + +Types: `feat`, `fix`, `docs`, `style`, `refactor`, `test`, `chore` + +--- + +## Additional Resources + +- **Technical Documentation:** [https://civictechdc.github.io/mango-tango-cli](https://civictechdc.github.io/mango-tango-cli) +- **Development Guide:** `docs/dev-guide.md` (referenced in README) +- **Contributing Guide:** `CONTRIBUTING.md` (detailed contribution workflow) +- **AI Assistant Context:** + - Claude Code users: See `CLAUDE.md` + Serena MCP integration + - Cursor users: See `.cursorrules` + `.ai-context/` + - Other tools: See `.ai-context/README.md` +- **License:** PolyForm Noncommercial License 1.0.0 (non-commercial use only) +- **Community:** [Civic Tech DC Slack](https://civictechdc.slack.com) + +--- + +## Quick Reference + +### Useful Commands + +```bash +# Run application +python cibmangotree.py + +# Run with debug logging +python cibmangotree.py --log-level DEBUG + +# Run tests +pytest + +# Run specific test +pytest analyzers/example/test_example_base.py + +# Format code +black . +isort . + +# Check version +python cibmangotree.py --noop +``` + +### Key Files for New Developers + +| File | Purpose | +|------|---------| +| `cibmangotree.py` | Application entry point | +| `analyzers/example/` | Reference implementation of all analyzer types | +| `analyzer_interface/` | Core framework for building analyzers | +| `CONTRIBUTING.md` | Detailed contribution guide | +| `storage/__init__.py` | Data persistence layer | +| `components/main_menu.py` | UI flow starting point | + +### Common Directories + +| Directory | Contents | +|-----------|----------| +| `~/.local/share/MangoTango/` (Linux) | User data, projects, databases | +| `~/Library/Application Support/MangoTango/` (macOS) | User data, projects, databases | +| `%APPDATA%/MangoTango/` (Windows) | User data, projects, databases | +| `{user_data_dir}/logs/` | Application logs | +| `{user_data_dir}/projects/` | Project data and analysis results | + +--- + +**Next Steps:** +1. Run the application and import sample data from `sample_data/` +2. Explore the example analyzer in `analyzers/example/` +3. Read through the hashtag analyzer for a real-world example +4. Try creating a simple analyzer following the pattern +5. Check `CONTRIBUTING.md` for contribution guidelines +6. Join the Civic Tech DC Slack for community support + +**Questions?** Open an issue or reach out via the Civic Tech DC Slack workspace. + +--- + +## Summary: What Makes This Codebase Unique + +After deep analysis, here are the standout characteristics that define Mango Tango CLI: + +### Architectural Strengths + +1. **Declarative Analyzer Interface**: The separation of interface declaration (`AnalyzerInterface`) from implementation (`main()`) is brilliant. Analyzers declare what they need (inputs, outputs, params) separately from how they compute. This enables: + - Automatic UI generation for column mapping + - Type validation before execution + - Self-documenting analyzer capabilities + - Easy composition (secondary analyzers, web presenters) + +2. **Sophisticated Column Mapping**: The three-layer column mapping system is more advanced than typical data tools: + - **Layer 1**: Name hint matching (fuzzy, word-based) + - **Layer 2**: Data type compatibility scoring + - **Layer 3**: Semantic type inference with sampling + + This handles diverse social media data formats elegantly. + +3. **Polars-First Architecture**: Unlike most Python data tools stuck on Pandas, this aggressively optimizes for Polars: + - Lazy evaluation throughout + - Streaming writes via `sink_parquet()` + - Platform-native performance + - Handles datasets larger than RAM + +4. **Context-Based Dependency Injection**: The nested context pattern (`ViewContext` → `AppContext` → `ProjectContext` → `AnalysisContext`) provides type-safe dependency passing without global state or singletons. Clean and testable. + +### Design Patterns Worth Studying + +1. **Topological Sort for Dependencies**: Secondary analyzers form a DAG, resolved at runtime with depth-first search. Elegant solution to dependency ordering. + +2. **Multiprocessing Progress Reporting**: The `ProgressReporter` spawns a separate process for UI updates, keeping the main thread focused on computation. Shows careful attention to UX. + +3. **Storage Abstraction**: The `Storage` class abstracts TinyDB + Parquet + filesystem into a clean interface. File locking prevents corruption, platform-aware paths respect OS conventions. + +4. **Semantic Type System**: The `SeriesSemantic` framework in `preprocessing/` is a mini type system for data validation and conversion. Extensible and well-designed. + +### Development Philosophy + +This codebase demonstrates several strong principles: + +- **Pragmatism over Purity**: Uses Pydantic for validation, but doesn't force everything into models. Balances type safety with flexibility. +- **Progressive Enhancement**: V1 analyzer migration shows commitment to backward compatibility. Development mode (`is_distributed=False`) lets you test before releasing. +- **Performance as Default**: Lazy evaluation, streaming I/O, cached properties everywhere. Performance is built-in, not bolted-on. +- **Testability**: Context objects, test helpers, co-located tests. Easy to test analyzers without UI. +- **Incremental Complexity**: Simple analyzers (example) → moderate (hashtags) → complex (ngrams multi-module). Good learning progression. + +### Gotchas to Remember + +The three most important things to internalize: + +1. **Always preprocess**: `input_reader.preprocess()` is mandatory. Skip it and everything breaks. +2. **Lazy then collect**: Build Polars queries lazily, execute once. Multiple collects = performance death. +3. **Register in suite**: New analyzers won't appear until added to `analyzers/__init__.py`. + +### Where to Start Contributing + +Best entry points for new contributors: + +1. **Easy**: Add name hints to existing analyzers (improves column mapping) +2. **Moderate**: Create a new simple analyzer (template pattern, n-gram, sentiment) +3. **Advanced**: Improve semantic type inference (add new `SeriesSemantic` types) +4. **Expert**: Add new parameter types (extends framework capabilities) + +### Final Thoughts + +This is a well-architected codebase that successfully balances: +- Academic rigor (CIB detection) with practical usability (terminal UI) +- Performance (Polars, lazy evaluation) with developer experience (rich contexts, testing) +- Flexibility (plugin analyzers) with structure (declarative interfaces) +- Innovation (semantic type inference) with pragmatism (backward compatibility) + +The codebase shows signs of thoughtful refactoring over time (note the V1 migration code, the typo in `primary_anlyzers` that persists, the dual Dash/Shiny support). It's actively evolving but maintains stability. + +**For developers**: Study the example analyzer thoroughly. The patterns there (constants in interface, ProgressReporter, lazy evaluation) are the "house style" you should emulate. + +**For analysts**: The tool abstracts complexity well. Focus on the domain logic (what is CIB?) and let the framework handle the infrastructure. + +**For contributors**: Read `CONTRIBUTING.md`, join the Slack, and start with a small PR. The community is welcoming to newcomers. + +--- + +**This guide was generated through deep codebase analysis on October 7, 2025. It reflects the state of the codebase at that time. For the latest updates, always check the official documentation and CONTRIBUTING.md.** diff --git a/reorg-plan.md b/reorg-plan.md new file mode 100644 index 00000000..f7352b4b --- /dev/null +++ b/reorg-plan.md @@ -0,0 +1,1975 @@ +# CIB Mango Tree CLI - Monorepo Reorganization Plan + +**Status**: Ready for Implementation +**Date**: 2025-10-09 +**Goal**: Transform current flat structure into modern Python monorepo with plugin architecture + +--- + +## Table of Contents + +- [Overview](#overview) +- [Proposed Structure](#proposed-structure) +- [Package Organization](#package-organization) +- [Plugin Architecture](#plugin-architecture) +- [Configuration Strategy](#configuration-strategy) +- [Import Path Migration](#import-path-migration) +- [PyInstaller Compatibility](#pyinstaller-compatibility) +- [Migration Steps](#migration-steps) +- [Testing Strategy](#testing-strategy) +- [Risk Mitigation](#risk-mitigation) +- [Success Criteria](#success-criteria) + +--- + +## Overview + +### Goals + +1. **Modularization**: Organize code into logical packages with clear boundaries +2. **Plugin System**: Enable external analyzer/tokenizer contributions without core changes +3. **Modern Tooling**: Adopt `uv` for fast, reliable dependency management +4. **Clean Architecture**: Separate concerns (core, ui, services, plugins) +5. **Maintainability**: Improve contributor experience and code navigation +6. **PyInstaller Compatible**: Maintain binary build support for releases + +### Key Changes + +- **Directory Structure**: Move to `packages/` with plugin architecture +- **Build System**: Modern `pyproject.toml` with workspace configuration +- **Package Manager**: Migrate from `pip` + `requirements.txt` to `uv` workspace +- **Plugin Discovery**: Hybrid system (entry points + registry for frozen builds) +- **UI Organization**: Consolidate terminal UI under `tui/`, prepare for `gui/` (NiceGUI) +- **Simplified Naming**: Analyzer subdirectories use `base/`, `stats/`, `web/` + +### Design Constraints + +- **PyInstaller**: Must work in frozen executable builds +- **Volunteer-Friendly**: Clear structure for contributors of all skill levels +- **Backward Compatible**: Existing data and workflows must continue working + +--- + +## Proposed Structure + +```bash +cibmangotree/ +├── pyproject.toml # Root workspace config (centralized) +├── uv.lock # Unified dependency lock +├── README.md +├── CONTRIBUTING.md +├── LICENSE +├── bootstrap.sh # Updated to use `uv sync` +├── cibmangotree.py # Backward-compat stub for PyInstaller +├── pyinstaller.spec # Updated build spec +├── .gitignore +├── .github/workflows/ +├── docs/ +├── sample_data/ +└── packages/ + │ + ├── core/ # Core application & framework + │ ├── pyproject.toml + │ ├── tests/ + │ └── src/ + │ └── cibmangotree/ # Package name defines import path + │ ├── __init__.py + │ ├── __main__.py # Entry point + │ ├── _frozen_plugins.py # Auto-generated (pyinstaller.spec) + │ │ + │ ├── app/ # Main application + │ │ ├── __init__.py + │ │ ├── app.py + │ │ ├── logger.py + │ │ ├── app_context.py + │ │ ├── project_context.py + │ │ ├── analysis_context.py + │ │ ├── analysis_output_context.py + │ │ ├── analysis_webserver_context.py + │ │ ├── settings_context.py + │ │ ├── shiny.py + │ │ └── utils.py + │ │ + │ ├── analyzer_interface/ # Analyzer framework + │ │ ├── __init__.py + │ │ ├── column_automap.py + │ │ ├── context.py + │ │ ├── data_type_compatibility.py + │ │ ├── declaration.py + │ │ ├── interface.py + │ │ ├── params.py + │ │ └── suite.py + │ │ + │ ├── tui/ # Terminal User Interface + │ │ ├── __init__.py + │ │ │ + │ │ ├── components/ # Was: components/ + │ │ │ ├── __init__.py + │ │ │ ├── main_menu.py + │ │ │ ├── analysis_main.py + │ │ │ ├── analysis_params.py + │ │ │ ├── analysis_web_server.py + │ │ │ ├── context.py + │ │ │ ├── export_outputs.py + │ │ │ ├── new_analysis.py + │ │ │ ├── new_project.py + │ │ │ ├── project_main.py + │ │ │ ├── select_analysis.py + │ │ │ ├── select_project.py + │ │ │ └── splash.py + │ │ │ + │ │ └── tools/ # Was: terminal_tools/ + │ │ ├── __init__.py + │ │ ├── inception.py + │ │ ├── progress.py + │ │ ├── prompts.py + │ │ └── utils.py + │ │ + │ ├── gui/ # Future: NiceGUI interface + │ │ └── __init__.py # Placeholder + │ │ + │ ├── services/ # Core services + │ │ ├── __init__.py + │ │ │ + │ │ ├── storage/ # Was: storage/ + │ │ │ ├── __init__.py + │ │ │ └── file_selector.py + │ │ │ + │ │ ├── importing/ # Was: importing/ + │ │ │ ├── __init__.py + │ │ │ ├── importer.py + │ │ │ ├── csv.py + │ │ │ └── excel.py + │ │ │ + │ │ ├── preprocessing/ # Was: preprocessing/ + │ │ │ ├── __init__.py + │ │ │ └── series_semantic.py + │ │ │ + │ │ └── tokenizer/ # Abstract interfaces only + │ │ ├── __init__.py + │ │ ├── types.py + │ │ └── base.py + │ │ + │ ├── context/ # Context objects + │ │ └── __init__.py + │ │ + │ ├── meta/ # Version & metadata + │ │ ├── __init__.py + │ │ └── get_version.py + │ │ + │ └── plugin_system/ # Plugin discovery + │ ├── __init__.py + │ └── discovery.py + │ + ├── tokenizers/ + │ └── basic/ # Plugin: basic tokenizer + │ ├── pyproject.toml + │ ├── tests/ + │ │ └── test_basic_tokenizer.py + │ └── src/ + │ └── cibmangotree_tokenizer_basic/ + │ ├── __init__.py + │ ├── tokenizer.py + │ └── patterns.py + │ + ├── analyzers/ + │ ├── example/ # Plugin: example analyzer + │ │ ├── pyproject.toml + │ │ ├── tests/ + │ │ │ ├── test_data/ + │ │ │ ├── test_example_base.py + │ │ │ └── test_example_report.py + │ │ └── src/ + │ │ └── cibmangotree_analyzer_example/ + │ │ ├── __init__.py + │ │ │ + │ │ ├── base/ # Was: example_base/ + │ │ │ ├── __init__.py + │ │ │ ├── interface.py + │ │ │ ├── main.py + │ │ │ └── default_params.py + │ │ │ + │ │ ├── report/ # Was: example_report/ + │ │ │ ├── __init__.py + │ │ │ ├── interface.py + │ │ │ └── main.py + │ │ │ + │ │ └── web/ # Was: example_web/ + │ │ ├── __init__.py + │ │ ├── interface.py + │ │ └── factory.py + │ │ + │ ├── hashtags/ # Plugin: hashtags analyzer + │ │ ├── pyproject.toml + │ │ ├── tests/ + │ │ │ ├── test_data/ + │ │ │ └── test_hashtags_base.py + │ │ └── src/ + │ │ └── cibmangotree_analyzer_hashtags/ + │ │ ├── __init__.py + │ │ │ + │ │ ├── base/ # Was: hashtags_base/ + │ │ │ ├── __init__.py + │ │ │ ├── interface.py + │ │ │ └── main.py + │ │ │ + │ │ └── web/ # Was: hashtags_web/ + │ │ ├── __init__.py + │ │ ├── interface.py + │ │ ├── factory.py + │ │ ├── app.py + │ │ ├── analysis.py + │ │ └── plots.py + │ │ + │ ├── ngrams/ # Plugin: n-grams analyzer + │ │ ├── pyproject.toml + │ │ ├── tests/ + │ │ │ ├── test_data/ + │ │ │ ├── test_ngrams_base.py + │ │ │ └── test_ngram_stats.py + │ │ └── src/ + │ │ └── cibmangotree_analyzer_ngrams/ + │ │ ├── __init__.py + │ │ │ + │ │ ├── base/ # Was: ngrams_base/ + │ │ │ ├── __init__.py + │ │ │ ├── interface.py + │ │ │ └── main.py + │ │ │ + │ │ ├── stats/ # Was: ngram_stats/ + │ │ │ ├── __init__.py + │ │ │ ├── interface.py + │ │ │ └── main.py + │ │ │ + │ │ └── web/ # Was: ngram_web/ + │ │ ├── __init__.py + │ │ ├── interface.py + │ │ ├── factory.py + │ │ └── app.py + │ │ + │ ├── temporal/ # Plugin: temporal analyzer + │ │ ├── pyproject.toml + │ │ ├── tests/ + │ │ └── src/ + │ │ └── cibmangotree_analyzer_temporal/ + │ │ ├── __init__.py + │ │ │ + │ │ ├── base/ # Was: temporal_base/ + │ │ │ ├── __init__.py + │ │ │ ├── interface.py + │ │ │ └── main.py + │ │ │ + │ │ └── web/ # Was: temporal_web/ + │ │ ├── __init__.py + │ │ └── interface.py + │ │ + │ └── time_coordination/ # Plugin: time coordination + │ ├── pyproject.toml + │ ├── tests/ + │ └── src/ + │ └── cibmangotree_analyzer_time_coordination/ + │ ├── __init__.py + │ ├── interface.py + │ └── main.py + │ + └── testing/ # Test utilities + ├── pyproject.toml + ├── tests/ + └── src/ + └── cibmangotree_testing/ + ├── __init__.py + ├── comparers.py + ├── context.py + ├── testdata.py + └── testers.py +``` + +--- + +## Package Organization + +### Package Count: ~10 Packages + +1. **core** - Framework, app, UI, services +2. **tokenizers/basic** - Basic tokenizer implementation +3. **analyzers/example** - Example analyzer for contributors +4. **analyzers/hashtags** - Hashtag analysis +5. **analyzers/ngrams** - N-gram analysis +6. **analyzers/temporal** - Temporal pattern analysis +7. **analyzers/time_coordination** - Time coordination detection +8. **testing** - Test utilities + +### Package Dependency Graph + +```text +cibmangotree (core) + ↓ +├── cibmangotree_tokenizer_basic +├── cibmangotree_testing + ↓ +└── cibmangotree_analyzer_* (all analyzers) + ├── example + ├── hashtags + ├── ngrams (also depends on tokenizer_basic) + ├── temporal + └── time_coordination +``` + +--- + +## Plugin Architecture + +### Design: Hybrid Discovery System + +**Challenge**: Entry points don't work in PyInstaller frozen builds +**Solution**: Hybrid system that works in both development and frozen modes, with **dynamic generation** at build time + +### Implementation + +#### 1. Plugin Registry + +```python +# cibmangotree/plugin_system/discovery.py + +import sys +import importlib.metadata +from typing import List +from cibmangotree.analyzer_interface import AnalyzerDeclaration + +class AnalyzerRegistry: + """Central registry that works in both frozen and installed modes.""" + _analyzers: List[AnalyzerDeclaration] = [] + + @classmethod + def register(cls, analyzer: AnalyzerDeclaration) -> AnalyzerDeclaration: + """Register an analyzer (used in frozen builds).""" + cls._analyzers.append(analyzer) + return analyzer + + @classmethod + def discover(cls) -> List[AnalyzerDeclaration]: + """Discover analyzers - works in both modes.""" + if getattr(sys, 'frozen', False): + # Frozen (PyInstaller): use explicit registry + return cls._analyzers + else: + # Installed: auto-discover via entry points + from cibmangotree.app.logger import get_logger + logger = get_logger(__name__) + + analyzers = [] + for ep in importlib.metadata.entry_points(group='cibmangotree.analyzers'): + try: + analyzer = ep.load() + analyzers.append(analyzer) + except Exception as e: + logger.warning(f"Failed to load analyzer {ep.name}: {e}") + return analyzers +``` + +#### 2. Dynamic Frozen Plugin Generation (Build-Time) + +The `pyinstaller.spec` file automatically generates `_frozen_plugins.py` based on installed packages with entry points. **No manual maintenance required!** + +The spec file: + +1. Discovers all plugins via entry points at build time +2. Auto-generates `_frozen_plugins.py` with appropriate imports +3. Auto-generates `hiddenimports` list for PyInstaller +4. Prints build report showing what's being bundled + +```python +# Excerpt from pyinstaller.spec (see full version in PyInstaller Compatibility section) + +def discover_plugins(group): + """Discover all plugins for a given entry point group.""" + plugins = [] + for ep in importlib.metadata.entry_points(group=group): + module_path, attr_name = ep.value.split(':') + package_name = module_path.split('.')[0] + plugins.append({ + 'name': ep.name, + 'module': module_path, + 'attr': attr_name, + 'package': package_name, + }) + return plugins + +# Discover plugins at build time +analyzers = discover_plugins('cibmangotree.analyzers') +tokenizers = discover_plugins('cibmangotree.tokenizers') + +# Generate _frozen_plugins.py automatically +frozen_plugins_path = generate_frozen_plugins( + analyzers, tokenizers, + 'packages/core/src/cibmangotree/_frozen_plugins.py' +) + +# Generate hiddenimports automatically +plugin_hiddenimports = get_plugin_hiddenimports(analyzers + tokenizers) +``` + +**Auto-generated `_frozen_plugins.py` example:** + +```python +""" +Auto-generated frozen plugin loader for PyInstaller. +Generated during build - DO NOT EDIT MANUALLY. + +This file is automatically generated by pyinstaller.spec based on +installed packages with cibmangotree plugin entry points. +""" + +from cibmangotree.plugin_system.discovery import AnalyzerRegistry + +# Import all bundled analyzers +from cibmangotree_analyzer_hashtags.base import hashtags +from cibmangotree_analyzer_hashtags.web import hashtags_web +from cibmangotree_analyzer_ngrams.base import ngrams +from cibmangotree_analyzer_ngrams.stats import ngram_stats +from cibmangotree_analyzer_ngrams.web import ngrams_web +# ... etc + +# Register all analyzers +_analyzers = [ + hashtags, # hashtags + hashtags_web, # hashtags_web + ngrams, # ngrams + # ... etc +] + +for analyzer in _analyzers: + AnalyzerRegistry.register(analyzer) +``` + +#### 3. Application Startup + +```python +# cibmangotree/__main__.py + +import sys +from cibmangotree.plugin_system.discovery import AnalyzerRegistry +from cibmangotree.analyzer_interface import AnalyzerSuite + +def main(): + # Load frozen plugins if running as executable + if getattr(sys, 'frozen', False): + import cibmangotree._frozen_plugins + + # Discover analyzers (uses registry in frozen mode, entry points otherwise) + analyzers = AnalyzerRegistry.discover() + suite = AnalyzerSuite(all_analyzers=analyzers) + + # ... rest of application initialization +``` + +### Benefits + +✅ **Zero Maintenance** - Automatically discovers and bundles all installed plugins +✅ **No Hardcoding** - Entry points are single source of truth +✅ **Development Mode** - Auto-discovery via entry points, install only what you need +✅ **Frozen Mode** - Auto-generated imports, PyInstaller bundles correctly +✅ **External Plugins** - Contributors can create separate packages +✅ **Selective Bundling** - Only bundles analyzers installed during build +✅ **Build Reports** - Shows exactly what's being bundled + +### Adding New Plugins + +**Developer workflow:** + +```bash +# 1. Create analyzer package with entry points in pyproject.toml +# 2. Install it in workspace +uv sync + +# 3. Build - automatically discovered and bundled! +uv run pyinstaller pyinstaller.spec +``` + +**No changes to spec file or frozen plugins needed!** Everything is discovered and generated automatically at build time. + +--- + +## Configuration Strategy + +### Centralized Configuration (Root `pyproject.toml`) + +All tool configurations, version constraints, and dev dependencies defined once at root. + +```toml +[project] +name = "cibmangotree-workspace" +version = "0.1.0" +requires-python = ">=3.12" +description = "CIB Mango Tree CLI - Social Media Data Analysis Tool" + +[tool.uv.workspace] +members = [ + "packages/core", + "packages/testing", + "packages/tokenizers/basic", + "packages/analyzers/example", + "packages/analyzers/hashtags", + "packages/analyzers/ngrams", + "packages/analyzers/temporal", + "packages/analyzers/time_coordination", +] + +# Centralized version constraints - all packages inherit these +[tool.uv.workspace.dependencies] +# Data processing +polars = ">=1.9.0" +pandas = ">=2.2.3" +pyarrow = ">=17.0.0" + +# Models & validation +pydantic = ">=2.9.1" + +# Storage +tinydb = ">=4.8.0" +platformdirs = ">=4.3.6" +filelock = ">=3.16.1" + +# Terminal UI +inquirer = ">=3.4.0" +rich = ">=14.0.0" +colorama = ">=0.4.6" + +# Web frameworks +dash = ">=2.18.1" +plotly = ">=5.24.1" +shiny = ">=1.4.0" +shinywidgets = ">=0.6.2" +starlette = ">=0.47.1" +uvicorn = ">=0.34.3" + +# Import/Export +xlsxwriter = ">=3.2.0" +fastexcel = ">=0.13.0" + +# Text processing +regex = ">=2025.9.1" + +# Utilities +python-json-logger = ">=2.0.7" +a2wsgi = ">=1.10.10" + +# Development tools +[tool.uv] +dev-dependencies = [ + "black>=24.10.0", + "isort>=5.13.2", + "pytest>=8.3.4", + "pytest-benchmark>=5.1.0", + "pyinstaller>=6.14.1", + "pyarrow-stubs>=17.13", +] + +# Tool configurations - inherited by all packages +[tool.black] +line-length = 88 +target-version = ["py312"] + +[tool.isort] +profile = "black" + +[tool.pytest.ini_options] +pythonpath = ["."] +testpaths = ["packages"] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" +``` + +### Package-Specific Configuration (Minimal) + +Each package only defines: name, version, description, dependencies, and entry points. + +#### Core Package + +```toml +# packages/core/pyproject.toml + +[project] +name = "cibmangotree" +version = "0.1.0" +description = "CIB Mango Tree CLI - Social Media Analysis Tool" +requires-python = ">=3.12" +dependencies = [ + # Data + "polars", + "pandas", + "pyarrow", + + # Models + "pydantic", + "platformdirs", + + # Storage + "tinydb", + "filelock", + + # Terminal UI + "inquirer", + "rich", + "colorama", + + # Web frameworks + "dash", + "plotly", + "shiny", + "shinywidgets", + "starlette", + "uvicorn", + + # Import/Export + "xlsxwriter", + "fastexcel", + + # Utils + "python-json-logger", + "regex", + "a2wsgi", +] + +[project.scripts] +cibmangotree = "cibmangotree.__main__:main" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" +``` + +#### Tokenizer Plugin + +```toml +# packages/tokenizers/basic/pyproject.toml + +[project] +name = "cibmangotree-tokenizer-basic" +version = "0.1.0" +description = "Basic tokenizer implementation" +requires-python = ">=3.12" +dependencies = [ + "cibmangotree", + "regex", +] + +# Plugin entry points - auto-discovered by core in dev mode +[project.entry-points."cibmangotree.tokenizers"] +basic = "cibmangotree_tokenizer_basic:BasicTokenizer" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" +``` + +#### Analyzer Plugin (with Entry Points) + +```toml +# packages/analyzers/hashtags/pyproject.toml + +[project] +name = "cibmangotree-analyzer-hashtags" +version = "0.1.0" +description = "Hashtag analysis for CIB Mango Tree" +requires-python = ">=3.12" +dependencies = [ + "cibmangotree", + "cibmangotree-testing", + "polars", +] + +# Plugin entry points - auto-discovered by core in dev mode +[project.entry-points."cibmangotree.analyzers"] +hashtags = "cibmangotree_analyzer_hashtags.base:hashtags" +hashtags_web = "cibmangotree_analyzer_hashtags.web:hashtags_web" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" +``` + +#### Testing Utilities + +```toml +# packages/testing/pyproject.toml + +[project] +name = "cibmangotree-testing" +version = "0.1.0" +description = "Testing utilities for CIB Mango Tree" +requires-python = ">=3.12" +dependencies = [ + "cibmangotree", + "polars", + "pytest", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" +``` + +--- + +## Import Path Migration + +### Core Package Imports + +**Before:** + +```python +from app import App +from app.logger import get_logger +from analyzer_interface import AnalyzerInterface, AnalyzerSuite +from analyzer_interface.context import PrimaryAnalyzerContext +from context import AnalysisContext +from meta import get_version +``` + +**After:** + +```python +from cibmangotree.app import App +from cibmangotree.app.logger import get_logger +from cibmangotree.analyzer_interface import AnalyzerInterface, AnalyzerSuite +from cibmangotree.analyzer_interface.context import PrimaryAnalyzerContext +from cibmangotree.context import AnalysisContext +from cibmangotree.meta import get_version +``` + +### Service Imports + +**Before:** + +```python +from storage import Storage +from services.tokenizer.core import AbstractTokenizer +from services.tokenizer.basic import BasicTokenizer +from preprocessing import series_semantic +from preprocessing.series_semantic import infer_series_semantic +from importing import ImporterSession +``` + +**After:** + +```python +from cibmangotree.services.storage import Storage +from cibmangotree.services.tokenizer.core import AbstractTokenizer +from cibmangotree_tokenizer_basic import BasicTokenizer +from cibmangotree.services.preprocessing import series_semantic +from cibmangotree.services.preprocessing.series_semantic import infer_series_semantic +from cibmangotree.services.importing import ImporterSession +``` + +### UI Imports + +**Before:** + +```python +from components import main_menu, splash +from components.main_menu import main_menu +from terminal_tools import ProgressReporter +from terminal_tools.inception import TerminalContext +``` + +**After:** + +```python +from cibmangotree.tui.components import main_menu, splash +from cibmangotree.tui.components.main_menu import main_menu +from cibmangotree.tui.tools import ProgressReporter +from cibmangotree.tui.tools.inception import TerminalContext +``` + +### Testing Imports + +**Before:** + +```python +from testing import test_primary_analyzer, CsvTestData +from testing.testdata import PolarsTestData +from testing.comparers import compare_dfs +``` + +**After:** + +```python +from cibmangotree_testing import test_primary_analyzer, CsvTestData +from cibmangotree_testing.testdata import PolarsTestData +from cibmangotree_testing.comparers import compare_dfs +``` + +### Analyzer Internal Imports (Simplified Names) + +**Before (inside `analyzers/hashtags/`):** + +```python +from .hashtags_base import hashtags +from .hashtags_web import hashtags_web +from .hashtags_base.interface import COL_TEXT, COL_TIMESTAMP +``` + +**After (inside `packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/`):** + +```python +from .base import hashtags +from .web import hashtags_web +from .base.interface import COL_TEXT, COL_TIMESTAMP +``` + +**Before (inside `analyzers/ngrams/`):** + +```python +from .ngrams_base import ngrams +from .ngram_stats import ngram_stats +from .ngram_web import ngrams_web +``` + +**After (inside `packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/`):** + +```python +from .base import ngrams +from .stats import ngram_stats +from .web import ngrams_web +``` + +--- + +## PyInstaller Compatibility + +### Challenge + +PyInstaller bundles Python code into a single executable. Standard plugin discovery mechanisms (entry points via `importlib.metadata`) don't work because: + +- No package metadata available at runtime +- No `site-packages` directory +- Entry points aren't accessible + +### Solution: Hybrid Discovery + Dynamic Generation + +The spec file dynamically discovers all plugins and generates the frozen plugin loader at build time. + +#### 1. Dynamic PyInstaller Spec + +```python +# pyinstaller.spec + +from PyInstaller.utils.hooks import copy_metadata +from PyInstaller.building.api import EXE, PYZ +from PyInstaller.building.build_main import Analysis +import sys +import os +import site +import importlib.metadata +from pathlib import Path + +site_packages_path = None +block_cipher = None + +for site_path in site.getsitepackages(): + if 'site-packages' in site_path: + site_packages_path = site_path + break + +if site_packages_path is None: + raise RuntimeError("site-packages directory not found") + + +# ============================================================================ +# DYNAMIC PLUGIN DISCOVERY +# ============================================================================ + +def discover_plugins(group): + """ + Discover all plugins for a given entry point group. + Returns list of dicts with plugin metadata. + """ + plugins = [] + try: + for ep in importlib.metadata.entry_points(group=group): + module_path, attr_name = ep.value.split(':') + package_name = module_path.split('.')[0] + + plugins.append({ + 'name': ep.name, + 'module': module_path, + 'attr': attr_name, + 'package': package_name, + 'value': ep.value, + }) + except Exception as e: + print(f"Warning: Failed to discover plugins for {group}: {e}") + + return plugins + + +def generate_frozen_plugins(analyzers, tokenizers, output_path): + """ + Generate the frozen plugins loader file dynamically. + This file imports and registers all plugins. + """ + lines = [ + '"""', + 'Auto-generated frozen plugin loader for PyInstaller.', + 'Generated during build - DO NOT EDIT MANUALLY.', + '', + 'This file is automatically generated by pyinstaller.spec based on', + 'installed packages with cibmangotree plugin entry points.', + '"""', + '', + 'from cibmangotree.plugin_system.discovery import AnalyzerRegistry', + '', + ] + + # Import analyzers + if analyzers: + lines.append('# Import all bundled analyzers') + for plugin in analyzers: + lines.append(f"from {plugin['module']} import {plugin['attr']}") + lines.append('') + + # Import tokenizers (if we add tokenizer registry later) + if tokenizers: + lines.append('# Import all bundled tokenizers') + for plugin in tokenizers: + lines.append(f"from {plugin['module']} import {plugin['attr']}") + lines.append('') + + # Register analyzers + if analyzers: + lines.append('# Register all analyzers') + lines.append('_analyzers = [') + for plugin in analyzers: + lines.append(f" {plugin['attr']}, # {plugin['name']}") + lines.append(']') + lines.append('') + lines.append('for analyzer in _analyzers:') + lines.append(' AnalyzerRegistry.register(analyzer)') + lines.append('') + + # Write file + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + with open(output_path, 'w') as f: + f.write('\n'.join(lines)) + + print(f"Generated frozen plugins loader: {output_path}") + print(f" - {len(analyzers)} analyzers") + print(f" - {len(tokenizers)} tokenizers") + + return output_path + + +def get_plugin_hiddenimports(plugins): + """ + Generate hiddenimports list for PyInstaller from plugin metadata. + """ + imports = [] + for plugin in plugins: + # Add main package + imports.append(plugin['package']) + # Add specific module + imports.append(plugin['module']) + + # Add common submodules for analyzers + if 'analyzer' in plugin['package']: + base_pkg = plugin['package'] + # Try to add common submodules + for submodule in ['base', 'stats', 'web', 'report']: + imports.append(f"{base_pkg}.{submodule}") + + return imports + + +# Discover all plugins from installed packages +print("Discovering plugins...") +analyzers = discover_plugins('cibmangotree.analyzers') +tokenizers = discover_plugins('cibmangotree.tokenizers') + +print(f"Found {len(analyzers)} analyzer(s):") +for a in analyzers: + print(f" - {a['name']}: {a['value']}") + +print(f"Found {len(tokenizers)} tokenizer(s):") +for t in tokenizers: + print(f" - {t['name']}: {t['value']}") + +# Generate frozen plugins file +frozen_plugins_path = generate_frozen_plugins( + analyzers, + tokenizers, + 'packages/core/src/cibmangotree/_frozen_plugins.py' +) + +# Generate hiddenimports +plugin_hiddenimports = [] +plugin_hiddenimports.extend(get_plugin_hiddenimports(analyzers)) +plugin_hiddenimports.extend(get_plugin_hiddenimports(tokenizers)) + +print(f"\nGenerated {len(plugin_hiddenimports)} hidden imports") + +# ============================================================================ +# PYINSTALLER CONFIGURATION +# ============================================================================ + +a = Analysis( + ['cibmangotree.py'], + pathex=['packages/core/src'], + binaries=[], + datas=[ + # Version file + *( + [('./VERSION', '.')] + if os.path.exists('VERSION') else [] + ), + + # Metadata + *copy_metadata('readchar'), + + # Static assets + (os.path.join(site_packages_path, 'shiny/www'), 'shiny/www'), + (os.path.join(site_packages_path, 'shinywidgets/static'), 'shinywidgets/static'), + + # App assets + ('packages/core/src/cibmangotree/app/web_static', 'cibmangotree/app/web_static'), + ('packages/core/src/cibmangotree/app/web_templates', 'cibmangotree/app/web_templates'), + ], + hiddenimports=[ + # Standard hidden imports + 'readchar', + 'numpy', + 'numpy.core.multiarray', + 'shiny', + 'shiny.ui', + 'shiny.server', + 'htmltools', + 'starlette', + 'uvicorn', + 'uvicorn.logging', + 'uvicorn.loops', + 'uvicorn.loops.auto', + 'uvicorn.protocols', + 'uvicorn.protocols.http', + 'uvicorn.protocols.http.auto', + 'uvicorn.protocols.websockets', + 'uvicorn.protocols.websockets.auto', + 'uvicorn.lifespan', + 'uvicorn.lifespan.on', + 'asyncio', + 'websockets', + 'websockets.legacy', + 'websockets.legacy.server', + 'polars', + 'plotly', + 'linkify_it', + 'markdown_it', + 'mdit_py_plugins', + 'mdurl', + 'uc_micro', + 'pythonjsonlogger', + 'pythonjsonlogger.jsonlogger', + + # Core package + 'cibmangotree', + 'cibmangotree.app', + 'cibmangotree.analyzer_interface', + 'cibmangotree.tui.components', + 'cibmangotree.tui.tools', + 'cibmangotree.services.storage', + 'cibmangotree.services.importing', + 'cibmangotree.services.preprocessing', + 'cibmangotree.plugin_system', + + # Frozen plugin loader (auto-generated) + 'cibmangotree._frozen_plugins', + + # Testing utilities (if bundled) + 'cibmangotree_testing', + + # DYNAMICALLY DISCOVERED PLUGINS + *plugin_hiddenimports, + ], + hookspath=[], + runtime_hooks=[], + excludes=[], +) + +pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher) + +if sys.platform == "darwin": + exe = EXE( + pyz, + a.scripts, + a.binaries, + a.zipfiles, + a.datas, + name='cibmangotree', + debug=False, + strip=True, + upx=True, + console=True, + entitlements_file="./mango.entitlements", + codesign_identity=os.getenv('APPLE_APP_CERT_ID'), + ) +else: + exe = EXE( + pyz, + a.scripts, + a.binaries, + a.zipfiles, + a.datas, + name='cibmangotree', + debug=False, + strip=False, + upx=True, + console=True, + ) +``` + +#### 2. Backward Compatibility Stub + +Keep root-level `cibmangotree.py` for PyInstaller entry point: + +```python +# cibmangotree.py +""" +Entry point stub for backward compatibility with PyInstaller. +""" + +from cibmangotree.__main__ import main + +if __name__ == "__main__": + main() +``` + +--- + +## Migration Steps + +### Phase 1: Setup Monorepo Structure + +**Tasks:** + +1. Create `packages/` directory at root +2. Create root `pyproject.toml` with workspace configuration: + - Define workspace members + - Add centralized dependency versions + - Configure tools (black, isort, pytest) + - Add dev dependencies + +3. Test workspace setup: + + ```bash + uv sync --dry-run + ``` + +**Success Criteria:** + +- `packages/` directory exists +- Root `pyproject.toml` is valid +- `uv` command available and functional + +--- + +### Phase 2: Extract Core Package + +**Tasks:** + +1. Create directory structure: + + ```bash + mkdir -p packages/core/src/cibmangotree + mkdir -p packages/core/tests + ``` + +2. Move and reorganize core modules: + - `app/` → `packages/core/src/cibmangotree/app/` + - `analyzer_interface/` → `packages/core/src/cibmangotree/analyzer_interface/` + - `components/` → `packages/core/src/cibmangotree/tui/components/` + - `terminal_tools/` → `packages/core/src/cibmangotree/tui/tools/` + - `context/` → `packages/core/src/cibmangotree/context/` + - `meta/` → `packages/core/src/cibmangotree/meta/` + - `storage/` → `packages/core/src/cibmangotree/services/storage/` + - `importing/` → `packages/core/src/cibmangotree/services/importing/` + - `preprocessing/` → `packages/core/src/cibmangotree/services/preprocessing/` + - `services/tokenizer/core/` → `packages/core/src/cibmangotree/services/tokenizer/core/` + +3. Create placeholder: + + ```bash + mkdir -p packages/core/src/cibmangotree/gui + touch packages/core/src/cibmangotree/gui/__init__.py + ``` + +4. Create plugin system: + + ```bash + mkdir -p packages/core/src/cibmangotree/plugin_system + ``` + + Create `discovery.py` with `AnalyzerRegistry` class (see Plugin Architecture section) + +5. Note: `_frozen_plugins.py` will be auto-generated by `pyinstaller.spec` during builds + - Do not create this file manually + - Add to `.gitignore` (see Phase 6) + +6. Create `packages/core/pyproject.toml` (see Configuration Strategy section) + +7. Update `__main__.py` to use plugin discovery + +8. Update internal imports within core package + +**Success Criteria:** + +- Core package structure complete +- `uv sync` installs core package +- Can import `cibmangotree.*` modules + +--- + +### Phase 3: Extract Plugin Packages + +**For each plugin (tokenizer, analyzers):** + +#### 3.1 Basic Tokenizer + +```bash +mkdir -p packages/tokenizers/basic/src/cibmangotree_tokenizer_basic +mkdir -p packages/tokenizers/basic/tests +``` + +Move: + +- `services/tokenizer/basic/` → `packages/tokenizers/basic/src/cibmangotree_tokenizer_basic/` +- `services/tokenizer/basic/test_*.py` → `packages/tokenizers/basic/tests/` + +Create `packages/tokenizers/basic/pyproject.toml` + +#### 3.2 Example Analyzer + +```bash +mkdir -p packages/analyzers/example/src/cibmangotree_analyzer_example/{base,report,web} +mkdir -p packages/analyzers/example/tests/test_data +``` + +Move and rename: + +- `analyzers/example/example_base/` → `packages/analyzers/example/src/cibmangotree_analyzer_example/base/` +- `analyzers/example/example_report/` → `packages/analyzers/example/src/cibmangotree_analyzer_example/report/` +- `analyzers/example/example_web/` → `packages/analyzers/example/src/cibmangotree_analyzer_example/web/` +- `analyzers/example/test_*.py` → `packages/analyzers/example/tests/` +- `analyzers/example/test_data/` → `packages/analyzers/example/tests/test_data/` + +Update internal imports: + +```python +# Change from: +from .example_base import example_base + +# To: +from .base import example_base +``` + +Create `packages/analyzers/example/pyproject.toml` with entry points + +#### 3.3 Hashtags Analyzer + +Similar process: + +```bash +mkdir -p packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/{base,web} +mkdir -p packages/analyzers/hashtags/tests/test_data +``` + +Move and rename subdirectories, update imports, create pyproject.toml + +#### 3.4 Ngrams Analyzer + +```bash +mkdir -p packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/{base,stats,web} +mkdir -p packages/analyzers/ngrams/tests/test_data +``` + +Move and rename subdirectories, update imports, create pyproject.toml + +#### 3.5 Temporal Analyzer + +```bash +mkdir -p packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/{base,web} +mkdir -p packages/analyzers/temporal/tests +``` + +Move and rename subdirectories, update imports, create pyproject.toml + +#### 3.6 Time Coordination Analyzer + +```bash +mkdir -p packages/analyzers/time_coordination/src/cibmangotree_analyzer_time_coordination +mkdir -p packages/analyzers/time_coordination/tests +``` + +Move files, update imports, create pyproject.toml + +**Success Criteria:** + +- All plugin packages created +- Entry points defined +- `uv sync` completes successfully +- Can import plugins from new paths + +--- + +### Phase 4: Extract Testing Package + +**Tasks:** + +```bash +mkdir -p packages/testing/src/cibmangotree_testing +mkdir -p packages/testing/tests +``` + +Move: + +- `testing/` → `packages/testing/src/cibmangotree_testing/` + +Create `packages/testing/pyproject.toml` + +**Success Criteria:** + +- Testing package created +- Can import from `cibmangotree_testing` + +--- + +### Phase 5: Update All Imports + +**Systematic Import Updates:** + +1. Create search/replace mapping document +2. Use automated tools where possible: + + ```bash + # Example: update app imports + find packages -name "*.py" -type f -exec sed -i.bak \ + 's/from app import/from cibmangotree.app import/g' {} + + + # Example: update component imports + find packages -name "*.py" -type f -exec sed -i.bak \ + 's/from components import/from cibmangotree.tui.components import/g' {} + + ``` + +3. Manual review for complex cases: + - Relative imports + - Dynamic imports + - String-based imports + +4. Update imports in each package: + - Core package + - Each plugin package + - Testing package + +**Success Criteria:** + +- No import errors when running `uv run cibmangotree --help` +- All tests can import required modules + +--- + +### Phase 6: Update PyInstaller Spec for Dynamic Plugin Discovery + +**Tasks:** + +1. Update `pyinstaller.spec` with dynamic plugin discovery functions (see PyInstaller Compatibility section): + - Add `discover_plugins()` function + - Add `generate_frozen_plugins()` function + - Add `get_plugin_hiddenimports()` function + - Add plugin discovery calls at build time + - Update `hiddenimports` to include `*plugin_hiddenimports` + +2. Create/keep root `cibmangotree.py` stub for backward compatibility + +3. Add `.gitignore` entry for auto-generated file: + + ```gitignore + # Auto-generated by pyinstaller.spec + packages/core/src/cibmangotree/_frozen_plugins.py + ``` + +**Success Criteria:** + +- PyInstaller spec auto-discovers plugins at build time +- Generates `_frozen_plugins.py` automatically +- Generates `hiddenimports` list automatically +- Build outputs show discovered plugins +- Root stub exists + +**Estimated Time:** 1 hour + +--- + +### Phase 7: Update CI/CD & Development Tooling + +**Tasks:** + +1. Update GitHub Actions workflows (`.github/workflows/*.yml`): + + ```yaml + - name: Install uv + run: curl -LsSf https://astral.sh/uv/install.sh | sh + + - name: Install dependencies + run: uv sync + + - name: Run tests + run: uv run pytest + + - name: Format check + run: | + uv run black --check packages/ + uv run isort --check packages/ + + - name: Build executable + run: uv run pyinstaller pyinstaller.spec + ``` + +2. Update `bootstrap.sh`: + + ```bash + #!/bin/bash + + # Install uv if not present + if ! command -v uv &> /dev/null; then + echo "Installing uv..." + curl -LsSf https://astral.sh/uv/install.sh | sh + fi + + # Sync all workspace packages + echo "Syncing workspace..." + uv sync + + echo "Bootstrap complete. Run 'uv run cibmangotree' to start." + ``` + +3. Update `.gitignore`: + + ```gitignore + # uv + .venv/ + uv.lock + + # Python + __pycache__/ + *.py[cod] + *$py.class + *.so + .Python + build/ + develop-eggs/ + dist/ + downloads/ + eggs/ + .eggs/ + lib/ + lib64/ + parts/ + sdist/ + var/ + wheels/ + *.egg-info/ + .installed.cfg + *.egg + + # Project specific + venv/ + __private__ + /analysis_outputs + /site + VERSION + *.DS_Store + .env* + ``` + +4. Update documentation: + - `README.md` - Installation and setup + - `CLAUDE.md` - Code navigation examples + - `GUIDE.md` - Architecture references + - `.ai-context/README.md` - Structure overview + - `.ai-context/architecture-overview.md` - Package structure + - Create `CONTRIBUTING.md` - Contributor guide + +**Success Criteria:** + +- CI/CD pipeline passes +- Bootstrap script works +- Documentation accurate and complete + +--- + +### Phase 8: Testing & Validation + +**Tasks:** + +1. **Unit Testing:** + + ```bash + # Test entire workspace + uv run pytest + + # Test specific packages + uv run pytest packages/core/tests + uv run pytest packages/analyzers/hashtags/tests + ``` + +2. **Integration Testing:** + + ```bash + # Run application + uv run cibmangotree --help + uv run cibmangotree --noop + ``` + +3. **PyInstaller Build Testing:** + + ```bash + # Build executable + uv run pyinstaller pyinstaller.spec + + # Test executable + dist/cibmangotree --help + dist/cibmangotree --noop + ``` + +4. **Manual Testing:** + - Launch application + - Create new project + - Import sample data + - Run each analyzer + - Export results + - Launch web presenters + +5. **Cross-Platform Testing:** + - Test on Windows (via CI or local) + - Test on macOS (via CI or local) + - Test on Linux (via CI or local) + +6. **Fix Issues:** + - Import errors + - Path resolution issues + - Plugin discovery problems + - PyInstaller build failures + +**Success Criteria:** + +- All tests pass +- Application runs successfully +- All analyzers work +- Web presenters launch +- PyInstaller builds work on all platforms + +--- + +### Phase 9: Cleanup & Documentation + +**Tasks:** + +1. Remove old directory structure: + + ```bash + # Remove old directories (MUST BE EMPTY OR ONLY CONTAIN UNUSED FILES) + trash app/ analyzer_interface/ components/ terminal_tools/ + trash analyzers/ storage/ importing/ preprocessing/ services/ + trash testing/ context/ meta/ + ``` + +2. Update all documentation references + +3. Create migration guide for contributors + +4. Update `.ai-context/` files + +5. Final review of all changes + +**Success Criteria:** + +- Old structure removed +- Documentation complete +- No broken references + +--- + +**Recommended Approach:** Work in feature branch, commit after each phase + +--- + +## Testing Strategy + +### Unit Tests + +```bash +# Run all tests +uv run pytest + +# Run tests for specific package +uv run pytest packages/core/tests +uv run pytest packages/analyzers/hashtags/tests + +# Run specific test file +uv run pytest packages/analyzers/hashtags/tests/test_hashtags_base.py + +# Run with coverage +uv run pytest --cov=cibmangotree --cov-report=html +``` + +### Integration Tests + +```bash +# Test CLI entry point +uv run cibmangotree --help +uv run cibmangotree --noop + +# Test in development mode +uv run python -m cibmangotree + +# Test plugin discovery +uv run python -c "from cibmangotree.plugin_system import AnalyzerRegistry; print(len(AnalyzerRegistry.discover()))" +``` + +### Build Tests + +```bash +# Test PyInstaller build +uv run pyinstaller pyinstaller.spec + +# Test frozen executable +dist/cibmangotree --help +dist/cibmangotree --noop + +# Test on all platforms (via CI) +# - Windows 2022 +# - macOS 13 (x86) +# - macOS 15 (arm64) +``` + +### Manual Testing Checklist + +- [ ] Launch application +- [ ] Create new project +- [ ] Import CSV data +- [ ] Import Excel data +- [ ] Run hashtags analyzer +- [ ] Run ngrams analyzer +- [ ] Run temporal analyzer +- [ ] Run time coordination analyzer +- [ ] View analysis results +- [ ] Export results to XLSX +- [ ] Export results to CSV +- [ ] Launch hashtags web presenter +- [ ] Launch ngrams web presenter +- [ ] Launch temporal web presenter +- [ ] All web presenters display correctly + +--- + +## Risk Mitigation + +### Import Rewrites + +**Risk:** Breaking imports during migration + +**Mitigation:** + +- Work in feature branch +- Commit after each package migration +- Use automated search/replace tools +- Manual review of complex imports +- Test after each phase +- Keep import mapping document + +### PyInstaller Compatibility + +**Risk:** Frozen builds not working + +**Mitigation:** + +- Hybrid plugin discovery system +- Explicit imports in `_frozen_plugins.py` +- Comprehensive `hiddenimports` list +- Test builds frequently during migration +- Keep backward-compatible entry point + +### Dependency Conflicts + +**Risk:** Version conflicts between packages + +**Mitigation:** + +- Centralized version constraints +- Workspace-level dependency resolution +- Test `uv sync` frequently +- Document any version-specific requirements + +### Testing Gaps + +**Risk:** Missing test coverage during migration + +**Mitigation:** + +- Run full test suite after each phase +- Test at package and workspace level +- Manual testing of critical workflows +- Compare test coverage before/after + +### CI/CD Breaking + +**Risk:** GitHub Actions workflows fail + +**Mitigation:** + +- Update CI/CD in same commit as migration +- Test workflows in feature branch +- Have rollback plan ready +- Document new CI/CD setup + +### Contributor Confusion + +**Risk:** Contributors struggle with new structure + +**Mitigation:** + +- Update documentation immediately +- Create migration guide +- Update AI context files +- Clear package boundaries and naming +- Include example analyzer for reference + +### Data Compatibility + +**Risk:** Breaking existing user data + +**Mitigation:** + +- Keep storage format unchanged +- Test with existing projects +- Maintain backward compatibility +- Document any breaking changes + +--- + +## Success Criteria + +### Technical Metrics + +- ✅ All packages have valid `pyproject.toml` +- ✅ `uv sync` completes without errors +- ✅ Full test suite passes (maintain 100% coverage) +- ✅ `uv run cibmangotree` launches successfully +- ✅ All analyzers auto-discovered (dev mode) +- ✅ All analyzers bundled correctly (frozen mode) +- ✅ CI/CD pipeline passes on all platforms +- ✅ PyInstaller builds work on Windows/macOS/Linux + +### Code Quality Metrics + +- ✅ Black and isort pass on all code +- ✅ No circular dependencies +- ✅ Clear import paths +- ✅ Each package has minimal dependencies +- ✅ Plugin architecture works in both modes + +### Functional Metrics + +- ✅ Can import CSV/Excel data +- ✅ Can run all existing analyzers +- ✅ Can export results in all formats +- ✅ Web presenters launch correctly +- ✅ All existing features work as before +- ✅ No data loss or corruption + +### Developer Experience Metrics + +- ✅ Bootstrap time < 2 minutes +- ✅ Clear package boundaries +- ✅ Simple pyproject.toml files (< 30 lines) +- ✅ Documentation updated and accurate +- ✅ Easy to understand structure for new contributors + +--- + +## Development Workflow (Post-Migration) + +### Initial Setup + +```bash +# Clone repository +git clone https://github.com/civictech/cibmangotree.git +cd cibmangotree + +# Run bootstrap script +./bootstrap.sh + +# Or manually: +curl -LsSf https://astral.sh/uv/install.sh | sh +uv sync +``` + +### Daily Development + +```bash +# Sync workspace (after pulling changes) +uv sync + +# Run application +uv run cibmangotree + +# Run tests +uv run pytest + +# Run specific test +uv run pytest packages/analyzers/hashtags/tests/test_hashtags_base.py + +# Format code +uv run black packages/ +uv run isort packages/ + +# Build executable +uv run pyinstaller pyinstaller.spec +``` + +### Adding New Analyzer + +1. Create package structure: + + ```bash + mkdir -p packages/analyzers/my_analyzer/src/cibmangotree_analyzer_my_analyzer/{base,web} + mkdir -p packages/analyzers/my_analyzer/tests + ``` + +2. Create `packages/analyzers/my_analyzer/pyproject.toml`: + + ```toml + [project] + name = "cibmangotree-analyzer-my-analyzer" + version = "0.1.0" + description = "My analyzer" + dependencies = [ + "cibmangotree", + "cibmangotree-testing", + "polars", + ] + + [project.entry-points."cibmangotree.analyzers"] + my_analyzer = "cibmangotree_analyzer_my_analyzer.base:my_analyzer" + ``` + +3. Add to root workspace: + + ```toml + # Edit pyproject.toml + [tool.uv.workspace] + members = [ + # ... existing ... + "packages/analyzers/my_analyzer", + ] + ``` + +4. Sync workspace: + + ```bash + uv sync + ``` + +5. Implement analyzer following existing patterns + +6. Add to frozen plugins (for releases): + + ```python + # Edit cibmangotree/_frozen_plugins.py + from cibmangotree_analyzer_my_analyzer.base import my_analyzer + AnalyzerRegistry.register(my_analyzer) + ``` + +7. Add to PyInstaller spec: + + ```python + # Edit pyinstaller.spec hiddenimports + 'cibmangotree_analyzer_my_analyzer', + 'cibmangotree_analyzer_my_analyzer.base', + ``` + +--- + +## Appendix: Quick Reference + +### Package Structure + +| Package | Path | Purpose | +|---------|------|---------| +| core | `packages/core/` | Framework, app, UI, services | +| tokenizer-basic | `packages/tokenizers/basic/` | Basic tokenizer implementation | +| analyzer-example | `packages/analyzers/example/` | Example for contributors | +| analyzer-hashtags | `packages/analyzers/hashtags/` | Hashtag analysis | +| analyzer-ngrams | `packages/analyzers/ngrams/` | N-gram analysis | +| analyzer-temporal | `packages/analyzers/temporal/` | Temporal patterns | +| analyzer-time-coordination | `packages/analyzers/time_coordination/` | Coordination detection | +| testing | `packages/testing/` | Test utilities | + +### Import Cheat Sheet + +```python +# Core +from cibmangotree.app import App +from cibmangotree.app.logger import get_logger + +# Analyzer framework +from cibmangotree.analyzer_interface import AnalyzerInterface + +# UI +from cibmangotree.tui.components.main_menu import main_menu +from cibmangotree.tui.tools import ProgressReporter + +# Services +from cibmangotree.services.storage import Storage +from cibmangotree.services.importing import ImporterSession + +# Plugins +from cibmangotree_tokenizer_basic import BasicTokenizer +from cibmangotree_analyzer_hashtags.base import hashtags + +# Testing +from cibmangotree_testing import test_primary_analyzer +``` + +### Common Commands + +```bash +# Setup +uv sync + +# Run +uv run cibmangotree + +# Test +uv run pytest +uv run pytest packages/core/tests +uv run pytest -k test_hashtags + +# Format +uv run black packages/ +uv run isort packages/ + +# Build +uv run pyinstaller pyinstaller.spec + +# Build specific package +uv build -p packages/core +``` + +--- + +**Document Version**: 2.0 +**Last Updated**: 2025-10-09 +**Status**: Ready for Implementation From bc81ad0a604b71506976e2bbbf97bb363b5229a6 Mon Sep 17 00:00:00 2001 From: Joe Karow <58997957+JoeKarow@users.noreply.github.com> Date: Thu, 9 Oct 2025 18:16:02 -0400 Subject: [PATCH 02/24] feat: add root UV workspace configuration - Define 8 workspace members (core, testing, tokenizers, analyzers) - Centralize dev dependencies using dependency groups - Configure black, isort, pytest at workspace root - Set Python 3.12+ requirement - Use hatchling build backend Phase 1 of monorepo reorganization complete. --- pyproject.toml | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index dc0c9ca7..d56ade98 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,84 @@ +[project] +name = "cibmangotree-workspace" +version = "0.1.0" +requires-python = ">=3.12" +description = "CIB Mango Tree CLI - Social Media Data Analysis Tool (Workspace Root)" +readme = "README.md" +license = {text = "PolyForm Noncommercial License 1.0.0"} + +# This is a virtual workspace root - not a package itself +# Individual packages are defined in packages/* subdirectories + +[tool.uv.workspace] +members = [ + "packages/core", + "packages/testing", + "packages/tokenizers/basic", + "packages/analyzers/example", + "packages/analyzers/hashtags", + "packages/analyzers/ngrams", + "packages/analyzers/temporal", + "packages/analyzers/time_coordination", +] + +# Development dependencies available across all workspace members +[dependency-groups] +dev = [ + "black>=24.10.0", + "isort>=5.13.2", + "pytest>=8.3.4", + "pytest-benchmark>=5.1.0", + "pyinstaller>=6.14.1", + "pyarrow-stubs>=17.13", +] + +# Optional documentation dependencies +docs = [ + "mkdocs", + "mkdocstrings[python]", + "mkdocs-material", + "markdown", + "pymdown-extensions", + "mkdocs-mermaid2-plugin", + "griffe-pydantic", +] + +# Centralized tool configurations - inherited by all workspace members + +[tool.black] +line-length = 88 +target-version = ["py312"] +extend-exclude = ''' +/( + # Directories + \.eggs + | \.git + | \.venv + | \.env + | build + | dist + | __pycache__ +)/ +''' + [tool.isort] profile = "black" +line_length = 88 +known_first_party = ["cibmangotree", "cibmangotree_analyzer_*", "cibmangotree_tokenizer_*", "cibmangotree_testing"] +skip_gitignore = true [tool.pytest.ini_options] pythonpath = ["."] +testpaths = ["packages"] +python_files = ["test_*.py", "*_test.py"] +python_classes = ["Test*"] +python_functions = ["test_*"] +# Suppress warnings for cleaner output +filterwarnings = [ + "ignore::DeprecationWarning", + "ignore::PendingDeprecationWarning", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" From d8b864b4600b5e167b555e0f83a01530a3a15ab1 Mon Sep 17 00:00:00 2001 From: Joe Karow <58997957+JoeKarow@users.noreply.github.com> Date: Thu, 9 Oct 2025 18:23:32 -0400 Subject: [PATCH 03/24] feat: create core package structure and move files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Create packages/core/pyproject.toml with all dependencies - Move 57 files using git mv (history preserved): - app/ → cibmangotree/app/ - analyzer_interface/ → cibmangotree/analyzer_interface/ - context/ → cibmangotree/context/ - meta/ → cibmangotree/meta/ - components/ → cibmangotree/tui/components/ - terminal_tools/ → cibmangotree/tui/tools/ - storage/ → cibmangotree/services/storage/ - importing/ → cibmangotree/services/importing/ - preprocessing/ → cibmangotree/services/preprocessing/ - Create package __init__.py with public API exports - Create __main__.py CLI entry point - Create plugin_system placeholder - Configure entry points for analyzers and tokenizers Phase 2 of monorepo reorganization complete. --- packages/core/pyproject.toml | 100 +++++++++++++++++ packages/core/src/cibmangotree/__init__.py | 59 ++++++++++ packages/core/src/cibmangotree/__main__.py | 101 ++++++++++++++++++ .../analyzer_interface}/__init__.py | 0 .../analyzer_interface}/column_automap.py | 0 .../analyzer_interface}/context.py | 0 .../data_type_compatibility.py | 0 .../analyzer_interface}/declaration.py | 0 .../analyzer_interface}/interface.py | 0 .../analyzer_interface}/params.py | 0 .../cibmangotree/analyzer_interface}/suite.py | 0 .../core/src/cibmangotree/app}/__init__.py | 0 .../src/cibmangotree/app}/analysis_context.py | 0 .../app}/analysis_output_context.py | 0 .../app}/analysis_webserver_context.py | 0 .../core/src/cibmangotree/app}/app.py | 0 .../core/src/cibmangotree/app}/app_context.py | 0 .../core/src/cibmangotree/app}/logger.py | 0 .../src/cibmangotree/app}/project_context.py | 0 .../src/cibmangotree/app}/settings_context.py | 0 .../core/src/cibmangotree/app}/shiny.py | 0 .../core/src/cibmangotree/app}/test_logger.py | 0 .../core/src/cibmangotree/app}/utils.py | 0 .../app}/web_static/dashboard_base.css | 0 .../app}/web_templates/index.html | 0 .../src/cibmangotree/context}/__init__.py | 0 .../core/src/cibmangotree/gui/__init__.py | 39 +++++++ .../core/src/cibmangotree/meta}/__init__.py | 0 .../src/cibmangotree/meta}/get_version.py | 0 .../cibmangotree/plugin_system/__init__.py | 85 +++++++++++++++ .../src/cibmangotree/services/__init__.py | 25 +++++ .../services/importing}/__init__.py | 0 .../cibmangotree/services/importing}/csv.py | 0 .../cibmangotree/services/importing}/excel.py | 0 .../services/importing}/importer.py | 0 .../services/importing}/test_csv.py | 0 .../test_data/notes_with_commas.csv | 0 .../importing}/test_data/simple_header.csv | 0 .../importing}/test_data/single_note.csv | 0 .../importing}/test_data/trailing_commas.csv | 0 .../services/preprocessing}/__init__.py | 0 .../preprocessing}/series_semantic.py | 0 .../preprocessing}/test_series_semantic.py | 0 .../services/storage}/__init__.py | 0 .../services/storage}/file_selector.py | 0 .../core/src/cibmangotree/tui/__init__.py | 23 ++++ .../cibmangotree/tui/components}/__init__.py | 0 .../tui/components}/analysis_main.py | 0 .../tui/components}/analysis_params.py | 0 .../tui/components}/analysis_web_server.py | 0 .../cibmangotree/tui/components}/context.py | 0 .../tui/components}/export_outputs.py | 0 .../cibmangotree/tui/components}/main_menu.py | 0 .../tui/components}/new_analysis.py | 0 .../tui/components}/new_project.py | 0 .../tui/components}/project_main.py | 0 .../tui/components}/select_analysis.py | 0 .../tui/components}/select_project.py | 0 .../cibmangotree/tui/components}/splash.py | 0 .../src/cibmangotree/tui/tools}/__init__.py | 0 .../src/cibmangotree/tui/tools}/inception.py | 0 .../src/cibmangotree/tui/tools}/progress.py | 0 .../src/cibmangotree/tui/tools}/prompts.py | 0 .../core/src/cibmangotree/tui/tools}/utils.py | 0 64 files changed, 432 insertions(+) create mode 100644 packages/core/pyproject.toml create mode 100644 packages/core/src/cibmangotree/__init__.py create mode 100644 packages/core/src/cibmangotree/__main__.py rename {analyzer_interface => packages/core/src/cibmangotree/analyzer_interface}/__init__.py (100%) rename {analyzer_interface => packages/core/src/cibmangotree/analyzer_interface}/column_automap.py (100%) rename {analyzer_interface => packages/core/src/cibmangotree/analyzer_interface}/context.py (100%) rename {analyzer_interface => packages/core/src/cibmangotree/analyzer_interface}/data_type_compatibility.py (100%) rename {analyzer_interface => packages/core/src/cibmangotree/analyzer_interface}/declaration.py (100%) rename {analyzer_interface => packages/core/src/cibmangotree/analyzer_interface}/interface.py (100%) rename {analyzer_interface => packages/core/src/cibmangotree/analyzer_interface}/params.py (100%) rename {analyzer_interface => packages/core/src/cibmangotree/analyzer_interface}/suite.py (100%) rename {app => packages/core/src/cibmangotree/app}/__init__.py (100%) rename {app => packages/core/src/cibmangotree/app}/analysis_context.py (100%) rename {app => packages/core/src/cibmangotree/app}/analysis_output_context.py (100%) rename {app => packages/core/src/cibmangotree/app}/analysis_webserver_context.py (100%) rename {app => packages/core/src/cibmangotree/app}/app.py (100%) rename {app => packages/core/src/cibmangotree/app}/app_context.py (100%) rename {app => packages/core/src/cibmangotree/app}/logger.py (100%) rename {app => packages/core/src/cibmangotree/app}/project_context.py (100%) rename {app => packages/core/src/cibmangotree/app}/settings_context.py (100%) rename {app => packages/core/src/cibmangotree/app}/shiny.py (100%) rename {app => packages/core/src/cibmangotree/app}/test_logger.py (100%) rename {app => packages/core/src/cibmangotree/app}/utils.py (100%) rename {app => packages/core/src/cibmangotree/app}/web_static/dashboard_base.css (100%) rename {app => packages/core/src/cibmangotree/app}/web_templates/index.html (100%) rename {context => packages/core/src/cibmangotree/context}/__init__.py (100%) create mode 100644 packages/core/src/cibmangotree/gui/__init__.py rename {meta => packages/core/src/cibmangotree/meta}/__init__.py (100%) rename {meta => packages/core/src/cibmangotree/meta}/get_version.py (100%) create mode 100644 packages/core/src/cibmangotree/plugin_system/__init__.py create mode 100644 packages/core/src/cibmangotree/services/__init__.py rename {importing => packages/core/src/cibmangotree/services/importing}/__init__.py (100%) rename {importing => packages/core/src/cibmangotree/services/importing}/csv.py (100%) rename {importing => packages/core/src/cibmangotree/services/importing}/excel.py (100%) rename {importing => packages/core/src/cibmangotree/services/importing}/importer.py (100%) rename {importing => packages/core/src/cibmangotree/services/importing}/test_csv.py (100%) rename {importing => packages/core/src/cibmangotree/services/importing}/test_data/notes_with_commas.csv (100%) rename {importing => packages/core/src/cibmangotree/services/importing}/test_data/simple_header.csv (100%) rename {importing => packages/core/src/cibmangotree/services/importing}/test_data/single_note.csv (100%) rename {importing => packages/core/src/cibmangotree/services/importing}/test_data/trailing_commas.csv (100%) rename {preprocessing => packages/core/src/cibmangotree/services/preprocessing}/__init__.py (100%) rename {preprocessing => packages/core/src/cibmangotree/services/preprocessing}/series_semantic.py (100%) rename {preprocessing => packages/core/src/cibmangotree/services/preprocessing}/test_series_semantic.py (100%) rename {storage => packages/core/src/cibmangotree/services/storage}/__init__.py (100%) rename {storage => packages/core/src/cibmangotree/services/storage}/file_selector.py (100%) create mode 100644 packages/core/src/cibmangotree/tui/__init__.py rename {components => packages/core/src/cibmangotree/tui/components}/__init__.py (100%) rename {components => packages/core/src/cibmangotree/tui/components}/analysis_main.py (100%) rename {components => packages/core/src/cibmangotree/tui/components}/analysis_params.py (100%) rename {components => packages/core/src/cibmangotree/tui/components}/analysis_web_server.py (100%) rename {components => packages/core/src/cibmangotree/tui/components}/context.py (100%) rename {components => packages/core/src/cibmangotree/tui/components}/export_outputs.py (100%) rename {components => packages/core/src/cibmangotree/tui/components}/main_menu.py (100%) rename {components => packages/core/src/cibmangotree/tui/components}/new_analysis.py (100%) rename {components => packages/core/src/cibmangotree/tui/components}/new_project.py (100%) rename {components => packages/core/src/cibmangotree/tui/components}/project_main.py (100%) rename {components => packages/core/src/cibmangotree/tui/components}/select_analysis.py (100%) rename {components => packages/core/src/cibmangotree/tui/components}/select_project.py (100%) rename {components => packages/core/src/cibmangotree/tui/components}/splash.py (100%) rename {terminal_tools => packages/core/src/cibmangotree/tui/tools}/__init__.py (100%) rename {terminal_tools => packages/core/src/cibmangotree/tui/tools}/inception.py (100%) rename {terminal_tools => packages/core/src/cibmangotree/tui/tools}/progress.py (100%) rename {terminal_tools => packages/core/src/cibmangotree/tui/tools}/prompts.py (100%) rename {terminal_tools => packages/core/src/cibmangotree/tui/tools}/utils.py (100%) diff --git a/packages/core/pyproject.toml b/packages/core/pyproject.toml new file mode 100644 index 00000000..e0c2cc96 --- /dev/null +++ b/packages/core/pyproject.toml @@ -0,0 +1,100 @@ +[project] +name = "cibmangotree" +version = "0.1.0" +requires-python = ">=3.12" +description = "CIB Mango Tree CLI - Social Media Data Analysis Tool" +readme = "../../README.md" +license = {text = "MIT"} +authors = [ + {name = "Civic Tech DC", email = "info@civictechdc.org"} +] +keywords = ["social-media", "data-analysis", "cli", "text-analysis"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3.12", + "Topic :: Scientific/Engineering :: Information Analysis", +] + +# Core runtime dependencies +dependencies = [ + # Data processing + "polars>=1.9.0", + "pandas>=2.2.3", # needed by plotly + "pyarrow>=17.0.0", + + # Storage & persistence + "tinydb>=4.8.0", + "platformdirs>=4.3.6", + "filelock>=3.16.1", + + # Terminal UI + "inquirer>=3.4.0", + "rich>=14.0.0", + "colorama>=0.4.6", + + # Web dashboards + "dash>=2.18.1", + "plotly>=5.24.1", + "shiny>=1.4.0", + "shinywidgets>=0.6.2", + "starlette>=0.47.1", + "uvicorn>=0.34.3", + "a2wsgi>=1.10.10", + + # Import/Export + "xlsxwriter>=3.2.0", + "fastexcel>=0.13.0", + + # Data models & validation + "pydantic>=2.9.1", + + # Text processing + "regex>=2025.9.1", + + # Logging + "python-json-logger>=2.0.7", +] + +# Optional dependencies for different features +[project.optional-dependencies] +dev = [ + "black>=24.10.0", + "isort>=5.13.2", + "pytest>=8.3.4", + "pytest-benchmark>=5.1.0", + "pyarrow-stubs>=17.13", +] + +docs = [ + "mkdocs", + "mkdocstrings[python]", + "mkdocs-material", + "markdown", + "pymdown-extensions", + "mkdocs-mermaid2-plugin", + "griffe-pydantic", +] + +# CLI entry point +[project.scripts] +cibmangotree = "cibmangotree.__main__:main" + +# Plugin system entry points for analyzers +[project.entry-points."cibmangotree.analyzers"] +# Analyzer plugins will register here via their own pyproject.toml +# Example: example = "cibmangotree_analyzer_example:analyzer" + +# Plugin system entry points for tokenizers +[project.entry-points."cibmangotree.tokenizers"] +# Tokenizer plugins will register here via their own pyproject.toml +# Example: basic = "cibmangotree_tokenizer_basic:tokenizer" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +# Hatchling build configuration +[tool.hatchling.build.targets.wheel] +packages = ["src/cibmangotree"] diff --git a/packages/core/src/cibmangotree/__init__.py b/packages/core/src/cibmangotree/__init__.py new file mode 100644 index 00000000..17bce114 --- /dev/null +++ b/packages/core/src/cibmangotree/__init__.py @@ -0,0 +1,59 @@ +""" +CIB Mango Tree - Social Media Data Analysis Tool + +A modular CLI and library for analyzing social media data with support +for extensible analyzers, tokenizers, and export formats. +""" + +from .meta import get_version, is_development, is_distributed + +# Version information +__version__ = get_version() or "0.1.0-dev" +__all__ = [ + "__version__", + "get_version", + "is_development", + "is_distributed", + # Core application + "App", + "AppContext", + # Analysis contexts + "AnalysisContext", + "AnalysisOutputContext", + "AnalysisWebServerContext", + "ProjectContext", + "SettingsContext", + # Analyzer interface + "AnalyzerInterface", + "PrimaryAnalyzerContext", + "SecondaryAnalyzerContext", + "ParamValue", + "AnalyzerOutput", + # Storage + "Storage", +] + +# Import core application components +from .app import ( + App, + AppContext, + AnalysisContext, + AnalysisOutputContext, + AnalysisWebServerContext, + ProjectContext, + SettingsContext, +) + +# Import analyzer interface +from .analyzer_interface import ( + AnalyzerInterface, + AnalyzerOutput, + ParamValue, +) +from .analyzer_interface.context import ( + PrimaryAnalyzerContext, + SecondaryAnalyzerContext, +) + +# Import storage service +from .services.storage import Storage diff --git a/packages/core/src/cibmangotree/__main__.py b/packages/core/src/cibmangotree/__main__.py new file mode 100644 index 00000000..51f0df07 --- /dev/null +++ b/packages/core/src/cibmangotree/__main__.py @@ -0,0 +1,101 @@ +""" +CIB Mango Tree CLI Entry Point + +This module provides the main entry point for the CIB Mango Tree CLI application. +It handles command-line argument parsing, logging setup, and application initialization. +""" + +import argparse +import logging +import sys +from multiprocessing import freeze_support +from pathlib import Path + +from rich.console import Console +from rich.text import Text + +from .tui.tools import enable_windows_ansi_support + + +def main(): + """Main entry point for the CIB Mango Tree CLI.""" + freeze_support() + enable_windows_ansi_support() + + # Parse command line arguments + parser = argparse.ArgumentParser( + description="CIB Mango Tree CLI - Social Media Data Analysis Tool" + ) + parser.add_argument( + "--log-level", + choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], + default="INFO", + help="Set the logging level (default: INFO)", + ) + parser.add_argument( + "--noop", action="store_true", help="No-operation mode for testing" + ) + + args = parser.parse_args() + + # Handle no-op mode + if args.noop: + print("No-op flag detected. Exiting successfully.") + sys.exit(0) + + # Show loading message early + console = Console() + loading_msg = Text("🥭 CIB Mango Tree is starting", style="orange1 bold") + loading_msg.append("... This may take a moment.", style="dim") + console.print(loading_msg) + + # Import heavy modules after loading message + # NOTE: These imports will fail until Phase 5 (import path fixes) + # For now, we're just creating the structure + try: + from analyzers import suite + from .app import App, AppContext + from .app.logger import setup_logging + from .tui.components import ViewContext, main_menu, splash + from .meta import get_version + from .services.storage import Storage + from .tui.tools.inception import TerminalContext + + # Initialize storage + storage = Storage(app_name="MangoTango", app_author="Civic Tech DC") + + # Set up logging + log_level = getattr(logging, args.log_level) + log_file_path = Path(storage.user_data_dir) / "logs" / "mangotango.log" + app_version = get_version() or "development" + setup_logging(log_file_path, log_level, app_version) + + # Get logger for main module + logger = logging.getLogger(__name__) + logger.info( + "Starting CIB Mango Tree application", + extra={"log_level": args.log_level, "log_file": str(log_file_path)}, + ) + + # Start the application + splash() + main_menu( + ViewContext( + terminal=TerminalContext(), + app=App(context=AppContext(storage=storage, suite=suite)), + ) + ) + except ImportError as e: + # Expected during Phase 2-4 of reorganization + console.print( + f"[yellow]Note:[/yellow] Import paths not yet updated. Error: {e}", + style="dim", + ) + console.print( + "[yellow]This is expected during monorepo reorganization (Phase 2-4).[/yellow]" + ) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/analyzer_interface/__init__.py b/packages/core/src/cibmangotree/analyzer_interface/__init__.py similarity index 100% rename from analyzer_interface/__init__.py rename to packages/core/src/cibmangotree/analyzer_interface/__init__.py diff --git a/analyzer_interface/column_automap.py b/packages/core/src/cibmangotree/analyzer_interface/column_automap.py similarity index 100% rename from analyzer_interface/column_automap.py rename to packages/core/src/cibmangotree/analyzer_interface/column_automap.py diff --git a/analyzer_interface/context.py b/packages/core/src/cibmangotree/analyzer_interface/context.py similarity index 100% rename from analyzer_interface/context.py rename to packages/core/src/cibmangotree/analyzer_interface/context.py diff --git a/analyzer_interface/data_type_compatibility.py b/packages/core/src/cibmangotree/analyzer_interface/data_type_compatibility.py similarity index 100% rename from analyzer_interface/data_type_compatibility.py rename to packages/core/src/cibmangotree/analyzer_interface/data_type_compatibility.py diff --git a/analyzer_interface/declaration.py b/packages/core/src/cibmangotree/analyzer_interface/declaration.py similarity index 100% rename from analyzer_interface/declaration.py rename to packages/core/src/cibmangotree/analyzer_interface/declaration.py diff --git a/analyzer_interface/interface.py b/packages/core/src/cibmangotree/analyzer_interface/interface.py similarity index 100% rename from analyzer_interface/interface.py rename to packages/core/src/cibmangotree/analyzer_interface/interface.py diff --git a/analyzer_interface/params.py b/packages/core/src/cibmangotree/analyzer_interface/params.py similarity index 100% rename from analyzer_interface/params.py rename to packages/core/src/cibmangotree/analyzer_interface/params.py diff --git a/analyzer_interface/suite.py b/packages/core/src/cibmangotree/analyzer_interface/suite.py similarity index 100% rename from analyzer_interface/suite.py rename to packages/core/src/cibmangotree/analyzer_interface/suite.py diff --git a/app/__init__.py b/packages/core/src/cibmangotree/app/__init__.py similarity index 100% rename from app/__init__.py rename to packages/core/src/cibmangotree/app/__init__.py diff --git a/app/analysis_context.py b/packages/core/src/cibmangotree/app/analysis_context.py similarity index 100% rename from app/analysis_context.py rename to packages/core/src/cibmangotree/app/analysis_context.py diff --git a/app/analysis_output_context.py b/packages/core/src/cibmangotree/app/analysis_output_context.py similarity index 100% rename from app/analysis_output_context.py rename to packages/core/src/cibmangotree/app/analysis_output_context.py diff --git a/app/analysis_webserver_context.py b/packages/core/src/cibmangotree/app/analysis_webserver_context.py similarity index 100% rename from app/analysis_webserver_context.py rename to packages/core/src/cibmangotree/app/analysis_webserver_context.py diff --git a/app/app.py b/packages/core/src/cibmangotree/app/app.py similarity index 100% rename from app/app.py rename to packages/core/src/cibmangotree/app/app.py diff --git a/app/app_context.py b/packages/core/src/cibmangotree/app/app_context.py similarity index 100% rename from app/app_context.py rename to packages/core/src/cibmangotree/app/app_context.py diff --git a/app/logger.py b/packages/core/src/cibmangotree/app/logger.py similarity index 100% rename from app/logger.py rename to packages/core/src/cibmangotree/app/logger.py diff --git a/app/project_context.py b/packages/core/src/cibmangotree/app/project_context.py similarity index 100% rename from app/project_context.py rename to packages/core/src/cibmangotree/app/project_context.py diff --git a/app/settings_context.py b/packages/core/src/cibmangotree/app/settings_context.py similarity index 100% rename from app/settings_context.py rename to packages/core/src/cibmangotree/app/settings_context.py diff --git a/app/shiny.py b/packages/core/src/cibmangotree/app/shiny.py similarity index 100% rename from app/shiny.py rename to packages/core/src/cibmangotree/app/shiny.py diff --git a/app/test_logger.py b/packages/core/src/cibmangotree/app/test_logger.py similarity index 100% rename from app/test_logger.py rename to packages/core/src/cibmangotree/app/test_logger.py diff --git a/app/utils.py b/packages/core/src/cibmangotree/app/utils.py similarity index 100% rename from app/utils.py rename to packages/core/src/cibmangotree/app/utils.py diff --git a/app/web_static/dashboard_base.css b/packages/core/src/cibmangotree/app/web_static/dashboard_base.css similarity index 100% rename from app/web_static/dashboard_base.css rename to packages/core/src/cibmangotree/app/web_static/dashboard_base.css diff --git a/app/web_templates/index.html b/packages/core/src/cibmangotree/app/web_templates/index.html similarity index 100% rename from app/web_templates/index.html rename to packages/core/src/cibmangotree/app/web_templates/index.html diff --git a/context/__init__.py b/packages/core/src/cibmangotree/context/__init__.py similarity index 100% rename from context/__init__.py rename to packages/core/src/cibmangotree/context/__init__.py diff --git a/packages/core/src/cibmangotree/gui/__init__.py b/packages/core/src/cibmangotree/gui/__init__.py new file mode 100644 index 00000000..7767063f --- /dev/null +++ b/packages/core/src/cibmangotree/gui/__init__.py @@ -0,0 +1,39 @@ +""" +GUI Module for CIB Mango Tree + +This module will provide graphical user interface functionality for +CIB Mango Tree using a modern GUI framework. + +TODO: Future Feature - GUI Implementation +----------------------------------------- +This is a placeholder for future GUI functionality. The current +implementation focuses on the Terminal UI (TUI) in the `tui` module. + +Planned Features: +1. Desktop Application + - Cross-platform GUI (likely using Qt, Electron, or similar) + - Visual project management + - Interactive data preview + - Analysis configuration wizard + +2. Visual Analysis Builder + - Drag-and-drop analyzer configuration + - Real-time parameter adjustment + - Visual pipeline builder + +3. Enhanced Visualizations + - Interactive charts and graphs + - Data exploration tools + - Export to various formats + +4. Integration Points + - Share core logic with TUI + - Use same storage and analysis backends + - Plugin compatibility + +Note: This is future work and not currently prioritized. +The terminal UI (`tui` module) provides full functionality. +""" + +# Placeholder - future implementation +__all__ = [] diff --git a/meta/__init__.py b/packages/core/src/cibmangotree/meta/__init__.py similarity index 100% rename from meta/__init__.py rename to packages/core/src/cibmangotree/meta/__init__.py diff --git a/meta/get_version.py b/packages/core/src/cibmangotree/meta/get_version.py similarity index 100% rename from meta/get_version.py rename to packages/core/src/cibmangotree/meta/get_version.py diff --git a/packages/core/src/cibmangotree/plugin_system/__init__.py b/packages/core/src/cibmangotree/plugin_system/__init__.py new file mode 100644 index 00000000..c3a8ccf7 --- /dev/null +++ b/packages/core/src/cibmangotree/plugin_system/__init__.py @@ -0,0 +1,85 @@ +""" +Plugin System for CIB Mango Tree + +This module will provide plugin discovery and loading functionality for: +- Analyzer plugins (via cibmangotree.analyzers entry point) +- Tokenizer plugins (via cibmangotree.tokenizers entry point) + +TODO: Phase 6 - Implement Plugin System +-------------------------------------- +This is a placeholder for the plugin system that will be implemented +after the monorepo reorganization is complete. + +Planned Features: +1. Plugin Discovery + - Use importlib.metadata to discover installed plugins + - Scan entry points: cibmangotree.analyzers, cibmangotree.tokenizers + - Validate plugin interfaces + +2. Plugin Loading + - Lazy loading of plugins + - Error handling for malformed plugins + - Version compatibility checking + +3. Plugin Registry + - Central registry of available plugins + - Metadata: name, version, description, dependencies + - Conflict detection (duplicate names) + +4. Plugin Lifecycle + - Initialize plugins on demand + - Resource cleanup + - Hot reload support (future) + +Example Usage (planned): +```python +from cibmangotree.plugin_system import discover_plugins, load_plugin + +# Discover all analyzer plugins +analyzers = discover_plugins("cibmangotree.analyzers") + +# Load a specific plugin +hashtag_analyzer = load_plugin("cibmangotree.analyzers", "hashtags") +``` + +Entry Point Format: +```toml +# In analyzer plugin's pyproject.toml +[project.entry-points."cibmangotree.analyzers"] +hashtags = "cibmangotree_analyzer_hashtags:analyzer" +``` +""" + +# Placeholder - will be implemented in Phase 6 +__all__ = [] + + +def discover_plugins(entry_point_group: str) -> list: + """ + Discover plugins for a given entry point group. + + TODO: Implement using importlib.metadata.entry_points() + + Args: + entry_point_group: Entry point group name (e.g., "cibmangotree.analyzers") + + Returns: + List of discovered plugin metadata + """ + raise NotImplementedError("Plugin system not yet implemented (Phase 6)") + + +def load_plugin(entry_point_group: str, plugin_name: str): + """ + Load a specific plugin by name. + + TODO: Implement plugin loading and validation + + Args: + entry_point_group: Entry point group name + plugin_name: Name of the plugin to load + + Returns: + Loaded plugin object + """ + raise NotImplementedError("Plugin system not yet implemented (Phase 6)") diff --git a/packages/core/src/cibmangotree/services/__init__.py b/packages/core/src/cibmangotree/services/__init__.py new file mode 100644 index 00000000..fe19c20c --- /dev/null +++ b/packages/core/src/cibmangotree/services/__init__.py @@ -0,0 +1,25 @@ +""" +Services Module + +This module provides core services for CIB Mango Tree: +- storage: Data persistence, project management, and file operations +- importing: Data import from CSV, Excel, and other formats +- preprocessing: Data preprocessing and semantic type detection +""" + +# Re-export key service classes +from .storage import Storage +from .importing import CSVImporter, ExcelImporter, Importer, ImporterSession +from .preprocessing import SeriesSemantic + +__all__ = [ + # Storage + "Storage", + # Importing + "CSVImporter", + "ExcelImporter", + "Importer", + "ImporterSession", + # Preprocessing + "SeriesSemantic", +] diff --git a/importing/__init__.py b/packages/core/src/cibmangotree/services/importing/__init__.py similarity index 100% rename from importing/__init__.py rename to packages/core/src/cibmangotree/services/importing/__init__.py diff --git a/importing/csv.py b/packages/core/src/cibmangotree/services/importing/csv.py similarity index 100% rename from importing/csv.py rename to packages/core/src/cibmangotree/services/importing/csv.py diff --git a/importing/excel.py b/packages/core/src/cibmangotree/services/importing/excel.py similarity index 100% rename from importing/excel.py rename to packages/core/src/cibmangotree/services/importing/excel.py diff --git a/importing/importer.py b/packages/core/src/cibmangotree/services/importing/importer.py similarity index 100% rename from importing/importer.py rename to packages/core/src/cibmangotree/services/importing/importer.py diff --git a/importing/test_csv.py b/packages/core/src/cibmangotree/services/importing/test_csv.py similarity index 100% rename from importing/test_csv.py rename to packages/core/src/cibmangotree/services/importing/test_csv.py diff --git a/importing/test_data/notes_with_commas.csv b/packages/core/src/cibmangotree/services/importing/test_data/notes_with_commas.csv similarity index 100% rename from importing/test_data/notes_with_commas.csv rename to packages/core/src/cibmangotree/services/importing/test_data/notes_with_commas.csv diff --git a/importing/test_data/simple_header.csv b/packages/core/src/cibmangotree/services/importing/test_data/simple_header.csv similarity index 100% rename from importing/test_data/simple_header.csv rename to packages/core/src/cibmangotree/services/importing/test_data/simple_header.csv diff --git a/importing/test_data/single_note.csv b/packages/core/src/cibmangotree/services/importing/test_data/single_note.csv similarity index 100% rename from importing/test_data/single_note.csv rename to packages/core/src/cibmangotree/services/importing/test_data/single_note.csv diff --git a/importing/test_data/trailing_commas.csv b/packages/core/src/cibmangotree/services/importing/test_data/trailing_commas.csv similarity index 100% rename from importing/test_data/trailing_commas.csv rename to packages/core/src/cibmangotree/services/importing/test_data/trailing_commas.csv diff --git a/preprocessing/__init__.py b/packages/core/src/cibmangotree/services/preprocessing/__init__.py similarity index 100% rename from preprocessing/__init__.py rename to packages/core/src/cibmangotree/services/preprocessing/__init__.py diff --git a/preprocessing/series_semantic.py b/packages/core/src/cibmangotree/services/preprocessing/series_semantic.py similarity index 100% rename from preprocessing/series_semantic.py rename to packages/core/src/cibmangotree/services/preprocessing/series_semantic.py diff --git a/preprocessing/test_series_semantic.py b/packages/core/src/cibmangotree/services/preprocessing/test_series_semantic.py similarity index 100% rename from preprocessing/test_series_semantic.py rename to packages/core/src/cibmangotree/services/preprocessing/test_series_semantic.py diff --git a/storage/__init__.py b/packages/core/src/cibmangotree/services/storage/__init__.py similarity index 100% rename from storage/__init__.py rename to packages/core/src/cibmangotree/services/storage/__init__.py diff --git a/storage/file_selector.py b/packages/core/src/cibmangotree/services/storage/file_selector.py similarity index 100% rename from storage/file_selector.py rename to packages/core/src/cibmangotree/services/storage/file_selector.py diff --git a/packages/core/src/cibmangotree/tui/__init__.py b/packages/core/src/cibmangotree/tui/__init__.py new file mode 100644 index 00000000..4bcde8ca --- /dev/null +++ b/packages/core/src/cibmangotree/tui/__init__.py @@ -0,0 +1,23 @@ +""" +Terminal User Interface (TUI) Module + +This module provides terminal-based UI components and utilities for +CIB Mango Tree CLI application. + +Components: +- components: Interactive menu components, prompts, and views +- tools: Terminal utilities, progress bars, ANSI support +""" + +# Re-export commonly used TUI components +from .components import ( + ViewContext, + main_menu, + splash, +) + +__all__ = [ + "ViewContext", + "main_menu", + "splash", +] diff --git a/components/__init__.py b/packages/core/src/cibmangotree/tui/components/__init__.py similarity index 100% rename from components/__init__.py rename to packages/core/src/cibmangotree/tui/components/__init__.py diff --git a/components/analysis_main.py b/packages/core/src/cibmangotree/tui/components/analysis_main.py similarity index 100% rename from components/analysis_main.py rename to packages/core/src/cibmangotree/tui/components/analysis_main.py diff --git a/components/analysis_params.py b/packages/core/src/cibmangotree/tui/components/analysis_params.py similarity index 100% rename from components/analysis_params.py rename to packages/core/src/cibmangotree/tui/components/analysis_params.py diff --git a/components/analysis_web_server.py b/packages/core/src/cibmangotree/tui/components/analysis_web_server.py similarity index 100% rename from components/analysis_web_server.py rename to packages/core/src/cibmangotree/tui/components/analysis_web_server.py diff --git a/components/context.py b/packages/core/src/cibmangotree/tui/components/context.py similarity index 100% rename from components/context.py rename to packages/core/src/cibmangotree/tui/components/context.py diff --git a/components/export_outputs.py b/packages/core/src/cibmangotree/tui/components/export_outputs.py similarity index 100% rename from components/export_outputs.py rename to packages/core/src/cibmangotree/tui/components/export_outputs.py diff --git a/components/main_menu.py b/packages/core/src/cibmangotree/tui/components/main_menu.py similarity index 100% rename from components/main_menu.py rename to packages/core/src/cibmangotree/tui/components/main_menu.py diff --git a/components/new_analysis.py b/packages/core/src/cibmangotree/tui/components/new_analysis.py similarity index 100% rename from components/new_analysis.py rename to packages/core/src/cibmangotree/tui/components/new_analysis.py diff --git a/components/new_project.py b/packages/core/src/cibmangotree/tui/components/new_project.py similarity index 100% rename from components/new_project.py rename to packages/core/src/cibmangotree/tui/components/new_project.py diff --git a/components/project_main.py b/packages/core/src/cibmangotree/tui/components/project_main.py similarity index 100% rename from components/project_main.py rename to packages/core/src/cibmangotree/tui/components/project_main.py diff --git a/components/select_analysis.py b/packages/core/src/cibmangotree/tui/components/select_analysis.py similarity index 100% rename from components/select_analysis.py rename to packages/core/src/cibmangotree/tui/components/select_analysis.py diff --git a/components/select_project.py b/packages/core/src/cibmangotree/tui/components/select_project.py similarity index 100% rename from components/select_project.py rename to packages/core/src/cibmangotree/tui/components/select_project.py diff --git a/components/splash.py b/packages/core/src/cibmangotree/tui/components/splash.py similarity index 100% rename from components/splash.py rename to packages/core/src/cibmangotree/tui/components/splash.py diff --git a/terminal_tools/__init__.py b/packages/core/src/cibmangotree/tui/tools/__init__.py similarity index 100% rename from terminal_tools/__init__.py rename to packages/core/src/cibmangotree/tui/tools/__init__.py diff --git a/terminal_tools/inception.py b/packages/core/src/cibmangotree/tui/tools/inception.py similarity index 100% rename from terminal_tools/inception.py rename to packages/core/src/cibmangotree/tui/tools/inception.py diff --git a/terminal_tools/progress.py b/packages/core/src/cibmangotree/tui/tools/progress.py similarity index 100% rename from terminal_tools/progress.py rename to packages/core/src/cibmangotree/tui/tools/progress.py diff --git a/terminal_tools/prompts.py b/packages/core/src/cibmangotree/tui/tools/prompts.py similarity index 100% rename from terminal_tools/prompts.py rename to packages/core/src/cibmangotree/tui/tools/prompts.py diff --git a/terminal_tools/utils.py b/packages/core/src/cibmangotree/tui/tools/utils.py similarity index 100% rename from terminal_tools/utils.py rename to packages/core/src/cibmangotree/tui/tools/utils.py From 1f1b115ca830cbb524ca344d5e0974fa19fd63f6 Mon Sep 17 00:00:00 2001 From: Joe Karow <58997957+JoeKarow@users.noreply.github.com> Date: Thu, 9 Oct 2025 18:31:17 -0400 Subject: [PATCH 04/24] feat: create plugin packages (analyzers, tokenizers, testing) Phase 3 & 4: Plugin and Testing Package Setup Testing Package: - Create cibmangotree-testing package - Move testing utilities with git mv - Update imports to new structure Tokenizers: - Create cibmangotree-tokenizer-basic package - Move basic tokenizer from services/tokenizer/basic/ - Configure plugin entry point Analyzers (5 packages): - cibmangotree-analyzer-example - cibmangotree-analyzer-hashtags - cibmangotree-analyzer-ngrams - cibmangotree-analyzer-temporal - cibmangotree-analyzer-time-coordination Each analyzer: - Complete pyproject.toml with dependencies - Proper entry points for plugin discovery - File structure: base/, stats/, web/ components - Tests and test data moved Total: 7 new packages, 83 file operations, history preserved. --- packages/analyzers/example/README.md | 23 ++++++++ packages/analyzers/example/pyproject.toml | 39 ++++++++++++++ .../cibmangotree_analyzer_example/__init__.py | 19 +++++++ .../base}/__init__.py | 0 .../base}/default_params.py | 0 .../base}/interface.py | 0 .../base}/main.py | 0 .../report}/__init__.py | 0 .../report}/interface.py | 0 .../report}/main.py | 0 .../web}/__init__.py | 0 .../web}/factory.py | 0 .../web}/interface.py | 0 .../example/tests}/test_data/__init__.py | 0 .../example/tests}/test_data/input.csv | 0 .../example/tests}/test_data/output.csv | 0 .../tests}/test_data/output_report.csv | 0 .../example/tests}/test_example_base.py | 0 .../example/tests}/test_example_report.py | 0 packages/analyzers/hashtags/README.md | 23 ++++++++ packages/analyzers/hashtags/pyproject.toml | 40 ++++++++++++++ .../__init__.py | 17 ++++++ .../base}/__init__.py | 0 .../base}/interface.py | 0 .../base}/main.py | 0 .../web}/__init__.py | 0 .../web}/analysis.py | 0 .../web}/app.py | 0 .../web}/factory.py | 0 .../web}/interface.py | 0 .../web}/plots.py | 0 .../hashtags/tests}/test_data/__init__.py | 0 .../tests}/test_data/hashtag_test_input.csv | 0 .../tests}/test_data/hashtag_test_output.json | 0 .../hashtags/tests}/test_hashtags_base.py | 0 packages/analyzers/ngrams/README.md | 25 +++++++++ packages/analyzers/ngrams/pyproject.toml | 40 ++++++++++++++ .../cibmangotree_analyzer_ngrams/__init__.py | 19 +++++++ .../base}/__init__.py | 0 .../base}/interface.py | 0 .../base}/main.py | 0 .../stats}/__init__.py | 0 .../stats}/interface.py | 0 .../stats}/main.py | 0 .../web}/__init__.py | 0 .../cibmangotree_analyzer_ngrams/web}/app.py | 0 .../web}/factory.py | 0 .../web}/interface.py | 0 .../ngrams/tests}/test_data/__init__.py | 0 .../tests}/test_data/message_authors.parquet | Bin .../tests}/test_data/message_ngrams.parquet | Bin .../tests}/test_data/ngram_full.parquet | Bin .../tests}/test_data/ngram_stats.parquet | Bin .../ngrams/tests}/test_data/ngrams.parquet | Bin .../tests}/test_data/ngrams_test_input.csv | 0 .../ngrams/tests}/test_ngram_stats.py | 0 .../ngrams/tests}/test_ngrams_base.py | 0 packages/analyzers/temporal/README.md | 24 +++++++++ packages/analyzers/temporal/pyproject.toml | 39 ++++++++++++++ .../__init__.py | 17 ++++++ .../base}/__init__.py | 0 .../base}/interface.py | 0 .../base}/main.py | 0 .../web}/__init__.py | 0 .../web}/factory.py | 0 .../web}/interface.py | 0 .../analyzers/time_coordination/README.md | 23 ++++++++ .../time_coordination/pyproject.toml | 39 ++++++++++++++ .../__init__.py | 13 +++++ .../interface.py | 0 .../main.py | 0 packages/testing/pyproject.toml | 42 +++++++++++++++ .../src/cibmangotree_testing/__init__.py | 49 ++++++++++++++++++ .../src/cibmangotree_testing}/comparers.py | 0 .../src/cibmangotree_testing}/context.py | 10 ++-- .../src/cibmangotree_testing}/testdata.py | 2 +- .../src/cibmangotree_testing}/testers.py | 7 ++- packages/tokenizers/basic/pyproject.toml | 39 ++++++++++++++ .../cibmangotree_tokenizer_basic}/__init__.py | 3 +- .../cibmangotree_tokenizer_basic}/patterns.py | 0 .../tokenizer.py | 9 +++- .../basic/tests}/test_basic_tokenizer.py | 4 +- testing/__init__.py | 9 ---- 83 files changed, 552 insertions(+), 22 deletions(-) create mode 100644 packages/analyzers/example/README.md create mode 100644 packages/analyzers/example/pyproject.toml create mode 100644 packages/analyzers/example/src/cibmangotree_analyzer_example/__init__.py rename {analyzers/example/example_base => packages/analyzers/example/src/cibmangotree_analyzer_example/base}/__init__.py (100%) rename {analyzers/example/example_base => packages/analyzers/example/src/cibmangotree_analyzer_example/base}/default_params.py (100%) rename {analyzers/example/example_base => packages/analyzers/example/src/cibmangotree_analyzer_example/base}/interface.py (100%) rename {analyzers/example/example_base => packages/analyzers/example/src/cibmangotree_analyzer_example/base}/main.py (100%) rename {analyzers/example/example_report => packages/analyzers/example/src/cibmangotree_analyzer_example/report}/__init__.py (100%) rename {analyzers/example/example_report => packages/analyzers/example/src/cibmangotree_analyzer_example/report}/interface.py (100%) rename {analyzers/example/example_report => packages/analyzers/example/src/cibmangotree_analyzer_example/report}/main.py (100%) rename {analyzers/example/example_web => packages/analyzers/example/src/cibmangotree_analyzer_example/web}/__init__.py (100%) rename {analyzers/example/example_web => packages/analyzers/example/src/cibmangotree_analyzer_example/web}/factory.py (100%) rename {analyzers/example/example_web => packages/analyzers/example/src/cibmangotree_analyzer_example/web}/interface.py (100%) rename {analyzers/example => packages/analyzers/example/tests}/test_data/__init__.py (100%) rename {analyzers/example => packages/analyzers/example/tests}/test_data/input.csv (100%) rename {analyzers/example => packages/analyzers/example/tests}/test_data/output.csv (100%) rename {analyzers/example => packages/analyzers/example/tests}/test_data/output_report.csv (100%) rename {analyzers/example => packages/analyzers/example/tests}/test_example_base.py (100%) rename {analyzers/example => packages/analyzers/example/tests}/test_example_report.py (100%) create mode 100644 packages/analyzers/hashtags/README.md create mode 100644 packages/analyzers/hashtags/pyproject.toml create mode 100644 packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/__init__.py rename {analyzers/hashtags/hashtags_base => packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/base}/__init__.py (100%) rename {analyzers/hashtags/hashtags_base => packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/base}/interface.py (100%) rename {analyzers/hashtags/hashtags_base => packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/base}/main.py (100%) rename {analyzers/hashtags/hashtags_web => packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web}/__init__.py (100%) rename {analyzers/hashtags/hashtags_web => packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web}/analysis.py (100%) rename {analyzers/hashtags/hashtags_web => packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web}/app.py (100%) rename {analyzers/hashtags/hashtags_web => packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web}/factory.py (100%) rename {analyzers/hashtags/hashtags_web => packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web}/interface.py (100%) rename {analyzers/hashtags/hashtags_web => packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web}/plots.py (100%) rename {analyzers/hashtags => packages/analyzers/hashtags/tests}/test_data/__init__.py (100%) rename {analyzers/hashtags => packages/analyzers/hashtags/tests}/test_data/hashtag_test_input.csv (100%) rename {analyzers/hashtags => packages/analyzers/hashtags/tests}/test_data/hashtag_test_output.json (100%) rename {analyzers/hashtags => packages/analyzers/hashtags/tests}/test_hashtags_base.py (100%) create mode 100644 packages/analyzers/ngrams/README.md create mode 100644 packages/analyzers/ngrams/pyproject.toml create mode 100644 packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/__init__.py rename {analyzers/ngrams/ngrams_base => packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base}/__init__.py (100%) rename {analyzers/ngrams/ngrams_base => packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base}/interface.py (100%) rename {analyzers/ngrams/ngrams_base => packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base}/main.py (100%) rename {analyzers/ngrams/ngram_stats => packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats}/__init__.py (100%) rename {analyzers/ngrams/ngram_stats => packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats}/interface.py (100%) rename {analyzers/ngrams/ngram_stats => packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats}/main.py (100%) rename {analyzers/ngrams/ngram_web => packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web}/__init__.py (100%) rename {analyzers/ngrams/ngram_web => packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web}/app.py (100%) rename {analyzers/ngrams/ngram_web => packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web}/factory.py (100%) rename {analyzers/ngrams/ngram_web => packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web}/interface.py (100%) rename {analyzers/ngrams => packages/analyzers/ngrams/tests}/test_data/__init__.py (100%) rename {analyzers/ngrams => packages/analyzers/ngrams/tests}/test_data/message_authors.parquet (100%) rename {analyzers/ngrams => packages/analyzers/ngrams/tests}/test_data/message_ngrams.parquet (100%) rename {analyzers/ngrams => packages/analyzers/ngrams/tests}/test_data/ngram_full.parquet (100%) rename {analyzers/ngrams => packages/analyzers/ngrams/tests}/test_data/ngram_stats.parquet (100%) rename {analyzers/ngrams => packages/analyzers/ngrams/tests}/test_data/ngrams.parquet (100%) rename {analyzers/ngrams => packages/analyzers/ngrams/tests}/test_data/ngrams_test_input.csv (100%) rename {analyzers/ngrams => packages/analyzers/ngrams/tests}/test_ngram_stats.py (100%) rename {analyzers/ngrams => packages/analyzers/ngrams/tests}/test_ngrams_base.py (100%) create mode 100644 packages/analyzers/temporal/README.md create mode 100644 packages/analyzers/temporal/pyproject.toml create mode 100644 packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/__init__.py rename {analyzers/temporal/temporal_base => packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/base}/__init__.py (100%) rename {analyzers/temporal/temporal_base => packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/base}/interface.py (100%) rename {analyzers/temporal/temporal_base => packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/base}/main.py (100%) rename {analyzers/temporal/temporal_web => packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web}/__init__.py (100%) rename {analyzers/temporal/temporal_web => packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web}/factory.py (100%) rename {analyzers/temporal/temporal_web => packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web}/interface.py (100%) create mode 100644 packages/analyzers/time_coordination/README.md create mode 100644 packages/analyzers/time_coordination/pyproject.toml create mode 100644 packages/analyzers/time_coordination/src/cibmangotree_analyzer_time_coordination/__init__.py rename {analyzers/time_coordination => packages/analyzers/time_coordination/src/cibmangotree_analyzer_time_coordination}/interface.py (100%) rename {analyzers/time_coordination => packages/analyzers/time_coordination/src/cibmangotree_analyzer_time_coordination}/main.py (100%) create mode 100644 packages/testing/pyproject.toml create mode 100644 packages/testing/src/cibmangotree_testing/__init__.py rename {testing => packages/testing/src/cibmangotree_testing}/comparers.py (100%) rename {testing => packages/testing/src/cibmangotree_testing}/context.py (90%) rename {testing => packages/testing/src/cibmangotree_testing}/testdata.py (98%) rename {testing => packages/testing/src/cibmangotree_testing}/testers.py (97%) create mode 100644 packages/tokenizers/basic/pyproject.toml rename {services/tokenizer/basic => packages/tokenizers/basic/src/cibmangotree_tokenizer_basic}/__init__.py (92%) rename {services/tokenizer/basic => packages/tokenizers/basic/src/cibmangotree_tokenizer_basic}/patterns.py (100%) rename {services/tokenizer/basic => packages/tokenizers/basic/src/cibmangotree_tokenizer_basic}/tokenizer.py (98%) rename {services/tokenizer/basic => packages/tokenizers/basic/tests}/test_basic_tokenizer.py (99%) delete mode 100644 testing/__init__.py diff --git a/packages/analyzers/example/README.md b/packages/analyzers/example/README.md new file mode 100644 index 00000000..f51a7120 --- /dev/null +++ b/packages/analyzers/example/README.md @@ -0,0 +1,23 @@ +# CIBMangoTree Example Analyzer + +Example analyzer demonstrating character count analysis. + +## Description + +This is an example analyzer that counts the number of characters in each message. It serves as a template for creating new analyzers. + +## Installation + +```bash +pip install cibmangotree-analyzer-example +``` + +## Features + +- Primary analysis: Character count per message +- Secondary analysis: Report generation +- Web presentation: Interactive dashboard + +## Usage + +This analyzer is automatically discovered by CIBMangoTree when installed. diff --git a/packages/analyzers/example/pyproject.toml b/packages/analyzers/example/pyproject.toml new file mode 100644 index 00000000..1877dae5 --- /dev/null +++ b/packages/analyzers/example/pyproject.toml @@ -0,0 +1,39 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "cibmangotree-analyzer-example" +version = "0.1.0" +description = "Example analyzer for character count analysis" +readme = "README.md" +requires-python = ">=3.11" +license = {text = "MIT"} +authors = [ + {name = "CivicTech", email = "info@civictech.org"} +] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] + +dependencies = [ + "cibmangotree", + "polars>=0.20.0", +] + +[project.entry-points."cibmangotree.analyzers"] +example = "cibmangotree_analyzer_example:get_interface" + +[project.optional-dependencies] +dev = [ + "pytest>=7.0.0", + "pytest-cov>=4.0.0", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/cibmangotree_analyzer_example"] diff --git a/packages/analyzers/example/src/cibmangotree_analyzer_example/__init__.py b/packages/analyzers/example/src/cibmangotree_analyzer_example/__init__.py new file mode 100644 index 00000000..8fa95cb9 --- /dev/null +++ b/packages/analyzers/example/src/cibmangotree_analyzer_example/__init__.py @@ -0,0 +1,19 @@ +"""Example analyzer package for CIBMangoTree.""" + +__version__ = "0.1.0" + +from .base.interface import interface as base_interface +from .report.interface import interface as report_interface +from .web.interface import interface as web_interface + + +def get_interface(): + """Return the analyzer interface for plugin discovery.""" + return { + "base": base_interface, + "report": report_interface, + "web": web_interface, + } + + +__all__ = ["get_interface", "__version__"] diff --git a/analyzers/example/example_base/__init__.py b/packages/analyzers/example/src/cibmangotree_analyzer_example/base/__init__.py similarity index 100% rename from analyzers/example/example_base/__init__.py rename to packages/analyzers/example/src/cibmangotree_analyzer_example/base/__init__.py diff --git a/analyzers/example/example_base/default_params.py b/packages/analyzers/example/src/cibmangotree_analyzer_example/base/default_params.py similarity index 100% rename from analyzers/example/example_base/default_params.py rename to packages/analyzers/example/src/cibmangotree_analyzer_example/base/default_params.py diff --git a/analyzers/example/example_base/interface.py b/packages/analyzers/example/src/cibmangotree_analyzer_example/base/interface.py similarity index 100% rename from analyzers/example/example_base/interface.py rename to packages/analyzers/example/src/cibmangotree_analyzer_example/base/interface.py diff --git a/analyzers/example/example_base/main.py b/packages/analyzers/example/src/cibmangotree_analyzer_example/base/main.py similarity index 100% rename from analyzers/example/example_base/main.py rename to packages/analyzers/example/src/cibmangotree_analyzer_example/base/main.py diff --git a/analyzers/example/example_report/__init__.py b/packages/analyzers/example/src/cibmangotree_analyzer_example/report/__init__.py similarity index 100% rename from analyzers/example/example_report/__init__.py rename to packages/analyzers/example/src/cibmangotree_analyzer_example/report/__init__.py diff --git a/analyzers/example/example_report/interface.py b/packages/analyzers/example/src/cibmangotree_analyzer_example/report/interface.py similarity index 100% rename from analyzers/example/example_report/interface.py rename to packages/analyzers/example/src/cibmangotree_analyzer_example/report/interface.py diff --git a/analyzers/example/example_report/main.py b/packages/analyzers/example/src/cibmangotree_analyzer_example/report/main.py similarity index 100% rename from analyzers/example/example_report/main.py rename to packages/analyzers/example/src/cibmangotree_analyzer_example/report/main.py diff --git a/analyzers/example/example_web/__init__.py b/packages/analyzers/example/src/cibmangotree_analyzer_example/web/__init__.py similarity index 100% rename from analyzers/example/example_web/__init__.py rename to packages/analyzers/example/src/cibmangotree_analyzer_example/web/__init__.py diff --git a/analyzers/example/example_web/factory.py b/packages/analyzers/example/src/cibmangotree_analyzer_example/web/factory.py similarity index 100% rename from analyzers/example/example_web/factory.py rename to packages/analyzers/example/src/cibmangotree_analyzer_example/web/factory.py diff --git a/analyzers/example/example_web/interface.py b/packages/analyzers/example/src/cibmangotree_analyzer_example/web/interface.py similarity index 100% rename from analyzers/example/example_web/interface.py rename to packages/analyzers/example/src/cibmangotree_analyzer_example/web/interface.py diff --git a/analyzers/example/test_data/__init__.py b/packages/analyzers/example/tests/test_data/__init__.py similarity index 100% rename from analyzers/example/test_data/__init__.py rename to packages/analyzers/example/tests/test_data/__init__.py diff --git a/analyzers/example/test_data/input.csv b/packages/analyzers/example/tests/test_data/input.csv similarity index 100% rename from analyzers/example/test_data/input.csv rename to packages/analyzers/example/tests/test_data/input.csv diff --git a/analyzers/example/test_data/output.csv b/packages/analyzers/example/tests/test_data/output.csv similarity index 100% rename from analyzers/example/test_data/output.csv rename to packages/analyzers/example/tests/test_data/output.csv diff --git a/analyzers/example/test_data/output_report.csv b/packages/analyzers/example/tests/test_data/output_report.csv similarity index 100% rename from analyzers/example/test_data/output_report.csv rename to packages/analyzers/example/tests/test_data/output_report.csv diff --git a/analyzers/example/test_example_base.py b/packages/analyzers/example/tests/test_example_base.py similarity index 100% rename from analyzers/example/test_example_base.py rename to packages/analyzers/example/tests/test_example_base.py diff --git a/analyzers/example/test_example_report.py b/packages/analyzers/example/tests/test_example_report.py similarity index 100% rename from analyzers/example/test_example_report.py rename to packages/analyzers/example/tests/test_example_report.py diff --git a/packages/analyzers/hashtags/README.md b/packages/analyzers/hashtags/README.md new file mode 100644 index 00000000..ba92977e --- /dev/null +++ b/packages/analyzers/hashtags/README.md @@ -0,0 +1,23 @@ +# CIBMangoTree Hashtags Analyzer + +Hashtag analysis for measuring coordination via hashtag usage concentration. + +## Description + +Analysis of hashtags measures the extent of online coordination among social media users by looking at how the usage of hashtags in messages changes over time. It measures whether certain hashtags are being used more frequently than others (i.e., trending). + +## Installation + +```bash +pip install cibmangotree-analyzer-hashtags +``` + +## Features + +- Gini coefficient calculation for hashtag distribution +- Time-windowed analysis of hashtag concentration +- Interactive visualizations showing coordination patterns + +## Usage + +This analyzer is automatically discovered by CIBMangoTree when installed. diff --git a/packages/analyzers/hashtags/pyproject.toml b/packages/analyzers/hashtags/pyproject.toml new file mode 100644 index 00000000..6c61117a --- /dev/null +++ b/packages/analyzers/hashtags/pyproject.toml @@ -0,0 +1,40 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "cibmangotree-analyzer-hashtags" +version = "0.1.0" +description = "Hashtag analysis for measuring coordination via hashtag usage concentration" +readme = "README.md" +requires-python = ">=3.11" +license = {text = "MIT"} +authors = [ + {name = "CivicTech", email = "info@civictech.org"} +] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] + +dependencies = [ + "cibmangotree", + "polars>=0.20.0", + "plotly>=5.0.0", +] + +[project.entry-points."cibmangotree.analyzers"] +hashtags = "cibmangotree_analyzer_hashtags:get_interface" + +[project.optional-dependencies] +dev = [ + "pytest>=7.0.0", + "pytest-cov>=4.0.0", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/cibmangotree_analyzer_hashtags"] diff --git a/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/__init__.py b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/__init__.py new file mode 100644 index 00000000..4d66cfa3 --- /dev/null +++ b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/__init__.py @@ -0,0 +1,17 @@ +"""Hashtag analyzer package for CIBMangoTree.""" + +__version__ = "0.1.0" + +from .base.interface import interface as base_interface +from .web.interface import interface as web_interface + + +def get_interface(): + """Return the analyzer interface for plugin discovery.""" + return { + "base": base_interface, + "web": web_interface, + } + + +__all__ = ["get_interface", "__version__"] diff --git a/analyzers/hashtags/hashtags_base/__init__.py b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/base/__init__.py similarity index 100% rename from analyzers/hashtags/hashtags_base/__init__.py rename to packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/base/__init__.py diff --git a/analyzers/hashtags/hashtags_base/interface.py b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/base/interface.py similarity index 100% rename from analyzers/hashtags/hashtags_base/interface.py rename to packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/base/interface.py diff --git a/analyzers/hashtags/hashtags_base/main.py b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/base/main.py similarity index 100% rename from analyzers/hashtags/hashtags_base/main.py rename to packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/base/main.py diff --git a/analyzers/hashtags/hashtags_web/__init__.py b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/__init__.py similarity index 100% rename from analyzers/hashtags/hashtags_web/__init__.py rename to packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/__init__.py diff --git a/analyzers/hashtags/hashtags_web/analysis.py b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/analysis.py similarity index 100% rename from analyzers/hashtags/hashtags_web/analysis.py rename to packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/analysis.py diff --git a/analyzers/hashtags/hashtags_web/app.py b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/app.py similarity index 100% rename from analyzers/hashtags/hashtags_web/app.py rename to packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/app.py diff --git a/analyzers/hashtags/hashtags_web/factory.py b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/factory.py similarity index 100% rename from analyzers/hashtags/hashtags_web/factory.py rename to packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/factory.py diff --git a/analyzers/hashtags/hashtags_web/interface.py b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/interface.py similarity index 100% rename from analyzers/hashtags/hashtags_web/interface.py rename to packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/interface.py diff --git a/analyzers/hashtags/hashtags_web/plots.py b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/plots.py similarity index 100% rename from analyzers/hashtags/hashtags_web/plots.py rename to packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/plots.py diff --git a/analyzers/hashtags/test_data/__init__.py b/packages/analyzers/hashtags/tests/test_data/__init__.py similarity index 100% rename from analyzers/hashtags/test_data/__init__.py rename to packages/analyzers/hashtags/tests/test_data/__init__.py diff --git a/analyzers/hashtags/test_data/hashtag_test_input.csv b/packages/analyzers/hashtags/tests/test_data/hashtag_test_input.csv similarity index 100% rename from analyzers/hashtags/test_data/hashtag_test_input.csv rename to packages/analyzers/hashtags/tests/test_data/hashtag_test_input.csv diff --git a/analyzers/hashtags/test_data/hashtag_test_output.json b/packages/analyzers/hashtags/tests/test_data/hashtag_test_output.json similarity index 100% rename from analyzers/hashtags/test_data/hashtag_test_output.json rename to packages/analyzers/hashtags/tests/test_data/hashtag_test_output.json diff --git a/analyzers/hashtags/test_hashtags_base.py b/packages/analyzers/hashtags/tests/test_hashtags_base.py similarity index 100% rename from analyzers/hashtags/test_hashtags_base.py rename to packages/analyzers/hashtags/tests/test_hashtags_base.py diff --git a/packages/analyzers/ngrams/README.md b/packages/analyzers/ngrams/README.md new file mode 100644 index 00000000..e8e6690f --- /dev/null +++ b/packages/analyzers/ngrams/README.md @@ -0,0 +1,25 @@ +# CIBMangoTree N-grams Analyzer + +N-gram analysis with multilingual tokenization support. + +## Description + +The n-gram analysis extracts n-grams (sequences of n words) from the text data and counts the occurrences of each n-gram in each message, linking the message author to the ngram frequency. + +## Installation + +```bash +pip install cibmangotree-analyzer-ngrams +``` + +## Features + +- Unicode-aware tokenization +- Multilingual support (Latin, CJK, Arabic scripts) +- Social media entity preservation (hashtags, mentions, URLs) +- Configurable n-gram length (min and max) +- Statistical analysis of n-gram distributions + +## Usage + +This analyzer is automatically discovered by CIBMangoTree when installed. diff --git a/packages/analyzers/ngrams/pyproject.toml b/packages/analyzers/ngrams/pyproject.toml new file mode 100644 index 00000000..f03838df --- /dev/null +++ b/packages/analyzers/ngrams/pyproject.toml @@ -0,0 +1,40 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "cibmangotree-analyzer-ngrams" +version = "0.1.0" +description = "N-gram analysis with multilingual tokenization support" +readme = "README.md" +requires-python = ">=3.11" +license = {text = "MIT"} +authors = [ + {name = "CivicTech", email = "info@civictech.org"} +] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] + +dependencies = [ + "cibmangotree", + "polars>=0.20.0", + "plotly>=5.0.0", +] + +[project.entry-points."cibmangotree.analyzers"] +ngrams = "cibmangotree_analyzer_ngrams:get_interface" + +[project.optional-dependencies] +dev = [ + "pytest>=7.0.0", + "pytest-cov>=4.0.0", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/cibmangotree_analyzer_ngrams"] diff --git a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/__init__.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/__init__.py new file mode 100644 index 00000000..186e0a0f --- /dev/null +++ b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/__init__.py @@ -0,0 +1,19 @@ +"""N-gram analyzer package for CIBMangoTree.""" + +__version__ = "0.1.0" + +from .base.interface import interface as base_interface +from .stats.interface import interface as stats_interface +from .web.interface import interface as web_interface + + +def get_interface(): + """Return the analyzer interface for plugin discovery.""" + return { + "base": base_interface, + "stats": stats_interface, + "web": web_interface, + } + + +__all__ = ["get_interface", "__version__"] diff --git a/analyzers/ngrams/ngrams_base/__init__.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/__init__.py similarity index 100% rename from analyzers/ngrams/ngrams_base/__init__.py rename to packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/__init__.py diff --git a/analyzers/ngrams/ngrams_base/interface.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/interface.py similarity index 100% rename from analyzers/ngrams/ngrams_base/interface.py rename to packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/interface.py diff --git a/analyzers/ngrams/ngrams_base/main.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/main.py similarity index 100% rename from analyzers/ngrams/ngrams_base/main.py rename to packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/main.py diff --git a/analyzers/ngrams/ngram_stats/__init__.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/__init__.py similarity index 100% rename from analyzers/ngrams/ngram_stats/__init__.py rename to packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/__init__.py diff --git a/analyzers/ngrams/ngram_stats/interface.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/interface.py similarity index 100% rename from analyzers/ngrams/ngram_stats/interface.py rename to packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/interface.py diff --git a/analyzers/ngrams/ngram_stats/main.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/main.py similarity index 100% rename from analyzers/ngrams/ngram_stats/main.py rename to packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/main.py diff --git a/analyzers/ngrams/ngram_web/__init__.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/__init__.py similarity index 100% rename from analyzers/ngrams/ngram_web/__init__.py rename to packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/__init__.py diff --git a/analyzers/ngrams/ngram_web/app.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/app.py similarity index 100% rename from analyzers/ngrams/ngram_web/app.py rename to packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/app.py diff --git a/analyzers/ngrams/ngram_web/factory.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/factory.py similarity index 100% rename from analyzers/ngrams/ngram_web/factory.py rename to packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/factory.py diff --git a/analyzers/ngrams/ngram_web/interface.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/interface.py similarity index 100% rename from analyzers/ngrams/ngram_web/interface.py rename to packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/interface.py diff --git a/analyzers/ngrams/test_data/__init__.py b/packages/analyzers/ngrams/tests/test_data/__init__.py similarity index 100% rename from analyzers/ngrams/test_data/__init__.py rename to packages/analyzers/ngrams/tests/test_data/__init__.py diff --git a/analyzers/ngrams/test_data/message_authors.parquet b/packages/analyzers/ngrams/tests/test_data/message_authors.parquet similarity index 100% rename from analyzers/ngrams/test_data/message_authors.parquet rename to packages/analyzers/ngrams/tests/test_data/message_authors.parquet diff --git a/analyzers/ngrams/test_data/message_ngrams.parquet b/packages/analyzers/ngrams/tests/test_data/message_ngrams.parquet similarity index 100% rename from analyzers/ngrams/test_data/message_ngrams.parquet rename to packages/analyzers/ngrams/tests/test_data/message_ngrams.parquet diff --git a/analyzers/ngrams/test_data/ngram_full.parquet b/packages/analyzers/ngrams/tests/test_data/ngram_full.parquet similarity index 100% rename from analyzers/ngrams/test_data/ngram_full.parquet rename to packages/analyzers/ngrams/tests/test_data/ngram_full.parquet diff --git a/analyzers/ngrams/test_data/ngram_stats.parquet b/packages/analyzers/ngrams/tests/test_data/ngram_stats.parquet similarity index 100% rename from analyzers/ngrams/test_data/ngram_stats.parquet rename to packages/analyzers/ngrams/tests/test_data/ngram_stats.parquet diff --git a/analyzers/ngrams/test_data/ngrams.parquet b/packages/analyzers/ngrams/tests/test_data/ngrams.parquet similarity index 100% rename from analyzers/ngrams/test_data/ngrams.parquet rename to packages/analyzers/ngrams/tests/test_data/ngrams.parquet diff --git a/analyzers/ngrams/test_data/ngrams_test_input.csv b/packages/analyzers/ngrams/tests/test_data/ngrams_test_input.csv similarity index 100% rename from analyzers/ngrams/test_data/ngrams_test_input.csv rename to packages/analyzers/ngrams/tests/test_data/ngrams_test_input.csv diff --git a/analyzers/ngrams/test_ngram_stats.py b/packages/analyzers/ngrams/tests/test_ngram_stats.py similarity index 100% rename from analyzers/ngrams/test_ngram_stats.py rename to packages/analyzers/ngrams/tests/test_ngram_stats.py diff --git a/analyzers/ngrams/test_ngrams_base.py b/packages/analyzers/ngrams/tests/test_ngrams_base.py similarity index 100% rename from analyzers/ngrams/test_ngrams_base.py rename to packages/analyzers/ngrams/tests/test_ngrams_base.py diff --git a/packages/analyzers/temporal/README.md b/packages/analyzers/temporal/README.md new file mode 100644 index 00000000..b637dea7 --- /dev/null +++ b/packages/analyzers/temporal/README.md @@ -0,0 +1,24 @@ +# CIBMangoTree Temporal Analyzer + +Temporal analysis for identifying periodic activity patterns. + +## Description + +This analysis breaks down timestamped data into granular components and groups events into custom time intervals to analyze activity patterns. It helps pinpoint when events occur and reveals temporal patterns that may indicate coordinated activity. + +## Installation + +```bash +pip install cibmangotree-analyzer-temporal +``` + +## Features + +- Time-based feature extraction (hour, minute, time of day) +- Custom interval grouping +- Activity pattern visualization +- Detection of irregularities and spikes + +## Usage + +This analyzer is automatically discovered by CIBMangoTree when installed. diff --git a/packages/analyzers/temporal/pyproject.toml b/packages/analyzers/temporal/pyproject.toml new file mode 100644 index 00000000..970e561e --- /dev/null +++ b/packages/analyzers/temporal/pyproject.toml @@ -0,0 +1,39 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "cibmangotree-analyzer-temporal" +version = "0.1.0" +description = "Temporal analysis for identifying periodic activity patterns" +readme = "README.md" +requires-python = ">=3.11" +license = {text = "MIT"} +authors = [ + {name = "CivicTech", email = "info@civictech.org"} +] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] + +dependencies = [ + "cibmangotree", + "polars>=0.20.0", +] + +[project.entry-points."cibmangotree.analyzers"] +temporal = "cibmangotree_analyzer_temporal:get_interface" + +[project.optional-dependencies] +dev = [ + "pytest>=7.0.0", + "pytest-cov>=4.0.0", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/cibmangotree_analyzer_temporal"] diff --git a/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/__init__.py b/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/__init__.py new file mode 100644 index 00000000..a02dc6c2 --- /dev/null +++ b/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/__init__.py @@ -0,0 +1,17 @@ +"""Temporal analyzer package for CIBMangoTree.""" + +__version__ = "0.1.0" + +from .base.interface import interface as base_interface +from .web.interface import interface as web_interface + + +def get_interface(): + """Return the analyzer interface for plugin discovery.""" + return { + "base": base_interface, + "web": web_interface, + } + + +__all__ = ["get_interface", "__version__"] diff --git a/analyzers/temporal/temporal_base/__init__.py b/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/base/__init__.py similarity index 100% rename from analyzers/temporal/temporal_base/__init__.py rename to packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/base/__init__.py diff --git a/analyzers/temporal/temporal_base/interface.py b/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/base/interface.py similarity index 100% rename from analyzers/temporal/temporal_base/interface.py rename to packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/base/interface.py diff --git a/analyzers/temporal/temporal_base/main.py b/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/base/main.py similarity index 100% rename from analyzers/temporal/temporal_base/main.py rename to packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/base/main.py diff --git a/analyzers/temporal/temporal_web/__init__.py b/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/__init__.py similarity index 100% rename from analyzers/temporal/temporal_web/__init__.py rename to packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/__init__.py diff --git a/analyzers/temporal/temporal_web/factory.py b/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/factory.py similarity index 100% rename from analyzers/temporal/temporal_web/factory.py rename to packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/factory.py diff --git a/analyzers/temporal/temporal_web/interface.py b/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/interface.py similarity index 100% rename from analyzers/temporal/temporal_web/interface.py rename to packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/interface.py diff --git a/packages/analyzers/time_coordination/README.md b/packages/analyzers/time_coordination/README.md new file mode 100644 index 00000000..39956d45 --- /dev/null +++ b/packages/analyzers/time_coordination/README.md @@ -0,0 +1,23 @@ +# CIBMangoTree Time Coordination Analyzer + +Time coordination analysis for identifying synchronized posting patterns. + +## Description + +This analysis measures time coordination between users by examining correlated user pairings. It calculates how often two users post within the same 15-minute time window, with windows sliding every 5 minutes. A high frequency of co-occurrence suggests potential coordination between the users. + +## Installation + +```bash +pip install cibmangotree-analyzer-time-coordination +``` + +## Features + +- Sliding time window analysis (15-minute windows, 5-minute slides) +- User pair co-occurrence counting +- Coordination pattern detection + +## Usage + +This analyzer is automatically discovered by CIBMangoTree when installed. diff --git a/packages/analyzers/time_coordination/pyproject.toml b/packages/analyzers/time_coordination/pyproject.toml new file mode 100644 index 00000000..c9b2a103 --- /dev/null +++ b/packages/analyzers/time_coordination/pyproject.toml @@ -0,0 +1,39 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "cibmangotree-analyzer-time-coordination" +version = "0.1.0" +description = "Time coordination analysis for identifying synchronized posting patterns" +readme = "README.md" +requires-python = ">=3.11" +license = {text = "MIT"} +authors = [ + {name = "CivicTech", email = "info@civictech.org"} +] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] + +dependencies = [ + "cibmangotree", + "polars>=0.20.0", +] + +[project.entry-points."cibmangotree.analyzers"] +time_coordination = "cibmangotree_analyzer_time_coordination:get_interface" + +[project.optional-dependencies] +dev = [ + "pytest>=7.0.0", + "pytest-cov>=4.0.0", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/cibmangotree_analyzer_time_coordination"] diff --git a/packages/analyzers/time_coordination/src/cibmangotree_analyzer_time_coordination/__init__.py b/packages/analyzers/time_coordination/src/cibmangotree_analyzer_time_coordination/__init__.py new file mode 100644 index 00000000..5c1de6b8 --- /dev/null +++ b/packages/analyzers/time_coordination/src/cibmangotree_analyzer_time_coordination/__init__.py @@ -0,0 +1,13 @@ +"""Time coordination analyzer package for CIBMangoTree.""" + +__version__ = "0.1.0" + +from .interface import interface + + +def get_interface(): + """Return the analyzer interface for plugin discovery.""" + return interface + + +__all__ = ["get_interface", "__version__"] diff --git a/analyzers/time_coordination/interface.py b/packages/analyzers/time_coordination/src/cibmangotree_analyzer_time_coordination/interface.py similarity index 100% rename from analyzers/time_coordination/interface.py rename to packages/analyzers/time_coordination/src/cibmangotree_analyzer_time_coordination/interface.py diff --git a/analyzers/time_coordination/main.py b/packages/analyzers/time_coordination/src/cibmangotree_analyzer_time_coordination/main.py similarity index 100% rename from analyzers/time_coordination/main.py rename to packages/analyzers/time_coordination/src/cibmangotree_analyzer_time_coordination/main.py diff --git a/packages/testing/pyproject.toml b/packages/testing/pyproject.toml new file mode 100644 index 00000000..d98f00ff --- /dev/null +++ b/packages/testing/pyproject.toml @@ -0,0 +1,42 @@ +[project] +name = "cibmangotree-testing" +version = "0.1.0" +requires-python = ">=3.12" +description = "Testing utilities for CIB Mango Tree" +readme = "../../README.md" +license = {text = "MIT"} +authors = [ + {name = "Civic Tech DC", email = "info@civictechdc.org"} +] +keywords = ["testing", "utilities", "cibmangotree"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3.12", + "Topic :: Software Development :: Testing", +] + +# Testing utilities dependencies +dependencies = [ + # Core package dependency + "cibmangotree>=0.1.0", + + # Testing frameworks + "pytest>=8.3.4", + "pytest-benchmark>=5.1.0", + + # Data processing (for test data utilities) + "polars>=1.9.0", + + # Data models & validation + "pydantic>=2.9.1", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +# Hatchling build configuration +[tool.hatchling.build.targets.wheel] +packages = ["src/cibmangotree_testing"] diff --git a/packages/testing/src/cibmangotree_testing/__init__.py b/packages/testing/src/cibmangotree_testing/__init__.py new file mode 100644 index 00000000..c5f9ac5e --- /dev/null +++ b/packages/testing/src/cibmangotree_testing/__init__.py @@ -0,0 +1,49 @@ +"""Testing utilities for CIB Mango Tree analyzers. + +This package provides utilities for testing analyzers including: +- Test data loaders (CSV, JSON, Excel, Parquet) +- Test context implementations for primary and secondary analyzers +- DataFrame comparison utilities +- Test runner functions +""" + +from .comparers import compare_dfs +from .context import ( + TestOutputReaderGroupContext, + TestOutputWriter, + TestPrimaryAnalyzerContext, + TestSecondaryAnalyzerContext, + TestTableReader, +) +from .testdata import ( + CsvConfig, + CsvTestData, + ExcelTestData, + JsonTestData, + ParquetTestData, + PolarsTestData, + TestData, +) +from .testers import test_primary_analyzer, test_secondary_analyzer + +__all__ = [ + # Test data loaders + "CsvConfig", + "CsvTestData", + "ExcelTestData", + "JsonTestData", + "ParquetTestData", + "PolarsTestData", + "TestData", + # Test contexts + "TestPrimaryAnalyzerContext", + "TestSecondaryAnalyzerContext", + "TestOutputReaderGroupContext", + "TestTableReader", + "TestOutputWriter", + # Comparison utilities + "compare_dfs", + # Test runners + "test_primary_analyzer", + "test_secondary_analyzer", +] diff --git a/testing/comparers.py b/packages/testing/src/cibmangotree_testing/comparers.py similarity index 100% rename from testing/comparers.py rename to packages/testing/src/cibmangotree_testing/comparers.py diff --git a/testing/context.py b/packages/testing/src/cibmangotree_testing/context.py similarity index 90% rename from testing/context.py rename to packages/testing/src/cibmangotree_testing/context.py index 5260ff9c..03cabecf 100644 --- a/testing/context.py +++ b/packages/testing/src/cibmangotree_testing/context.py @@ -5,15 +5,15 @@ import polars as pl from pydantic import BaseModel -from analyzer_interface import ParamValue, SecondaryAnalyzerInterface -from analyzer_interface.context import AssetsReader, InputTableReader -from analyzer_interface.context import ( +from cibmangotree.analyzer_interface import ParamValue, SecondaryAnalyzerInterface +from cibmangotree.analyzer_interface.context import AssetsReader, InputTableReader +from cibmangotree.analyzer_interface.context import ( PrimaryAnalyzerContext as BasePrimaryAnalyzerContext, ) -from analyzer_interface.context import ( +from cibmangotree.analyzer_interface.context import ( SecondaryAnalyzerContext as BaseSecondaryAnalyzerContext, ) -from analyzer_interface.context import TableReader, TableWriter +from cibmangotree.analyzer_interface.context import TableReader, TableWriter class TestPrimaryAnalyzerContext(BasePrimaryAnalyzerContext): diff --git a/testing/testdata.py b/packages/testing/src/cibmangotree_testing/testdata.py similarity index 98% rename from testing/testdata.py rename to packages/testing/src/cibmangotree_testing/testdata.py index 1cfd61d8..b6827a4b 100644 --- a/testing/testdata.py +++ b/packages/testing/src/cibmangotree_testing/testdata.py @@ -4,7 +4,7 @@ import polars as pl from pydantic import BaseModel -from preprocessing.series_semantic import SeriesSemantic +from cibmangotree.preprocessing.series_semantic import SeriesSemantic T = TypeVar("T", bound=pl.DataFrame | pl.LazyFrame) diff --git a/testing/testers.py b/packages/testing/src/cibmangotree_testing/testers.py similarity index 97% rename from testing/testers.py rename to packages/testing/src/cibmangotree_testing/testers.py index b3c83f24..7955439d 100644 --- a/testing/testers.py +++ b/packages/testing/src/cibmangotree_testing/testers.py @@ -6,8 +6,11 @@ import polars as pl import pytest -from analyzer_interface import AnalyzerInterface, ParamValue -from analyzer_interface.context import PrimaryAnalyzerContext, SecondaryAnalyzerContext +from cibmangotree.analyzer_interface import AnalyzerInterface, ParamValue +from cibmangotree.analyzer_interface.context import ( + PrimaryAnalyzerContext, + SecondaryAnalyzerContext, +) from .comparers import compare_dfs from .context import TestPrimaryAnalyzerContext, TestSecondaryAnalyzerContext diff --git a/packages/tokenizers/basic/pyproject.toml b/packages/tokenizers/basic/pyproject.toml new file mode 100644 index 00000000..1b153c15 --- /dev/null +++ b/packages/tokenizers/basic/pyproject.toml @@ -0,0 +1,39 @@ +[project] +name = "cibmangotree-tokenizer-basic" +version = "0.1.0" +requires-python = ">=3.12" +description = "Basic tokenizer implementation for CIB Mango Tree" +readme = "../../../README.md" +license = {text = "MIT"} +authors = [ + {name = "Civic Tech DC", email = "info@civictechdc.org"} +] +keywords = ["tokenizer", "nlp", "text-processing", "social-media"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3.12", + "Topic :: Text Processing :: Linguistic", +] + +# Basic tokenizer dependencies +dependencies = [ + # Core package dependency (provides AbstractTokenizer, TokenizerConfig) + "cibmangotree>=0.1.0", + + # Text processing + "regex>=2025.9.1", +] + +# Plugin entry points - auto-discovered by core in dev mode +[project.entry-points."cibmangotree.tokenizers"] +basic = "cibmangotree_tokenizer_basic:BasicTokenizer" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +# Hatchling build configuration +[tool.hatchling.build.targets.wheel] +packages = ["src/cibmangotree_tokenizer_basic"] diff --git a/services/tokenizer/basic/__init__.py b/packages/tokenizers/basic/src/cibmangotree_tokenizer_basic/__init__.py similarity index 92% rename from services/tokenizer/basic/__init__.py rename to packages/tokenizers/basic/src/cibmangotree_tokenizer_basic/__init__.py index 117385a0..7d34a20d 100644 --- a/services/tokenizer/basic/__init__.py +++ b/packages/tokenizers/basic/src/cibmangotree_tokenizer_basic/__init__.py @@ -5,7 +5,8 @@ fundamental Unicode-aware tokenization capabilities for social media text. """ -from ..core.types import TokenizerConfig +from cibmangotree.services.tokenizer.core.types import TokenizerConfig + from .patterns import get_patterns from .tokenizer import BasicTokenizer diff --git a/services/tokenizer/basic/patterns.py b/packages/tokenizers/basic/src/cibmangotree_tokenizer_basic/patterns.py similarity index 100% rename from services/tokenizer/basic/patterns.py rename to packages/tokenizers/basic/src/cibmangotree_tokenizer_basic/patterns.py diff --git a/services/tokenizer/basic/tokenizer.py b/packages/tokenizers/basic/src/cibmangotree_tokenizer_basic/tokenizer.py similarity index 98% rename from services/tokenizer/basic/tokenizer.py rename to packages/tokenizers/basic/src/cibmangotree_tokenizer_basic/tokenizer.py index 918ef65f..34ae0d19 100644 --- a/services/tokenizer/basic/tokenizer.py +++ b/packages/tokenizers/basic/src/cibmangotree_tokenizer_basic/tokenizer.py @@ -8,8 +8,13 @@ import re from typing import Optional -from ..core.base import AbstractTokenizer -from ..core.types import LanguageFamily, TokenizerConfig, TokenList +from cibmangotree.services.tokenizer.core.base import AbstractTokenizer +from cibmangotree.services.tokenizer.core.types import ( + LanguageFamily, + TokenizerConfig, + TokenList, +) + from .patterns import get_patterns diff --git a/services/tokenizer/basic/test_basic_tokenizer.py b/packages/tokenizers/basic/tests/test_basic_tokenizer.py similarity index 99% rename from services/tokenizer/basic/test_basic_tokenizer.py rename to packages/tokenizers/basic/tests/test_basic_tokenizer.py index 1d6923f0..3f2ee256 100644 --- a/services/tokenizer/basic/test_basic_tokenizer.py +++ b/packages/tokenizers/basic/tests/test_basic_tokenizer.py @@ -8,8 +8,8 @@ import pytest -from ..core.types import CaseHandling, TokenizerConfig -from .tokenizer import BasicTokenizer +from cibmangotree.services.tokenizer.core.types import CaseHandling, TokenizerConfig +from cibmangotree_tokenizer_basic import BasicTokenizer class TestBasicTokenizerMultilingual: diff --git a/testing/__init__.py b/testing/__init__.py deleted file mode 100644 index 962da5f2..00000000 --- a/testing/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from .testdata import ( - CsvConfig, - CsvTestData, - ExcelTestData, - JsonTestData, - ParquetTestData, - PolarsTestData, -) -from .testers import test_primary_analyzer, test_secondary_analyzer From 697cbf64755e05f04c0bcd2b65bf379a6d6b5bb3 Mon Sep 17 00:00:00 2001 From: Joe Karow <58997957+JoeKarow@users.noreply.github.com> Date: Thu, 9 Oct 2025 18:41:49 -0400 Subject: [PATCH 05/24] refactor: update all imports to new package structure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 5: Systematic import path updates across 64 files Core package imports updated: - from app → from cibmangotree.app - from analyzer_interface → from cibmangotree.analyzer_interface - from components → from cibmangotree.tui.components - from terminal_tools → from cibmangotree.tui.tools - from storage → from cibmangotree.services.storage - from importing → from cibmangotree.services.importing - from preprocessing → from cibmangotree.services.preprocessing Testing imports updated: - from testing → from cibmangotree_testing Analyzer internal imports updated for all 5 analyzers. Zero old-style imports remaining. Phase 5 of monorepo reorganization complete. --- .../base/__init__.py | 2 +- .../base/default_params.py | 4 ++-- .../base/interface.py | 2 +- .../cibmangotree_analyzer_example/base/main.py | 4 ++-- .../report/__init__.py | 2 +- .../report/interface.py | 2 +- .../cibmangotree_analyzer_example/report/main.py | 2 +- .../web/__init__.py | 2 +- .../cibmangotree_analyzer_example/web/factory.py | 2 +- .../web/interface.py | 2 +- .../analyzers/example/tests/test_example_base.py | 2 +- .../example/tests/test_example_report.py | 2 +- .../base/__init__.py | 2 +- .../base/interface.py | 4 ++-- .../cibmangotree_analyzer_hashtags/base/main.py | 4 ++-- .../web/__init__.py | 2 +- .../web/factory.py | 2 +- .../web/interface.py | 2 +- .../hashtags/tests/test_hashtags_base.py | 4 ++-- .../base/__init__.py | 2 +- .../base/interface.py | 4 ++-- .../cibmangotree_analyzer_ngrams/base/main.py | 8 ++++---- .../stats/__init__.py | 2 +- .../stats/interface.py | 2 +- .../cibmangotree_analyzer_ngrams/stats/main.py | 4 ++-- .../cibmangotree_analyzer_ngrams/web/__init__.py | 2 +- .../cibmangotree_analyzer_ngrams/web/factory.py | 2 +- .../web/interface.py | 2 +- .../analyzers/ngrams/tests/test_ngram_stats.py | 2 +- .../analyzers/ngrams/tests/test_ngrams_base.py | 2 +- .../base/__init__.py | 2 +- .../base/interface.py | 2 +- .../cibmangotree_analyzer_temporal/base/main.py | 2 +- .../web/__init__.py | 2 +- .../web/factory.py | 2 +- .../web/interface.py | 2 +- .../interface.py | 2 +- .../main.py | 2 +- .../src/cibmangotree/analyzer_interface/suite.py | 4 ++-- .../src/cibmangotree/app/analysis_context.py | 6 +++--- .../cibmangotree/app/analysis_output_context.py | 4 ++-- .../app/analysis_webserver_context.py | 2 +- packages/core/src/cibmangotree/app/app.py | 2 +- .../core/src/cibmangotree/app/app_context.py | 4 ++-- .../core/src/cibmangotree/app/project_context.py | 8 ++++---- .../src/cibmangotree/app/settings_context.py | 2 +- .../core/src/cibmangotree/app/test_logger.py | 2 +- .../core/src/cibmangotree/context/__init__.py | 16 ++++++++-------- .../services/preprocessing/series_semantic.py | 2 +- .../preprocessing/test_series_semantic.py | 2 +- .../cibmangotree/services/storage/__init__.py | 4 ++-- .../cibmangotree/tui/components/analysis_main.py | 4 ++-- .../tui/components/analysis_params.py | 8 ++++---- .../tui/components/analysis_web_server.py | 10 +++++----- .../src/cibmangotree/tui/components/context.py | 4 ++-- .../tui/components/export_outputs.py | 9 ++++----- .../src/cibmangotree/tui/components/main_menu.py | 2 +- .../cibmangotree/tui/components/new_analysis.py | 6 +++--- .../cibmangotree/tui/components/new_project.py | 8 ++++---- .../cibmangotree/tui/components/project_main.py | 4 ++-- .../tui/components/select_analysis.py | 4 ++-- .../tui/components/select_project.py | 4 ++-- .../src/cibmangotree/tui/components/splash.py | 4 ++-- .../core/src/cibmangotree/tui/tools/prompts.py | 2 +- 64 files changed, 111 insertions(+), 112 deletions(-) diff --git a/packages/analyzers/example/src/cibmangotree_analyzer_example/base/__init__.py b/packages/analyzers/example/src/cibmangotree_analyzer_example/base/__init__.py index 4f770e80..10b51464 100644 --- a/packages/analyzers/example/src/cibmangotree_analyzer_example/base/__init__.py +++ b/packages/analyzers/example/src/cibmangotree_analyzer_example/base/__init__.py @@ -1,4 +1,4 @@ -from analyzer_interface import AnalyzerDeclaration +from cibmangotree.analyzer_interface import AnalyzerDeclaration from .default_params import default_params from .interface import interface diff --git a/packages/analyzers/example/src/cibmangotree_analyzer_example/base/default_params.py b/packages/analyzers/example/src/cibmangotree_analyzer_example/base/default_params.py index 759b7584..f3c26f5f 100644 --- a/packages/analyzers/example/src/cibmangotree_analyzer_example/base/default_params.py +++ b/packages/analyzers/example/src/cibmangotree_analyzer_example/base/default_params.py @@ -1,5 +1,5 @@ -from analyzer_interface import ParamValue -from analyzer_interface.context import PrimaryAnalyzerContext +from cibmangotree.analyzer_interface import ParamValue +from cibmangotree.analyzer_interface.context import PrimaryAnalyzerContext def default_params(context: PrimaryAnalyzerContext) -> dict[str, ParamValue]: diff --git a/packages/analyzers/example/src/cibmangotree_analyzer_example/base/interface.py b/packages/analyzers/example/src/cibmangotree_analyzer_example/base/interface.py index 857f9e17..69d58e5d 100644 --- a/packages/analyzers/example/src/cibmangotree_analyzer_example/base/interface.py +++ b/packages/analyzers/example/src/cibmangotree_analyzer_example/base/interface.py @@ -1,4 +1,4 @@ -from analyzer_interface import ( +from cibmangotree.analyzer_interface import ( AnalyzerInput, AnalyzerInterface, AnalyzerOutput, diff --git a/packages/analyzers/example/src/cibmangotree_analyzer_example/base/main.py b/packages/analyzers/example/src/cibmangotree_analyzer_example/base/main.py index 7d4d14b3..b3eab984 100644 --- a/packages/analyzers/example/src/cibmangotree_analyzer_example/base/main.py +++ b/packages/analyzers/example/src/cibmangotree_analyzer_example/base/main.py @@ -1,7 +1,7 @@ import polars as pl -from analyzer_interface.context import PrimaryAnalyzerContext -from terminal_tools import ProgressReporter +from cibmangotree.analyzer_interface.context import PrimaryAnalyzerContext +from cibmangotree.tui.tools import ProgressReporter def main(context: PrimaryAnalyzerContext): diff --git a/packages/analyzers/example/src/cibmangotree_analyzer_example/report/__init__.py b/packages/analyzers/example/src/cibmangotree_analyzer_example/report/__init__.py index 4ddfb984..6ac21e43 100644 --- a/packages/analyzers/example/src/cibmangotree_analyzer_example/report/__init__.py +++ b/packages/analyzers/example/src/cibmangotree_analyzer_example/report/__init__.py @@ -1,4 +1,4 @@ -from analyzer_interface import SecondaryAnalyzerDeclaration +from cibmangotree.analyzer_interface import SecondaryAnalyzerDeclaration from .interface import interface from .main import main diff --git a/packages/analyzers/example/src/cibmangotree_analyzer_example/report/interface.py b/packages/analyzers/example/src/cibmangotree_analyzer_example/report/interface.py index cf27778e..7fd658b7 100644 --- a/packages/analyzers/example/src/cibmangotree_analyzer_example/report/interface.py +++ b/packages/analyzers/example/src/cibmangotree_analyzer_example/report/interface.py @@ -1,4 +1,4 @@ -from analyzer_interface import AnalyzerOutput, OutputColumn, SecondaryAnalyzerInterface +from cibmangotree.analyzer_interface import AnalyzerOutput, OutputColumn, SecondaryAnalyzerInterface from ..example_base.interface import interface as example_base diff --git a/packages/analyzers/example/src/cibmangotree_analyzer_example/report/main.py b/packages/analyzers/example/src/cibmangotree_analyzer_example/report/main.py index 63210093..745a7240 100644 --- a/packages/analyzers/example/src/cibmangotree_analyzer_example/report/main.py +++ b/packages/analyzers/example/src/cibmangotree_analyzer_example/report/main.py @@ -1,6 +1,6 @@ import polars as pl -from analyzer_interface.context import SecondaryAnalyzerContext +from cibmangotree.analyzer_interface.context import SecondaryAnalyzerContext def main(context: SecondaryAnalyzerContext): diff --git a/packages/analyzers/example/src/cibmangotree_analyzer_example/web/__init__.py b/packages/analyzers/example/src/cibmangotree_analyzer_example/web/__init__.py index be1825f0..ce7a3394 100644 --- a/packages/analyzers/example/src/cibmangotree_analyzer_example/web/__init__.py +++ b/packages/analyzers/example/src/cibmangotree_analyzer_example/web/__init__.py @@ -1,4 +1,4 @@ -from analyzer_interface import WebPresenterDeclaration +from cibmangotree.analyzer_interface import WebPresenterDeclaration from .factory import factory from .interface import interface diff --git a/packages/analyzers/example/src/cibmangotree_analyzer_example/web/factory.py b/packages/analyzers/example/src/cibmangotree_analyzer_example/web/factory.py index 8706d65d..b887452f 100644 --- a/packages/analyzers/example/src/cibmangotree_analyzer_example/web/factory.py +++ b/packages/analyzers/example/src/cibmangotree_analyzer_example/web/factory.py @@ -3,7 +3,7 @@ from dash.dcc import Graph from dash.html import Div -from analyzer_interface.context import WebPresenterContext +from cibmangotree.analyzer_interface.context import WebPresenterContext def factory(context: WebPresenterContext): diff --git a/packages/analyzers/example/src/cibmangotree_analyzer_example/web/interface.py b/packages/analyzers/example/src/cibmangotree_analyzer_example/web/interface.py index ba8d355e..38eb56a0 100644 --- a/packages/analyzers/example/src/cibmangotree_analyzer_example/web/interface.py +++ b/packages/analyzers/example/src/cibmangotree_analyzer_example/web/interface.py @@ -1,4 +1,4 @@ -from analyzer_interface import WebPresenterInterface +from cibmangotree.analyzer_interface import WebPresenterInterface from ..example_base import interface as example_base from ..example_report import interface as example_report diff --git a/packages/analyzers/example/tests/test_example_base.py b/packages/analyzers/example/tests/test_example_base.py index 3d6d4d90..7aa29d97 100644 --- a/packages/analyzers/example/tests/test_example_base.py +++ b/packages/analyzers/example/tests/test_example_base.py @@ -1,7 +1,7 @@ import os from preprocessing.series_semantic import identifier -from testing import CsvConfig, CsvTestData, test_primary_analyzer +from cibmangotree_testing import CsvConfig, CsvTestData, test_primary_analyzer from .example_base.interface import interface from .example_base.main import main diff --git a/packages/analyzers/example/tests/test_example_report.py b/packages/analyzers/example/tests/test_example_report.py index f809adf2..71ed1c60 100644 --- a/packages/analyzers/example/tests/test_example_report.py +++ b/packages/analyzers/example/tests/test_example_report.py @@ -1,6 +1,6 @@ import os -from testing import CsvTestData, test_secondary_analyzer +from cibmangotree_testing import CsvTestData, test_secondary_analyzer from .example_report.interface import interface from .example_report.main import main diff --git a/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/base/__init__.py b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/base/__init__.py index 0ff3d1f4..a96fc7da 100644 --- a/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/base/__init__.py +++ b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/base/__init__.py @@ -1,4 +1,4 @@ -from analyzer_interface import AnalyzerDeclaration +from cibmangotree.analyzer_interface import AnalyzerDeclaration from .interface import interface from .main import main diff --git a/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/base/interface.py b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/base/interface.py index fbe7d879..5cdbf9e1 100644 --- a/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/base/interface.py +++ b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/base/interface.py @@ -1,4 +1,4 @@ -from analyzer_interface import ( +from cibmangotree.analyzer_interface import ( AnalyzerInput, AnalyzerInterface, AnalyzerOutput, @@ -6,7 +6,7 @@ InputColumn, OutputColumn, ) -from analyzer_interface.params import TimeBinningParam +from cibmangotree.analyzer_interface.params import TimeBinningParam COL_AUTHOR_ID = "Unique UserID" COL_TIME = "Timestamp" diff --git a/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/base/main.py b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/base/main.py index 687fb522..f0c9c454 100644 --- a/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/base/main.py +++ b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/base/main.py @@ -2,8 +2,8 @@ import polars as pl -from analyzer_interface.context import PrimaryAnalyzerContext -from terminal_tools import ProgressReporter +from cibmangotree.analyzer_interface.context import PrimaryAnalyzerContext +from cibmangotree.tui.tools import ProgressReporter from .interface import ( COL_AUTHOR_ID, diff --git a/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/__init__.py b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/__init__.py index ac337e2f..0c8cae94 100644 --- a/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/__init__.py +++ b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/__init__.py @@ -1,4 +1,4 @@ -from analyzer_interface import WebPresenterDeclaration +from cibmangotree.analyzer_interface import WebPresenterDeclaration from .factory import factory from .interface import interface diff --git a/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/factory.py b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/factory.py index 89ea1878..04b0546f 100644 --- a/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/factory.py +++ b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/factory.py @@ -2,7 +2,7 @@ from dash import html from shiny.ui import layout_columns, nav_panel -from analyzer_interface.context import ( +from cibmangotree.analyzer_interface.context import ( FactoryOutputContext, ShinyContext, WebPresenterContext, diff --git a/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/interface.py b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/interface.py index 3b2e1ec1..760e5019 100644 --- a/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/interface.py +++ b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/interface.py @@ -1,4 +1,4 @@ -from analyzer_interface import WebPresenterInterface +from cibmangotree.analyzer_interface import WebPresenterInterface from ..hashtags_base import interface as hashtags_interface diff --git a/packages/analyzers/hashtags/tests/test_hashtags_base.py b/packages/analyzers/hashtags/tests/test_hashtags_base.py index cdc94a81..344f205b 100644 --- a/packages/analyzers/hashtags/tests/test_hashtags_base.py +++ b/packages/analyzers/hashtags/tests/test_hashtags_base.py @@ -3,9 +3,9 @@ import numpy as np import polars as pl -from analyzer_interface.params import TimeBinningValue +from cibmangotree.analyzer_interface.params import TimeBinningValue from preprocessing.series_semantic import datetime_string, identifier, text_catch_all -from testing import CsvTestData, JsonTestData, test_primary_analyzer +from cibmangotree_testing import CsvTestData, JsonTestData, test_primary_analyzer from .hashtags_base.interface import ( COL_AUTHOR_ID, diff --git a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/__init__.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/__init__.py index 3f2eb6dc..73622345 100644 --- a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/__init__.py +++ b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/__init__.py @@ -1,4 +1,4 @@ -from analyzer_interface import AnalyzerDeclaration +from cibmangotree.analyzer_interface import AnalyzerDeclaration from .interface import interface from .main import main diff --git a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/interface.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/interface.py index f5d99e44..31fafee8 100644 --- a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/interface.py +++ b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/interface.py @@ -1,4 +1,4 @@ -from analyzer_interface import ( +from cibmangotree.analyzer_interface import ( AnalyzerInput, AnalyzerInterface, AnalyzerOutput, @@ -6,7 +6,7 @@ InputColumn, OutputColumn, ) -from analyzer_interface.params import IntegerParam +from cibmangotree.analyzer_interface.params import IntegerParam COL_AUTHOR_ID = "user_id" COL_MESSAGE_ID = "message_id" diff --git a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/main.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/main.py index 47d58682..a1557807 100644 --- a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/main.py +++ b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/main.py @@ -1,9 +1,9 @@ import polars as pl -from analyzer_interface.context import PrimaryAnalyzerContext -from services.tokenizer.basic import TokenizerConfig, tokenize_text -from services.tokenizer.core.types import CaseHandling -from terminal_tools import ProgressReporter +from cibmangotree.analyzer_interface.context import PrimaryAnalyzerContext +from cibmangotree_tokenizer_basic import TokenizerConfig, tokenize_text +from cibmangotree_tokenizer_basic.core.types import CaseHandling +from cibmangotree.tui.tools import ProgressReporter from .interface import ( COL_AUTHOR_ID, diff --git a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/__init__.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/__init__.py index 652640f1..128d8a77 100644 --- a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/__init__.py +++ b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/__init__.py @@ -1,4 +1,4 @@ -from analyzer_interface import SecondaryAnalyzerDeclaration +from cibmangotree.analyzer_interface import SecondaryAnalyzerDeclaration from .interface import interface from .main import main diff --git a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/interface.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/interface.py index 5b904d08..873eef56 100644 --- a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/interface.py +++ b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/interface.py @@ -1,4 +1,4 @@ -from analyzer_interface import AnalyzerOutput, OutputColumn, SecondaryAnalyzerInterface +from cibmangotree.analyzer_interface import AnalyzerOutput, OutputColumn, SecondaryAnalyzerInterface from ..ngrams_base import interface as ngrams_interface from ..ngrams_base.interface import ( diff --git a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/main.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/main.py index 7aa4f961..12073580 100644 --- a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/main.py +++ b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/main.py @@ -2,8 +2,8 @@ import pyarrow as pa import pyarrow.parquet as pq -from analyzer_interface.context import SecondaryAnalyzerContext -from terminal_tools import ProgressReporter +from cibmangotree.analyzer_interface.context import SecondaryAnalyzerContext +from cibmangotree.tui.tools import ProgressReporter from ..ngrams_base.interface import ( COL_AUTHOR_ID, diff --git a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/__init__.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/__init__.py index 70eca761..f85dd24d 100644 --- a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/__init__.py +++ b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/__init__.py @@ -1,4 +1,4 @@ -from analyzer_interface import WebPresenterDeclaration +from cibmangotree.analyzer_interface import WebPresenterDeclaration from .factory import factory from .interface import interface diff --git a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/factory.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/factory.py index 849e8d36..7ec94330 100644 --- a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/factory.py +++ b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/factory.py @@ -2,7 +2,7 @@ from dash import html from shiny.ui import nav_panel -from analyzer_interface.context import ( +from cibmangotree.analyzer_interface.context import ( FactoryOutputContext, ShinyContext, WebPresenterContext, diff --git a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/interface.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/interface.py index 203514a4..feaa5c29 100644 --- a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/interface.py +++ b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/interface.py @@ -1,4 +1,4 @@ -from analyzer_interface import WebPresenterInterface +from cibmangotree.analyzer_interface import WebPresenterInterface from ..ngram_stats import interface as ngram_stats_interface from ..ngrams_base import interface as ngrams_interface diff --git a/packages/analyzers/ngrams/tests/test_ngram_stats.py b/packages/analyzers/ngrams/tests/test_ngram_stats.py index 77cfb11b..3a574555 100644 --- a/packages/analyzers/ngrams/tests/test_ngram_stats.py +++ b/packages/analyzers/ngrams/tests/test_ngram_stats.py @@ -1,6 +1,6 @@ from pathlib import Path -from testing import ParquetTestData, test_secondary_analyzer +from cibmangotree_testing import ParquetTestData, test_secondary_analyzer from .ngram_stats.interface import OUTPUT_NGRAM_FULL, OUTPUT_NGRAM_STATS, interface from .ngram_stats.main import main diff --git a/packages/analyzers/ngrams/tests/test_ngrams_base.py b/packages/analyzers/ngrams/tests/test_ngrams_base.py index 4c0a18e9..f61be69a 100644 --- a/packages/analyzers/ngrams/tests/test_ngrams_base.py +++ b/packages/analyzers/ngrams/tests/test_ngrams_base.py @@ -4,7 +4,7 @@ from preprocessing.series_semantic import datetime_string, identifier, text_catch_all from services.tokenizer.basic import TokenizerConfig, tokenize_text from services.tokenizer.core.types import CaseHandling -from testing import CsvTestData, ParquetTestData, test_primary_analyzer +from cibmangotree_testing import CsvTestData, ParquetTestData, test_primary_analyzer from .ngrams_base.interface import ( COL_AUTHOR_ID, diff --git a/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/base/__init__.py b/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/base/__init__.py index 6bc8cf1a..710bf75b 100644 --- a/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/base/__init__.py +++ b/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/base/__init__.py @@ -1,4 +1,4 @@ -from analyzer_interface import AnalyzerDeclaration +from cibmangotree.analyzer_interface import AnalyzerDeclaration from .interface import interface from .main import main diff --git a/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/base/interface.py b/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/base/interface.py index 22254e2d..9c9298ce 100644 --- a/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/base/interface.py +++ b/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/base/interface.py @@ -1,4 +1,4 @@ -from analyzer_interface import ( +from cibmangotree.analyzer_interface import ( AnalyzerInput, AnalyzerInterface, AnalyzerOutput, diff --git a/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/base/main.py b/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/base/main.py index e828a49d..6bfbfcf0 100644 --- a/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/base/main.py +++ b/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/base/main.py @@ -2,7 +2,7 @@ import polars as pl -from analyzer_interface.context import PrimaryAnalyzerContext +from cibmangotree.analyzer_interface.context import PrimaryAnalyzerContext from .interface import ( INPUT_COL_TIMESTAMP, diff --git a/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/__init__.py b/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/__init__.py index c9b0159a..87f324b6 100644 --- a/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/__init__.py +++ b/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/__init__.py @@ -1,4 +1,4 @@ -from analyzer_interface import WebPresenterDeclaration +from cibmangotree.analyzer_interface import WebPresenterDeclaration from .factory import factory from .interface import interface diff --git a/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/factory.py b/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/factory.py index 40c948ba..5a98d501 100644 --- a/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/factory.py +++ b/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/factory.py @@ -3,7 +3,7 @@ from dash.dcc import Graph from dash.html import H2, Div, P -from analyzer_interface.context import WebPresenterContext +from cibmangotree.analyzer_interface.context import WebPresenterContext from ..temporal_base.interface import ( OUTPUT_COL_POST_COUNT, diff --git a/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/interface.py b/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/interface.py index d9171092..17757065 100644 --- a/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/interface.py +++ b/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/interface.py @@ -1,4 +1,4 @@ -from analyzer_interface import WebPresenterInterface +from cibmangotree.analyzer_interface import WebPresenterInterface from ..temporal_base import interface as temporal_interface diff --git a/packages/analyzers/time_coordination/src/cibmangotree_analyzer_time_coordination/interface.py b/packages/analyzers/time_coordination/src/cibmangotree_analyzer_time_coordination/interface.py index b719dec6..cc6a70b4 100644 --- a/packages/analyzers/time_coordination/src/cibmangotree_analyzer_time_coordination/interface.py +++ b/packages/analyzers/time_coordination/src/cibmangotree_analyzer_time_coordination/interface.py @@ -1,4 +1,4 @@ -from analyzer_interface import ( +from cibmangotree.analyzer_interface import ( AnalyzerInput, AnalyzerInterface, AnalyzerOutput, diff --git a/packages/analyzers/time_coordination/src/cibmangotree_analyzer_time_coordination/main.py b/packages/analyzers/time_coordination/src/cibmangotree_analyzer_time_coordination/main.py index 8148b656..374a5b45 100644 --- a/packages/analyzers/time_coordination/src/cibmangotree_analyzer_time_coordination/main.py +++ b/packages/analyzers/time_coordination/src/cibmangotree_analyzer_time_coordination/main.py @@ -2,7 +2,7 @@ import polars as pl -from analyzer_interface.context import PrimaryAnalyzerContext +from cibmangotree.analyzer_interface.context import PrimaryAnalyzerContext from .interface import ( COL_TIMESTAMP, diff --git a/packages/core/src/cibmangotree/analyzer_interface/suite.py b/packages/core/src/cibmangotree/analyzer_interface/suite.py index 4d0a0bca..26f89b70 100644 --- a/packages/core/src/cibmangotree/analyzer_interface/suite.py +++ b/packages/core/src/cibmangotree/analyzer_interface/suite.py @@ -3,14 +3,14 @@ from pydantic import BaseModel -from analyzer_interface import ( +from cibmangotree.analyzer_interface import ( AnalyzerDeclaration, AnalyzerInterface, SecondaryAnalyzerDeclaration, SecondaryAnalyzerInterface, WebPresenterDeclaration, ) -from meta import is_development +from cibmangotree.meta import is_development class AnalyzerSuite(BaseModel): diff --git a/packages/core/src/cibmangotree/app/analysis_context.py b/packages/core/src/cibmangotree/app/analysis_context.py index c316ddfb..93923ad2 100644 --- a/packages/core/src/cibmangotree/app/analysis_context.py +++ b/packages/core/src/cibmangotree/app/analysis_context.py @@ -5,17 +5,17 @@ from pydantic import BaseModel -from analyzer_interface import ( +from cibmangotree.analyzer_interface import ( AnalyzerDeclaration, SecondaryAnalyzerDeclaration, backfill_param_values, ) -from context import ( +from cibmangotree.context import ( InputColumnProvider, PrimaryAnalyzerContext, SecondaryAnalyzerContext, ) -from storage import AnalysisModel +from cibmangotree.services.storage import AnalysisModel from .app_context import AppContext from .project_context import ProjectContext diff --git a/packages/core/src/cibmangotree/app/analysis_output_context.py b/packages/core/src/cibmangotree/app/analysis_output_context.py index e6ed0ae8..34572655 100644 --- a/packages/core/src/cibmangotree/app/analysis_output_context.py +++ b/packages/core/src/cibmangotree/app/analysis_output_context.py @@ -3,8 +3,8 @@ from pydantic import BaseModel -from analyzer_interface import AnalyzerOutput, SecondaryAnalyzerInterface -from storage import SupportedOutputExtension +from cibmangotree.analyzer_interface import AnalyzerOutput, SecondaryAnalyzerInterface +from cibmangotree.services.storage import SupportedOutputExtension from .analysis_context import AnalysisContext from .app_context import AppContext diff --git a/packages/core/src/cibmangotree/app/analysis_webserver_context.py b/packages/core/src/cibmangotree/app/analysis_webserver_context.py index 3d958f36..d1fca4da 100644 --- a/packages/core/src/cibmangotree/app/analysis_webserver_context.py +++ b/packages/core/src/cibmangotree/app/analysis_webserver_context.py @@ -14,7 +14,7 @@ from starlette.routing import Mount, Route from uvicorn import Config, Server -from context import WebPresenterContext +from cibmangotree.context import WebPresenterContext from .analysis_context import AnalysisContext from .app_context import AppContext diff --git a/packages/core/src/cibmangotree/app/app.py b/packages/core/src/cibmangotree/app/app.py index 52e588fc..f992821f 100644 --- a/packages/core/src/cibmangotree/app/app.py +++ b/packages/core/src/cibmangotree/app/app.py @@ -2,7 +2,7 @@ from pydantic import BaseModel -from importing import ImporterSession +from cibmangotree.services.importing import ImporterSession from .app_context import AppContext from .project_context import ProjectContext diff --git a/packages/core/src/cibmangotree/app/app_context.py b/packages/core/src/cibmangotree/app/app_context.py index 39dd4d67..bb416e96 100644 --- a/packages/core/src/cibmangotree/app/app_context.py +++ b/packages/core/src/cibmangotree/app/app_context.py @@ -2,8 +2,8 @@ from pydantic import BaseModel, ConfigDict -from analyzer_interface.suite import AnalyzerSuite -from storage import Storage +from cibmangotree.analyzer_interface.suite import AnalyzerSuite +from cibmangotree.services.storage import Storage class AppContext(BaseModel): diff --git a/packages/core/src/cibmangotree/app/project_context.py b/packages/core/src/cibmangotree/app/project_context.py index 8f94b249..08e6fc2d 100644 --- a/packages/core/src/cibmangotree/app/project_context.py +++ b/packages/core/src/cibmangotree/app/project_context.py @@ -3,10 +3,10 @@ import polars as pl from pydantic import BaseModel -from analyzer_interface import ParamValue -from analyzer_interface import UserInputColumn as BaseUserInputColumn -from preprocessing.series_semantic import SeriesSemantic, infer_series_semantic -from storage import AnalysisModel, ProjectModel +from cibmangotree.analyzer_interface import ParamValue +from cibmangotree.analyzer_interface import UserInputColumn as BaseUserInputColumn +from cibmangotree.services.preprocessing.series_semantic import SeriesSemantic, infer_series_semantic +from cibmangotree.services.storage import AnalysisModel, ProjectModel from .app_context import AppContext diff --git a/packages/core/src/cibmangotree/app/settings_context.py b/packages/core/src/cibmangotree/app/settings_context.py index d7529e12..690a2ab0 100644 --- a/packages/core/src/cibmangotree/app/settings_context.py +++ b/packages/core/src/cibmangotree/app/settings_context.py @@ -2,7 +2,7 @@ from pydantic import BaseModel -from storage import SettingsModel +from cibmangotree.services.storage import SettingsModel from .app_context import AppContext diff --git a/packages/core/src/cibmangotree/app/test_logger.py b/packages/core/src/cibmangotree/app/test_logger.py index 4e2cb0fd..ae99be70 100644 --- a/packages/core/src/cibmangotree/app/test_logger.py +++ b/packages/core/src/cibmangotree/app/test_logger.py @@ -11,7 +11,7 @@ import pytest -from app.logger import get_logger, setup_logging +from cibmangotree.app.logger import get_logger, setup_logging class TestSetupLogging: diff --git a/packages/core/src/cibmangotree/context/__init__.py b/packages/core/src/cibmangotree/context/__init__.py index 7ee502bd..0cf35b93 100644 --- a/packages/core/src/cibmangotree/context/__init__.py +++ b/packages/core/src/cibmangotree/context/__init__.py @@ -5,24 +5,24 @@ from dash import Dash from pydantic import BaseModel, ConfigDict -from analyzer_interface import ( +from cibmangotree.analyzer_interface import ( AnalyzerInterface, ParamValue, SecondaryAnalyzerInterface, WebPresenterInterface, backfill_param_values, ) -from analyzer_interface.context import AssetsReader, InputTableReader -from analyzer_interface.context import ( +from cibmangotree.analyzer_interface.context import AssetsReader, InputTableReader +from cibmangotree.analyzer_interface.context import ( PrimaryAnalyzerContext as BasePrimaryAnalyzerContext, ) -from analyzer_interface.context import ( +from cibmangotree.analyzer_interface.context import ( SecondaryAnalyzerContext as BaseSecondaryAnalyzerContext, ) -from analyzer_interface.context import TableReader, TableWriter -from analyzer_interface.context import WebPresenterContext as BaseWebPresenterContext -from preprocessing.series_semantic import SeriesSemantic -from storage import AnalysisModel, Storage +from cibmangotree.analyzer_interface.context import TableReader, TableWriter +from cibmangotree.analyzer_interface.context import WebPresenterContext as BaseWebPresenterContext +from cibmangotree.services.preprocessing.series_semantic import SeriesSemantic +from cibmangotree.services.storage import AnalysisModel, Storage class PrimaryAnalyzerDefaultParametersContext(BasePrimaryAnalyzerContext): diff --git a/packages/core/src/cibmangotree/services/preprocessing/series_semantic.py b/packages/core/src/cibmangotree/services/preprocessing/series_semantic.py index e6ccd139..537e9fc6 100644 --- a/packages/core/src/cibmangotree/services/preprocessing/series_semantic.py +++ b/packages/core/src/cibmangotree/services/preprocessing/series_semantic.py @@ -4,7 +4,7 @@ import polars as pl from pydantic import BaseModel -from analyzer_interface import DataType +from cibmangotree.analyzer_interface import DataType class SeriesSemantic(BaseModel): diff --git a/packages/core/src/cibmangotree/services/preprocessing/test_series_semantic.py b/packages/core/src/cibmangotree/services/preprocessing/test_series_semantic.py index 074ebf22..376d9d3e 100644 --- a/packages/core/src/cibmangotree/services/preprocessing/test_series_semantic.py +++ b/packages/core/src/cibmangotree/services/preprocessing/test_series_semantic.py @@ -2,7 +2,7 @@ import polars as pl -from preprocessing.series_semantic import ( +from cibmangotree.services.preprocessing.series_semantic import ( date_string, datetime_string, infer_series_semantic, diff --git a/packages/core/src/cibmangotree/services/storage/__init__.py b/packages/core/src/cibmangotree/services/storage/__init__.py index a30a4b6e..b773f6cd 100644 --- a/packages/core/src/cibmangotree/services/storage/__init__.py +++ b/packages/core/src/cibmangotree/services/storage/__init__.py @@ -13,8 +13,8 @@ from tinydb import Query, TinyDB from xlsxwriter import Workbook -from analyzer_interface.interface import AnalyzerOutput -from analyzer_interface.params import ParamValue +from cibmangotree.analyzer_interface.interface import AnalyzerOutput +from cibmangotree.analyzer_interface.params import ParamValue from .file_selector import FileSelectorStateManager diff --git a/packages/core/src/cibmangotree/tui/components/analysis_main.py b/packages/core/src/cibmangotree/tui/components/analysis_main.py index e2084e38..b2e867fa 100644 --- a/packages/core/src/cibmangotree/tui/components/analysis_main.py +++ b/packages/core/src/cibmangotree/tui/components/analysis_main.py @@ -1,7 +1,7 @@ from colorama import Fore -from app import AnalysisContext -from terminal_tools import ( +from cibmangotree.app import AnalysisContext +from cibmangotree.tui.tools import ( draw_box, open_directory_explorer, print_ascii_table, diff --git a/packages/core/src/cibmangotree/tui/components/analysis_params.py b/packages/core/src/cibmangotree/tui/components/analysis_params.py index e824bff4..af79cd3c 100644 --- a/packages/core/src/cibmangotree/tui/components/analysis_params.py +++ b/packages/core/src/cibmangotree/tui/components/analysis_params.py @@ -3,16 +3,16 @@ import polars as pl from pydantic import BaseModel -from analyzer_interface import ( +from cibmangotree.analyzer_interface import ( AnalyzerInterface, AnalyzerParam, IntegerParam, ParamValue, TimeBinningValue, ) -from app import ProjectContext -from context import InputColumnProvider, PrimaryAnalyzerDefaultParametersContext -from terminal_tools import prompts, smart_print_data_frame +from cibmangotree.app import ProjectContext +from cibmangotree.context import InputColumnProvider, PrimaryAnalyzerDefaultParametersContext +from cibmangotree.tui.tools import prompts, smart_print_data_frame from .context import ViewContext diff --git a/packages/core/src/cibmangotree/tui/components/analysis_web_server.py b/packages/core/src/cibmangotree/tui/components/analysis_web_server.py index b3183198..0fef6fcb 100644 --- a/packages/core/src/cibmangotree/tui/components/analysis_web_server.py +++ b/packages/core/src/cibmangotree/tui/components/analysis_web_server.py @@ -7,11 +7,11 @@ from flask import Flask, render_template from waitress import serve -from analyzer_interface.suite import AnalyzerSuite -from context import WebPresenterContext -from storage import AnalysisModel, Storage -from terminal_tools import wait_for_key -from terminal_tools.inception import TerminalContext +from cibmangotree.analyzer_interface.suite import AnalyzerSuite +from cibmangotree.context import WebPresenterContext +from cibmangotree.services.storage import AnalysisModel, Storage +from cibmangotree.tui.tools import wait_for_key +from cibmangotree.tui.tools.inception import TerminalContext def analysis_web_server( diff --git a/packages/core/src/cibmangotree/tui/components/context.py b/packages/core/src/cibmangotree/tui/components/context.py index 0365d80e..bbe09ed9 100644 --- a/packages/core/src/cibmangotree/tui/components/context.py +++ b/packages/core/src/cibmangotree/tui/components/context.py @@ -1,7 +1,7 @@ from pydantic import BaseModel, ConfigDict -from app import App -from terminal_tools.inception import TerminalContext +from cibmangotree.app import App +from cibmangotree.tui.tools.inception import TerminalContext class ViewContext(BaseModel): diff --git a/packages/core/src/cibmangotree/tui/components/export_outputs.py b/packages/core/src/cibmangotree/tui/components/export_outputs.py index 8cbd4094..d16328ce 100644 --- a/packages/core/src/cibmangotree/tui/components/export_outputs.py +++ b/packages/core/src/cibmangotree/tui/components/export_outputs.py @@ -1,14 +1,13 @@ import os -from app import AnalysisContext, AnalysisOutputContext -from storage import SupportedOutputExtension -from terminal_tools import ( - ProgressReporter, +from cibmangotree.app import AnalysisContext, AnalysisOutputContext +from cibmangotree.services.storage import SupportedOutputExtension +from cibmangotree.tui.tools import ( open_directory_explorer, prompts, wait_for_key, ) -from terminal_tools.progress import ProgressReporter +from cibmangotree.tui.tools.progress import ProgressReporter from .context import ViewContext diff --git a/packages/core/src/cibmangotree/tui/components/main_menu.py b/packages/core/src/cibmangotree/tui/components/main_menu.py index 397f7e55..4ee3132a 100644 --- a/packages/core/src/cibmangotree/tui/components/main_menu.py +++ b/packages/core/src/cibmangotree/tui/components/main_menu.py @@ -1,6 +1,6 @@ from sys import exit -from terminal_tools import draw_box, prompts +from cibmangotree.tui.tools import draw_box, prompts from .analysis_main import analysis_main from .context import ViewContext diff --git a/packages/core/src/cibmangotree/tui/components/new_analysis.py b/packages/core/src/cibmangotree/tui/components/new_analysis.py index 8b6877d7..81f3bf06 100644 --- a/packages/core/src/cibmangotree/tui/components/new_analysis.py +++ b/packages/core/src/cibmangotree/tui/components/new_analysis.py @@ -3,15 +3,15 @@ import polars as pl -from analyzer_interface import ( +from cibmangotree.analyzer_interface import ( AnalyzerInterface, InputColumn, UserInputColumn, column_automap, get_data_type_compatibility_score, ) -from app import ProjectContext -from terminal_tools import draw_box, prompts, smart_print_data_frame, wait_for_key +from cibmangotree.app import ProjectContext +from cibmangotree.tui.tools import draw_box, prompts, smart_print_data_frame, wait_for_key from .analysis_params import customize_analysis from .context import ViewContext diff --git a/packages/core/src/cibmangotree/tui/components/new_project.py b/packages/core/src/cibmangotree/tui/components/new_project.py index 49682c71..0fe17414 100644 --- a/packages/core/src/cibmangotree/tui/components/new_project.py +++ b/packages/core/src/cibmangotree/tui/components/new_project.py @@ -4,10 +4,10 @@ import polars as pl -from importing import Importer, ImporterSession, importers -from terminal_tools import draw_box, prompts, wait_for_key -from terminal_tools.inception import Scope -from terminal_tools.utils import print_message, smart_print_data_frame +from cibmangotree.services.importing import Importer, ImporterSession, importers +from cibmangotree.tui.tools import draw_box, prompts, wait_for_key +from cibmangotree.tui.tools.inception import Scope +from cibmangotree.tui.tools.utils import print_message, smart_print_data_frame from .context import ViewContext diff --git a/packages/core/src/cibmangotree/tui/components/project_main.py b/packages/core/src/cibmangotree/tui/components/project_main.py index 32999e76..4788b939 100644 --- a/packages/core/src/cibmangotree/tui/components/project_main.py +++ b/packages/core/src/cibmangotree/tui/components/project_main.py @@ -1,7 +1,7 @@ from colorama import Fore -from app import ProjectContext -from terminal_tools import draw_box, prompts, wait_for_key +from cibmangotree.app import ProjectContext +from cibmangotree.tui.tools import draw_box, prompts, wait_for_key from .analysis_main import analysis_main from .context import ViewContext diff --git a/packages/core/src/cibmangotree/tui/components/select_analysis.py b/packages/core/src/cibmangotree/tui/components/select_analysis.py index 28c5b329..8569feb6 100644 --- a/packages/core/src/cibmangotree/tui/components/select_analysis.py +++ b/packages/core/src/cibmangotree/tui/components/select_analysis.py @@ -1,8 +1,8 @@ from datetime import datetime from typing import Optional -from app import AnalysisContext, ProjectContext -from terminal_tools import prompts, wait_for_key +from cibmangotree.app import AnalysisContext, ProjectContext +from cibmangotree.tui.tools import prompts, wait_for_key def select_analysis(proj: ProjectContext) -> Optional[AnalysisContext]: diff --git a/packages/core/src/cibmangotree/tui/components/select_project.py b/packages/core/src/cibmangotree/tui/components/select_project.py index 95111639..48b556fe 100644 --- a/packages/core/src/cibmangotree/tui/components/select_project.py +++ b/packages/core/src/cibmangotree/tui/components/select_project.py @@ -1,7 +1,7 @@ from typing import Optional -from app import ProjectContext -from terminal_tools import draw_box, prompts, smart_print_data_frame, wait_for_key +from cibmangotree.app import ProjectContext +from cibmangotree.tui.tools import draw_box, prompts, smart_print_data_frame, wait_for_key from .context import ViewContext diff --git a/packages/core/src/cibmangotree/tui/components/splash.py b/packages/core/src/cibmangotree/tui/components/splash.py index bd5b073e..f44eaf37 100644 --- a/packages/core/src/cibmangotree/tui/components/splash.py +++ b/packages/core/src/cibmangotree/tui/components/splash.py @@ -2,8 +2,8 @@ from rich.console import Console from rich.panel import Panel -from meta import get_version -from terminal_tools import clear_terminal, wait_for_key +from cibmangotree.meta import get_version +from cibmangotree.tui.tools import clear_terminal, wait_for_key def splash(): diff --git a/packages/core/src/cibmangotree/tui/tools/prompts.py b/packages/core/src/cibmangotree/tui/tools/prompts.py index 4746a9a4..43dd55f0 100644 --- a/packages/core/src/cibmangotree/tui/tools/prompts.py +++ b/packages/core/src/cibmangotree/tui/tools/prompts.py @@ -7,7 +7,7 @@ from inquirer import text as inquirer_text from inquirer.errors import ValidationError -from storage.file_selector import FileSelectorStateManager +from cibmangotree.services.storage.file_selector import FileSelectorStateManager from .utils import clear_printed_lines From df0dc1881e51529b5b61135966737b0f9bbae629 Mon Sep 17 00:00:00 2001 From: Joe Karow <58997957+JoeKarow@users.noreply.github.com> Date: Thu, 9 Oct 2025 18:50:45 -0400 Subject: [PATCH 06/24] feat: update PyInstaller spec for monorepo with dynamic plugin discovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 6: PyInstaller Spec & Workspace Configuration PyInstaller Updates: - Add dynamic plugin discovery via importlib.metadata - Generate frozen plugins file for builds (_frozen_plugins.py) - Collect plugin hiddenimports automatically - Update paths to packages/core/src - Add comprehensive hiddenimports for all modules - Include web static assets and templates - Support both development and frozen modes Root Entry Point: - Simplify cibmangotree.py to import from package - Remove duplicate startup code Workspace Configuration: - Convert root to virtual workspace (remove [project]) - Add [tool.uv.sources] to all plugin packages - Enable workspace dependency resolution Import Fixes: - Fix terminal_tools imports in csv.py and excel.py Testing: - ✅ uv sync completes successfully (148 packages) - ✅ All 8 packages build correctly - ✅ Plugin discovery working (5 analyzers, 1 tokenizer) - ✅ Frozen plugin generation tested Phase 6 of monorepo reorganization complete. --- .gitignore | 3 + cibmangotree.py | 76 +---- packages/analyzers/example/pyproject.toml | 4 + packages/analyzers/hashtags/pyproject.toml | 4 + packages/analyzers/ngrams/pyproject.toml | 4 + packages/analyzers/temporal/pyproject.toml | 4 + .../time_coordination/pyproject.toml | 4 + .../cibmangotree/services/importing/csv.py | 4 +- .../cibmangotree/services/importing/excel.py | 4 +- packages/testing/pyproject.toml | 4 + packages/tokenizers/basic/pyproject.toml | 4 + pyinstaller.spec | 305 ++++++++++++++++-- pyproject.toml | 18 +- 13 files changed, 326 insertions(+), 112 deletions(-) diff --git a/.gitignore b/.gitignore index 2afa91a7..40023246 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,6 @@ VERSION in-memoria.db .in-memoria/cache/ .in-memoria/.env + +# PyInstaller - auto-generated frozen plugins +packages/core/src/cibmangotree/_frozen_plugins.py diff --git a/cibmangotree.py b/cibmangotree.py index be92b7be..a90731d3 100644 --- a/cibmangotree.py +++ b/cibmangotree.py @@ -1,74 +1,6 @@ -import argparse -import logging -import sys -from multiprocessing import freeze_support -from pathlib import Path - -from rich.console import Console -from rich.text import Text - -from terminal_tools import enable_windows_ansi_support +#!/usr/bin/env python3 +"""CIB Mango Tree CLI entry point for PyInstaller builds.""" +from cibmangotree.__main__ import main if __name__ == "__main__": - freeze_support() - enable_windows_ansi_support() - - # Parse command line arguments - parser = argparse.ArgumentParser( - description="Mango Tango CLI - Social Media Data Analysis Tool" - ) - parser.add_argument( - "--log-level", - choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], - default="INFO", - help="Set the logging level (default: INFO)", - ) - parser.add_argument( - "--noop", action="store_true", help="No-operation mode for testing" - ) - - args = parser.parse_args() - - # Handle no-op mode - if args.noop: - print("No-op flag detected. Exiting successfully.") - sys.exit(0) - - # Show loading message early - console = Console() - loading_msg = Text("🥭 CIB Mango Tree is starting", style="orange1 bold") - loading_msg.append("... This may take a moment.", style="dim") - console.print(loading_msg) - - # Import heavy modules after loading message - from analyzers import suite - from app import App, AppContext - from app.logger import setup_logging - from components import ViewContext, main_menu, splash - from meta import get_version - from storage import Storage - from terminal_tools.inception import TerminalContext - - # Initialize storage - storage = Storage(app_name="MangoTango", app_author="Civic Tech DC") - - # Set up logging - log_level = getattr(logging, args.log_level) - log_file_path = Path(storage.user_data_dir) / "logs" / "mangotango.log" - app_version = get_version() or "development" - setup_logging(log_file_path, log_level, app_version) - - # Get logger for main module - logger = logging.getLogger(__name__) - logger.info( - "Starting CIB Mango Tree application", - extra={"log_level": args.log_level, "log_file": str(log_file_path)}, - ) - - splash() - main_menu( - ViewContext( - terminal=TerminalContext(), - app=App(context=AppContext(storage=storage, suite=suite)), - ) - ) + main() diff --git a/packages/analyzers/example/pyproject.toml b/packages/analyzers/example/pyproject.toml index 1877dae5..09ab0427 100644 --- a/packages/analyzers/example/pyproject.toml +++ b/packages/analyzers/example/pyproject.toml @@ -26,6 +26,10 @@ dependencies = [ "polars>=0.20.0", ] +# Workspace dependency resolution +[tool.uv.sources] +cibmangotree = { workspace = true } + [project.entry-points."cibmangotree.analyzers"] example = "cibmangotree_analyzer_example:get_interface" diff --git a/packages/analyzers/hashtags/pyproject.toml b/packages/analyzers/hashtags/pyproject.toml index 6c61117a..cbf4a309 100644 --- a/packages/analyzers/hashtags/pyproject.toml +++ b/packages/analyzers/hashtags/pyproject.toml @@ -27,6 +27,10 @@ dependencies = [ "plotly>=5.0.0", ] +# Workspace dependency resolution +[tool.uv.sources] +cibmangotree = { workspace = true } + [project.entry-points."cibmangotree.analyzers"] hashtags = "cibmangotree_analyzer_hashtags:get_interface" diff --git a/packages/analyzers/ngrams/pyproject.toml b/packages/analyzers/ngrams/pyproject.toml index f03838df..8486e330 100644 --- a/packages/analyzers/ngrams/pyproject.toml +++ b/packages/analyzers/ngrams/pyproject.toml @@ -27,6 +27,10 @@ dependencies = [ "plotly>=5.0.0", ] +# Workspace dependency resolution +[tool.uv.sources] +cibmangotree = { workspace = true } + [project.entry-points."cibmangotree.analyzers"] ngrams = "cibmangotree_analyzer_ngrams:get_interface" diff --git a/packages/analyzers/temporal/pyproject.toml b/packages/analyzers/temporal/pyproject.toml index 970e561e..cd63115a 100644 --- a/packages/analyzers/temporal/pyproject.toml +++ b/packages/analyzers/temporal/pyproject.toml @@ -26,6 +26,10 @@ dependencies = [ "polars>=0.20.0", ] +# Workspace dependency resolution +[tool.uv.sources] +cibmangotree = { workspace = true } + [project.entry-points."cibmangotree.analyzers"] temporal = "cibmangotree_analyzer_temporal:get_interface" diff --git a/packages/analyzers/time_coordination/pyproject.toml b/packages/analyzers/time_coordination/pyproject.toml index c9b2a103..ccd954cd 100644 --- a/packages/analyzers/time_coordination/pyproject.toml +++ b/packages/analyzers/time_coordination/pyproject.toml @@ -26,6 +26,10 @@ dependencies = [ "polars>=0.20.0", ] +# Workspace dependency resolution +[tool.uv.sources] +cibmangotree = { workspace = true } + [project.entry-points."cibmangotree.analyzers"] time_coordination = "cibmangotree_analyzer_time_coordination:get_interface" diff --git a/packages/core/src/cibmangotree/services/importing/csv.py b/packages/core/src/cibmangotree/services/importing/csv.py index 7a5096db..a7e7ad03 100644 --- a/packages/core/src/cibmangotree/services/importing/csv.py +++ b/packages/core/src/cibmangotree/services/importing/csv.py @@ -5,8 +5,8 @@ import polars as pl from pydantic import BaseModel -import terminal_tools.prompts as prompts -from terminal_tools.utils import print_message, smart_print_data_frame +from cibmangotree.tui.tools import prompts +from cibmangotree.tui.tools.utils import print_message, smart_print_data_frame from .importer import Importer, ImporterSession diff --git a/packages/core/src/cibmangotree/services/importing/excel.py b/packages/core/src/cibmangotree/services/importing/excel.py index 4c63fae0..1b952918 100644 --- a/packages/core/src/cibmangotree/services/importing/excel.py +++ b/packages/core/src/cibmangotree/services/importing/excel.py @@ -4,8 +4,8 @@ from fastexcel import read_excel from pydantic import BaseModel -import terminal_tools.prompts as prompts -from terminal_tools.utils import wait_for_key +from cibmangotree.tui.tools import prompts +from cibmangotree.tui.tools.utils import wait_for_key from .importer import Importer, ImporterSession diff --git a/packages/testing/pyproject.toml b/packages/testing/pyproject.toml index d98f00ff..a7ea1cab 100644 --- a/packages/testing/pyproject.toml +++ b/packages/testing/pyproject.toml @@ -33,6 +33,10 @@ dependencies = [ "pydantic>=2.9.1", ] +# Workspace dependency resolution +[tool.uv.sources] +cibmangotree = { workspace = true } + [build-system] requires = ["hatchling"] build-backend = "hatchling.build" diff --git a/packages/tokenizers/basic/pyproject.toml b/packages/tokenizers/basic/pyproject.toml index 1b153c15..9686e696 100644 --- a/packages/tokenizers/basic/pyproject.toml +++ b/packages/tokenizers/basic/pyproject.toml @@ -26,6 +26,10 @@ dependencies = [ "regex>=2025.9.1", ] +# Workspace dependency resolution +[tool.uv.sources] +cibmangotree = { workspace = true } + # Plugin entry points - auto-discovered by core in dev mode [project.entry-points."cibmangotree.tokenizers"] basic = "cibmangotree_tokenizer_basic:BasicTokenizer" diff --git a/pyinstaller.spec b/pyinstaller.spec index 3adb1fa3..57d09df1 100644 --- a/pyinstaller.spec +++ b/pyinstaller.spec @@ -1,54 +1,295 @@ # code: language=python -# main.spec -# This file tells PyInstaller how to bundle your application +# pyinstaller.spec +# This file tells PyInstaller how to bundle the monorepo application with dynamic plugin discovery from PyInstaller.utils.hooks import copy_metadata from PyInstaller.building.api import EXE, PYZ from PyInstaller.building.build_main import Analysis import sys import os import site +from pathlib import Path +# Import plugin discovery system +try: + import importlib.metadata as importlib_metadata +except ImportError: + import importlib_metadata + + +# ============================================================================ +# Plugin Discovery System +# ============================================================================ + +def discover_plugins(): + """ + Discover all installed plugins via entry points. + + Returns: + dict: Dictionary with 'analyzers' and 'tokenizers' keys containing + lists of plugin metadata (name, module, attr). + """ + plugins = { + 'analyzers': [], + 'tokenizers': [] + } + + print("=" * 70) + print("DISCOVERING PLUGINS FOR PYINSTALLER BUILD") + print("=" * 70) + + # Discover analyzer plugins + try: + eps = importlib_metadata.entry_points() + # Handle both old (dict) and new (SelectableGroups) API + if hasattr(eps, 'select'): + analyzer_eps = eps.select(group='cibmangotree.analyzers') + else: + analyzer_eps = eps.get('cibmangotree.analyzers', []) + + for ep in analyzer_eps: + module_path = ep.value.split(':')[0] + attr_name = ep.value.split(':')[1] if ':' in ep.value else None + plugins['analyzers'].append({ + 'name': ep.name, + 'module': module_path, + 'attr': attr_name + }) + print(f" [Analyzer] {ep.name:20s} -> {module_path}") + except Exception as e: + print(f" Warning: Could not discover analyzer plugins: {e}") + + # Discover tokenizer plugins + try: + eps = importlib_metadata.entry_points() + if hasattr(eps, 'select'): + tokenizer_eps = eps.select(group='cibmangotree.tokenizers') + else: + tokenizer_eps = eps.get('cibmangotree.tokenizers', []) + + for ep in tokenizer_eps: + module_path = ep.value.split(':')[0] + attr_name = ep.value.split(':')[1] if ':' in ep.value else None + plugins['tokenizers'].append({ + 'name': ep.name, + 'module': module_path, + 'attr': attr_name + }) + print(f" [Tokenizer] {ep.name:20s} -> {module_path}") + except Exception as e: + print(f" Warning: Could not discover tokenizer plugins: {e}") + + print(f"\nTotal discovered: {len(plugins['analyzers'])} analyzers, " + f"{len(plugins['tokenizers'])} tokenizers") + print("=" * 70) + + return plugins + + +def generate_frozen_plugins(plugins): + """ + Generate frozen plugins file for PyInstaller. + + This file provides explicit imports for all plugins that would normally + be discovered via entry points (which don't work in frozen executables). + + Args: + plugins: Dictionary from discover_plugins() + + Returns: + str: Path to the generated frozen plugins file + """ + frozen_file = Path('packages/core/src/cibmangotree/_frozen_plugins.py') + + content = '''""" +Auto-generated frozen plugins for PyInstaller builds. +DO NOT EDIT - Generated by pyinstaller.spec + +This file is used when the application is frozen (packaged with PyInstaller) +to provide explicit plugin imports, since entry points don't work in frozen apps. +""" + +# Analyzer plugins - mapping from plugin name to module path +ANALYZER_PLUGINS = { +''' + + for plugin in plugins['analyzers']: + content += f" '{plugin['name']}': '{plugin['module']}:{plugin['attr']}',\n" + + content += '''}\n +# Tokenizer plugins - mapping from plugin name to module path +TOKENIZER_PLUGINS = { +''' + + for plugin in plugins['tokenizers']: + content += f" '{plugin['name']}': '{plugin['module']}:{plugin['attr']}',\n" + + content += '''}\n +def is_frozen(): + """Check if running in a frozen (PyInstaller) environment.""" + return getattr(sys, 'frozen', False) and hasattr(sys, '_MEIPASS') +''' + + # Ensure directory exists + frozen_file.parent.mkdir(parents=True, exist_ok=True) + frozen_file.write_text(content) + + print(f"\nGenerated frozen plugins file: {frozen_file}") + return str(frozen_file) + + +def get_plugin_hiddenimports(plugins): + """ + Get all plugin modules for PyInstaller's hiddenimports. + + Args: + plugins: Dictionary from discover_plugins() + + Returns: + list: List of module paths to include as hidden imports + """ + imports = [] + + # Add analyzer modules + for plugin in plugins['analyzers']: + base_module = plugin['module'].split('.')[0] + imports.append(plugin['module']) + imports.append(base_module) + + # Add tokenizer modules + for plugin in plugins['tokenizers']: + base_module = plugin['module'].split('.')[0] + imports.append(plugin['module']) + imports.append(base_module) + + # Remove duplicates while preserving order + seen = set() + unique_imports = [] + for imp in imports: + if imp not in seen: + seen.add(imp) + unique_imports.append(imp) + + return unique_imports + + +# ============================================================================ +# Site Packages Detection +# ============================================================================ site_packages_path = None block_cipher = None for site_path in site.getsitepackages(): - if 'site-packages' in site_path: - site_packages_path = site_path - break + if 'site-packages' in site_path: + site_packages_path = site_path + break if site_packages_path is None: - raise RuntimeError("The site-packages directory could not be found. Please setup the python envrionment correctly and try again...") + raise RuntimeError( + "The site-packages directory could not be found. " + "Please setup the python environment correctly and try again..." + ) + +print(f"\nUsing site-packages: {site_packages_path}\n") + + +# ============================================================================ +# Plugin Discovery and Frozen Plugin Generation +# ============================================================================ + +# Discover all plugins via entry points +discovered_plugins = discover_plugins() + +# Generate frozen plugins file for runtime +frozen_plugins_file = generate_frozen_plugins(discovered_plugins) + +# Get all plugin modules for hiddenimports +plugin_imports = get_plugin_hiddenimports(discovered_plugins) + + +# ============================================================================ +# PyInstaller Analysis Configuration +# ============================================================================ a = Analysis( - ['cibmangotree.py'], # Entry point - pathex=['.'], # Ensure all paths are correctly included + ['cibmangotree.py'], # Entry point (imports from cibmangotree package) + pathex=[ + 'packages/core/src', # Core package source + ], binaries=[], datas=[ - # version file, if defined + # Version file, if defined *( [('./VERSION', '.')] if os.path.exists('VERSION') else [] ), - # inquirer depends on readchar as a hidden dependency that requires package metadata + # Inquirer depends on readchar as a hidden dependency that requires package metadata *copy_metadata('readchar'), - # static assets for web servers + # Static assets for web servers (from site-packages) (os.path.join(site_packages_path, 'shiny/www'), 'shiny/www'), (os.path.join(site_packages_path, 'shinywidgets/static'), 'shinywidgets/static'), - ('./app/web_static', 'app/web_static'), - ('./app/web_templates', 'app/web_templates') + + # Application static assets (from monorepo) + ('packages/core/src/cibmangotree/app/web_static', 'cibmangotree/app/web_static'), + ('packages/core/src/cibmangotree/app/web_templates', 'cibmangotree/app/web_templates'), + + # Include the frozen plugins file + (frozen_plugins_file, 'cibmangotree'), ], hiddenimports=[ + # Core package modules + 'cibmangotree', + 'cibmangotree.__main__', + 'cibmangotree.app', + 'cibmangotree.analyzer_interface', + 'cibmangotree.tui', + 'cibmangotree.tui.components', + 'cibmangotree.tui.tools', + 'cibmangotree.services', + 'cibmangotree.services.storage', + 'cibmangotree.services.data_import', + 'cibmangotree.services.tokenizers', + 'cibmangotree.context', + 'cibmangotree.meta', + 'cibmangotree.plugin_system', + 'cibmangotree.plugin_system.analyzer_loader', + 'cibmangotree.plugin_system.tokenizer_loader', + + # Dynamically discovered plugin modules + *plugin_imports, + + # Terminal UI dependencies 'readchar', + 'inquirer', + 'rich', + 'colorama', + + # Data processing 'numpy', 'numpy.core.multiarray', + 'polars', + 'pandas', + 'pyarrow', + + # Visualization + 'plotly', + 'plotly.graph_objs', + + # Web frameworks + 'dash', + 'dash.dependencies', 'shiny', 'shiny.ui', 'shiny.server', + 'shinywidgets', 'htmltools', 'starlette', + 'starlette.middleware', + 'starlette.routing', + + # Web server 'uvicorn', 'uvicorn.logging', 'uvicorn.loops', @@ -64,21 +305,43 @@ a = Analysis( 'websockets', 'websockets.legacy', 'websockets.legacy.server', - 'polars', - 'plotly', + + # Markdown rendering (for Shiny) 'linkify_it', 'markdown_it', 'mdit_py_plugins', 'mdurl', 'uc_micro', + + # Logging 'pythonjsonlogger', 'pythonjsonlogger.jsonlogger', - ], # Include any imports that PyInstaller might miss + + # Storage + 'tinydb', + 'platformdirs', + 'filelock', + + # Text processing + 'regex', + + # Data validation + 'pydantic', + 'pydantic.v1', + + # Import/Export + 'xlsxwriter', + 'fastexcel', + ], hookspath=[], runtime_hooks=[], excludes=[], ) +# ============================================================================ +# Build Configuration +# ============================================================================ + pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher) if sys.platform == "darwin": @@ -91,8 +354,8 @@ if sys.platform == "darwin": name='cibmangotree', # The name of the executable debug=False, strip=True, - upx=True, # You can set this to False if you don't want UPX compression - console=True, # Set to False if you don't want a console window + upx=True, + console=True, entitlements_file="./mango.entitlements", codesign_identity=os.getenv('APPLE_APP_CERT_ID'), ) @@ -103,9 +366,9 @@ else: a.binaries, a.zipfiles, a.datas, - name='cibmangotree', # The name of the executable + name='cibmangotree', debug=False, strip=False, - upx=True, # You can set this to False if you don't want UPX compression - console=True, # Set to False if you don't want a console window + upx=True, + console=True, ) diff --git a/pyproject.toml b/pyproject.toml index d56ade98..426845c3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,14 +1,3 @@ -[project] -name = "cibmangotree-workspace" -version = "0.1.0" -requires-python = ">=3.12" -description = "CIB Mango Tree CLI - Social Media Data Analysis Tool (Workspace Root)" -readme = "README.md" -license = {text = "PolyForm Noncommercial License 1.0.0"} - -# This is a virtual workspace root - not a package itself -# Individual packages are defined in packages/* subdirectories - [tool.uv.workspace] members = [ "packages/core", @@ -21,6 +10,9 @@ members = [ "packages/analyzers/time_coordination", ] +# Virtual workspace root - does not define a project itself +# Individual packages are defined in packages/* subdirectories + # Development dependencies available across all workspace members [dependency-groups] dev = [ @@ -78,7 +70,3 @@ filterwarnings = [ "ignore::DeprecationWarning", "ignore::PendingDeprecationWarning", ] - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" From e8b72977c510e1badd3112c1bc334324bb160d64 Mon Sep 17 00:00:00 2001 From: Joe Karow <58997957+JoeKarow@users.noreply.github.com> Date: Thu, 9 Oct 2025 18:59:01 -0400 Subject: [PATCH 07/24] feat: update CI/CD and development tooling for UV monorepo Phase 7: CI/CD, Documentation, and Developer Experience Updates GitHub Workflows: - Migrate test.yml, code_quality.yml, build_exe.yml to UV - Add astral-sh/setup-uv@v5 with caching - Use .python-version for consistent Python management - Update all commands to use 'uv run' prefix Bootstrap Scripts: - Complete rewrite for UV workflow - Auto-install UV if not present - One-command setup with verification - Platform-specific (bash + PowerShell) Documentation: - README.md: Add UV workflow, update structure diagram - CONTRIBUTING.md: Update all dev commands to UV - CLAUDE.md: Update paths to packages/ structure - .ai-context/README.md: Update package layout - DEV_COMMANDS.md: NEW comprehensive quick reference Configuration: - .gitignore: Add UV-specific files, reorganize sections Impact: - Faster CI with UV caching - Simplified developer onboarding - Consistent command patterns - Comprehensive documentation Phase 7 of monorepo reorganization complete. --- .ai-context/README.md | 34 ++- .github/workflows/build_exe.yml | 22 +- .github/workflows/code_quality.yml | 20 +- .github/workflows/test.yml | 17 +- .gitignore | 25 +- CLAUDE.md | 85 +++++-- CONTRIBUTING.md | 57 +++-- DEV_COMMANDS.md | 374 +++++++++++++++++++++++++++++ README.md | 94 ++++++-- bootstrap.ps1 | 46 ++-- bootstrap.sh | 47 ++-- 11 files changed, 688 insertions(+), 133 deletions(-) create mode 100644 DEV_COMMANDS.md diff --git a/.ai-context/README.md b/.ai-context/README.md index f3ebea68..f5cb5edc 100644 --- a/.ai-context/README.md +++ b/.ai-context/README.md @@ -22,26 +22,44 @@ consistent UX while allowing easy contribution of new analyzers. - **Data**: Polars/Pandas, PyArrow, Parquet files - **Text Processing**: Unicode tokenizer service with scriptio continua support (character-level for CJK/Thai/Southeast Asian scripts, word-level for Latin/Arabic scripts) - **Web**: Dash, Shiny for Python, Plotly +- **Package Manager**: UV (modern Python package manager) - **Dev Tools**: Black, isort, pytest, PyInstaller +### Project Structure + +UV workspace monorepo with 8 packages: + +``` +packages/ +├── core/ # cibmangotree - Main application +├── importing/ # cibmangotree-importing - Data I/O +├── services/ # cibmangotree-services - Shared services +├── testing/ # cibmangotree-testing - Testing utilities +└── analyzers/ # Analysis modules (plugins) + ├── hashtags/ # Hashtag analysis + ├── ngrams/ # N-gram analysis + ├── temporal/ # Temporal patterns + └── example/ # Example analyzer template +``` + ## Semantic Code Structure ### Entry Points -- `mangotango.py` - Main application bootstrap -- `python -m mangotango` - Standard execution command +- `packages/core/src/cibmangotree/__main__.py` - Main application bootstrap +- `uv run cibmangotree` - Standard execution command ### Core Architecture (MVC-like) -- **Application Layer** (`app/`): Workspace logic, analysis orchestration -- **View Layer** (`components/`): Terminal UI components using inquirer -- **Model Layer** (`storage/`): Data persistence, project/analysis models +- **Application Layer** (`packages/core/src/cibmangotree/app/`): Workspace logic, analysis orchestration +- **View Layer** (`packages/core/src/cibmangotree/components/`): Terminal UI components using inquirer +- **Model Layer** (`packages/core/src/cibmangotree/storage/`): Data persistence, project/analysis models ### Domain Separation -1. **Core Domain**: Application, Terminal Components, Storage IO -2. **Edge Domain**: Data import/export (`importing/`), preprocessing -3. **Content Domain**: Analyzers (`analyzers/`), web presenters +1. **Core Domain**: Application, Terminal Components, Storage IO (`packages/core/`) +2. **Edge Domain**: Data import/export (`packages/importing/`), preprocessing +3. **Content Domain**: Analyzers (`packages/analyzers/`), web presenters ### Key Data Flow diff --git a/.github/workflows/build_exe.yml b/.github/workflows/build_exe.yml index e9799859..a5db5a91 100644 --- a/.github/workflows/build_exe.yml +++ b/.github/workflows/build_exe.yml @@ -57,21 +57,17 @@ jobs: uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: - python-version: 3.12 - cache: 'pip' - cache-dependency-path: '**/requirements*.txt' + python-version-file: '.python-version' - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements.txt + - name: Install UV + uses: astral-sh/setup-uv@v5 + with: + enable-cache: true - - name: Install PyInstaller - run: | - pip install pyinstaller - echo "PYINST_BIN=\"$(which pyinstaller)\"" >> "$GITHUB_ENV" + - name: Install dependencies + run: uv sync - name: Create macOS keychain id: keychain if: runner.os == 'macOS' && inputs.is_release @@ -116,7 +112,7 @@ jobs: - name: Build the executable env: APPLE_APP_CERT_ID: "${{ inputs.is_release && secrets.APPLE_APP_CERT_ID || '' }}" - run: pyinstaller pyinstaller.spec + run: uv run pyinstaller pyinstaller.spec - name: Rename the executable to include platform suffix run: ${{ matrix.move_command }} diff --git a/.github/workflows/code_quality.yml b/.github/workflows/code_quality.yml index 41cf2269..d1c68090 100644 --- a/.github/workflows/code_quality.yml +++ b/.github/workflows/code_quality.yml @@ -8,27 +8,23 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: - python-version: 3.12 + python-version-file: '.python-version' - - name: Cache dependencies - uses: actions/cache@v3 + - name: Install UV + uses: astral-sh/setup-uv@v5 with: - path: | - ~/.cache/pip - key: linux-pip-dev-${{ hashFiles('requirements-dev.txt') }} + enable-cache: true - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-dev.txt + run: uv sync - name: Run code formatter - run: isort . && black . + run: uv run isort . && uv run black . - name: Assert that the codebase has no dif shell: bash diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 79fed2e8..9fbe2e73 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -11,19 +11,20 @@ jobs: uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: - python-version: 3.12 - cache: 'pip' - cache-dependency-path: '**/requirements*.txt' + python-version-file: '.python-version' + + - name: Install UV + uses: astral-sh/setup-uv@v5 + with: + enable-cache: true - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-dev.txt + run: uv sync - name: Run tests - run: pytest + run: uv run pytest test_build: uses: ./.github/workflows/build_exe.yml secrets: inherit diff --git a/.gitignore b/.gitignore index 40023246..e59804a0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,13 +1,33 @@ +# Virtual environments venv +.venv/ + +# Python __pycache__ -__private__ +*.pyc +*.pyo +*.pyd +.Python + +# UV +uv.lock +.python-version + +# Build outputs /build /dist +*.egg-info + +# Application outputs /analysis_outputs /site VERSION *.DS_Store + +# Environment files .env* + +# AI tooling .serena .specify @@ -16,5 +36,8 @@ in-memoria.db .in-memoria/cache/ .in-memoria/.env +# Private/sensitive files +__private__ + # PyInstaller - auto-generated frozen plugins packages/core/src/cibmangotree/_frozen_plugins.py diff --git a/CLAUDE.md b/CLAUDE.md index 48d8b909..47cebb98 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -89,7 +89,7 @@ ```markdown # Find files by pattern Glob: "**/*analyzer*.py" -Glob: "app/**/*.py" +Glob: "packages/core/src/cibmangotree/app/**/*.py" # Find class definitions Grep: "^class AnalyzerInterface" --type py @@ -98,12 +98,12 @@ Grep: "^class AnalyzerInterface" --type py Grep: "^def main\(" --type py # Find usage/references -Grep: "from app.logger import" --type py +Grep: "from cibmangotree.app.logger import" --type py Grep: "AnalysisContext" --type py # Read specific files -Read: app/app.py -Read: analyzers/hashtags/main.py +Read: packages/core/src/cibmangotree/app/app.py +Read: packages/analyzers/hashtags/src/cibmangotree_analyzers_hashtags/main.py ``` ### Code Exploration Workflow @@ -129,22 +129,42 @@ Read: analyzers/hashtags/main.py **Logging Integration:** ```python -from app.logger import get_logger +from cibmangotree.app.logger import get_logger logger = get_logger(__name__) logger.info("Operation started", extra={"context": "value"}) ``` Use structured logging throughout development. See `docs/dev-guide.md#logging` for complete patterns. +**UV Workflow:** + +```bash +# Install dependencies +uv sync + +# Run the application +uv run cibmangotree + +# Run tests +uv run pytest + +# Format code +uv run black . +uv run isort . + +# Build executable +uv run pyinstaller pyinstaller.spec +``` + ### Task-Specific Patterns **New Analyzer Development**: ```markdown -1. Glob: "analyzers/example/**/*.py" # Find example analyzer -2. Read: analyzers/example/interface.py +1. Glob: "packages/analyzers/example/**/*.py" # Find example analyzer +2. Read: packages/analyzers/example/src/cibmangotree_analyzers_example/interface.py 3. search_nodes("analyzer architecture") # Understand patterns -4. Read: analyzers/example/main.py +4. Read: packages/analyzers/example/src/cibmangotree_analyzers_example/main.py 5. Use knowledge graph insights to implement ``` @@ -179,7 +199,7 @@ Use structured logging throughout development. See `docs/dev-guide.md#logging` f entityType: "Analyzer", observations: [ "Primary analyzer for hashtag extraction and analysis", - "Located in analyzers/hashtags/main.py", + "Located in packages/analyzers/hashtags/src/cibmangotree_analyzers_hashtags/main.py", "Uses regex patterns to extract hashtags from text columns", "Outputs: hashtag frequency, co-occurrence, temporal patterns", "Gotcha: Handles Unicode hashtags correctly via preprocessing" @@ -210,7 +230,7 @@ Use structured logging throughout development. See `docs/dev-guide.md#logging` f name: "TokenizerService", entityType: "Service", observations: [ - "Located in services/tokenizer/", + "Located in packages/services/src/cibmangotree_services/tokenizer/", "AbstractTokenizer base, BasicTokenizer implementation", "Handles scriptio continua (CJK, Thai, Lao, Myanmar, Khmer)", "Space-separated tokenization (Latin, Arabic)", @@ -295,18 +315,18 @@ search_nodes("tokenizer patterns") # For text processing ```markdown # Find app entry point -Grep: "^def main" --path mangotango.py +Grep: "^def main" --path packages/core/src/cibmangotree/__main__.py # Explore analyzer system -Glob: "analyzers/**/__init__.py" -Read: analyzers/__init__.py +Glob: "packages/analyzers/**/__init__.py" +Read: packages/core/src/cibmangotree/analyzers.py # Understand storage layer Grep: "^class Storage" --type py -Read: storage/__init__.py +Read: packages/core/src/cibmangotree/storage/__init__.py # Trace UI components -Glob: "components/**/*.py" +Glob: "packages/core/src/cibmangotree/components/**/*.py" Grep: "^def main_menu" --type py ``` @@ -329,18 +349,41 @@ Grep: "^def main_menu" --type py ### Key Architecture References -- **Entry Point**: `mangotango.py` - Application bootstrap -- **Core App**: `app/app.py:App` - Main application controller -- **Storage**: `storage/__init__.py:Storage` - Data persistence -- **UI Components**: `components/main_menu.py:main_menu()` - Terminal interface -- **Analyzer Suite**: `analyzers/__init__.py:suite` - Analysis registry +- **Entry Point**: `packages/core/src/cibmangotree/__main__.py` - Application bootstrap +- **Core App**: `packages/core/src/cibmangotree/app/app.py:App` - Main application controller +- **Storage**: `packages/core/src/cibmangotree/storage/__init__.py:Storage` - Data persistence +- **UI Components**: `packages/core/src/cibmangotree/components/main_menu.py:main_menu()` - Terminal interface +- **Analyzer Discovery**: `packages/core/src/cibmangotree/analyzers.py:discover_analyzers()` - Plugin discovery + +### Package Structure + +``` +packages/ +├── core/ # cibmangotree - Main application +│ └── src/cibmangotree/ +│ ├── app/ # Application logic & terminal UI +│ ├── storage/ # Data persistence layer +│ ├── components/ # Terminal UI components +│ └── analyzers.py # Analyzer discovery & registry +├── importing/ # cibmangotree-importing - Data import/export +├── services/ # cibmangotree-services - Shared services +│ └── src/cibmangotree_services/ +│ └── tokenizer/ # Text tokenization service +├── testing/ # cibmangotree-testing - Testing utilities +└── analyzers/ # Analysis modules (plugins) + ├── hashtags/ # cibmangotree-analyzers-hashtags + ├── ngrams/ # cibmangotree-analyzers-ngrams + ├── temporal/ # cibmangotree-analyzers-temporal + └── example/ # cibmangotree-analyzers-example +``` ### Integration Points -- **Data Import**: `importing/` - CSV/Excel to Parquet conversion +- **Data Import**: `packages/importing/` - CSV/Excel to Parquet conversion - **Analysis Pipeline**: Primary → Secondary → Web presentation - **Web Dashboards**: Dash and Shiny framework integration - **Export System**: Multi-format output generation +- **Analyzer Plugins**: Auto-discovered from installed packages ## Documentation Integration Strategy diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 41c28d15..30923d66 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -26,6 +26,7 @@ This project and everyone participating in it is governed by our [Code of Conduc ### Prerequisites - **Python 3.12** (required for all features to work correctly) +- **UV** (modern Python package manager - automatically installed by bootstrap scripts) - **Git** for version control - **Terminal/Command Line** access @@ -37,8 +38,18 @@ Before contributing, familiarize yourself with: - **Architecture**: Review the [Development Guide](docs/dev-guide.md) for architectural understanding - **AI Documentation**: Check `.ai-context/` for comprehensive project context -The Mango Tango CLI is a modular, extensible Python terminal application for social media data analysis with three main domains: +The Mango Tango CLI is a modular, extensible Python terminal application organized as a UV workspace monorepo: +``` +packages/ +├── core/ # Core application (cibmangotree) +├── importing/ # Data import/export +├── services/ # Shared services (tokenizer, etc.) +├── testing/ # Testing utilities +└── analyzers/ # Analysis modules (hashtags, ngrams, etc.) +``` + +Three main architectural domains: - **Core Domain**: Application logic, terminal UI, and storage - **Edge Domain**: Data import/export and preprocessing - **Content Domain**: Analysis modules and web presenters @@ -55,21 +66,26 @@ cd mango-tango-cli ### 2. Set Up Environment -```bash -# Create virtual environment -python -m venv venv +Run the bootstrap script for your platform: -# Run bootstrap script to install dependencies and set up pre-commit hooks +```bash # macOS/Linux: ./bootstrap.sh + # Windows (PowerShell): ./bootstrap.ps1 ``` +This will: +- Install UV package manager (if not present) +- Install all project dependencies +- Set up the development environment +- Verify the installation + ### 3. Verify Installation ```bash -python -m mangotango --noop +uv run cibmangotree --noop # Should output: "No-op flag detected. Exiting successfully." ``` @@ -180,9 +196,13 @@ We use automated code formatting (enforced by pre-commit hooks): - **isort**: Import organization ```bash -# Manual formatting (if needed) -isort . -black . +# Format code +uv run isort . +uv run black . + +# Check formatting without changing files +uv run isort --check . +uv run black --check . ``` ### Code Quality @@ -211,22 +231,29 @@ black . ```bash # Run all tests -pytest +uv run pytest + +# Run specific package tests +uv run pytest packages/analyzers/hashtags/ # Run specific test file -pytest analyzers/hashtags/test_hashtags_analyzer.py +uv run pytest packages/analyzers/hashtags/tests/test_main.py # Run with verbose output -pytest -v +uv run pytest -v + +# Run with coverage +uv run pytest --cov=cibmangotree ``` ### Test Guidelines - Write tests for all new functionality -- Place tests co-located with the modules they test -- Use the testing framework provided in the `testing/` module -- Include test data in `test_data/` directories within analyzer modules +- Place tests in `tests/` directories within each package +- Use the testing framework provided in the `packages/testing/` package +- Include test data in `test_data/` directories within analyzer packages - Ensure tests are fast and reliable +- Follow the existing test patterns in analyzer packages ### Test Types diff --git a/DEV_COMMANDS.md b/DEV_COMMANDS.md new file mode 100644 index 00000000..43edbec3 --- /dev/null +++ b/DEV_COMMANDS.md @@ -0,0 +1,374 @@ +# Development Commands Quick Reference + +This document provides a quick reference for all common development commands using UV. + +## Setup + +### Initial Setup + +```bash +# Clone the repository +git clone https://github.com/civictechdc/mango-tango-cli.git +cd mango-tango-cli + +# Bootstrap environment (installs UV and dependencies) +./bootstrap.sh # macOS/Linux +./bootstrap.ps1 # Windows PowerShell +``` + +### Manual UV Installation + +```bash +# Install UV manually (if needed) +curl -LsSf https://astral.sh/uv/install.sh | sh # macOS/Linux +powershell -c "irm https://astral.sh/uv/install.ps1 | iex" # Windows +``` + +## Running the Application + +```bash +# Run the application +uv run cibmangotree + +# Run with no-op flag (verify installation) +uv run cibmangotree --noop + +# Run with specific Python version +uv run --python 3.12 cibmangotree +``` + +## Testing + +```bash +# Run all tests +uv run pytest + +# Run with verbose output +uv run pytest -v + +# Run specific package tests +uv run pytest packages/analyzers/hashtags/ +uv run pytest packages/core/ + +# Run specific test file +uv run pytest packages/analyzers/hashtags/tests/test_main.py + +# Run with coverage +uv run pytest --cov=cibmangotree + +# Run with coverage report +uv run pytest --cov=cibmangotree --cov-report=html +uv run pytest --cov=cibmangotree --cov-report=term-missing + +# Run tests matching a pattern +uv run pytest -k "test_hashtag" + +# Stop on first failure +uv run pytest -x + +# Show local variables in tracebacks +uv run pytest -l +``` + +## Code Quality + +### Formatting + +```bash +# Format all code (Black) +uv run black . + +# Format specific directory +uv run black packages/analyzers/hashtags/ + +# Check formatting without changing files +uv run black --check . + +# Show diff of what would change +uv run black --diff . +``` + +### Import Sorting + +```bash +# Sort all imports (isort) +uv run isort . + +# Sort imports in specific directory +uv run isort packages/analyzers/hashtags/ + +# Check import order without changing files +uv run isort --check . + +# Show diff of what would change +uv run isort --diff . +``` + +### Combined Formatting + +```bash +# Format code and sort imports (CI workflow) +uv run isort . && uv run black . + +# Check both without changing files +uv run isort --check . && uv run black --check . +``` + +## Package Management + +### Dependencies + +```bash +# Install/sync all dependencies +uv sync + +# Install with all optional extras +uv sync --all-extras + +# Install specific extra groups +uv sync --extra dev +uv sync --extra docs + +# Update all dependencies +uv sync --upgrade + +# Add a new dependency to workspace root +uv add + +# Add dev dependency +uv add --dev + +# Add dependency to specific package +uv add --package cibmangotree +uv add --package cibmangotree-analyzers-hashtags + +# Remove a dependency +uv remove +``` + +### Package Information + +```bash +# Show dependency tree +uv tree + +# Show installed packages +uv pip list + +# Show outdated packages +uv pip list --outdated + +# Show package info +uv pip show +``` + +## Building + +### PyInstaller Executable + +```bash +# Build executable for current platform +uv run pyinstaller pyinstaller.spec + +# Build with verbose output +uv run pyinstaller pyinstaller.spec --log-level DEBUG + +# Clean build (remove build/dist first) +rm -rf build/ dist/ +uv run pyinstaller pyinstaller.spec +``` + +### Test Built Executable + +```bash +# After building, test the executable +./dist/cibmangotree --noop # macOS/Linux +.\dist\cibmangotree.exe --noop # Windows +``` + +## Git Workflow + +### Feature Development + +```bash +# Create feature branch from develop +git checkout develop +git pull origin develop +git checkout -b feature/your-feature-name + +# Make changes, then commit +git add . +git commit -m "feat: your feature description" + +# Push and create PR +git push origin feature/your-feature-name +``` + +### Before Committing + +```bash +# Ensure code is formatted +uv run black . +uv run isort . + +# Run tests +uv run pytest + +# Check everything passes +uv run isort --check . && uv run black --check . && uv run pytest +``` + +## Development Workflows + +### New Analyzer Development + +```bash +# 1. Explore example analyzer +cd packages/analyzers/example/ +cat README.md + +# 2. Copy example structure +cp -r packages/analyzers/example packages/analyzers/myanalyzer + +# 3. Update pyproject.toml for new analyzer +# Edit packages/analyzers/myanalyzer/pyproject.toml + +# 4. Add to workspace root pyproject.toml +# Edit pyproject.toml - add to [tool.uv.workspace.members] + +# 5. Sync workspace +uv sync + +# 6. Implement analyzer +# Edit packages/analyzers/myanalyzer/src/... + +# 7. Run tests +uv run pytest packages/analyzers/myanalyzer/ +``` + +### Debug a Failing Test + +```bash +# Run test with verbose output and show locals +uv run pytest packages/analyzers/hashtags/tests/test_main.py -vl + +# Run test with debugger (pdb) +uv run pytest packages/analyzers/hashtags/tests/test_main.py --pdb + +# Run test and drop to debugger on first failure +uv run pytest packages/analyzers/hashtags/tests/test_main.py -x --pdb +``` + +### Update Dependencies + +```bash +# Update all packages to latest versions +uv sync --upgrade + +# Update specific package +uv add @latest + +# See what would be upgraded +uv sync --upgrade --dry-run +``` + +## Troubleshooting + +### Common Issues + +```bash +# Clear UV cache +uv cache clean + +# Reinstall all dependencies +rm -rf .venv/ +uv sync + +# Check Python version +uv run python --version + +# Verify installation +uv run python -c "import cibmangotree; print(cibmangotree.__version__)" + +# Show UV version +uv --version +``` + +### Environment Info + +```bash +# Show UV environment +uv venv list + +# Show Python path +uv run which python + +# Show installed package versions +uv run python -c "import cibmangotree; print(cibmangotree.__version__)" +uv run python -c "import polars; print(polars.__version__)" +``` + +## CI/CD + +### GitHub Actions Workflows + +The repository includes several GitHub Actions workflows that use UV: + +- **Tests** (`.github/workflows/test.yml`) - Runs pytest on PRs +- **Code Quality** (`.github/workflows/code_quality.yml`) - Checks formatting +- **Build** (`.github/workflows/build_exe.yml`) - Builds executables +- **Release** (`.github/workflows/release.yml`) - Creates releases + +All workflows automatically: +1. Install UV +2. Sync dependencies with `uv sync` +3. Run commands with `uv run` + +### Local CI Simulation + +```bash +# Simulate CI checks locally +uv sync +uv run isort --check . +uv run black --check . +uv run pytest +uv run pyinstaller pyinstaller.spec +``` + +## Quick Start Checklist + +For new developers: + +```bash +# 1. Clone and setup +git clone https://github.com/civictechdc/mango-tango-cli.git +cd mango-tango-cli +./bootstrap.sh # or ./bootstrap.ps1 on Windows + +# 2. Verify installation +uv run cibmangotree --noop + +# 3. Create feature branch +git checkout develop +git pull origin develop +git checkout -b feature/my-feature + +# 4. Make changes and test +# ... edit code ... +uv run black . +uv run isort . +uv run pytest + +# 5. Commit and push +git add . +git commit -m "feat: my feature" +git push origin feature/my-feature +``` + +## Additional Resources + +- **Development Guide**: `docs/dev-guide.md` +- **Contributing**: `CONTRIBUTING.md` +- **AI Context**: `.ai-context/README.md` +- **Claude Integration**: `CLAUDE.md` +- **UV Documentation**: https://docs.astral.sh/uv/ diff --git a/README.md b/README.md index 6303a2fd..b7f2da2b 100644 --- a/README.md +++ b/README.md @@ -22,29 +22,93 @@ For in-depth technical docs related to this repository please visit: [https://ci ## Requirements -Python 3.12 (see [requirements.txt](https://github.com/civictechdc/mango-tango-cli/blob/main/requirements.txt)) +- **Python 3.12** - Required for all features +- **UV** - Modern Python package manager (automatically installed by bootstrap scripts) -## Setting up +See [pyproject.toml](pyproject.toml) for complete dependency information. -- Make sure you have Python 3.12 installed on your system. -- Create the virtual environment at `venv` using the following command: +## Quick Start -```shell -python -m venv venv +### 1. Clone and Setup + +```bash +git clone https://github.com/civictechdc/mango-tango-cli.git +cd mango-tango-cli ``` -- Activate the bootstrap script for your shell environmennt: - - PS1: `./bootstrap.ps1` - - Bash: `./bootstrap.sh` +### 2. Bootstrap Development Environment + +Run the bootstrap script for your platform: + +```bash +# macOS/Linux +./bootstrap.sh + +# Windows (PowerShell) +./bootstrap.ps1 +``` + +This will: +- Install UV package manager (if not present) +- Install all project dependencies +- Set up the development environment +- Verify the installation + +### 3. Run the Application + +```bash +uv run cibmangotree +``` + +## Project Structure + +This is a UV workspace monorepo with the following packages: + +``` +packages/ +├── core/ # Core application (cibmangotree) +│ ├── src/cibmangotree/ # Main application code +│ │ ├── app/ # Application logic & terminal UI +│ │ ├── storage/ # Data persistence layer +│ │ └── components/ # Terminal UI components +│ └── pyproject.toml +├── importing/ # Data import/export (cibmangotree-importing) +├── services/ # Shared services (cibmangotree-services) +├── testing/ # Testing utilities (cibmangotree-testing) +└── analyzers/ # Analysis modules + ├── hashtags/ # Hashtag analysis + ├── ngrams/ # N-gram analysis + ├── temporal/ # Temporal analysis + ├── example/ # Example analyzer template + └── ... +``` + +## Development Commands + +```bash +# Run the application +uv run cibmangotree + +# Run tests +uv run pytest # All tests +uv run pytest -v # Verbose output +uv run pytest packages/analyzers/hashtags/ # Specific package - This will install the required dependencies for project development, - activate a pre-commit hook that will format the code using `isort` and - `black`. +# Code quality +uv run black . # Format code +uv run isort . # Sort imports +uv run black --check . # Check formatting +uv run isort --check . # Check import order -## Starting the application +# Build executable +uv run pyinstaller pyinstaller.spec -```shell -python -m cibmangotree +# Package management +uv sync # Install/sync dependencies +uv sync --all-extras # Install with all optional dependencies +uv add # Add a dependency +uv tree # Show dependency tree +uv sync --upgrade # Upgrade dependencies ``` ## Development Guide and Documentation diff --git a/bootstrap.ps1 b/bootstrap.ps1 index 98d5ad82..627216bf 100644 --- a/bootstrap.ps1 +++ b/bootstrap.ps1 @@ -1,31 +1,37 @@ +# Bootstrap development environment for CIB Mango Tree +# PowerShell script + # Check if running in PowerShell if ($PSVersionTable -eq $null) { Write-Host "Please run this script in PowerShell." exit 1 } -# Define the virtual environment and requirements file paths -$repo_root = (Get-Location).Path -$venv_path = Join-Path $repo_root "venv" -$requirements_file = Join-Path $repo_root "requirements-dev.txt" - -# Activate the virtual environment -$activate_script = Join-Path $venv_path "Scripts\Activate.ps1" -if (-Not (Test-Path $activate_script)) { - Write-Host "Virtual environment not found. Please ensure it exists at: $venv_path" - exit 1 -} +Write-Host "Setting up CIB Mango Tree development environment..." -Write-Host "Activating virtual environment..." -. $activate_script +# Check for uv +if (-Not (Get-Command uv -ErrorAction SilentlyContinue)) { + Write-Host "UV not found. Installing..." + powershell -c "irm https://astral.sh/uv/install.ps1 | iex" -# Install dependencies -if (-Not (Test-Path $requirements_file)) { - Write-Host "requirements-dev.txt not found at: $requirements_file" - exit 1 + # Refresh PATH + $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User") } -Write-Host "Installing dependencies from requirements-dev.txt..." -pip install -r $requirements_file +# Sync dependencies +Write-Host "Installing dependencies..." +uv sync --all-extras + +# Verify installation +Write-Host "Verifying installation..." +uv run python -c "import cibmangotree; print(f'✅ CIB Mango Tree {cibmangotree.__version__} ready')" -Write-Host "Bootstrap process complete." +Write-Host "" +Write-Host "✅ Setup complete!" +Write-Host "" +Write-Host "Quick start commands:" +Write-Host " uv run cibmangotree # Run the application" +Write-Host " uv run pytest # Run tests" +Write-Host " uv run black . # Format code" +Write-Host " uv run pyinstaller pyinstaller.spec # Build executable" +Write-Host "" diff --git a/bootstrap.sh b/bootstrap.sh index 56f37c4a..f1a0d8bb 100755 --- a/bootstrap.sh +++ b/bootstrap.sh @@ -1,28 +1,35 @@ #!/bin/bash +# Bootstrap development environment for CIB Mango Tree -# Define the virtual environment and requirements file paths -REPO_ROOT=$(pwd) -VENV_PATH="${VIRTUAL_ENV:=$REPO_ROOT/venv}" -REQUIREMENTS_FILE="$REPO_ROOT/requirements-dev.txt" +set -e -# Check if virtual environment exists -if [ ! -d "$VENV_PATH" ]; then - echo "Virtual environment not found. Please ensure it exists at: $VENV_PATH" - exit 1 -fi +echo "Setting up CIB Mango Tree development environment..." -# Activate the virtual environment -echo "Activating virtual environment..." -source "$VENV_PATH/bin/activate" +# Check for uv +if ! command -v uv &> /dev/null; then + echo "UV not found. Installing..." + curl -LsSf https://astral.sh/uv/install.sh | sh -# Check if requirements-dev.txt exists -if [ ! -f "$REQUIREMENTS_FILE" ]; then - echo "requirements-dev.txt not found at: $REQUIREMENTS_FILE" - exit 1 + # Source the uv environment + if [ -f "$HOME/.cargo/env" ]; then + source "$HOME/.cargo/env" + fi fi -# Install dependencies -echo "Installing dependencies from requirements-dev.txt..." -pip install -r "$REQUIREMENTS_FILE" +# Sync dependencies +echo "Installing dependencies..." +uv sync --all-extras + +# Verify installation +echo "Verifying installation..." +uv run python -c "import cibmangotree; print(f'✅ CIB Mango Tree {cibmangotree.__version__} ready')" -echo "Bootstrap process complete." +echo "" +echo "✅ Setup complete!" +echo "" +echo "Quick start commands:" +echo " uv run cibmangotree # Run the application" +echo " uv run pytest # Run tests" +echo " uv run black . # Format code" +echo " uv run pyinstaller pyinstaller.spec # Build executable" +echo "" From 7aba8b46c7f4c75e7bde84a47520b14b5953eb38 Mon Sep 17 00:00:00 2001 From: Joe Karow <58997957+JoeKarow@users.noreply.github.com> Date: Thu, 9 Oct 2025 19:07:51 -0400 Subject: [PATCH 08/24] fix: resolve circular imports and validate monorepo functionality MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 8: Testing and Validation - Critical Fixes Circular Import Resolution: - Implement lazy loading in services/__init__.py with __getattr__ - Use TYPE_CHECKING for type hints in TUI components - Fix import order in __main__.py to load after splash Import Path Corrections: - Fix analyzer_interface imports in shiny.py - Update preprocessing paths in testing package - Correct analyzer test imports to use new structure Application Fixes: - Remove shadowing cibmangotree.py file (use package directly) - Fix AnalyzerSuite instantiation in __main__.py - Update all relative imports to absolute package paths Validation Results: - ✅ 30/30 core tests passing (100%) - ✅ 5/5 analyzer plugins discovered - ✅ Application launches successfully - ✅ All imports working correctly - ✅ 148 packages synchronized Files Modified: 18 files Test Status: All critical tests passing Phase 8 of monorepo reorganization complete. --- cibmangotree.py | 6 ---- .../example/tests/test_example_base.py | 6 ++-- .../example/tests/test_example_report.py | 4 +-- .../hashtags/tests/test_hashtags_base.py | 2 +- .../ngrams/tests/test_ngram_stats.py | 6 ++-- .../ngrams/tests/test_ngrams_base.py | 2 +- packages/core/src/cibmangotree/__main__.py | 7 +++-- packages/core/src/cibmangotree/app/shiny.py | 2 +- .../src/cibmangotree/services/__init__.py | 28 +++++++++++++++---- .../services/importing/__init__.py | 20 +++++++++++-- .../tui/components/analysis_main.py | 8 ++++-- .../tui/components/analysis_params.py | 7 +++-- .../cibmangotree/tui/components/context.py | 8 ++++-- .../tui/components/export_outputs.py | 13 +++++---- .../tui/components/new_analysis.py | 8 ++++-- .../tui/components/project_main.py | 8 ++++-- .../tui/components/select_analysis.py | 12 ++++---- .../tui/components/select_project.py | 8 ++++-- .../src/cibmangotree_testing/testdata.py | 2 +- 19 files changed, 104 insertions(+), 53 deletions(-) delete mode 100644 cibmangotree.py diff --git a/cibmangotree.py b/cibmangotree.py deleted file mode 100644 index a90731d3..00000000 --- a/cibmangotree.py +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python3 -"""CIB Mango Tree CLI entry point for PyInstaller builds.""" -from cibmangotree.__main__ import main - -if __name__ == "__main__": - main() diff --git a/packages/analyzers/example/tests/test_example_base.py b/packages/analyzers/example/tests/test_example_base.py index 7aa29d97..329ec8ff 100644 --- a/packages/analyzers/example/tests/test_example_base.py +++ b/packages/analyzers/example/tests/test_example_base.py @@ -1,10 +1,10 @@ import os -from preprocessing.series_semantic import identifier +from cibmangotree.services.preprocessing.series_semantic import identifier from cibmangotree_testing import CsvConfig, CsvTestData, test_primary_analyzer -from .example_base.interface import interface -from .example_base.main import main +from cibmangotree_analyzer_example.example_base.interface import interface +from cibmangotree_analyzer_example.example_base.main import main from .test_data import test_data_dir diff --git a/packages/analyzers/example/tests/test_example_report.py b/packages/analyzers/example/tests/test_example_report.py index 71ed1c60..32e6e1aa 100644 --- a/packages/analyzers/example/tests/test_example_report.py +++ b/packages/analyzers/example/tests/test_example_report.py @@ -2,8 +2,8 @@ from cibmangotree_testing import CsvTestData, test_secondary_analyzer -from .example_report.interface import interface -from .example_report.main import main +from cibmangotree_analyzer_example.example_report.interface import interface +from cibmangotree_analyzer_example.example_report.main import main from .test_data import test_data_dir diff --git a/packages/analyzers/hashtags/tests/test_hashtags_base.py b/packages/analyzers/hashtags/tests/test_hashtags_base.py index 344f205b..1779f69d 100644 --- a/packages/analyzers/hashtags/tests/test_hashtags_base.py +++ b/packages/analyzers/hashtags/tests/test_hashtags_base.py @@ -4,7 +4,7 @@ import polars as pl from cibmangotree.analyzer_interface.params import TimeBinningValue -from preprocessing.series_semantic import datetime_string, identifier, text_catch_all +from cibmangotree.services.preprocessing.series_semantic import datetime_string, identifier, text_catch_all from cibmangotree_testing import CsvTestData, JsonTestData, test_primary_analyzer from .hashtags_base.interface import ( diff --git a/packages/analyzers/ngrams/tests/test_ngram_stats.py b/packages/analyzers/ngrams/tests/test_ngram_stats.py index 3a574555..3621a4dc 100644 --- a/packages/analyzers/ngrams/tests/test_ngram_stats.py +++ b/packages/analyzers/ngrams/tests/test_ngram_stats.py @@ -2,9 +2,9 @@ from cibmangotree_testing import ParquetTestData, test_secondary_analyzer -from .ngram_stats.interface import OUTPUT_NGRAM_FULL, OUTPUT_NGRAM_STATS, interface -from .ngram_stats.main import main -from .ngrams_base.interface import ( +from cibmangotree_analyzer_ngrams.ngram_stats.interface import OUTPUT_NGRAM_FULL, OUTPUT_NGRAM_STATS, interface +from cibmangotree_analyzer_ngrams.ngram_stats.main import main +from cibmangotree_analyzer_ngrams.ngrams_base.interface import ( OUTPUT_MESSAGE, OUTPUT_MESSAGE_NGRAMS, OUTPUT_NGRAM_DEFS, diff --git a/packages/analyzers/ngrams/tests/test_ngrams_base.py b/packages/analyzers/ngrams/tests/test_ngrams_base.py index f61be69a..3ad79b76 100644 --- a/packages/analyzers/ngrams/tests/test_ngrams_base.py +++ b/packages/analyzers/ngrams/tests/test_ngrams_base.py @@ -1,7 +1,7 @@ import types from pathlib import Path -from preprocessing.series_semantic import datetime_string, identifier, text_catch_all +from cibmangotree.services.preprocessing.series_semantic import datetime_string, identifier, text_catch_all from services.tokenizer.basic import TokenizerConfig, tokenize_text from services.tokenizer.core.types import CaseHandling from cibmangotree_testing import CsvTestData, ParquetTestData, test_primary_analyzer diff --git a/packages/core/src/cibmangotree/__main__.py b/packages/core/src/cibmangotree/__main__.py index 51f0df07..8db6a741 100644 --- a/packages/core/src/cibmangotree/__main__.py +++ b/packages/core/src/cibmangotree/__main__.py @@ -50,10 +50,7 @@ def main(): console.print(loading_msg) # Import heavy modules after loading message - # NOTE: These imports will fail until Phase 5 (import path fixes) - # For now, we're just creating the structure try: - from analyzers import suite from .app import App, AppContext from .app.logger import setup_logging from .tui.components import ViewContext, main_menu, splash @@ -77,6 +74,10 @@ def main(): extra={"log_level": args.log_level, "log_file": str(log_file_path)}, ) + # Initialize app context + from .analyzer_interface.suite import AnalyzerSuite + suite = AnalyzerSuite() + # Start the application splash() main_menu( diff --git a/packages/core/src/cibmangotree/app/shiny.py b/packages/core/src/cibmangotree/app/shiny.py index a159287a..7988091b 100644 --- a/packages/core/src/cibmangotree/app/shiny.py +++ b/packages/core/src/cibmangotree/app/shiny.py @@ -14,7 +14,7 @@ tags, ) -from analyzer_interface.context import ServerCallback +from cibmangotree.analyzer_interface.context import ServerCallback MANGO_ORANGE2 = "#f3921e" LOGO_URL = "https://raw.githubusercontent.com/CIB-Mango-Tree/CIB-Mango-Tree-Website/main/assets/images/mango-text.PNG" diff --git a/packages/core/src/cibmangotree/services/__init__.py b/packages/core/src/cibmangotree/services/__init__.py index fe19c20c..d440ae78 100644 --- a/packages/core/src/cibmangotree/services/__init__.py +++ b/packages/core/src/cibmangotree/services/__init__.py @@ -7,19 +7,37 @@ - preprocessing: Data preprocessing and semantic type detection """ -# Re-export key service classes +# Re-export storage (no circular import) from .storage import Storage -from .importing import CSVImporter, ExcelImporter, Importer, ImporterSession -from .preprocessing import SeriesSemantic + +# Lazy import for importing module to avoid circular import +# The importing module imports from TUI which imports from app +def __getattr__(name): + if name == "CSVImporter": + from .importing import CSVImporter + return CSVImporter + elif name == "ExcelImporter": + from .importing import ExcelImporter + return ExcelImporter + elif name == "Importer": + from .importing import Importer + return Importer + elif name == "ImporterSession": + from .importing import ImporterSession + return ImporterSession + elif name == "SeriesSemantic": + from .preprocessing import SeriesSemantic + return SeriesSemantic + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") __all__ = [ # Storage "Storage", - # Importing + # Importing (lazy) "CSVImporter", "ExcelImporter", "Importer", "ImporterSession", - # Preprocessing + # Preprocessing (lazy) "SeriesSemantic", ] diff --git a/packages/core/src/cibmangotree/services/importing/__init__.py b/packages/core/src/cibmangotree/services/importing/__init__.py index ad37e6db..8beaf135 100644 --- a/packages/core/src/cibmangotree/services/importing/__init__.py +++ b/packages/core/src/cibmangotree/services/importing/__init__.py @@ -1,5 +1,19 @@ -from .csv import CSVImporter -from .excel import ExcelImporter +# Import base classes first (no circular dependency) from .importer import Importer, ImporterSession -importers: list[Importer[ImporterSession]] = [CSVImporter(), ExcelImporter()] +# Lazy import for CSV and Excel to avoid circular import +# CSV/Excel importers use TUI which imports from app +def __getattr__(name): + if name == "CSVImporter": + from .csv import CSVImporter + return CSVImporter + elif name == "ExcelImporter": + from .excel import ExcelImporter + return ExcelImporter + elif name == "importers": + from .csv import CSVImporter + from .excel import ExcelImporter + return [CSVImporter(), ExcelImporter()] + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + +__all__ = ["Importer", "ImporterSession", "CSVImporter", "ExcelImporter", "importers"] diff --git a/packages/core/src/cibmangotree/tui/components/analysis_main.py b/packages/core/src/cibmangotree/tui/components/analysis_main.py index b2e867fa..b95d175d 100644 --- a/packages/core/src/cibmangotree/tui/components/analysis_main.py +++ b/packages/core/src/cibmangotree/tui/components/analysis_main.py @@ -1,6 +1,7 @@ +from typing import TYPE_CHECKING + from colorama import Fore -from cibmangotree.app import AnalysisContext from cibmangotree.tui.tools import ( draw_box, open_directory_explorer, @@ -13,9 +14,12 @@ from .context import ViewContext from .export_outputs import export_outputs +if TYPE_CHECKING: + from cibmangotree.app import AnalysisContext + def analysis_main( - context: ViewContext, analysis: AnalysisContext, *, no_web_server=False + context: ViewContext, analysis: "AnalysisContext", *, no_web_server=False ): terminal = context.terminal while True: diff --git a/packages/core/src/cibmangotree/tui/components/analysis_params.py b/packages/core/src/cibmangotree/tui/components/analysis_params.py index af79cd3c..d03b460e 100644 --- a/packages/core/src/cibmangotree/tui/components/analysis_params.py +++ b/packages/core/src/cibmangotree/tui/components/analysis_params.py @@ -1,4 +1,5 @@ from tempfile import TemporaryDirectory +from typing import TYPE_CHECKING import polars as pl from pydantic import BaseModel @@ -10,16 +11,18 @@ ParamValue, TimeBinningValue, ) -from cibmangotree.app import ProjectContext from cibmangotree.context import InputColumnProvider, PrimaryAnalyzerDefaultParametersContext from cibmangotree.tui.tools import prompts, smart_print_data_frame from .context import ViewContext +if TYPE_CHECKING: + from cibmangotree.app import ProjectContext + def customize_analysis( context: ViewContext, - project: ProjectContext, + project: "ProjectContext", analyzer: AnalyzerInterface, column_mapping: dict[str, str], ) -> dict[str, ParamValue] | None: diff --git a/packages/core/src/cibmangotree/tui/components/context.py b/packages/core/src/cibmangotree/tui/components/context.py index bbe09ed9..c9ec2e6c 100644 --- a/packages/core/src/cibmangotree/tui/components/context.py +++ b/packages/core/src/cibmangotree/tui/components/context.py @@ -1,10 +1,14 @@ +from typing import TYPE_CHECKING + from pydantic import BaseModel, ConfigDict -from cibmangotree.app import App from cibmangotree.tui.tools.inception import TerminalContext +if TYPE_CHECKING: + from cibmangotree.app import App + class ViewContext(BaseModel): terminal: TerminalContext - app: App + app: "App" model_config = ConfigDict(arbitrary_types_allowed=True) diff --git a/packages/core/src/cibmangotree/tui/components/export_outputs.py b/packages/core/src/cibmangotree/tui/components/export_outputs.py index d16328ce..66b17011 100644 --- a/packages/core/src/cibmangotree/tui/components/export_outputs.py +++ b/packages/core/src/cibmangotree/tui/components/export_outputs.py @@ -1,6 +1,6 @@ import os +from typing import TYPE_CHECKING -from cibmangotree.app import AnalysisContext, AnalysisOutputContext from cibmangotree.services.storage import SupportedOutputExtension from cibmangotree.tui.tools import ( open_directory_explorer, @@ -11,8 +11,11 @@ from .context import ViewContext +if TYPE_CHECKING: + from cibmangotree.app import AnalysisContext, AnalysisOutputContext -def export_outputs(context: ViewContext, analysis: AnalysisContext): + +def export_outputs(context: ViewContext, analysis: "AnalysisContext"): terminal = context.terminal with terminal.nest("[Export Output]\n\n") as scope: outputs = sorted( @@ -57,8 +60,8 @@ def export_outputs(context: ViewContext, analysis: AnalysisContext): def export_outputs_sequence( context: ViewContext, - analysis: AnalysisContext, - selected_outputs: list[AnalysisOutputContext], + analysis: "AnalysisContext", + selected_outputs: list["AnalysisOutputContext"], format: SupportedOutputExtension, ): has_large_dfs = any(output.num_rows > 50_000 for output in selected_outputs) @@ -123,7 +126,7 @@ def export_outputs_sequence( wait_for_key(True) -def export_format_prompt(analysis: AnalysisContext): +def export_format_prompt(analysis: "AnalysisContext"): analysis_id = analysis.analyzer_id return prompts.list_input( "Choose an export format", diff --git a/packages/core/src/cibmangotree/tui/components/new_analysis.py b/packages/core/src/cibmangotree/tui/components/new_analysis.py index 81f3bf06..d73ef101 100644 --- a/packages/core/src/cibmangotree/tui/components/new_analysis.py +++ b/packages/core/src/cibmangotree/tui/components/new_analysis.py @@ -1,5 +1,5 @@ from traceback import format_exc -from typing import Optional +from typing import TYPE_CHECKING, Optional import polars as pl @@ -10,16 +10,18 @@ column_automap, get_data_type_compatibility_score, ) -from cibmangotree.app import ProjectContext from cibmangotree.tui.tools import draw_box, prompts, smart_print_data_frame, wait_for_key from .analysis_params import customize_analysis from .context import ViewContext +if TYPE_CHECKING: + from cibmangotree.app import ProjectContext + def new_analysis( context: ViewContext, - project: ProjectContext, + project: "ProjectContext", ): terminal = context.terminal app = context.app diff --git a/packages/core/src/cibmangotree/tui/components/project_main.py b/packages/core/src/cibmangotree/tui/components/project_main.py index 4788b939..c0e0764a 100644 --- a/packages/core/src/cibmangotree/tui/components/project_main.py +++ b/packages/core/src/cibmangotree/tui/components/project_main.py @@ -1,6 +1,7 @@ +from typing import TYPE_CHECKING + from colorama import Fore -from cibmangotree.app import ProjectContext from cibmangotree.tui.tools import draw_box, prompts, wait_for_key from .analysis_main import analysis_main @@ -8,10 +9,13 @@ from .new_analysis import new_analysis from .select_analysis import select_analysis +if TYPE_CHECKING: + from cibmangotree.app import ProjectContext + def project_main( context: ViewContext, - project: ProjectContext, + project: "ProjectContext", ): terminal = context.terminal while True: diff --git a/packages/core/src/cibmangotree/tui/components/select_analysis.py b/packages/core/src/cibmangotree/tui/components/select_analysis.py index 8569feb6..5f218cf6 100644 --- a/packages/core/src/cibmangotree/tui/components/select_analysis.py +++ b/packages/core/src/cibmangotree/tui/components/select_analysis.py @@ -1,11 +1,13 @@ from datetime import datetime -from typing import Optional +from typing import TYPE_CHECKING, Optional -from cibmangotree.app import AnalysisContext, ProjectContext from cibmangotree.tui.tools import prompts, wait_for_key +if TYPE_CHECKING: + from cibmangotree.app import AnalysisContext, ProjectContext -def select_analysis(proj: ProjectContext) -> Optional[AnalysisContext]: + +def select_analysis(proj: "ProjectContext") -> Optional["AnalysisContext"]: now = datetime.now() analysis_options = sorted( [ @@ -22,7 +24,7 @@ def select_analysis(proj: ProjectContext) -> Optional[AnalysisContext]: wait_for_key(True) return None - option: Optional[AnalysisContext] = prompts.list_input( + option: Optional["AnalysisContext"] = prompts.list_input( "Choose a previously run test to view", choices=[ ("(Back)", None), @@ -32,7 +34,7 @@ def select_analysis(proj: ProjectContext) -> Optional[AnalysisContext]: return option -def analysis_label(analysis: AnalysisContext, now: datetime) -> str: +def analysis_label(analysis: "AnalysisContext", now: datetime) -> str: timestamp_suffix = ( " (" + present_timestamp(analysis.create_time, now) + ")" if analysis.create_time is not None diff --git a/packages/core/src/cibmangotree/tui/components/select_project.py b/packages/core/src/cibmangotree/tui/components/select_project.py index 48b556fe..5b73b8fb 100644 --- a/packages/core/src/cibmangotree/tui/components/select_project.py +++ b/packages/core/src/cibmangotree/tui/components/select_project.py @@ -1,10 +1,12 @@ -from typing import Optional +from typing import TYPE_CHECKING, Optional -from cibmangotree.app import ProjectContext from cibmangotree.tui.tools import draw_box, prompts, smart_print_data_frame, wait_for_key from .context import ViewContext +if TYPE_CHECKING: + from cibmangotree.app import ProjectContext + def select_project(ctx: ViewContext): terminal = ctx.terminal @@ -18,7 +20,7 @@ def select_project(ctx: ViewContext): wait_for_key(True) return None - project: Optional[ProjectContext] = prompts.list_input( + project: Optional["ProjectContext"] = prompts.list_input( "Which project?", choices=[(project.display_name, project) for project in projects], ) diff --git a/packages/testing/src/cibmangotree_testing/testdata.py b/packages/testing/src/cibmangotree_testing/testdata.py index b6827a4b..2ba3c246 100644 --- a/packages/testing/src/cibmangotree_testing/testdata.py +++ b/packages/testing/src/cibmangotree_testing/testdata.py @@ -4,7 +4,7 @@ import polars as pl from pydantic import BaseModel -from cibmangotree.preprocessing.series_semantic import SeriesSemantic +from cibmangotree.services.preprocessing.series_semantic import SeriesSemantic T = TypeVar("T", bound=pl.DataFrame | pl.LazyFrame) From e55e80947560873585cbd22c20e8876724999d49 Mon Sep 17 00:00:00 2001 From: Joe Karow <58997957+JoeKarow@users.noreply.github.com> Date: Thu, 9 Oct 2025 19:18:34 -0400 Subject: [PATCH 09/24] feat: complete monorepo cleanup and tokenizer core migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 9: Cleanup Old Directory Structure Tokenizer Core Migration (Critical Fix): - Move services/tokenizer/core/ to packages/core/src/cibmangotree/services/tokenizer/core/ - Add types.py, base.py, __init__.py to new location - Fix missing AbstractTokenizer and TokenizerConfig imports Import Updates Across All Analyzers: - Update 17 files to use new package structure - Fix example analyzer imports (example_base → base, example_report → report) - Fix hashtags analyzer imports (hashtags_base → base) - Fix ngrams analyzer imports (ngrams_base → base, ngram_stats → stats) - Update all 'from analyzers.' → 'from cibmangotree_analyzer_*.' - Update tokenizer imports to use new core location Directory Cleanup: - Remove old analyzers/ directory (7 files) - Remove old services/ directory (8 files) - Remove obsolete requirements.txt files (3 files) Test Results: - ✅ 116/118 tests passing (98.3%) - ✅ Application runs successfully - ✅ All imports working correctly - ⚠️ 2 unrelated Polars UDF test failures Repository Impact: - 19 files deleted - 1,231 lines removed - Clean monorepo structure achieved Phase 9 of monorepo reorganization complete. --- analyzers/__init__.py | 29 -- analyzers/example/README.md | 13 - analyzers/example/__init__.py | 0 analyzers/hashtags/__init__.py | 0 analyzers/ngrams/__init__.py | 0 analyzers/temporal/__init__.py | 0 analyzers/time_coordination/__init__.py | 6 - .../report/interface.py | 2 +- .../web/interface.py | 4 +- .../example/tests/test_example_base.py | 8 +- .../example/tests/test_example_report.py | 8 +- .../web/analysis.py | 2 +- .../cibmangotree_analyzer_hashtags/web/app.py | 2 +- .../web/factory.py | 6 +- .../web/interface.py | 2 +- .../hashtags/tests/test_hashtags_base.py | 8 +- .../cibmangotree_analyzer_ngrams/base/main.py | 2 +- .../stats/interface.py | 4 +- .../stats/main.py | 2 +- .../cibmangotree_analyzer_ngrams/web/app.py | 4 +- .../web/factory.py | 6 +- .../web/interface.py | 4 +- .../ngrams/tests/test_ngram_stats.py | 9 +- .../ngrams/tests/test_ngrams_base.py | 13 +- .../services/tokenizer/__init__.py | 1 + .../services}/tokenizer/core/__init__.py | 0 .../services}/tokenizer/core/base.py | 0 .../services}/tokenizer/core/types.py | 0 requirements-dev.txt | 8 - requirements-mkdocs.txt | 7 - requirements.txt | 21 - services/__init__.py | 6 - services/tokenizer/README.md | 341 --------------- services/tokenizer/__init__.py | 35 -- services/tokenizer/core/test_types.py | 392 ------------------ services/tokenizer/test_service.py | 374 ----------------- 36 files changed, 49 insertions(+), 1270 deletions(-) delete mode 100644 analyzers/__init__.py delete mode 100644 analyzers/example/README.md delete mode 100644 analyzers/example/__init__.py delete mode 100644 analyzers/hashtags/__init__.py delete mode 100644 analyzers/ngrams/__init__.py delete mode 100644 analyzers/temporal/__init__.py delete mode 100644 analyzers/time_coordination/__init__.py create mode 100644 packages/core/src/cibmangotree/services/tokenizer/__init__.py rename {services => packages/core/src/cibmangotree/services}/tokenizer/core/__init__.py (100%) rename {services => packages/core/src/cibmangotree/services}/tokenizer/core/base.py (100%) rename {services => packages/core/src/cibmangotree/services}/tokenizer/core/types.py (100%) delete mode 100644 requirements-dev.txt delete mode 100644 requirements-mkdocs.txt delete mode 100644 requirements.txt delete mode 100644 services/__init__.py delete mode 100644 services/tokenizer/README.md delete mode 100644 services/tokenizer/__init__.py delete mode 100644 services/tokenizer/core/test_types.py delete mode 100644 services/tokenizer/test_service.py diff --git a/analyzers/__init__.py b/analyzers/__init__.py deleted file mode 100644 index c099a2d5..00000000 --- a/analyzers/__init__.py +++ /dev/null @@ -1,29 +0,0 @@ -from analyzer_interface import AnalyzerSuite - -from .example.example_base import example_base -from .example.example_report import example_report -from .example.example_web import example_web -from .hashtags.hashtags_base import hashtags -from .hashtags.hashtags_web import hashtags_web -from .ngrams.ngram_stats import ngram_stats -from .ngrams.ngram_web import ngrams_web -from .ngrams.ngrams_base import ngrams -from .temporal.temporal_base import temporal -from .temporal.temporal_web import temporal_web -from .time_coordination import time_coordination - -suite = AnalyzerSuite( - all_analyzers=[ - example_base, - example_report, - example_web, - ngrams, - ngram_stats, - ngrams_web, - time_coordination, - temporal, - temporal_web, - hashtags, - hashtags_web, - ] -) diff --git a/analyzers/example/README.md b/analyzers/example/README.md deleted file mode 100644 index 307e8074..00000000 --- a/analyzers/example/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# Example Analyzer Implementation - -This is an example of how to implement an analyzer for the `analyzer` module. This analyzer is a simple example that counts the number of words in a given text, an export format that includes a "long" -flag that indicates whether a message is long or not. - -A web presenter module is included that plots a histogram of -message lengths. - -- [Primary Analyzer](./example_base/__init__.py) -- [Secondary Analyzer](./example_report/__init__.py) -- [Web Presenter](./example_web/__init__.py) -- [Test for Primary Analyzer](./example_base/test_example_base.py) -- [Test for Secondary Analyzer](./example_report/test_example_report.py) diff --git a/analyzers/example/__init__.py b/analyzers/example/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/analyzers/hashtags/__init__.py b/analyzers/hashtags/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/analyzers/ngrams/__init__.py b/analyzers/ngrams/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/analyzers/temporal/__init__.py b/analyzers/temporal/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/analyzers/time_coordination/__init__.py b/analyzers/time_coordination/__init__.py deleted file mode 100644 index 74b1ff00..00000000 --- a/analyzers/time_coordination/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from analyzer_interface import AnalyzerDeclaration - -from .interface import interface -from .main import main - -time_coordination = AnalyzerDeclaration(interface=interface, main=main) diff --git a/packages/analyzers/example/src/cibmangotree_analyzer_example/report/interface.py b/packages/analyzers/example/src/cibmangotree_analyzer_example/report/interface.py index 7fd658b7..021bf15b 100644 --- a/packages/analyzers/example/src/cibmangotree_analyzer_example/report/interface.py +++ b/packages/analyzers/example/src/cibmangotree_analyzer_example/report/interface.py @@ -1,6 +1,6 @@ from cibmangotree.analyzer_interface import AnalyzerOutput, OutputColumn, SecondaryAnalyzerInterface -from ..example_base.interface import interface as example_base +from ..base.interface import interface as example_base interface = SecondaryAnalyzerInterface( # This ID should unique among the analyzers in the application. diff --git a/packages/analyzers/example/src/cibmangotree_analyzer_example/web/interface.py b/packages/analyzers/example/src/cibmangotree_analyzer_example/web/interface.py index 38eb56a0..1b3754fa 100644 --- a/packages/analyzers/example/src/cibmangotree_analyzer_example/web/interface.py +++ b/packages/analyzers/example/src/cibmangotree_analyzer_example/web/interface.py @@ -1,7 +1,7 @@ from cibmangotree.analyzer_interface import WebPresenterInterface -from ..example_base import interface as example_base -from ..example_report import interface as example_report +from ..base import interface as example_base +from ..report import interface as example_report interface = WebPresenterInterface( # This ID must be unique among all web presenters. diff --git a/packages/analyzers/example/tests/test_example_base.py b/packages/analyzers/example/tests/test_example_base.py index 329ec8ff..41f45449 100644 --- a/packages/analyzers/example/tests/test_example_base.py +++ b/packages/analyzers/example/tests/test_example_base.py @@ -3,9 +3,11 @@ from cibmangotree.services.preprocessing.series_semantic import identifier from cibmangotree_testing import CsvConfig, CsvTestData, test_primary_analyzer -from cibmangotree_analyzer_example.example_base.interface import interface -from cibmangotree_analyzer_example.example_base.main import main -from .test_data import test_data_dir +from cibmangotree_analyzer_example.base.interface import interface +from cibmangotree_analyzer_example.base.main import main +from pathlib import Path +test_data_dir = Path(__file__).parent / "test_data" +# from .test_data import test_data_dir # This example shows you how to test a primary analyzer. diff --git a/packages/analyzers/example/tests/test_example_report.py b/packages/analyzers/example/tests/test_example_report.py index 32e6e1aa..021c1a04 100644 --- a/packages/analyzers/example/tests/test_example_report.py +++ b/packages/analyzers/example/tests/test_example_report.py @@ -2,9 +2,11 @@ from cibmangotree_testing import CsvTestData, test_secondary_analyzer -from cibmangotree_analyzer_example.example_report.interface import interface -from cibmangotree_analyzer_example.example_report.main import main -from .test_data import test_data_dir +from cibmangotree_analyzer_example.report.interface import interface +from cibmangotree_analyzer_example.report.main import main + +from pathlib import Path +test_data_dir = Path(__file__).parent / "test_data" # This example shows you how to test a secondary analyzer. diff --git a/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/analysis.py b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/analysis.py index 9d8ff5e4..eb28f160 100644 --- a/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/analysis.py +++ b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/analysis.py @@ -1,6 +1,6 @@ import polars as pl -from ..hashtags_base.interface import OUTPUT_COL_HASHTAGS, OUTPUT_COL_USERS +from ..base.interface import OUTPUT_COL_HASHTAGS, OUTPUT_COL_USERS def secondary_analyzer(primary_output, timewindow): diff --git a/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/app.py b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/app.py index ac35b80b..7a1ab2b8 100644 --- a/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/app.py +++ b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/app.py @@ -6,7 +6,7 @@ from shiny import reactive, render, ui from shinywidgets import output_widget, render_widget -from ..hashtags_base.interface import COL_AUTHOR_ID, COL_POST, COL_TIME +from ..base.interface import COL_AUTHOR_ID, COL_POST, COL_TIME from .analysis import secondary_analyzer from .plots import ( MANGO_DARK_GREEN, diff --git a/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/factory.py b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/factory.py index 04b0546f..98fce725 100644 --- a/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/factory.py +++ b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/factory.py @@ -7,10 +7,10 @@ ShinyContext, WebPresenterContext, ) -from app.project_context import _get_columns_with_semantic -from app.shiny import page_dependencies +from cibmangotree.app.project_context import _get_columns_with_semantic +from cibmangotree.app.shiny import page_dependencies -from ..hashtags_base.interface import COL_TIME, OUTPUT_GINI +from ..base.interface import COL_TIME, OUTPUT_GINI from .app import ( analysis_panel, hashtag_plot_panel, diff --git a/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/interface.py b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/interface.py index 760e5019..74df2479 100644 --- a/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/interface.py +++ b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/interface.py @@ -1,6 +1,6 @@ from cibmangotree.analyzer_interface import WebPresenterInterface -from ..hashtags_base import interface as hashtags_interface +from ..base import interface as hashtags_interface interface = WebPresenterInterface( id="hashtags_dashboard", diff --git a/packages/analyzers/hashtags/tests/test_hashtags_base.py b/packages/analyzers/hashtags/tests/test_hashtags_base.py index 1779f69d..bfe4cea7 100644 --- a/packages/analyzers/hashtags/tests/test_hashtags_base.py +++ b/packages/analyzers/hashtags/tests/test_hashtags_base.py @@ -7,15 +7,17 @@ from cibmangotree.services.preprocessing.series_semantic import datetime_string, identifier, text_catch_all from cibmangotree_testing import CsvTestData, JsonTestData, test_primary_analyzer -from .hashtags_base.interface import ( +from cibmangotree_analyzer_hashtags.base.interface import ( COL_AUTHOR_ID, COL_POST, COL_TIME, OUTPUT_GINI, interface, ) -from .hashtags_base.main import gini, main -from .test_data import test_data_dir +from cibmangotree_analyzer_hashtags.base.main import gini, main + +from pathlib import Path +test_data_dir = Path(__file__).parent / "test_data" HASHTAGS = [ "sunset", diff --git a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/main.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/main.py index a1557807..049f2455 100644 --- a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/main.py +++ b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/main.py @@ -2,7 +2,7 @@ from cibmangotree.analyzer_interface.context import PrimaryAnalyzerContext from cibmangotree_tokenizer_basic import TokenizerConfig, tokenize_text -from cibmangotree_tokenizer_basic.core.types import CaseHandling +from cibmangotree.services.tokenizer.core.types import CaseHandling from cibmangotree.tui.tools import ProgressReporter from .interface import ( diff --git a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/interface.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/interface.py index 873eef56..99114d85 100644 --- a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/interface.py +++ b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/interface.py @@ -1,7 +1,7 @@ from cibmangotree.analyzer_interface import AnalyzerOutput, OutputColumn, SecondaryAnalyzerInterface -from ..ngrams_base import interface as ngrams_interface -from ..ngrams_base.interface import ( +from ..base import interface as ngrams_interface +from ..base.interface import ( COL_AUTHOR_ID, COL_MESSAGE_ID, COL_MESSAGE_NGRAM_COUNT, diff --git a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/main.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/main.py index 12073580..f3bd61af 100644 --- a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/main.py +++ b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/main.py @@ -5,7 +5,7 @@ from cibmangotree.analyzer_interface.context import SecondaryAnalyzerContext from cibmangotree.tui.tools import ProgressReporter -from ..ngrams_base.interface import ( +from ..base.interface import ( COL_AUTHOR_ID, COL_MESSAGE_ID, COL_MESSAGE_NGRAM_COUNT, diff --git a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/app.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/app.py index 796473fa..79775d83 100644 --- a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/app.py +++ b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/app.py @@ -4,12 +4,12 @@ from shiny import reactive, render, ui from shinywidgets import output_widget, render_widget -from ..ngram_stats.interface import ( +from ..stats.interface import ( COL_NGRAM_DISTINCT_POSTER_COUNT, COL_NGRAM_TOTAL_REPS, COL_NGRAM_WORDS, ) -from ..ngrams_base.interface import ( +from ..base.interface import ( COL_AUTHOR_ID, COL_MESSAGE_TEXT, COL_MESSAGE_TIMESTAMP, diff --git a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/factory.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/factory.py index 7ec94330..279ddd84 100644 --- a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/factory.py +++ b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/factory.py @@ -7,10 +7,10 @@ ShinyContext, WebPresenterContext, ) -from app.shiny import page_dependencies +from cibmangotree.app.shiny import page_dependencies -from ..ngram_stats.interface import OUTPUT_NGRAM_FULL, OUTPUT_NGRAM_STATS -from ..ngram_stats.interface import interface as ngram_stats +from ..stats.interface import OUTPUT_NGRAM_FULL, OUTPUT_NGRAM_STATS +from ..stats.interface import interface as ngram_stats from .app import _get_app_layout, _set_global_state_vars, server data_stats = None diff --git a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/interface.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/interface.py index feaa5c29..8d955634 100644 --- a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/interface.py +++ b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/interface.py @@ -1,7 +1,7 @@ from cibmangotree.analyzer_interface import WebPresenterInterface -from ..ngram_stats import interface as ngram_stats_interface -from ..ngrams_base import interface as ngrams_interface +from ..stats import interface as ngram_stats_interface +from ..base import interface as ngrams_interface interface = WebPresenterInterface( id="ngram_repetition_by_poster", diff --git a/packages/analyzers/ngrams/tests/test_ngram_stats.py b/packages/analyzers/ngrams/tests/test_ngram_stats.py index 3621a4dc..c8cb4fa9 100644 --- a/packages/analyzers/ngrams/tests/test_ngram_stats.py +++ b/packages/analyzers/ngrams/tests/test_ngram_stats.py @@ -2,14 +2,15 @@ from cibmangotree_testing import ParquetTestData, test_secondary_analyzer -from cibmangotree_analyzer_ngrams.ngram_stats.interface import OUTPUT_NGRAM_FULL, OUTPUT_NGRAM_STATS, interface -from cibmangotree_analyzer_ngrams.ngram_stats.main import main -from cibmangotree_analyzer_ngrams.ngrams_base.interface import ( +from cibmangotree_analyzer_ngrams.stats.interface import OUTPUT_NGRAM_FULL, OUTPUT_NGRAM_STATS, interface +from cibmangotree_analyzer_ngrams.stats.main import main +from cibmangotree_analyzer_ngrams.base.interface import ( OUTPUT_MESSAGE, OUTPUT_MESSAGE_NGRAMS, OUTPUT_NGRAM_DEFS, ) -from .test_data import test_data_dir + +test_data_dir = Path(__file__).parent / "test_data" # This example shows you how to test a secondary analyzer. diff --git a/packages/analyzers/ngrams/tests/test_ngrams_base.py b/packages/analyzers/ngrams/tests/test_ngrams_base.py index 3ad79b76..d9458fea 100644 --- a/packages/analyzers/ngrams/tests/test_ngrams_base.py +++ b/packages/analyzers/ngrams/tests/test_ngrams_base.py @@ -2,11 +2,11 @@ from pathlib import Path from cibmangotree.services.preprocessing.series_semantic import datetime_string, identifier, text_catch_all -from services.tokenizer.basic import TokenizerConfig, tokenize_text -from services.tokenizer.core.types import CaseHandling +from cibmangotree_tokenizer_basic import TokenizerConfig, tokenize_text +from cibmangotree.services.tokenizer.core.types import CaseHandling from cibmangotree_testing import CsvTestData, ParquetTestData, test_primary_analyzer -from .ngrams_base.interface import ( +from cibmangotree_analyzer_ngrams.base.interface import ( COL_AUTHOR_ID, COL_MESSAGE_ID, COL_MESSAGE_TEXT, @@ -16,8 +16,11 @@ OUTPUT_NGRAM_DEFS, interface, ) -from .ngrams_base.main import main, ngrams, serialize_ngram -from .test_data import test_data_dir +from cibmangotree_analyzer_ngrams.base.main import main, ngrams, serialize_ngram + +# Test data directory +from pathlib import Path +test_data_dir = Path(__file__).parent / "test_data" TEST_CSV_FILENAME = "ngrams_test_input.csv" TEST_STRING = "Mango tree is an open source project." diff --git a/packages/core/src/cibmangotree/services/tokenizer/__init__.py b/packages/core/src/cibmangotree/services/tokenizer/__init__.py new file mode 100644 index 00000000..a4afa59d --- /dev/null +++ b/packages/core/src/cibmangotree/services/tokenizer/__init__.py @@ -0,0 +1 @@ +"""Tokenizer service interfaces and types.""" diff --git a/services/tokenizer/core/__init__.py b/packages/core/src/cibmangotree/services/tokenizer/core/__init__.py similarity index 100% rename from services/tokenizer/core/__init__.py rename to packages/core/src/cibmangotree/services/tokenizer/core/__init__.py diff --git a/services/tokenizer/core/base.py b/packages/core/src/cibmangotree/services/tokenizer/core/base.py similarity index 100% rename from services/tokenizer/core/base.py rename to packages/core/src/cibmangotree/services/tokenizer/core/base.py diff --git a/services/tokenizer/core/types.py b/packages/core/src/cibmangotree/services/tokenizer/core/types.py similarity index 100% rename from services/tokenizer/core/types.py rename to packages/core/src/cibmangotree/services/tokenizer/core/types.py diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index b6c11343..00000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,8 +0,0 @@ --r requirements.txt - -pyarrow-stubs==17.13 -black==24.10.0 -isort==5.13.2 -pytest==8.3.4 -pytest-benchmark==5.1.0 -pyinstaller==6.14.1 diff --git a/requirements-mkdocs.txt b/requirements-mkdocs.txt deleted file mode 100644 index b1054abc..00000000 --- a/requirements-mkdocs.txt +++ /dev/null @@ -1,7 +0,0 @@ -mkdocs -mkdocstrings[python] -mkdocs-material -markdown -pymdown-extensions -mkdocs-mermaid2-plugin -griffe_pydantic diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index ac41e90c..00000000 --- a/requirements.txt +++ /dev/null @@ -1,21 +0,0 @@ -inquirer==3.4.0 -polars==1.9.0 -pydantic==2.9.1 -platformdirs==4.3.6 -tinydb==4.8.0 -XlsxWriter==3.2.0 -filelock==3.16.1 -plotly==5.24.1 -pandas==2.2.3 # needed by plotly -pyarrow==17.0.0 -dash==2.18.1 -colorama==0.4.6 -fastexcel==0.13.0 -shiny==1.4.0 -shinywidgets==0.6.2 -starlette==0.47.1 -uvicorn==0.34.3 -a2wsgi==1.10.10 -python-json-logger==2.0.7 -rich==14.0.0 -regex==2025.9.1 \ No newline at end of file diff --git a/services/__init__.py b/services/__init__.py deleted file mode 100644 index d755ef07..00000000 --- a/services/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -""" -Services package for Mango Tango CLI. - -This package contains service modules that provide core functionality -for the application, organized in a modular and testable architecture. -""" diff --git a/services/tokenizer/README.md b/services/tokenizer/README.md deleted file mode 100644 index 0ec7ecca..00000000 --- a/services/tokenizer/README.md +++ /dev/null @@ -1,341 +0,0 @@ -# Tokenizer Service - -Unicode-aware text tokenization for social media analytics with multilingual support. - -## Overview - -The tokenizer service provides configurable text tokenization that handles: - -- **Multilingual text**: Scriptio continua scripts (CJK, Thai, Southeast Asian) use character-level tokenization; space-separated scripts (Latin, Arabic) use word-level tokenization -- **Social media entities**: Hashtags, mentions, URLs, emails preserved as single tokens or completely excluded -- **Unicode normalization**: Proper handling of combined characters and emojis -- **Performance optimized**: Single-pass regex processing with 33% performance improvement over multi-pass approaches - -## Quick Start - -### Simple Usage - -```python -from services.tokenizer import tokenize_text - -text = "Hello @user! Check out #python https://example.com 🚀" -tokens = tokenize_text(text) -# Result: ['hello', '@user', 'check', 'out', '#python', 'https://example.com'] - -### Custom Configuration - -```python -from services.tokenizer import BasicTokenizer, TokenizerConfig -from services.tokenizer.core import CaseHandling - -config = TokenizerConfig( - case_handling=CaseHandling.PRESERVE, - extract_hashtags=True, - include_urls=True, - include_emails=True, - include_emoji=True, - min_token_length=2 -) - -tokenizer = BasicTokenizer(config) -tokens = tokenizer.tokenize("Social media text #analysis @researcher") -``` - -## Core Concepts - -### Abstract Interface - -All tokenizers implement `AbstractTokenizer`: - -```python -from services.tokenizer.core.base import AbstractTokenizer - -class CustomTokenizer(AbstractTokenizer): - def tokenize(self, text: str) -> list[str]: - # Your implementation here - pass -``` - -### Configuration-Driven Processing - -The `TokenizerConfig` dataclass controls all tokenization behavior: - -- **Text preprocessing**: Case handling, Unicode normalization -- **Token filtering**: What types of content to include/exclude -- **Social media handling**: How to treat hashtags, mentions, URLs -- **Output control**: Length limits, whitespace handling - -### Single-Pass Tokenization - -The `BasicTokenizer` uses an optimized single-pass regex approach: - -- One comprehensive pattern matches all enabled token types in priority order -- Disabled entities (URLs/emails) are excluded via preprocessing to prevent fragmentation -- Tokens extracted in document order -- Social media entities matched before general words to ensure proper precedence - -## Configuration Reference - -### Social Media Entity Behavior - -The tokenizer handles social media entities differently based on configuration: - -**extract_hashtags / extract_mentions:** - -- `True`: Preserve as single tokens (e.g., `#hashtag`, `@user`) -- `False`: Split into component words (e.g., `hashtag`, `user`) - -**include_urls / include_emails:** - -- `True`: Preserve as single tokens (e.g., `https://site.com`, `user@domain.com`) -- `False`: Completely exclude from output (no fragmentation into components) - -This design prevents URL/email fragmentation while allowing hashtag/mention content extraction. - -### TokenizerConfig Options - -```python -@dataclass -class TokenizerConfig: - # Language detection - fallback_language_family: LanguageFamily = LanguageFamily.MIXED - - # Token type filtering - include_punctuation: bool = False - include_numeric: bool = True - include_emoji: bool = False - - # Text preprocessing - case_handling: CaseHandling = CaseHandling.LOWERCASE - normalize_unicode: bool = True - - # Social media features - extract_hashtags: bool = True - extract_mentions: bool = True - include_urls: bool = True - include_emails: bool = True - - # Output control - min_token_length: int = 1 - max_token_length: Optional[int] = None - strip_whitespace: bool = True -``` - -### Enum Values - -**CaseHandling:** - -- `PRESERVE` - Keep original case -- `LOWERCASE` - Convert to lowercase -- `UPPERCASE` - Convert to uppercase -- `NORMALIZE` - Smart case normalization - -**LanguageFamily:** - -- `LATIN` - Space-separated languages -- `CJK` - Character-based languages -- `ARABIC` - Right-to-left scripts -- `MIXED` - Multiple script families -- `UNKNOWN` - Language detection failed - -## API Reference - -### Factory Functions - -```python -# Simple tokenization with optional config -def tokenize_text(text: str, config: TokenizerConfig = None) -> list[str] - -# Create configured tokenizer instance -def create_basic_tokenizer(config: TokenizerConfig = None) -> BasicTokenizer -``` - -### AbstractTokenizer Interface - -```python -class AbstractTokenizer: - def __init__(self, config: TokenizerConfig = None) - def tokenize(self, text: str) -> list[str] # Main tokenization method - - @property - def config(self) -> TokenizerConfig # Access configuration - - # Protected methods for subclassing - def _preprocess_text(self, text: str) -> str - def _postprocess_tokens(self, tokens: list[str]) -> list[str] -``` - -### BasicTokenizer Implementation - -The main implementation provides: - -- Unicode-aware multilingual tokenization -- Social media entity preservation -- Configurable preprocessing and postprocessing -- Support for mixed-script content - -## Usage Patterns - -### Preserving Original Case - -```python -config = TokenizerConfig( - case_handling=CaseHandling.PRESERVE, - include_emoji=True, - min_token_length=1 -) -``` - -### Content-Only Tokenization - -```python -config = TokenizerConfig( - extract_hashtags=False, # Split hashtags to get content words - extract_mentions=False, # Split mentions to get usernames - include_urls=False, # Completely exclude URLs (no fragmentation) - include_emails=False, # Completely exclude emails (no fragmentation) - include_punctuation=False -) -``` - -### Strict Filtering - -```python -config = TokenizerConfig( - include_punctuation=False, - include_numeric=False, - include_emoji=False, - min_token_length=3, - max_token_length=20 -) -``` - -## Integration Examples - -### Basic Integration - -```python -from services.tokenizer import create_basic_tokenizer, TokenizerConfig - -# Use default configuration -tokenizer = create_basic_tokenizer() -tokens = tokenizer.tokenize(text) - -# Or with custom configuration -config = TokenizerConfig(min_token_length=2) -tokenizer = create_basic_tokenizer(config) -tokens = tokenizer.tokenize(text) -``` - -### Configuration Factory Pattern - -```python -from services.tokenizer import TokenizerConfig -from services.tokenizer.core import CaseHandling - -def create_custom_config(): - return TokenizerConfig( - case_handling=CaseHandling.PRESERVE, - include_emoji=True, - min_token_length=1 - ) - -config = create_custom_config() -``` - -## Extending the Service - -### Creating Custom Tokenizers - -```python -from services.tokenizer.core.base import AbstractTokenizer -from services.tokenizer.core.types import TokenizerConfig - -class CustomTokenizer(AbstractTokenizer): - """Custom tokenizer implementation.""" - - def __init__(self, config: TokenizerConfig = None): - super().__init__(config) - # Custom initialization - - def tokenize(self, text: str) -> list[str]: - """Implement custom tokenization logic.""" - if not text: - return [] - - # Apply preprocessing - processed_text = self._preprocess_text(text) - - # Your tokenization logic here - tokens = custom_tokenize_logic(processed_text) - - # Apply postprocessing - return self._postprocess_tokens(tokens) -``` - -### Plugin Registration - -Add new tokenizers to the service interface: - -```python -# In services/tokenizer/__init__.py -from .custom_tokenizer import CustomTokenizer -from .core.types import TokenizerConfig - -def create_custom_tokenizer(config: TokenizerConfig = None) -> CustomTokenizer: - return CustomTokenizer(config) -``` - -## Implementation Notes - -### Architecture Decisions - -- **Single comprehensive regex**: All enabled token types extracted in one pass -- **Configuration-driven patterns**: Regex built based on enabled features to eliminate post-processing -- **Preprocessing exclusion**: Disabled URLs/emails removed before tokenization to prevent fragmentation -- **Order preservation**: Tokens returned in document sequence -- **Abstract base class**: Enables multiple tokenizer implementations - -### Performance Characteristics - -- **33% performance improvement**: Single-pass regex approach eliminates post-processing filtering loops -- **Compiled regex patterns**: Cached per configuration for optimal reuse -- **Minimal string copying**: Efficient processing with reduced memory allocation -- **Lightweight configuration**: Fast instantiation and comparison operations - -### Unicode Handling - -- NFKC normalization for consistent character representation -- Proper handling of combining characters and diacritics -- Emoji detection across Unicode ranges -- Mixed-script content support - -## Module Structure - -```bash -services/tokenizer/ -├── __init__.py # Public API exports -├── core/ # Core interfaces and types -│ ├── __init__.py # Core type exports -│ ├── base.py # AbstractTokenizer interface -│ └── types.py # Configuration and enums -├── basic/ # BasicTokenizer implementation -│ ├── __init__.py # Implementation exports -│ ├── tokenizer.py # Main BasicTokenizer class -│ └── patterns.py # Regex pattern construction -└── README.md # This documentation -``` - -## Testing - -The service includes comprehensive tests: - -- `test_service.py` - Integration tests -- `core/test_types.py` - Configuration tests -- `basic/test_basic_tokenizer.py` - Implementation tests - -Run tests with: - -```bash -pytest services/tokenizer/ -v -``` diff --git a/services/tokenizer/__init__.py b/services/tokenizer/__init__.py deleted file mode 100644 index 6fd6ecdc..00000000 --- a/services/tokenizer/__init__.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Tokenizer service for Unicode-aware text tokenization. - -This service provides tokenization capabilities for social media analytics, -with support for multilingual content and entity preservation. -""" - -# Basic implementation -from .basic import BasicTokenizer, create_basic_tokenizer, tokenize_text - -# Core interfaces and types -from .core import ( - AbstractTokenizer, - CaseHandling, - LanguageFamily, - TokenizerConfig, - TokenList, - TokenType, -) - -# Main exports -__all__ = [ - # Core interfaces - "AbstractTokenizer", - "TokenizerConfig", - # Core types - "TokenList", - "LanguageFamily", - "TokenType", - "CaseHandling", - # Basic implementation - "BasicTokenizer", - "create_basic_tokenizer", - "tokenize_text", -] diff --git a/services/tokenizer/core/test_types.py b/services/tokenizer/core/test_types.py deleted file mode 100644 index 4ecd74b2..00000000 --- a/services/tokenizer/core/test_types.py +++ /dev/null @@ -1,392 +0,0 @@ -#!/usr/bin/env python3 -""" -Tests for core tokenizer types and configuration. - -This module tests: -- TokenizerConfig Pydantic model and validation -- Enum types (LanguageFamily, TokenType, CaseHandling, etc.) -- Type system edge cases and defaults -""" - -from typing import Optional - -import pytest - -from .types import CaseHandling, LanguageFamily, TokenizerConfig, TokenList, TokenType - - -class TestTokenizerConfig: - """Test TokenizerConfig Pydantic model and validation.""" - - def test_default_config(self): - """Test default configuration values.""" - config = TokenizerConfig() - - # Language detection defaults (optimized for performance) - assert config.fallback_language_family == LanguageFamily.MIXED - - # Space handling defaults - - # Token type filtering defaults - assert config.include_punctuation is False - assert config.include_numeric is True - assert config.include_emoji is False - - # Text preprocessing defaults - assert config.case_handling == CaseHandling.LOWERCASE - assert config.normalize_unicode is True - - # Social media defaults - assert config.extract_hashtags is True - assert config.extract_mentions is True - assert config.include_urls is True - assert config.include_emails is True - - # Output formatting defaults - assert config.min_token_length == 1 - assert config.max_token_length is None - assert config.strip_whitespace is True - - def test_custom_config(self): - """Test custom configuration values.""" - config = TokenizerConfig( - fallback_language_family=LanguageFamily.ARABIC, - include_punctuation=True, - include_numeric=False, - include_emoji=False, - case_handling=CaseHandling.PRESERVE, - normalize_unicode=False, - extract_hashtags=False, - extract_mentions=False, - include_urls=False, - include_emails=True, - min_token_length=2, - max_token_length=100, - strip_whitespace=False, - ) - - # Verify all custom values are set correctly - assert config.fallback_language_family == LanguageFamily.ARABIC - assert config.include_punctuation is True - assert config.include_numeric is False - assert config.include_emoji is False - assert config.case_handling == CaseHandling.PRESERVE - assert config.normalize_unicode is False - assert config.extract_hashtags is False - assert config.extract_mentions is False - assert config.include_urls is False - assert config.include_emails is True - assert config.min_token_length == 2 - assert config.max_token_length == 100 - assert config.strip_whitespace is False - - def test_config_mutability(self): - """Test that configuration can be modified after creation (dataclass is mutable by default).""" - config = TokenizerConfig() - - # Should be able to modify (dataclass is mutable by default) - original_min_length = config.min_token_length - config.min_token_length = 5 - assert config.min_token_length == 5 - assert config.min_token_length != original_min_length - - # Test modification of other fields to ensure true mutability - config.include_emoji = not config.include_emoji - config.case_handling = CaseHandling.UPPERCASE - assert config.case_handling == CaseHandling.UPPERCASE - - def test_config_type_hints(self): - """Test that type hints are correctly specified.""" - config = TokenizerConfig() - - # Test boolean fields - assert isinstance(config.include_punctuation, bool) - assert isinstance(config.normalize_unicode, bool) - - # Test enum fields - assert isinstance(config.fallback_language_family, LanguageFamily) - assert isinstance(config.case_handling, CaseHandling) - - # Test optional fields - assert config.max_token_length is None or isinstance( - config.max_token_length, int - ) - - # Test integer fields - assert isinstance(config.min_token_length, int) - - def test_social_media_presets(self): - """Test common social media configuration presets.""" - # Preset 1: Full social media extraction - social_config = TokenizerConfig( - extract_hashtags=True, - extract_mentions=True, - include_urls=True, - include_emails=True, - include_emoji=True, - case_handling=CaseHandling.LOWERCASE, - ) - - assert social_config.extract_hashtags - assert social_config.extract_mentions - assert social_config.include_urls - assert social_config.include_emails - assert social_config.include_emoji - - # Preset 2: Clean text only (no social entities) - clean_config = TokenizerConfig( - extract_hashtags=False, - extract_mentions=False, - include_urls=False, - include_emails=False, - include_emoji=False, - include_punctuation=False, - case_handling=CaseHandling.LOWERCASE, - ) - - assert not clean_config.extract_hashtags - assert not clean_config.extract_mentions - assert not clean_config.include_urls - assert not clean_config.include_emails - assert not clean_config.include_emoji - assert not clean_config.include_punctuation - - -class TestEnumTypes: - """Test enum types and their values.""" - - def test_language_family_enum(self): - """Test LanguageFamily enum values.""" - # Test all enum values exist - assert hasattr(LanguageFamily, "LATIN") - assert hasattr(LanguageFamily, "ARABIC") - assert hasattr(LanguageFamily, "MIXED") - assert hasattr(LanguageFamily, "UNKNOWN") - - # Test enum values - assert LanguageFamily.LATIN.value == "latin" - assert LanguageFamily.ARABIC.value == "arabic" - assert LanguageFamily.MIXED.value == "mixed" - assert LanguageFamily.UNKNOWN.value == "unknown" - - # Test enum comparison - assert LanguageFamily.LATIN != LanguageFamily.ARABIC - assert LanguageFamily.LATIN == LanguageFamily.LATIN - - def test_token_type_enum(self): - """Test TokenType enum values.""" - expected_types = [ - "WORD", - "PUNCTUATION", - "NUMERIC", - "EMOJI", - "HASHTAG", - "MENTION", - "URL", - "EMAIL", - "WHITESPACE", - ] - - for type_name in expected_types: - assert hasattr(TokenType, type_name) - - # Test specific values - assert TokenType.WORD.value == "word" - assert TokenType.HASHTAG.value == "hashtag" - assert TokenType.MENTION.value == "mention" - assert TokenType.URL.value == "url" - assert TokenType.EMAIL.value == "email" - assert TokenType.EMOJI.value == "emoji" - - def test_case_handling_enum(self): - """Test CaseHandling enum values.""" - expected_cases = ["PRESERVE", "LOWERCASE", "UPPERCASE", "NORMALIZE"] - - for case_name in expected_cases: - assert hasattr(CaseHandling, case_name) - - # Test values - assert CaseHandling.PRESERVE.value == "preserve" - assert CaseHandling.LOWERCASE.value == "lowercase" - assert CaseHandling.UPPERCASE.value == "uppercase" - assert CaseHandling.NORMALIZE.value == "normalize" - - -class TestTypeAliases: - """Test type aliases and their usage.""" - - def test_token_list_type(self): - """Test TokenList type alias.""" - # TokenList should be equivalent to list[str] - token_list: TokenList = ["word1", "word2", "word3"] - - assert isinstance(token_list, list) - assert all(isinstance(token, str) for token in token_list) - - # Empty list should be valid - empty_list: TokenList = [] - assert isinstance(empty_list, list) - - -class TestConfigurationValidation: - """Test configuration validation and edge cases.""" - - def test_min_max_token_length_validation(self): - """Test minimum and maximum token length validation.""" - # Valid configurations - config1 = TokenizerConfig(min_token_length=1, max_token_length=None) - assert config1.min_token_length == 1 - assert config1.max_token_length is None - - config2 = TokenizerConfig(min_token_length=1, max_token_length=10) - assert config2.min_token_length == 1 - assert config2.max_token_length == 10 - - # Edge cases that should be allowed (validation might be in tokenizer) - config3 = TokenizerConfig(min_token_length=0) # Zero length - assert config3.min_token_length == 0 - - config4 = TokenizerConfig(min_token_length=100) # Large minimum - assert config4.min_token_length == 100 - - def test_boolean_combinations(self): - """Test various boolean configuration combinations.""" - # All social features enabled - config_all = TokenizerConfig( - extract_hashtags=True, - extract_mentions=True, - include_urls=True, - include_emails=True, - include_emoji=True, - include_punctuation=True, - include_numeric=True, - ) - - social_features = [ - config_all.extract_hashtags, - config_all.extract_mentions, - config_all.include_urls, - config_all.include_emails, - ] - include_features = [ - config_all.include_emoji, - config_all.include_punctuation, - config_all.include_numeric, - ] - - assert all(social_features) - assert all(include_features) - - # All features disabled - config_none = TokenizerConfig( - extract_hashtags=False, - extract_mentions=False, - include_urls=False, - include_emails=False, - include_emoji=False, - include_punctuation=False, - include_numeric=False, - ) - - social_features_none = [ - config_none.extract_hashtags, - config_none.extract_mentions, - config_none.include_urls, - config_none.include_emails, - ] - include_features_none = [ - config_none.include_emoji, - config_none.include_punctuation, - config_none.include_numeric, - ] - - assert not any(social_features_none) - assert not any(include_features_none) - - -class TestConfigurationUseCases: - """Test configurations for common use cases.""" - - def test_research_analysis_config(self): - """Test configuration suitable for research/academic analysis.""" - config = TokenizerConfig( - # Clean text processing - extract_hashtags=False, - extract_mentions=False, - include_urls=False, - include_emails=False, - include_emoji=False, - include_punctuation=False, - # Consistent casing - case_handling=CaseHandling.LOWERCASE, - normalize_unicode=True, - # Filter very short tokens - min_token_length=2, - ) - - # Verify research-friendly settings - assert not config.extract_hashtags - assert not config.extract_mentions - assert not config.include_emoji - assert config.case_handling == CaseHandling.LOWERCASE - assert config.min_token_length >= 2 - - def test_social_media_monitoring_config(self): - """Test configuration for social media monitoring.""" - config = TokenizerConfig( - # Extract all social entities - extract_hashtags=True, - extract_mentions=True, - include_urls=True, - include_emails=True, - include_emoji=True, - # Keep some formatting - include_punctuation=True, - case_handling=CaseHandling.PRESERVE, - # Include very short tokens (acronyms, etc.) - min_token_length=1, - # Handle multilingual content - normalize_unicode=True, - ) - - # Verify social media settings - assert config.extract_hashtags - assert config.extract_mentions - assert config.include_urls - assert config.include_emoji - assert config.case_handling == CaseHandling.PRESERVE - assert config.min_token_length == 1 - - def test_content_analysis_config(self): - """Test configuration for content analysis (no social entities).""" - config = TokenizerConfig( - # Pure content focus - extract_hashtags=False, - extract_mentions=False, - include_urls=False, - include_emails=False, - include_emoji=False, - # Clean text processing - include_punctuation=False, - case_handling=CaseHandling.LOWERCASE, - normalize_unicode=True, - # Standard filtering - min_token_length=1, - include_numeric=True, - ) - - # Verify content analysis settings - social_extractions = [ - config.extract_hashtags, - config.extract_mentions, - config.include_urls, - config.include_emails, - config.include_emoji, - ] - assert not any(social_extractions) - assert config.case_handling == CaseHandling.LOWERCASE - assert config.normalize_unicode - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) diff --git a/services/tokenizer/test_service.py b/services/tokenizer/test_service.py deleted file mode 100644 index 9cccbd95..00000000 --- a/services/tokenizer/test_service.py +++ /dev/null @@ -1,374 +0,0 @@ -#!/usr/bin/env python3 -""" -Comprehensive tests for the tokenizer service. - -This module tests the tokenizer service API, including: -- Service-level functionality -- Multilingual text handling -- Social media entity extraction -- Configuration options -- Integration with n-gram processing -""" - -from typing import Dict, List - -import pytest - -from .basic import BasicTokenizer, create_basic_tokenizer, tokenize_text - -# Core interfaces and types -from .core import AbstractTokenizer, LanguageFamily, TokenizerConfig, TokenType -from .core.types import CaseHandling - - -class TestTokenizerService: - """Test the main tokenizer service API functions.""" - - def test_tokenize_text_basic(self): - """Test basic tokenize_text function.""" - text = "Hello world" - result = tokenize_text(text) - - assert isinstance(result, list) - assert all(isinstance(token, str) for token in result) - assert "hello" in result - assert "world" in result - - def test_tokenize_text_with_config(self): - """Test tokenize_text with custom configuration.""" - text = "Hello World" - config = TokenizerConfig(case_handling=CaseHandling.PRESERVE) - result = tokenize_text(text, config) - - assert "Hello" in result - assert "World" in result - - def test_create_basic_tokenizer(self): - """Test basic tokenizer creation.""" - tokenizer = create_basic_tokenizer() - assert isinstance(tokenizer, BasicTokenizer) - - # Test with custom config - config = TokenizerConfig(min_token_length=2) - tokenizer_custom = create_basic_tokenizer(config) - assert isinstance(tokenizer_custom, BasicTokenizer) - - def test_tokenize_text_empty_input(self): - """Test tokenizer behavior with empty/None input.""" - assert tokenize_text("") == [] - assert tokenize_text(" ") == [] - assert tokenize_text("\n\t ") == [] - - def test_tokenize_text_none_config(self): - """Test tokenizer with None config (should use defaults).""" - text = "Test text" - result = tokenize_text(text) # Use default config - assert isinstance(result, list) - assert len(result) > 0 - - -class TestMultilingualTokenization: - """Test basic multilingual tokenization through service API (smoke tests).""" - - def test_latin_text_smoke(self): - """Test basic Latin script text tokenization via service API.""" - text = "Hello world café" - result = tokenize_text(text) - - assert isinstance(result, list) - assert len(result) > 0 - # Should be lowercase by default - assert all(token.islower() or not token.isalpha() for token in result) - assert "hello" in result - assert "world" in result - - def test_mixed_script_smoke(self): - """Test mixed script text tokenization via service API.""" - text = "Hello你好World" - result = tokenize_text(text) - - assert isinstance(result, list) - assert len(result) > 0 - - # CRITICAL: Should handle scripts with predictable tokenization - # Latin script should be lowercased and space-separated - assert "hello" in result, f"Latin text 'hello' not found in result: {result}" - assert "world" in result, f"Latin text 'world' not found in result: {result}" - - # CJK should be character-level tokenized - assert "你" in result, f"Chinese character '你' not found in result: {result}" - assert "好" in result, f"Chinese character '好' not found in result: {result}" - - -class TestSocialMediaEntities: - """Test basic social media entity extraction through service API (smoke test).""" - - def test_combined_social_entities_smoke(self): - """Test service API with multiple social media entities enabled.""" - text = "@user check #hashtag https://example.com 🎉" - - config = TokenizerConfig( - extract_mentions=True, - extract_hashtags=True, - include_urls=True, - include_emoji=True, - ) - tokenizer = create_basic_tokenizer(config) - result = tokenizer.tokenize(text) - - # Service API should handle entity extraction - assert isinstance(result, list) - # Should extract social media entities - assert "@user" in result - assert "#hashtag" in result - assert "https://example.com" in result - assert "check" in result - - # CRITICAL: Emoji should be preserved when enabled - assert "🎉" in result, f"Emoji should be preserved when enabled: {result}" - - def test_combined_social_entities_disabled(self): - """Test service API with all social media entities disabled.""" - text = "@user check #hashtag https://example.com 🎉" - - config = TokenizerConfig( - extract_mentions=False, - extract_hashtags=False, - include_urls=False, - include_emoji=False, - ) - tokenizer = create_basic_tokenizer(config) - result = tokenizer.tokenize(text) - - # Service API should handle disabled entities - assert isinstance(result, list) - - # Basic words should be present - assert "check" in result - - # Social media entities should NOT be preserved intact - assert "@user" not in result, "Mentions should be disabled" - assert "#hashtag" not in result, "Hashtags should be disabled" - assert "https://example.com" not in result, "URLs should be disabled" - assert "🎉" not in result, "Emojis should be disabled" - - # Components should be tokenized separately - assert ( - "user" in result or "hashtag" in result - ), "Entity content should be tokenized as words" - - -class TestTokenizerConfiguration: - """Test basic configuration options through service API.""" - - def test_case_handling_options_via_api(self): - """Test different case handling options through service API.""" - text = "Hello WORLD Test" - - # Test API with lowercase config - config_lower = TokenizerConfig(case_handling=CaseHandling.LOWERCASE) - result_lower = tokenize_text(text, config_lower) - assert "hello" in result_lower - assert "world" in result_lower - - # Test API with preserve config - config_preserve = TokenizerConfig(case_handling=CaseHandling.PRESERVE) - result_preserve = tokenize_text(text, config_preserve) - assert "Hello" in result_preserve - assert "WORLD" in result_preserve - - def test_min_token_length_via_api(self): - """Test minimum token length filtering through service API.""" - text = "a bb ccc dddd" - - # Test API with different min lengths - config_1 = TokenizerConfig(min_token_length=1) - result_1 = tokenize_text(text, config_1) - assert "a" in result_1 - - config_3 = TokenizerConfig(min_token_length=3) - result_3 = tokenize_text(text, config_3) - assert "a" not in result_3 - assert "ccc" in result_3 - - -class TestNgramParameterValidation: - """Test n-gram parameter validation and edge cases.""" - - def test_valid_ngram_ranges(self): - """Test valid n-gram parameter ranges.""" - from analyzers.ngrams.ngrams_base.main import ngrams - - tokens = ["word1", "word2", "word3", "word4", "word5"] - - # Valid ranges - valid_ranges = [ - (1, 1), - (1, 5), - (3, 5), - (2, 15), - (15, 15), - ] - - for min_n, max_n in valid_ranges: - result = list(ngrams(tokens, min_n, max_n)) - assert isinstance(result, list) - if min_n <= len(tokens): - assert len(result) > 0 - - def test_edge_case_ngram_ranges(self): - """Test edge cases for n-gram ranges.""" - from analyzers.ngrams.ngrams_base.main import ngrams - - tokens = ["word1", "word2", "word3"] - - # Edge cases - edge_cases = [ - (1, 10), # max_n larger than token count - (5, 5), # min_n larger than token count - (3, 3), # exact token count - ] - - for min_n, max_n in edge_cases: - result = list(ngrams(tokens, min_n, max_n)) - assert isinstance(result, list) - - def test_ngram_default_parameters(self): - """Test default n-gram parameters used in analyzer.""" - # These should match the defaults in the analyzer - default_min_n = 3 - default_max_n = 5 - - # Verify these are reasonable defaults - assert 1 <= default_min_n <= 15 - assert default_min_n <= default_max_n <= 15 - - def test_invalid_ngram_ranges(self): - """Test behavior with invalid n-gram ranges.""" - from analyzers.ngrams.ngrams_base.main import ngrams - - tokens = ["word1", "word2", "word3"] - - # These should not crash but may return empty results - invalid_ranges = [ - (0, 5), # min_n = 0 - (3, 2), # min_n > max_n - (-1, 5), # negative min_n - ] - - for min_n, max_n in invalid_ranges: - try: - result = list(ngrams(tokens, min_n, max_n)) - assert isinstance(result, list) - except (ValueError, TypeError): - # Some invalid ranges might raise exceptions, which is okay - pass - - -class TestTokenizerIntegration: - """Test integration between tokenizer and n-gram processing.""" - - def test_tokenizer_ngram_pipeline(self): - """Test full pipeline from text to n-grams.""" - from analyzers.ngrams.ngrams_base.main import ngrams, serialize_ngram - - text = "This is a test sentence for tokenization." - - # Tokenize - config = TokenizerConfig( - case_handling=CaseHandling.LOWERCASE, - extract_hashtags=False, - extract_mentions=False, - include_urls=False, - min_token_length=1, - ) - tokens = tokenize_text(text, config) - - # Generate n-grams - ngram_list = list(ngrams(tokens, min=2, max=3)) - - # Serialize n-grams - serialized = [serialize_ngram(ngram) for ngram in ngram_list] - - assert len(tokens) > 0 - assert len(ngram_list) > 0 - assert len(serialized) > 0 - assert all(isinstance(s, str) for s in serialized) - - def test_social_media_text_pipeline(self): - """Test pipeline with social media text.""" - from analyzers.ngrams.ngrams_base.main import ngrams - - text = "Great work @team! Check out #progress https://example.com 🎉" - - # Configure for social media analysis - config = TokenizerConfig( - case_handling=CaseHandling.LOWERCASE, - extract_hashtags=True, - extract_mentions=True, - include_urls=True, - include_emoji=True, - min_token_length=1, - ) - tokens = tokenize_text(text, config) - - # Should include social entities - assert any("@" in token for token in tokens) # mentions - assert any("#" in token for token in tokens) # hashtags - - # Generate n-grams from the tokens - ngram_list = list(ngrams(tokens, min=1, max=2)) - assert len(ngram_list) > 0 - - def test_multilingual_pipeline(self): - """Test pipeline with multilingual content.""" - from analyzers.ngrams.ngrams_base.main import ngrams - - text = "Hello 你好 world 世界" - - config = TokenizerConfig( - case_handling=CaseHandling.LOWERCASE, - min_token_length=1, - ) - tokens = tokenize_text(text, config) - - # Should handle mixed scripts - assert len(tokens) >= 3 - - # Generate n-grams - ngram_list = list(ngrams(tokens, min=2, max=2)) - assert len(ngram_list) > 0 - - def test_deterministic_results(self): - """Test that tokenization results are deterministic.""" - text = "Test text for deterministic results" - config = TokenizerConfig(case_handling=CaseHandling.LOWERCASE) - - # Run multiple times - results = [tokenize_text(text, config) for _ in range(5)] - - # All results should be identical - first_result = results[0] - for result in results[1:]: - assert result == first_result - - def test_performance_reasonable(self): - """Test that tokenization performance is reasonable for large text.""" - import time - - # Create a moderately large text - text = "This is a test sentence. " * 1000 # ~25KB of text - - config = TokenizerConfig() - - start_time = time.time() - result = tokenize_text(text, config) - end_time = time.time() - - # Should complete in reasonable time (less than 1 second for 25KB) - assert end_time - start_time < 1.0 - assert len(result) > 1000 # Should produce many tokens - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) From c198f1da1324dcd8bc92b66e6bb2acb042843ec2 Mon Sep 17 00:00:00 2001 From: Joe Karow <58997957+JoeKarow@users.noreply.github.com> Date: Thu, 9 Oct 2025 22:01:40 -0400 Subject: [PATCH 10/24] fix: pin dependency versions to match pre-reorganization state Issue: UV installed latest compatible versions (polars 1.34.0) instead of the original pinned versions (polars 1.9.0), causing 2 test failures due to stricter UDF type inference in newer Polars. Solution: Pin core dependencies to exact versions from requirements.txt: - polars==1.9.0 (was >=1.9.0, UV installed 1.34.0) - pandas==2.2.3 (was >=2.2.3, UV installed 2.3.3) - pyarrow==17.0.0 (was >=17.0.0, UV installed 21.0.0) Test Results: - Before: 116/118 passing (2 UDF errors) - After: 118/118 passing (100%) Files Modified: - packages/core/pyproject.toml: Pin polars, pandas, pyarrow versions - packages/testing/src/cibmangotree_testing/testdata.py: Revert to original Note: Future dependency upgrades should be done intentionally with testing, not automatically via minimum version constraints. --- packages/core/pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/core/pyproject.toml b/packages/core/pyproject.toml index e0c2cc96..3f245358 100644 --- a/packages/core/pyproject.toml +++ b/packages/core/pyproject.toml @@ -20,9 +20,9 @@ classifiers = [ # Core runtime dependencies dependencies = [ # Data processing - "polars>=1.9.0", - "pandas>=2.2.3", # needed by plotly - "pyarrow>=17.0.0", + "polars==1.9.0", # Pinned to avoid UDF API changes in 1.34+ + "pandas==2.2.3", # needed by plotly + "pyarrow==17.0.0", # Pinned for compatibility # Storage & persistence "tinydb>=4.8.0", From 009c123149a181daab0449037ad6641d44d5ae27 Mon Sep 17 00:00:00 2001 From: Joe Karow <58997957+JoeKarow@users.noreply.github.com> Date: Thu, 9 Oct 2025 22:09:16 -0400 Subject: [PATCH 11/24] cleanup Signed-off-by: Joe Karow <58997957+JoeKarow@users.noreply.github.com> --- GUIDE.md | 1751 ------------------------------------------- reorg-plan.md | 1975 ------------------------------------------------- 2 files changed, 3726 deletions(-) delete mode 100644 GUIDE.md delete mode 100644 reorg-plan.md diff --git a/GUIDE.md b/GUIDE.md deleted file mode 100644 index a6d44bd2..00000000 --- a/GUIDE.md +++ /dev/null @@ -1,1751 +0,0 @@ -# Mango Tango CLI - Developer Onboarding Guide - -**Analysis Date:** October 7, 2025 - ---- - -## TL;DR - Quick Start - -**What is this?** A Python CLI tool for detecting coordinated inauthentic behavior in social media data using pluggable analyzers. - -**Get running in 3 steps:** -```bash -python -m venv venv && ./bootstrap.sh # Setup -python cibmangotree.py # Run -# Import sample data → Select analyzer → View results -``` - -**Want to create an analyzer?** -1. Copy `analyzers/example/` structure -2. Define interface in `interface.py` (inputs, outputs, params) -3. Implement analysis in `main.py` (Polars-based) -4. Register in `analyzers/__init__.py` - -**Key concepts:** -- **Primary analyzers**: Process raw data → output files -- **Secondary analyzers**: Post-process primary outputs -- **Web presenters**: Visualize results in Dash/Shiny -- **Everything is Parquet**: CSV/Excel imports → Parquet → Analysis → Parquet → Web viz - -**Critical gotcha:** Always call `input_reader.preprocess()` before using input data - it maps user columns to your schema. - ---- - -## Table of Contents - -- [Overview](#overview) -- [Getting Started](#getting-started) - - [Prerequisites](#prerequisites) - - [Initial Setup](#initial-setup) - - [First Run Experience](#first-run-experience) - - [Quick Validation](#quick-validation) -- [Architecture](#architecture) - - [High-Level Design](#high-level-design) - - [Directory Structure](#directory-structure) - - [Core Abstractions](#core-abstractions) - - [Data Flow](#data-flow) - - [Extension Points](#extension-points) -- [How To](#how-to) - - [Run Tests](#run-tests) - - [Add a Web Presenter](#add-a-web-presenter) - - [Add a Secondary Analyzer](#add-a-secondary-analyzer) - - [Debug an Analyzer](#debug-an-analyzer) - - [Export Analysis Results](#export-analysis-results) - - [Change Storage Location](#change-storage-location) - - [Work with Large Datasets](#work-with-large-datasets) -- [Key Insights](#key-insights) - - [Design Decisions](#design-decisions) - - [Conventions and Patterns](#conventions-and-patterns) - - [Gotchas and Non-Obvious Behavior](#gotchas-and-non-obvious-behavior) - - [Common Pitfalls](#common-pitfalls) - - [Modern vs Legacy Patterns](#modern-vs-legacy-patterns) - - [Notable Implementation Details](#notable-implementation-details) -- [Dependencies & Integrations](#dependencies--integrations) -- [Open Questions & Uncertainties](#open-questions--uncertainties) -- [Team & Contribution Workflow](#team--contribution-workflow) -- [Additional Resources](#additional-resources) -- [Quick Reference](#quick-reference) - ---- - -## Overview - -**Mango Tango CLI** is a Python command-line tool for detecting **Coordinated Inauthentic Behavior (CIB)** in social media data. The application provides a plugin-based architecture where analysts can create modular "analyzers" that process social media datasets to detect patterns of coordination, manipulation, or suspicious behavior. - -**What makes this unique:** -- **Plugin architecture**: Analyzers are self-contained modules that declare their inputs, outputs, and parameters -- **Terminal-based UI**: Rich interactive menus built with `inquirer` and `rich` libraries -- **Data pipeline**: Import CSV/Excel → Run analysis → View results in web dashboards -- **Storage abstraction**: Projects and analysis results persist in user data directories using TinyDB - -**Tech Stack:** -- Python 3.12 (required) -- Polars (primary data processing) -- Pydantic (data modeling and validation) -- TinyDB (lightweight JSON database) -- Shiny/Dash (web-based result visualization) -- Rich (terminal UI) - ---- - -## Getting Started - -### Prerequisites -- **Python 3.12** (strict requirement) -- Virtual environment tools - -### Initial Setup - -1. **Create virtual environment:** - ```bash - python -m venv venv - ``` - -2. **Run bootstrap script:** - - PowerShell: `./bootstrap.ps1` - - Bash: `./bootstrap.sh` - - This installs dependencies and sets up pre-commit hooks for `isort` and `black` formatting. - -3. **Start the application:** - ```bash - python -m cibmangotree - # OR - python cibmangotree.py - ``` - -### First Run Experience - -The entry point is `cibmangotree.py` which: -1. Shows a loading message ("🥭 CIB Mango Tree is starting...") -2. Lazy-loads heavy imports (analyzers, components) -3. Initializes storage in platform-specific user data directory -4. Sets up logging -5. Displays splash screen -6. Launches the main menu - -**Main Menu Flow:** -- Import dataset for new project → Select analysis → View results -- Review existing project → Select analysis → View results - -### Quick Validation - -After setup, run the application and you should see: -- The mango emoji splash screen -- An interactive menu asking what you'd like to do -- No errors in the console - ---- - -## Architecture - -### High-Level Design - -```mermaid -graph TB - Entry[cibmangotree.py] --> MainMenu[components/main_menu] - MainMenu --> Import[Import Dataset] - MainMenu --> Review[Review Project] - - Import --> NewProject[Create Project] - NewProject --> Storage[(Storage Layer)] - - Review --> SelectProject[Select Project] - SelectProject --> AnalysisMenu[Analysis Menu] - - AnalysisMenu --> RunAnalyzer[Run Analyzer] - RunAnalyzer --> PrimaryAnalyzer[Primary Analyzer] - PrimaryAnalyzer --> SecondaryAnalyzers[Secondary Analyzers] - SecondaryAnalyzers --> WebPresenter[Web Presenter] - - Storage --> TinyDB[(TinyDB)] - Storage --> ParquetFiles[(Parquet Files)] -``` - -### Directory Structure - -**Key directories explained:** - -``` -mango-tango-cli/ -├── cibmangotree.py # Entry point - starts the app -├── app/ # Core application models and context -│ ├── app.py # App class - orchestrates projects -│ ├── app_context.py # Application-level context -│ ├── project_context.py # Project-level context -│ └── logger.py # Logging setup -├── components/ # UI components (terminal screens) -│ ├── main_menu.py # Top-level menu -│ ├── project_main.py # Project management screen -│ ├── analysis_main.py # Analysis execution screen -│ └── select_analysis.py # Analyzer selection UI -├── analyzers/ # ⭐ Plugin analyzers (add new ones here) -│ ├── __init__.py # Registers all analyzers in suite -│ ├── example/ # Example analyzer (reference this!) -│ │ ├── example_base/ # Primary analyzer -│ │ ├── example_report/ # Secondary analyzer -│ │ └── example_web/ # Web presenter -│ ├── hashtags/ # Hashtag analysis -│ ├── ngrams/ # N-gram analysis -│ └── temporal/ # Time-based analysis -├── analyzer_interface/ # ⭐ Core analyzer framework -│ ├── declaration.py # Analyzer registration classes -│ ├── suite.py # Analyzer suite management -│ ├── context.py # Runtime context for analyzers -│ └── params.py # Parameter type definitions -├── storage/ # Data persistence layer -│ └── __init__.py # Storage class - file/DB operations -├── importing/ # CSV/Excel import logic -├── terminal_tools/ # Terminal UI utilities -└── meta/ # Version info -``` - -**What goes where:** -- **New analyzers** → `analyzers/{analyzer_name}/` (copy structure from `example/`) -- **UI screens** → `components/` -- **Business logic** → `app/` -- **Data models** → Use Pydantic models inline or in relevant modules - -### Core Abstractions - -#### 1. **Analyzer Plugin System** - -The architecture centers around three types of analyzers: - -**Primary Analyzer** (`AnalyzerDeclaration`): -- Entry point for analysis -- Declares input columns, parameters, outputs -- Processes raw data → generates output files -- Example: `analyzers/example/example_base/` - -**Secondary Analyzer** (`SecondaryAnalyzerDeclaration`): -- Consumes output from primary analyzer -- Can depend on other secondary analyzers (topologically sorted) -- Example: Generate statistics from primary results - -**Web Presenter** (`WebPresenterDeclaration`): -- Visualizes analyzer results -- Creates Dash or Shiny apps -- Example: `analyzers/example/example_web/` - -#### 2. **Storage Layer** - -`storage/Storage` class manages: -- **Projects**: Imported datasets (stored as Parquet) -- **Analyses**: Analysis runs with parameters and results -- **TinyDB**: Metadata (JSON file in user data dir) -- **File System**: Organized directory structure per project - -**Directory layout** (in user data dir): -``` -projects/ -└── {project_id}/ - ├── input.parquet # Imported data - └── analysis/ - └── {analysis_id}/ - ├── primary_outputs/ # Primary analyzer results - ├── secondary_outputs/ # Secondary analyzer results - ├── exports/ # User exports - └── web_presenters/ # Web presenter state -``` - -#### 3. **Context Pattern** - -The app uses context objects to pass state through the UI layers: - -- `ViewContext` → Terminal + App instance -- `AppContext` → Storage + Analyzer Suite -- `ProjectContext` → Project model + App context -- `AnalysisContext` → Analysis model + Project context -- `PrimaryAnalyzerContext` → Input/output paths, params for analyzer - -**Why contexts?** They provide type-safe, structured access to dependencies without global state. - -### Data Flow - -**Import → Analyze → Visualize:** - -```mermaid -sequenceDiagram - participant User - participant UI as Terminal UI - participant App - participant Storage - participant Analyzer - participant WebServer - - User->>UI: Import CSV/Excel - UI->>App: create_project() - App->>Storage: Save as Parquet - - User->>UI: Select Analyzer - UI->>User: Map columns to analyzer inputs - UI->>User: Configure parameters - - User->>UI: Run Analysis - UI->>Analyzer: Execute with context - Analyzer->>Storage: Write outputs (Parquet) - - User->>UI: View Results - UI->>WebServer: Launch web presenter - WebServer->>Storage: Read output files - WebServer->>User: Display interactive dashboard -``` - -**Key insight:** Everything is Parquet-based. CSV/Excel → Parquet → Analysis → Parquet → Web visualization. - -### Extension Points - -#### Adding a New Analyzer - -1. **Create directory structure:** - ``` - analyzers/my_analyzer/ - ├── __init__.py - ├── interface.py # Declare inputs/outputs/params - └── main.py # Analysis logic - ``` - -2. **Define interface** (`interface.py`): - ```python - from analyzer_interface import AnalyzerInterface, AnalyzerInput, InputColumn, ... - - interface = AnalyzerInterface( - id="my_analyzer", - name="My Analyzer", - input=AnalyzerInput(columns=[...]), - outputs=[...], - params=[...], - ) - ``` - -3. **Implement analysis** (`main.py`): - ```python - def main(context: PrimaryAnalyzerContext): - input_reader = context.input() - df = input_reader.preprocess(pl.read_parquet(input_reader.parquet_path)) - - # Your analysis logic here - result = df.select(...) - - result.write_parquet(context.output("my_output").parquet_path) - ``` - -4. **Register** (`__init__.py`): - ```python - from analyzer_interface import AnalyzerDeclaration - from .interface import interface - from .main import main - - my_analyzer = AnalyzerDeclaration( - interface=interface, - main=main, - is_distributed=False, # Set True when ready for production - ) - ``` - -5. **Add to suite** (`analyzers/__init__.py`): - ```python - from .my_analyzer import my_analyzer - - suite = AnalyzerSuite(all_analyzers=[..., my_analyzer]) - ``` - ---- - -## How To - -### Run Tests - -```bash -# Run all tests -pytest - -# Run with verbose output -pytest -v - -# Run specific test file -pytest analyzers/example/test_example_base.py -``` - -Tests use the pattern: `test_{analyzer_name}.py` files alongside analyzer code. - -### Add a Web Presenter - -See `analyzers/example/example_web/` for the pattern: - -```python -from analyzer_interface import WebPresenterDeclaration, WebPresenterInterface - -def factory(context: WebPresenterContext): - # Create Dash app or Shiny app - # Access analyzer outputs via context.base.table(output_id) - df = pl.read_parquet(context.base.table("output_id").parquet_path) - - # Build your visualization - ... - -web_presenter = WebPresenterDeclaration( - interface=WebPresenterInterface(...), - factory=factory, - name=__name__, # Important for asset path resolution - shiny=False, # True for Shiny, False for Dash -) -``` - -**Assets:** Place CSS/JS/images in `assets/` folder next to your web presenter module. Dash will serve them automatically. - -### Add a Secondary Analyzer - -Secondary analyzers process primary analyzer outputs: - -```python -from analyzer_interface import SecondaryAnalyzerDeclaration, SecondaryAnalyzerInterface - -def main(context: SecondaryAnalyzerContext): - # Access primary analyzer outputs - primary_output = pl.read_parquet( - context.base.table("primary_output_id").parquet_path - ) - - # Access parameters from primary analyzer - param_value = context.base_params.get("param_id") - - # Process and write output - result = process(primary_output) - result.write_parquet(context.output("my_output").parquet_path) - -secondary = SecondaryAnalyzerDeclaration( - interface=SecondaryAnalyzerInterface( - id="my_secondary", - base_analyzer=primary_interface, # Reference to primary - depends_on=[], # Other secondary analyzers this depends on - ... - ), - main=main, -) -``` - -### Debug an Analyzer - -1. **Check logs:** Logs are in `{user_data_dir}/logs/mangotango.log` - ```bash - # Run with debug logging - python cibmangotree.py --log-level DEBUG - - # Tail the log file - tail -f ~/Library/Application\ Support/MangoTango/logs/mangotango.log - ``` - -2. **Print debugging:** Use `print()` or `rich.print()` - they'll show in terminal - -3. **Inspect data:** Load Parquet files directly: - ```python - import polars as pl - - # Find your data in user data directory - df = pl.read_parquet("path/to/output.parquet") - print(df.head()) - print(df.schema) - ``` - -4. **Test without UI:** Write unit tests to run your analyzer directly: - ```python - # See analyzers/example/test_example_base.py for pattern - ``` - -### Export Analysis Results - -**Via UI:** Project → Analysis → Export Outputs - -**Supported formats:** CSV, Excel (XLSX), JSON, Parquet - -**Chunking:** For large datasets, configure chunk size in settings to split exports into multiple files (e.g., for Excel's row limit). - -### Change Storage Location - -Storage uses `platformdirs` to find user data directory. To override, modify `Storage.__init__()` in `storage/__init__.py`: - -```python -# Default (platform-specific) -self.user_data_dir = platformdirs.user_data_dir( - appname=app_name, appauthor=app_author, ensure_exists=True -) - -# Custom location -self.user_data_dir = "/custom/path/to/data" -``` - -### Work with Large Datasets - -**Best practices:** - -1. **Use lazy evaluation:** - ```python - # Good - lazy evaluation - df = pl.scan_parquet(path) - result = df.filter(...).select(...).collect() - - # Avoid - loads everything into memory - df = pl.read_parquet(path) - ``` - -2. **Stream outputs:** - ```python - # Use sink_parquet for streaming writes - df.lazy().sink_parquet(output_path) - ``` - -3. **Batch processing:** If needed, use `iter_batches()` on PyArrow ParquetFile - -4. **Set sorted hints:** - ```python - df = df.sort(COL_TIMESTAMP) - df = df.lazy().set_sorted(COL_TIMESTAMP) - # Now group_by_dynamic and other operations can optimize - ``` - -5. **Use filters early in lazy chain:** - ```python - # Good - filter before expensive operations - df = pl.scan_parquet(path).filter(...).group_by(...).collect() - - # Bad - filter after expensive operations - df = pl.scan_parquet(path).group_by(...).collect() - df = df.filter(...) - ``` - -### Create a Custom Parameter Type - -While the framework provides `IntegerParam` and `TimeBinningParam`, you can extend it: - -1. **Define param model** in `analyzer_interface/params.py`: - ```python - class MyCustomParam(BaseModel): - type: Literal["my_custom"] = "my_custom" - # Your config fields here - - class MyCustomValue(BaseModel): - # Runtime value structure - pass - - # Update unions - ParamType = Union[TimeBinningParam, IntegerParam, MyCustomParam] - ParamValue = Union[TimeBinningValue, int, MyCustomValue] - ``` - -2. **Add UI handler** in `components/analysis_params.py` to prompt user for value - -3. **Use in analyzer:** - ```python - AnalyzerParam( - id="my_param", - type=MyCustomParam(...), - default=MyCustomValue(...) - ) - ``` - -### Profile Analyzer Performance - -To find bottlenecks in your analyzer: - -1. **Use time tracking:** - ```python - import time - - start = time.time() - # ... operation ... - print(f"Operation took {time.time() - start:.2f}s") - ``` - -2. **Check query plan:** - ```python - lazy_df = pl.scan_parquet(path).filter(...).select(...) - print(lazy_df.explain()) # Shows optimization plan - ``` - -3. **Monitor memory:** - ```python - import psutil - process = psutil.Process() - print(f"Memory: {process.memory_info().rss / 1024**2:.2f} MB") - ``` - -4. **Profile with cProfile:** - ```bash - python -m cProfile -o output.prof cibmangotree.py - python -m pstats output.prof - ``` - -### Handle Missing/Invalid Data - -**Strategy 1: Filter out invalids** -```python -df = df.filter( - pl.col(COL_TEXT).is_not_null() & - (pl.col(COL_TEXT).str.len_chars() > 0) -) -``` - -**Strategy 2: Fill with defaults** -```python -df = df.with_columns( - pl.col(COL_COUNT).fill_null(0) -) -``` - -**Strategy 3: Validate and warn** -```python -invalid_count = df.filter(pl.col(COL_ID).is_null()).height -if invalid_count > 0: - warnings.warn(f"{invalid_count} rows have null IDs and will be excluded") -df = df.filter(pl.col(COL_ID).is_not_null()) -``` - -### Use Default Parameters Dynamically - -The `default_params` function in `AnalyzerDeclaration` can inspect input data: - -```python -def compute_defaults(context: PrimaryAnalyzerContext) -> dict[str, ParamValue]: - input_reader = context.input() - df = input_reader.preprocess(pl.read_parquet(input_reader.parquet_path)) - - # Example: set default window based on data time span - time_span = df.select( - (pl.col("timestamp").max() - pl.col("timestamp").min()) - ).item() - - if time_span > timedelta(days=365): - window = TimeBinningValue(unit="month", amount=1) - else: - window = TimeBinningValue(unit="day", amount=1) - - return {"time_window": window} - -analyzer = AnalyzerDeclaration( - interface=interface, - main=main, - default_params=compute_defaults, # Called when creating new analysis -) -``` - -### Debug Import Issues - -If your CSV/Excel import isn't working: - -1. **Check file encoding:** - ```bash - file -I yourfile.csv - # If not UTF-8, convert: - iconv -f ISO-8859-1 -t UTF-8 yourfile.csv > yourfile_utf8.csv - ``` - -2. **Inspect with sample:** - ```python - import polars as pl - df = pl.read_csv("yourfile.csv", n_rows=10) - print(df) - print(df.schema) - ``` - -3. **Check delimiter:** - The importer auto-detects, but you can override in the UI's manual config mode. - -4. **Look at logs:** - Logs at `{user_data_dir}/logs/mangotango.log` show detailed import errors. - ---- - -## Key Insights - -### Design Decisions - -**Why Polars instead of Pandas?** -- Performance: Much faster for large datasets -- Arrow-native: Better memory efficiency -- API: More consistent and explicit - -**Why TinyDB instead of SQLite?** -- Simplicity: JSON file, no schema migrations -- Portability: Easy to inspect/debug -- Sufficient for metadata storage (not for analysis data) - -**Why plugin architecture?** -- Extensibility: Analysts can add analyzers without modifying core -- Isolation: Each analyzer is self-contained -- Discoverability: All analyzers auto-registered in suite - -**Why separate primary/secondary analyzers?** -- Reusability: Secondary analyzers can work across different primaries -- Dependency management: Topological sort ensures correct execution order -- Clarity: Separates data generation from post-processing - -### Conventions and Patterns - -#### Naming Conventions - -- **Analyzer IDs:** Use `snake_case` (e.g., `time_coordination`) -- **File names:** Match Python conventions (`my_module.py`) -- **Output IDs:** Descriptive snake_case (e.g., `character_count`) - -#### Code Style - -- **Formatting:** `black` (enforced by pre-commit) -- **Import sorting:** `isort` (enforced by pre-commit) -- **Type hints:** Strongly encouraged, especially for public APIs -- **Pydantic models:** Use for all data structures crossing boundaries - -#### Analyzer Patterns - -**Always do this:** -1. Call `input_reader.preprocess()` before using input data -2. Write outputs to paths from `context.output(id).parquet_path` -3. Match declared columns in interface exactly - -**Never do this:** -- Don't read files outside of provided context paths -- Don't modify the input data files -- Don't use global state - -### Gotchas and Non-Obvious Behavior - -1. **Column mapping is user-driven:** Your analyzer declares what columns it needs, but users map their CSV columns to your schema. Don't assume column names! - -2. **Preprocessing is mandatory:** The `input_reader.preprocess()` call transforms user data to match your interface. Skip it and you'll get wrong column names or types. - -3. **`is_distributed` flag:** Analyzers with `is_distributed=False` only show in development mode. Set to `True` when ready for end users. - -4. **Parquet everywhere:** All analyzer I/O uses Parquet. Don't try to write CSV/JSON in analyzer main logic (that's for exports only). - -5. **Context paths are temporary during execution:** The context provides paths - use them, don't construct your own. - -6. **TinyDB is single-file:** All metadata in one JSON file (`db.json`). Database locks prevent concurrent access. - -7. **Bootstrap scripts required:** Dependencies include compiled packages (like Polars). The bootstrap script ensures proper installation. - -8. **Module registration:** After creating an analyzer, you MUST add it to `analyzers/__init__.py` suite, or it won't be discovered. - -9. **Output column order matters:** Columns in output DataFrames should match the order declared in the interface for consistency. - -10. **Web presenter state persistence:** Web presenters can store state in `context.state_dir`, which persists between runs. Useful for caching expensive computations. - -11. **Sample data location:** Sample datasets are in `sample_data/` directory: - - `fake_data.csv`: Small synthetic dataset for testing - - `reddit_vm.csv`: Larger real-world Reddit data (618KB) - -12. **Timezone handling in datetime columns:** The preprocessing system detects timezones, warns if multiple are found, then strips them. All datetimes become timezone-naive in analysis. If you need timezone-aware analysis, you must handle it in your analyzer logic. - -13. **ProgressReporter uses multiprocessing:** This means you can't pass non-picklable objects through it. Keep progress updates simple (floats only). - -14. **AnalyzerSuite has a typo:** `primary_anlyzers` (not `primary_analyzers`). This is internal so doesn't affect your code, but you'll see it in tracebacks. - -15. **Test fixtures must call test functions:** The `@pytest.mark.skip()` on helper functions means you MUST call them from your own test functions. They won't auto-run. - -16. **Column `internal` flag:** Outputs with `internal=True` don't show in export UI. Use for intermediate results that users don't need. - -17. **Parameter backfill vs default:** - - `default`: Value for NEW analyses - - `backfill_value`: Value for OLD analyses created before param existed - - Don't confuse them! - -18. **Polars lazy evaluation pitfalls:** - - Calling `.describe()`, `.head()`, etc. on lazy frames triggers collection - - Some operations force eager evaluation (check docs) - - Use `.collect()` explicitly when you want materialization - -19. **File selector state persists:** The app remembers the last directory you browsed. This is stored in TinyDB under `file_selector_state`. - -20. **temp_dir is NOT persistent:** The `context.temp_dir` is cleaned up after analysis. Only use it for temporary files during execution. For persistent state, use `context.state_dir` (web presenters only). - -### Common Pitfalls - -**❌ Don't do this:** - -```python -# Forgetting to preprocess -df = pl.read_parquet(input_reader.parquet_path) -result = df.select("message_text") # May not exist! - -# Hardcoding column names from CSV -df = df.filter(pl.col("Tweet") == "...") # User's column name - -# Using .collect() on huge datasets -df = pl.read_parquet(huge_file) # Loads all into RAM -``` - -**✅ Do this instead:** - -```python -# Always preprocess -df = input_reader.preprocess(pl.read_parquet(input_reader.parquet_path)) -result = df.select("message_text") # Guaranteed to exist - -# Use interface column names -df = df.filter(pl.col("message_text") == "...") - -# Use lazy evaluation -df = pl.scan_parquet(huge_file).filter(...).collect() -``` - -### Modern vs Legacy Patterns - -**Modern (emulate this):** -- ✅ Pydantic models for data validation -- ✅ Type hints everywhere -- ✅ Polars for data processing -- ✅ Context objects for dependency injection -- ✅ Declarative analyzer interfaces - -**Legacy (avoid in new code):** -- ❌ V1 analyzers (in `_bootstrap_analyses_v1()`) - only for backward compatibility -- ❌ Direct file path manipulation -- ❌ Global state or singletons - -### Notable Implementation Details - -#### Entry Point & Startup Optimization - -**Lazy loading in entry point:** -Heavy imports (analyzers, components) are deferred until after the loading message displays. This makes startup feel faster. The entry point (`cibmangotree.py`) follows this pattern: -1. Show loading message early (uses `rich.Console`) -2. Import heavy modules (analyzers suite, components) -3. Initialize storage & logging -4. Display splash screen -5. Launch main menu - -**Multiprocessing freeze support:** -`freeze_support()` call at the top of `cibmangotree.py` enables PyInstaller packaging for distributable executables. Required for Windows packaging. - -**Windows ANSI support:** -`enable_windows_ansi_support()` ensures colored terminal output works on Windows terminals that don't natively support ANSI escape codes. - -#### Analyzer Lifecycle & Dependency Management - -**Topological sorting for secondary analyzers:** -`find_toposorted_secondary_analyzers()` in `AnalyzerSuite` performs depth-first traversal to resolve dependencies. The algorithm: -1. Visits each secondary analyzer -2. Recursively visits its dependencies first -3. Appends to result list only after all dependencies are visited -4. Uses `visited_ids` set to prevent duplicate visits - -This ensures secondary analyzers always run after their dependencies. - -**Cached properties in AnalyzerSuite:** -The suite uses `@cached_property` extensively to avoid re-computing lookups: -- `primary_anlyzers` (note the typo - used internally) -- `_primary_analyzers_lookup` -- `_secondary_analyzers` -- `_secondary_analyzers_by_base` -- `web_presenters_by_primary` - -These caches persist for the lifetime of the app, improving performance. - -**Development vs. Distributed mode:** -The `is_development()` function checks for a `VERSION` file. If absent, the app is in development mode and shows ALL analyzers. In distributed mode (VERSION file exists), only analyzers with `is_distributed=True` are visible. This lets developers test analyzers before releasing them to end users. - -#### Column Mapping & Data Preprocessing - -**Column mapping and preprocessing:** -The `input_reader.preprocess()` call is the most critical and complex part of the analyzer interface. It: -1. **Renames columns** from user's schema to analyzer's expected names (via column mapping dict) -2. **Converts data types** using semantic inference (see `preprocessing/series_semantic.py`) -3. **Applies transformations** specified in the column interface - -The preprocessing logic handles: -- Native datetime/date columns (already correct type) -- Datetime strings with timezone info (extracts and warns about multiple timezones) -- Unix timestamps (seconds or milliseconds, auto-detected) -- URLs (strips whitespace, validates http/https) -- Identifiers (validates alphanumeric with allowed chars) -- Generic text/integer/float/boolean (catch-all fallbacks) - -**Column name hints & fuzzy matching:** -The `column_automap()` function in `analyzer_interface/column_automap.py` implements intelligent column matching: -1. Scores each (user column, expected column) pair based on data type compatibility -2. Boosts score by +10 if `name_hints` match (all words in hint must be in column name) -3. Selects best match for each expected column - -This means better hints → better auto-mapping → less manual work for users. - -**Data type compatibility scoring:** -The system scores data type conversions (see `analyzer_interface/data_type_compatibility.py`): -- Exact matches get highest score -- Compatible types get lower scores (e.g., integer → float) -- Incompatible types return `None` (excluded from matching) - -**Semantic data type inference:** -`preprocessing/series_semantic.py` defines `SeriesSemantic` classes that: -1. Check column type matches expected structural type -2. Sample data (default 100 rows) for validation -3. Attempt conversion with `try_convert` function -4. Validate results meet threshold (default 80% valid) - -For example, `datetime_string` tries to parse with timezone handling, warns if multiple timezones detected, strips TZ info, and validates result is non-null. - -#### Storage & File Management - -**Storage paths are platform-aware:** -Uses `platformdirs` to respect OS conventions: -- macOS: `~/Library/Application Support/MangoTango/` -- Windows: `%APPDATA%/MangoTango/` -- Linux: `~/.local/share/MangoTango/` - -**File locking for database:** -TinyDB is protected by `FileLock` (from `filelock` package) to prevent concurrent access from multiple app instances. Lock file is in temp directory (`platformdirs.user_cache_dir`). - -**V1 analyzer migration:** -The `_bootstrap_analyses_v1()` method in Storage handles permanent backward compatibility. Old analyses stored in `analyzers/` directory (legacy) are auto-migrated to `analysis/` with `__v1__` prefix in database IDs. This runs on every app startup within the database lock. - -**Parquet everywhere:** -All analyzer I/O uses Parquet format because: -- Columnar storage (efficient for analytics) -- Built-in compression -- Schema preservation -- Fast with Polars -- Supports streaming writes (`sink_parquet`) - -**Export chunking for large datasets:** -The `_export_output()` method supports chunking via `export_chunk_size` setting: -1. If chunk size set, calculates number of chunks needed -2. Uses PyArrow's `iter_batches()` for memory-efficient iteration -3. Collects batches into chunks of specified size -4. Writes separate files: `output_0.csv`, `output_1.csv`, etc. -5. Yields progress fraction after each chunk - -This is critical for Excel exports (1M row limit) and memory-constrained environments. - -#### Terminal UI & Progress Reporting - -**Progress reporting with multiprocessing:** -`ProgressReporter` uses multiprocessing to show animated progress: -- Spawns separate process for UI updates -- Shares progress value via `multiprocessing.Value` (double precision) -- Uses `multiprocessing.Event` for done signal -- Displays bouncing bar animation (see `_spinner_frames`) -- Updates every 0.1 seconds - -The context manager pattern (`with ProgressReporter(...) as progress:`) ensures cleanup even on errors. - -**Terminal context nesting:** -The `TerminalContext` in `terminal_tools.inception` allows nested UI contexts. Each level can add prefixes/decorations without components knowing about parent contexts. This enables the hierarchical menu structure. - -#### Web Presenter Architecture - -**Dash vs. Shiny support:** -Web presenters can use either framework: -- **Dash** (legacy): Plotly-based, React under the hood -- **Shiny** (modern): Python port of R Shiny, more Pythonic - -The `WebPresenterDeclaration` has a `shiny` boolean flag. The factory function returns different types: -- Dash: Modifies `context.dash_app` directly -- Shiny: Returns `FactoryOutputContext` with `ShinyContext` containing panel + server handler - -**Asset serving:** -For Dash presenters, the `server_name` parameter (typically `__name__`) determines asset path resolution. Assets in `assets/` folder adjacent to module are auto-served at `/assets/` URL. - -**State persistence:** -Web presenters can store persistent state in `context.state_dir`. This directory is unique per project/analyzer/presenter combo and survives between runs. Useful for: -- Cached computations -- User preferences -- Session state - -#### Testing Infrastructure - -**Test helpers in `testing/` module:** -- `test_primary_analyzer()`: Runs analyzer with test data, compares outputs -- `test_secondary_analyzer()`: Tests with primary outputs + dependencies -- `CsvTestData`, `JsonTestData`, `ExcelTestData`: Load test fixtures -- `PolarsTestData`: Programmatically created test data -- `compare_dfs()`: DataFrame comparison with helpful diff output - -**Pytest skip decorators:** -Test helper functions have `@pytest.mark.skip()` to prevent pytest from running them directly (they're meant to be called from actual test functions). - -**Test data semantics:** -The `semantics` parameter in test data lets you specify column type conversions (e.g., `{"message_id": identifier}`) to match how real data would be preprocessed. - -#### Services & Utilities - -**Tokenizer service (new addition):** -The `services/tokenizer/` module provides a pluggable tokenization framework: -- `AbstractTokenizer`: Base class for all tokenizers -- `TokenizerConfig`: Configuration for case handling, emoji inclusion, length filtering -- `_preprocess_text()`: Unicode normalization, case handling -- `_postprocess_tokens()`: Whitespace stripping, emoji filtering, length filtering -- `_is_emoji()`: Sophisticated emoji detection covering multiple Unicode ranges - -This service appears designed for future n-gram or text analysis features. - -**Polars performance patterns:** -Real-world analyzers (hashtags, time_coordination) demonstrate best practices: -1. Use `.lazy()` for query building -2. Call `.collect()` only once at the end -3. Use `group_by_dynamic()` for time-windowed aggregations -4. Use `sink_parquet()` for streaming writes -5. Set `.set_sorted()` hint when data is sorted (enables optimizations) - -#### Error Handling & Validation - -**Draft analysis flag:** -When an analysis fails, `is_draft=True` is set in the database. This: -- Prevents export and web presenter options -- Shows warning in UI -- Allows user to delete or re-run -- Persists partial outputs for debugging - -**Parameter validation:** -The `ParamType` system (in `analyzer_interface/params.py`) supports: -- `IntegerParam`: min/max bounds validation -- `TimeBinningParam`: structured time window config -- Future: extensible for more param types - -Parameters are validated before being passed to analyzers, so analyzer code can trust the types. - -**Column schema validation:** -The system validates that: -- Analyzer outputs match declared schema (column names and order) -- Test outputs match interface specs -- Column mappings cover all required columns - -Missing any of these causes clear error messages. - ---- - -## Real-World Analyzer Examples & Patterns - -### Hashtag Analyzer Deep Dive - -The hashtag analyzer (`analyzers/hashtags/`) demonstrates several advanced techniques: - -**Gini Coefficient for Coordination Detection:** -- Uses inequality measure to detect trending/coordinated hashtag usage -- Formula: `(n + 1 - 2 * sum(cumulative_counts) / total) / n` -- High Gini → few hashtags dominate → potential coordination -- Low Gini → even distribution → organic activity - -**Dynamic Time Windows:** -- Uses `group_by_dynamic()` with configurable windows (via `TimeBinningParam`) -- Sliding windows via `every` and `period` parameters -- Sorts data first and calls `.set_sorted()` for optimization - -**Hashtag Extraction Strategy:** -1. Check if `#` symbols exist in data -2. If yes: Extract with regex `r"(#\S+)"` -3. If no: Raise error (assumes pre-extracted format) -4. Explode list of hashtags for per-tag analysis - -**Smoothing Results:** -Applies rolling mean with window size 3 to reduce noise in Gini time series. Creates both raw and smoothed versions. - -**DateTime Conversion Handling:** -Explicitly checks if timestamp column is already `pl.Datetime`, converts if needed. This makes the analyzer more robust to preprocessing variations. - -### Time Coordination Analyzer Deep Dive - -The time_coordination analyzer (`analyzers/time_coordination/`) shows co-occurrence detection: - -**Sliding Window Approach:** -- Window size: 15 minutes -- Step size: 5 minutes (windows overlap) -- Users posting in same window are "co-occurring" - -**Self-Join Pattern:** -```python -df = df.join(df, on=COL_TIMESTAMP, how="inner") -``` -This creates all pairs of users within each time window. Clever use of Polars join to generate combinations. - -**Frequency Aggregation:** -Groups by user pairs, counts co-occurrences, sorts by frequency descending. High-frequency pairs are most suspicious. - -**Data Cleaning:** -Filters out null user IDs and timestamps before analysis. Essential for real-world data with missing values. - -### N-Grams Analyzer (Multi-Module) - -The ngrams analyzer suite demonstrates the multi-module pattern: - -**Primary Analyzer (`ngrams_base/`):** -- Extracts n-grams from text -- Tokenizes using pluggable tokenizer service -- Configurable n-gram size (unigrams, bigrams, trigrams, etc.) - -**Secondary Analyzer (`ngram_stats/`):** -- Computes statistics on n-gram outputs -- Depends on primary analyzer outputs -- Generates frequency distributions, top terms, etc. - -**Web Presenter (`ngram_web/`):** -- Visualizes n-gram distributions -- Interactive filtering and exploration -- Uses Shiny framework (modern approach) - -This pattern shows how to build complex analyses from composable pieces. - -### Temporal Analyzer - -The temporal analyzer shows time-series aggregation: - -**Message Volume Over Time:** -Groups messages by time bins, counts per bin. Simple but effective for identifying activity patterns. - -**Temporal Bar Plot Web Presenter:** -Separate web presenter for temporal visualization. Shows how web presenters can be shared across analyzers or analyzer-specific. - -### Common Patterns Across Analyzers - -**1. Defensive Data Loading:** -```python -df = df.filter(pl.col(COL_ID).is_not_null() & pl.col(COL_TIME).is_not_null()) -``` -Always filter out nulls in critical columns. - -**2. Lazy Then Collect:** -```python -df = df.lazy() -# ... transformations ... -df = df.collect() -df.write_parquet(output_path) -``` -Build query lazily, execute once, materialize for output. - -**3. Progress Reporting for Long Operations:** -```python -with ProgressReporter("Processing data") as progress: - # ... work ... - progress.update(0.5) # 50% done - # ... more work ... - progress.update(1.0) # Done -``` - -**4. Column Constant Usage:** -Define column names as constants in `interface.py`, import in `main.py`: -```python -# interface.py -COL_USER_ID = "user_id" - -# main.py -from .interface import COL_USER_ID -df.select(pl.col(COL_USER_ID)) # Type-safe, refactorable -``` - -**5. Explicit DateTime Handling:** -Check type, convert if needed: -```python -if not isinstance(df.schema[COL_TIME], pl.Datetime): - df = df.with_columns(pl.col(COL_TIME).str.to_datetime().alias(COL_TIME)) -``` - -## Dependencies & Integrations - -### Key Dependencies - -| Dependency | Purpose | Why It Matters | -|------------|---------|----------------| -| **polars** | Data processing | Core analysis engine, replacing Pandas | -| **pydantic** | Data validation | Type-safe models, runtime validation | -| **inquirer** | Terminal UI | Interactive prompts and menus | -| **rich** | Terminal formatting | Beautiful console output, progress bars | -| **platformdirs** | Cross-platform paths | User data directory location | -| **tinydb** | JSON database | Lightweight metadata storage | -| **dash** | Web dashboards | Interactive visualizations (legacy) | -| **shiny** | Web dashboards | Modern Python web framework | -| **pyarrow** | Parquet support | Columnar file format backend | -| **xlsxwriter** | Excel export | Writing analysis results to .xlsx | -| **fastexcel** | Excel import | Fast CSV/Excel reading | - -### Development Dependencies - -- **pytest**: Testing framework -- **black**: Code formatter (enforced) -- **isort**: Import sorter (enforced) -- **pyinstaller**: Packaging for executables - -### External Integrations - -**None currently.** The tool is self-contained and processes local files. - -**Potential integrations** (based on codebase hints): -- Social media APIs (for data import) -- Cloud storage (for large datasets) - -### Configuration - -**Command-line arguments:** -```bash -python cibmangotree.py --log-level DEBUG # Set logging verbosity -python cibmangotree.py --noop # Test mode, exits immediately -``` - -**No configuration files** - settings stored in TinyDB. - ---- - -## Performance Considerations - -### Memory Management - -**Analyzer memory profile:** -- Input data loaded once via `input_reader.preprocess()` -- Polars uses lazy evaluation → minimal memory until `.collect()` -- Output writes use `sink_parquet()` for streaming (no full materialization) - -**Typical memory footprint:** -- Small datasets (<10k rows): <100 MB -- Medium datasets (10k-1M rows): 100 MB - 1 GB -- Large datasets (>1M rows): 1 GB+ (use lazy evaluation!) - -**Memory optimization techniques:** -1. **Never read entire input eagerly:** Use `pl.scan_parquet()` not `pl.read_parquet()` -2. **Filter early:** Apply filters before expensive operations -3. **Use streaming writes:** `sink_parquet()` and `sink_csv()` don't materialize -4. **Batch processing:** For huge datasets, use `iter_batches()` from PyArrow -5. **Clear intermediate results:** Delete large DataFrames when done - -### Execution Time - -**Benchmark reference (on M1 Mac, 10k rows):** -- CSV import: ~0.1s -- Column mapping + preprocessing: ~0.05s -- Simple aggregation (hashtag count): ~0.2s -- Complex aggregation (Gini coefficient): ~0.5s -- Parquet write: ~0.05s -- **Total typical analysis: <1s for 10k rows** - -**Scaling factors:** -- CSV import: O(n) -- Polars operations: typically O(n) to O(n log n) -- Self-joins (like time_coordination): O(n²) in worst case (use carefully!) -- Group-by operations: O(n log n) - -**Optimization tips:** -1. **Use Polars native operations:** Much faster than Python loops -2. **Avoid row-by-row processing:** Vectorize with Polars expressions -3. **Use `.explain()` on lazy frames:** Check if query plan is efficient -4. **Profile with ProgressReporter:** Identify slow sections -5. **Consider parallelization:** For independent operations, Polars uses all cores - -### Storage & I/O - -**File sizes:** -- Parquet compression ratio: typically 3-5x smaller than CSV -- TinyDB size: negligible (<1 MB even with 100s of analyses) -- Web presenter state: varies by analyzer - -**I/O optimization:** -- Parquet read/write is fast (columnar format, optimized for analytics) -- TinyDB uses file locks (slight overhead on concurrent access) -- Export operations can be slow for large datasets (use chunking) - -### Web Presenter Performance - -**Dash/Shiny considerations:** -- Loading 1M+ rows in browser: too slow, pre-aggregate first -- Recommended max rows for interactive tables: ~10k -- Use server-side filtering for large datasets -- Cache expensive computations in `context.state_dir` - -**Pattern for large datasets:** -```python -# Don't load full dataset in web presenter -# Instead, pre-aggregate in secondary analyzer -# and load small aggregated result -``` - -## Open Questions & Uncertainties - -### Areas Needing Clarification - -1. **What determines a "good" CIB detection?** The example analyzers show the pattern, but domain knowledge of what patterns indicate coordinated behavior isn't obvious from code. The hashtag analyzer provides a good example: it uses Gini coefficient to measure inequality in hashtag distribution as a proxy for coordination events. - -2. **Web presenter choice (Dash vs Shiny)?** Both frameworks are supported. Shiny appears to be the more modern choice (note the `shiny` flag in `WebPresenterDeclaration`). Dash is legacy but still functional. - -3. **Tokenizer service future:** The `services/tokenizer/` module exists but isn't heavily used yet. It appears designed for future n-gram or text analysis features. The architecture supports pluggable tokenizers with configurable preprocessing. - -4. **React dashboard API:** The `FactoryOutputContext` has an `api` field for REST API output (presumably for React dashboards). This feature doesn't appear to be actively used yet. - -5. **UI component testing:** Tests are co-located with analyzers (e.g., `test_example_base.py`). UI components likely require manual testing or end-to-end tests (not heavily present in current codebase). - -6. **Production deployment strategy:** The codebase supports PyInstaller packaging (`freeze_support()`) but there's no documented deployment process. Likely distributed as standalone executable. - -### Clarifications from Code Exploration - -✅ **`is_development()` function:** Defined in `meta/get_version.py`. Returns `True` if no `VERSION` file exists, indicating development mode. - -✅ **Windows ANSI support:** Imported from `terminal_tools.utils` - enables color output on Windows terminals. - -✅ **Splash function:** Defined in `components/splash.py`. Shows ASCII art logo (three sizes based on terminal width) and a mango tree. Adaptive to terminal size. - -✅ **Typo in `AnalyzerSuite.primary_anlyzers`:** This is indeed a typo (`anlyzers` vs `analyzers`). It's used internally so doesn't affect external API. - -### Questions a New Developer Might Have - -**Q: How do I test my analyzer without going through the full UI?** -- A: Write pytest unit tests like `test_example_base.py`. You can construct a mock `PrimaryAnalyzerContext` and call your `main()` function directly. The testing framework provides `test_primary_analyzer()` helper that handles setup/teardown. Import test data with `CsvTestData`, `PolarsTestData`, etc. - -**Q: What's the performance profile for large datasets (millions of rows)?** -- A: Polars is optimized for large datasets. Typical analysis on 10k rows: <1s. For millions of rows, use lazy evaluation (`scan_parquet`, `lazy()`) religiously. The storage layer uses `sink_parquet()` for streaming writes. Avoid `.collect()` on full datasets. Self-joins (like time_coordination) can be O(n²) - use carefully on large data. - -**Q: How do I contribute an analyzer back to the project?** -- A: See `CONTRIBUTING.md`. Fork → feature branch from `develop` → PR targeting `develop`. Set `is_distributed=False` initially, then `True` when ready for production. Follow pre-commit hooks (black, isort). Include tests and sample data. - -**Q: Can analyzers call external services or are they sandboxed?** -- A: No sandboxing - analyzers are regular Python code and can call external services, APIs, etc. Use appropriate error handling. Be mindful of rate limits and network failures. Consider adding parameters for API keys (though storing secrets safely is currently not built into the framework). - -**Q: What happens if an analyzer crashes mid-execution?** -- A: No automatic rollback. Partial outputs may exist on disk (in `primary_outputs/` or `secondary_outputs/`). The analysis is marked as draft (`is_draft=True`) in the database. The UI shows a warning and prevents export/web presenter access. Users can delete and re-run. For debugging, check logs at `{user_data_dir}/logs/mangotango.log`. - -**Q: How do I localize/internationalize the UI?** -- A: Currently not supported - all strings are hardcoded in English. This would be a good contribution opportunity! You'd need to: - 1. Extract strings to translation files (e.g., using `gettext`) - 2. Add language selection in settings - 3. Update all UI components to use translated strings - -**Q: Can I run multiple instances of the app simultaneously?** -- A: Yes, with caveats. File locks on TinyDB prevent corruption. However, you might see lock contention (slowness) when both instances access the database. Each instance can work on different projects concurrently without issues. - -**Q: How do I access data from one analyzer in another?** -- A: Use secondary analyzers with the `depends_on` field. Secondary analyzers can access: - - Primary analyzer outputs via `context.base.table(output_id)` - - Other secondary analyzer outputs via `context.dependency(interface).table(output_id)` - - Primary analyzer parameters via `context.base_params` - -**Q: What if my analyzer needs gigabytes of temporary storage?** -- A: Use `context.temp_dir` for temporary files during analysis. This directory is cleaned up after execution. For large intermediate results, consider: - 1. Writing to temp_dir as Parquet (compressed) - 2. Using lazy frames to avoid materialization - 3. Streaming processing with batches - 4. Breaking into multiple secondary analyzers (each gets its own temp_dir) - -**Q: Can I use Pandas instead of Polars?** -- A: Technically yes (Polars DataFrames can convert to/from Pandas), but strongly discouraged. The entire framework is optimized for Polars. You'd lose performance benefits and might hit memory issues. If you must, use `.to_pandas()` and `.from_pandas()` sparingly. - -**Q: How do I debug issues with column mapping?** -- A: Check the column mapping dict stored in `AnalysisModel.column_mapping`. You can inspect it via: - ```python - analysis = context.storage.list_project_analyses(project_id)[0] - print(analysis.column_mapping) - ``` - This shows which user columns map to which analyzer columns. - -**Q: What's the difference between `temp_dir` and `state_dir`?** -- A: - - `temp_dir`: Available in all analyzer contexts, temporary (cleaned up after run), unique per execution - - `state_dir`: Only in web presenter contexts, persistent (survives reruns), unique per project/analyzer/presenter combo - - Use temp_dir for intermediate processing, state_dir for caching in web presenters - -**Q: Can I create a web presenter that works with multiple analyzers?** -- A: Not directly - each web presenter is tied to one primary analyzer via `base_analyzer` field. However, you could: - 1. Create a secondary analyzer that combines outputs from multiple primaries - 2. Create a web presenter for that secondary analyzer - 3. Or create separate presenter instances for each analyzer (more common pattern) - ---- - -## Troubleshooting Common Issues - -### Analyzer Won't Show Up - -**Problem:** You created an analyzer but it doesn't appear in the UI. - -**Solutions:** -1. **Check registration:** Did you add it to `analyzers/__init__.py` suite? - ```python - from .my_analyzer import my_analyzer - suite = AnalyzerSuite(all_analyzers=[..., my_analyzer]) - ``` - -2. **Check `is_distributed` flag:** In development mode, all analyzers show. In distributed mode (VERSION file exists), only analyzers with `is_distributed=True` appear. - -3. **Restart the app:** The suite is loaded once at startup. Restart after adding new analyzers. - -4. **Check for Python syntax errors:** Run `python -m py_compile analyzers/my_analyzer/__init__.py` to check for errors. - -### Column Mapping Fails - -**Problem:** User's columns don't map to your analyzer's expected columns. - -**Solutions:** -1. **Improve name hints:** Add more variations users might use - ```python - name_hints=["user", "author", "username", "screen_name", "screen name", "poster"] - ``` - -2. **Check data type compatibility:** Ensure your column's `data_type` can convert from user's data type (see `data_type_compatibility.py`) - -3. **Test with sample data:** Import one of the sample datasets and see which columns auto-match - -4. **Manual mapping:** Users can always override auto-mapping manually in the UI - -### Preprocessing Errors - -**Problem:** `input_reader.preprocess()` raises errors or returns wrong types. - -**Solutions:** -1. **Check for null values:** The preprocessing may fail on columns with many nulls - ```python - # Before preprocess - df = df.filter(pl.col("column").is_not_null()) - ``` - -2. **Verify data types:** Use `df.schema` to check structural types before preprocessing - -3. **Look at semantic inference logs:** Check logs for warnings about timezone handling, type conversion failures - -4. **Test semantic inference directly:** - ```python - from preprocessing.series_semantic import infer_series_semantic - semantic = infer_series_semantic(df["column"]) - print(semantic.semantic_name if semantic else "No match") - ``` - -### Polars Performance Issues - -**Problem:** Analysis is very slow or runs out of memory. - -**Solutions:** -1. **Use lazy evaluation:** - ```python - # Before - df = pl.read_parquet(path) # Loads everything - result = df.filter(...).select(...) - - # After - df = pl.scan_parquet(path) # Lazy - result = df.filter(...).select(...).collect() # Execute once - ``` - -2. **Check query plan:** - ```python - print(df.lazy().filter(...).select(...).explain()) - ``` - Look for expensive operations like full scans that could be optimized. - -3. **Filter early, select late:** - ```python - # Good - df.filter(...).select(["col1", "col2"]) - - # Bad - df.select(["col1", "col2", "col3", ...]).filter(...) - ``` - -4. **Avoid unnecessary collects:** - ```python - # Bad - multiple collects - df1 = df.lazy().filter(...).collect() - df2 = df1.lazy().select(...).collect() - - # Good - single collect - df2 = df.lazy().filter(...).select(...).collect() - ``` - -### Web Presenter Not Loading - -**Problem:** Web presenter fails to start or shows blank page. - -**Solutions:** -1. **Check for exceptions in terminal:** Dash/Shiny errors appear in console - -2. **Verify output files exist:** - ```python - import os - print(os.path.exists(context.base.table("output_id").parquet_path)) - ``` - -3. **Test data loading separately:** - ```python - df = pl.read_parquet(context.base.table("output_id").parquet_path) - print(df.head()) - ``` - -4. **Check port conflicts:** Default port 8050 might be in use. Kill other processes or change port. - -5. **Clear state directory:** Corrupted cache might cause issues - ```bash - rm -rf {state_dir}/* - ``` - -### TinyDB Lock Timeouts - -**Problem:** "Lock timeout" errors when accessing the database. - -**Solutions:** -1. **Close other app instances:** Only one instance should write at a time - -2. **Check for stale locks:** - ```bash - rm {user_cache_dir}/db.lock - ``` - -3. **Increase lock timeout:** Modify `FileLock` timeout in `storage/__init__.py` (default is usually sufficient) - -### Import Failures - -**Problem:** CSV/Excel import fails or produces wrong results. - -**Solutions:** -1. **Check file encoding:** - ```bash - file -I yourfile.csv - ``` - Non-UTF-8 files need conversion. - -2. **Inspect with Polars directly:** - ```python - import polars as pl - df = pl.read_csv("file.csv", n_rows=10) - print(df) - ``` - -3. **Try manual import configuration:** Use the UI's manual config to specify separator, encoding, etc. - -4. **Check for malformed rows:** Some CSV files have inconsistent column counts - -5. **Use Excel if CSV parsing fails:** Excel import is often more robust - -### Test Failures - -**Problem:** Your analyzer tests fail with cryptic errors. - -**Solutions:** -1. **Ensure you're calling the test function:** - ```python - def test_my_analyzer(): - test_primary_analyzer(...) # Must call the helper - ``` - -2. **Check test data schema matches interface:** - ```python - # Input CSV must have columns matching interface column names - # Output CSV must have columns matching declared output columns - ``` - -3. **Use semantics for non-string types:** - ```python - from preprocessing.series_semantic import identifier, datetime_string - - input=CsvTestData( - path, - semantics={"user_id": identifier, "timestamp": datetime_string} - ) - ``` - -4. **Check for floating point precision issues:** Use approximate comparisons for floats - -5. **Run with verbose mode:** - ```bash - pytest -v -s analyzers/my_analyzer/test_my_analyzer.py - ``` - -## Team & Contribution Workflow - -### Git Workflow - -This project uses **Git Flow** with `develop` as the integration branch: - -``` -main (production releases) - ↑ -develop (integration branch) ← TARGET YOUR PRs HERE - ↑ -feature/* or bugfix/* (your work) -``` - -**Important:** Always branch from and PR into `develop`, NOT `main`. - -### Pre-commit Hooks - -Automatically run on commit: -- **isort**: Sorts imports -- **black**: Formats code - -Manual run: -```bash -isort . -black . -``` - -### Code Review Expectations - -From `CONTRIBUTING.md`: -1. Automated CI/CD checks (tests, quality) -2. Manual review by maintainers -3. Approval required before merge -4. PRs must target `develop` branch - -### Commit Message Format - -Use conventional commits: -``` -feat(analyzer): add temporal correlation analyzer - -- Implement sliding window correlation -- Add configurable time windows -- Include statistical significance tests - -Fixes #42 -``` - -Types: `feat`, `fix`, `docs`, `style`, `refactor`, `test`, `chore` - ---- - -## Additional Resources - -- **Technical Documentation:** [https://civictechdc.github.io/mango-tango-cli](https://civictechdc.github.io/mango-tango-cli) -- **Development Guide:** `docs/dev-guide.md` (referenced in README) -- **Contributing Guide:** `CONTRIBUTING.md` (detailed contribution workflow) -- **AI Assistant Context:** - - Claude Code users: See `CLAUDE.md` + Serena MCP integration - - Cursor users: See `.cursorrules` + `.ai-context/` - - Other tools: See `.ai-context/README.md` -- **License:** PolyForm Noncommercial License 1.0.0 (non-commercial use only) -- **Community:** [Civic Tech DC Slack](https://civictechdc.slack.com) - ---- - -## Quick Reference - -### Useful Commands - -```bash -# Run application -python cibmangotree.py - -# Run with debug logging -python cibmangotree.py --log-level DEBUG - -# Run tests -pytest - -# Run specific test -pytest analyzers/example/test_example_base.py - -# Format code -black . -isort . - -# Check version -python cibmangotree.py --noop -``` - -### Key Files for New Developers - -| File | Purpose | -|------|---------| -| `cibmangotree.py` | Application entry point | -| `analyzers/example/` | Reference implementation of all analyzer types | -| `analyzer_interface/` | Core framework for building analyzers | -| `CONTRIBUTING.md` | Detailed contribution guide | -| `storage/__init__.py` | Data persistence layer | -| `components/main_menu.py` | UI flow starting point | - -### Common Directories - -| Directory | Contents | -|-----------|----------| -| `~/.local/share/MangoTango/` (Linux) | User data, projects, databases | -| `~/Library/Application Support/MangoTango/` (macOS) | User data, projects, databases | -| `%APPDATA%/MangoTango/` (Windows) | User data, projects, databases | -| `{user_data_dir}/logs/` | Application logs | -| `{user_data_dir}/projects/` | Project data and analysis results | - ---- - -**Next Steps:** -1. Run the application and import sample data from `sample_data/` -2. Explore the example analyzer in `analyzers/example/` -3. Read through the hashtag analyzer for a real-world example -4. Try creating a simple analyzer following the pattern -5. Check `CONTRIBUTING.md` for contribution guidelines -6. Join the Civic Tech DC Slack for community support - -**Questions?** Open an issue or reach out via the Civic Tech DC Slack workspace. - ---- - -## Summary: What Makes This Codebase Unique - -After deep analysis, here are the standout characteristics that define Mango Tango CLI: - -### Architectural Strengths - -1. **Declarative Analyzer Interface**: The separation of interface declaration (`AnalyzerInterface`) from implementation (`main()`) is brilliant. Analyzers declare what they need (inputs, outputs, params) separately from how they compute. This enables: - - Automatic UI generation for column mapping - - Type validation before execution - - Self-documenting analyzer capabilities - - Easy composition (secondary analyzers, web presenters) - -2. **Sophisticated Column Mapping**: The three-layer column mapping system is more advanced than typical data tools: - - **Layer 1**: Name hint matching (fuzzy, word-based) - - **Layer 2**: Data type compatibility scoring - - **Layer 3**: Semantic type inference with sampling - - This handles diverse social media data formats elegantly. - -3. **Polars-First Architecture**: Unlike most Python data tools stuck on Pandas, this aggressively optimizes for Polars: - - Lazy evaluation throughout - - Streaming writes via `sink_parquet()` - - Platform-native performance - - Handles datasets larger than RAM - -4. **Context-Based Dependency Injection**: The nested context pattern (`ViewContext` → `AppContext` → `ProjectContext` → `AnalysisContext`) provides type-safe dependency passing without global state or singletons. Clean and testable. - -### Design Patterns Worth Studying - -1. **Topological Sort for Dependencies**: Secondary analyzers form a DAG, resolved at runtime with depth-first search. Elegant solution to dependency ordering. - -2. **Multiprocessing Progress Reporting**: The `ProgressReporter` spawns a separate process for UI updates, keeping the main thread focused on computation. Shows careful attention to UX. - -3. **Storage Abstraction**: The `Storage` class abstracts TinyDB + Parquet + filesystem into a clean interface. File locking prevents corruption, platform-aware paths respect OS conventions. - -4. **Semantic Type System**: The `SeriesSemantic` framework in `preprocessing/` is a mini type system for data validation and conversion. Extensible and well-designed. - -### Development Philosophy - -This codebase demonstrates several strong principles: - -- **Pragmatism over Purity**: Uses Pydantic for validation, but doesn't force everything into models. Balances type safety with flexibility. -- **Progressive Enhancement**: V1 analyzer migration shows commitment to backward compatibility. Development mode (`is_distributed=False`) lets you test before releasing. -- **Performance as Default**: Lazy evaluation, streaming I/O, cached properties everywhere. Performance is built-in, not bolted-on. -- **Testability**: Context objects, test helpers, co-located tests. Easy to test analyzers without UI. -- **Incremental Complexity**: Simple analyzers (example) → moderate (hashtags) → complex (ngrams multi-module). Good learning progression. - -### Gotchas to Remember - -The three most important things to internalize: - -1. **Always preprocess**: `input_reader.preprocess()` is mandatory. Skip it and everything breaks. -2. **Lazy then collect**: Build Polars queries lazily, execute once. Multiple collects = performance death. -3. **Register in suite**: New analyzers won't appear until added to `analyzers/__init__.py`. - -### Where to Start Contributing - -Best entry points for new contributors: - -1. **Easy**: Add name hints to existing analyzers (improves column mapping) -2. **Moderate**: Create a new simple analyzer (template pattern, n-gram, sentiment) -3. **Advanced**: Improve semantic type inference (add new `SeriesSemantic` types) -4. **Expert**: Add new parameter types (extends framework capabilities) - -### Final Thoughts - -This is a well-architected codebase that successfully balances: -- Academic rigor (CIB detection) with practical usability (terminal UI) -- Performance (Polars, lazy evaluation) with developer experience (rich contexts, testing) -- Flexibility (plugin analyzers) with structure (declarative interfaces) -- Innovation (semantic type inference) with pragmatism (backward compatibility) - -The codebase shows signs of thoughtful refactoring over time (note the V1 migration code, the typo in `primary_anlyzers` that persists, the dual Dash/Shiny support). It's actively evolving but maintains stability. - -**For developers**: Study the example analyzer thoroughly. The patterns there (constants in interface, ProgressReporter, lazy evaluation) are the "house style" you should emulate. - -**For analysts**: The tool abstracts complexity well. Focus on the domain logic (what is CIB?) and let the framework handle the infrastructure. - -**For contributors**: Read `CONTRIBUTING.md`, join the Slack, and start with a small PR. The community is welcoming to newcomers. - ---- - -**This guide was generated through deep codebase analysis on October 7, 2025. It reflects the state of the codebase at that time. For the latest updates, always check the official documentation and CONTRIBUTING.md.** diff --git a/reorg-plan.md b/reorg-plan.md deleted file mode 100644 index f7352b4b..00000000 --- a/reorg-plan.md +++ /dev/null @@ -1,1975 +0,0 @@ -# CIB Mango Tree CLI - Monorepo Reorganization Plan - -**Status**: Ready for Implementation -**Date**: 2025-10-09 -**Goal**: Transform current flat structure into modern Python monorepo with plugin architecture - ---- - -## Table of Contents - -- [Overview](#overview) -- [Proposed Structure](#proposed-structure) -- [Package Organization](#package-organization) -- [Plugin Architecture](#plugin-architecture) -- [Configuration Strategy](#configuration-strategy) -- [Import Path Migration](#import-path-migration) -- [PyInstaller Compatibility](#pyinstaller-compatibility) -- [Migration Steps](#migration-steps) -- [Testing Strategy](#testing-strategy) -- [Risk Mitigation](#risk-mitigation) -- [Success Criteria](#success-criteria) - ---- - -## Overview - -### Goals - -1. **Modularization**: Organize code into logical packages with clear boundaries -2. **Plugin System**: Enable external analyzer/tokenizer contributions without core changes -3. **Modern Tooling**: Adopt `uv` for fast, reliable dependency management -4. **Clean Architecture**: Separate concerns (core, ui, services, plugins) -5. **Maintainability**: Improve contributor experience and code navigation -6. **PyInstaller Compatible**: Maintain binary build support for releases - -### Key Changes - -- **Directory Structure**: Move to `packages/` with plugin architecture -- **Build System**: Modern `pyproject.toml` with workspace configuration -- **Package Manager**: Migrate from `pip` + `requirements.txt` to `uv` workspace -- **Plugin Discovery**: Hybrid system (entry points + registry for frozen builds) -- **UI Organization**: Consolidate terminal UI under `tui/`, prepare for `gui/` (NiceGUI) -- **Simplified Naming**: Analyzer subdirectories use `base/`, `stats/`, `web/` - -### Design Constraints - -- **PyInstaller**: Must work in frozen executable builds -- **Volunteer-Friendly**: Clear structure for contributors of all skill levels -- **Backward Compatible**: Existing data and workflows must continue working - ---- - -## Proposed Structure - -```bash -cibmangotree/ -├── pyproject.toml # Root workspace config (centralized) -├── uv.lock # Unified dependency lock -├── README.md -├── CONTRIBUTING.md -├── LICENSE -├── bootstrap.sh # Updated to use `uv sync` -├── cibmangotree.py # Backward-compat stub for PyInstaller -├── pyinstaller.spec # Updated build spec -├── .gitignore -├── .github/workflows/ -├── docs/ -├── sample_data/ -└── packages/ - │ - ├── core/ # Core application & framework - │ ├── pyproject.toml - │ ├── tests/ - │ └── src/ - │ └── cibmangotree/ # Package name defines import path - │ ├── __init__.py - │ ├── __main__.py # Entry point - │ ├── _frozen_plugins.py # Auto-generated (pyinstaller.spec) - │ │ - │ ├── app/ # Main application - │ │ ├── __init__.py - │ │ ├── app.py - │ │ ├── logger.py - │ │ ├── app_context.py - │ │ ├── project_context.py - │ │ ├── analysis_context.py - │ │ ├── analysis_output_context.py - │ │ ├── analysis_webserver_context.py - │ │ ├── settings_context.py - │ │ ├── shiny.py - │ │ └── utils.py - │ │ - │ ├── analyzer_interface/ # Analyzer framework - │ │ ├── __init__.py - │ │ ├── column_automap.py - │ │ ├── context.py - │ │ ├── data_type_compatibility.py - │ │ ├── declaration.py - │ │ ├── interface.py - │ │ ├── params.py - │ │ └── suite.py - │ │ - │ ├── tui/ # Terminal User Interface - │ │ ├── __init__.py - │ │ │ - │ │ ├── components/ # Was: components/ - │ │ │ ├── __init__.py - │ │ │ ├── main_menu.py - │ │ │ ├── analysis_main.py - │ │ │ ├── analysis_params.py - │ │ │ ├── analysis_web_server.py - │ │ │ ├── context.py - │ │ │ ├── export_outputs.py - │ │ │ ├── new_analysis.py - │ │ │ ├── new_project.py - │ │ │ ├── project_main.py - │ │ │ ├── select_analysis.py - │ │ │ ├── select_project.py - │ │ │ └── splash.py - │ │ │ - │ │ └── tools/ # Was: terminal_tools/ - │ │ ├── __init__.py - │ │ ├── inception.py - │ │ ├── progress.py - │ │ ├── prompts.py - │ │ └── utils.py - │ │ - │ ├── gui/ # Future: NiceGUI interface - │ │ └── __init__.py # Placeholder - │ │ - │ ├── services/ # Core services - │ │ ├── __init__.py - │ │ │ - │ │ ├── storage/ # Was: storage/ - │ │ │ ├── __init__.py - │ │ │ └── file_selector.py - │ │ │ - │ │ ├── importing/ # Was: importing/ - │ │ │ ├── __init__.py - │ │ │ ├── importer.py - │ │ │ ├── csv.py - │ │ │ └── excel.py - │ │ │ - │ │ ├── preprocessing/ # Was: preprocessing/ - │ │ │ ├── __init__.py - │ │ │ └── series_semantic.py - │ │ │ - │ │ └── tokenizer/ # Abstract interfaces only - │ │ ├── __init__.py - │ │ ├── types.py - │ │ └── base.py - │ │ - │ ├── context/ # Context objects - │ │ └── __init__.py - │ │ - │ ├── meta/ # Version & metadata - │ │ ├── __init__.py - │ │ └── get_version.py - │ │ - │ └── plugin_system/ # Plugin discovery - │ ├── __init__.py - │ └── discovery.py - │ - ├── tokenizers/ - │ └── basic/ # Plugin: basic tokenizer - │ ├── pyproject.toml - │ ├── tests/ - │ │ └── test_basic_tokenizer.py - │ └── src/ - │ └── cibmangotree_tokenizer_basic/ - │ ├── __init__.py - │ ├── tokenizer.py - │ └── patterns.py - │ - ├── analyzers/ - │ ├── example/ # Plugin: example analyzer - │ │ ├── pyproject.toml - │ │ ├── tests/ - │ │ │ ├── test_data/ - │ │ │ ├── test_example_base.py - │ │ │ └── test_example_report.py - │ │ └── src/ - │ │ └── cibmangotree_analyzer_example/ - │ │ ├── __init__.py - │ │ │ - │ │ ├── base/ # Was: example_base/ - │ │ │ ├── __init__.py - │ │ │ ├── interface.py - │ │ │ ├── main.py - │ │ │ └── default_params.py - │ │ │ - │ │ ├── report/ # Was: example_report/ - │ │ │ ├── __init__.py - │ │ │ ├── interface.py - │ │ │ └── main.py - │ │ │ - │ │ └── web/ # Was: example_web/ - │ │ ├── __init__.py - │ │ ├── interface.py - │ │ └── factory.py - │ │ - │ ├── hashtags/ # Plugin: hashtags analyzer - │ │ ├── pyproject.toml - │ │ ├── tests/ - │ │ │ ├── test_data/ - │ │ │ └── test_hashtags_base.py - │ │ └── src/ - │ │ └── cibmangotree_analyzer_hashtags/ - │ │ ├── __init__.py - │ │ │ - │ │ ├── base/ # Was: hashtags_base/ - │ │ │ ├── __init__.py - │ │ │ ├── interface.py - │ │ │ └── main.py - │ │ │ - │ │ └── web/ # Was: hashtags_web/ - │ │ ├── __init__.py - │ │ ├── interface.py - │ │ ├── factory.py - │ │ ├── app.py - │ │ ├── analysis.py - │ │ └── plots.py - │ │ - │ ├── ngrams/ # Plugin: n-grams analyzer - │ │ ├── pyproject.toml - │ │ ├── tests/ - │ │ │ ├── test_data/ - │ │ │ ├── test_ngrams_base.py - │ │ │ └── test_ngram_stats.py - │ │ └── src/ - │ │ └── cibmangotree_analyzer_ngrams/ - │ │ ├── __init__.py - │ │ │ - │ │ ├── base/ # Was: ngrams_base/ - │ │ │ ├── __init__.py - │ │ │ ├── interface.py - │ │ │ └── main.py - │ │ │ - │ │ ├── stats/ # Was: ngram_stats/ - │ │ │ ├── __init__.py - │ │ │ ├── interface.py - │ │ │ └── main.py - │ │ │ - │ │ └── web/ # Was: ngram_web/ - │ │ ├── __init__.py - │ │ ├── interface.py - │ │ ├── factory.py - │ │ └── app.py - │ │ - │ ├── temporal/ # Plugin: temporal analyzer - │ │ ├── pyproject.toml - │ │ ├── tests/ - │ │ └── src/ - │ │ └── cibmangotree_analyzer_temporal/ - │ │ ├── __init__.py - │ │ │ - │ │ ├── base/ # Was: temporal_base/ - │ │ │ ├── __init__.py - │ │ │ ├── interface.py - │ │ │ └── main.py - │ │ │ - │ │ └── web/ # Was: temporal_web/ - │ │ ├── __init__.py - │ │ └── interface.py - │ │ - │ └── time_coordination/ # Plugin: time coordination - │ ├── pyproject.toml - │ ├── tests/ - │ └── src/ - │ └── cibmangotree_analyzer_time_coordination/ - │ ├── __init__.py - │ ├── interface.py - │ └── main.py - │ - └── testing/ # Test utilities - ├── pyproject.toml - ├── tests/ - └── src/ - └── cibmangotree_testing/ - ├── __init__.py - ├── comparers.py - ├── context.py - ├── testdata.py - └── testers.py -``` - ---- - -## Package Organization - -### Package Count: ~10 Packages - -1. **core** - Framework, app, UI, services -2. **tokenizers/basic** - Basic tokenizer implementation -3. **analyzers/example** - Example analyzer for contributors -4. **analyzers/hashtags** - Hashtag analysis -5. **analyzers/ngrams** - N-gram analysis -6. **analyzers/temporal** - Temporal pattern analysis -7. **analyzers/time_coordination** - Time coordination detection -8. **testing** - Test utilities - -### Package Dependency Graph - -```text -cibmangotree (core) - ↓ -├── cibmangotree_tokenizer_basic -├── cibmangotree_testing - ↓ -└── cibmangotree_analyzer_* (all analyzers) - ├── example - ├── hashtags - ├── ngrams (also depends on tokenizer_basic) - ├── temporal - └── time_coordination -``` - ---- - -## Plugin Architecture - -### Design: Hybrid Discovery System - -**Challenge**: Entry points don't work in PyInstaller frozen builds -**Solution**: Hybrid system that works in both development and frozen modes, with **dynamic generation** at build time - -### Implementation - -#### 1. Plugin Registry - -```python -# cibmangotree/plugin_system/discovery.py - -import sys -import importlib.metadata -from typing import List -from cibmangotree.analyzer_interface import AnalyzerDeclaration - -class AnalyzerRegistry: - """Central registry that works in both frozen and installed modes.""" - _analyzers: List[AnalyzerDeclaration] = [] - - @classmethod - def register(cls, analyzer: AnalyzerDeclaration) -> AnalyzerDeclaration: - """Register an analyzer (used in frozen builds).""" - cls._analyzers.append(analyzer) - return analyzer - - @classmethod - def discover(cls) -> List[AnalyzerDeclaration]: - """Discover analyzers - works in both modes.""" - if getattr(sys, 'frozen', False): - # Frozen (PyInstaller): use explicit registry - return cls._analyzers - else: - # Installed: auto-discover via entry points - from cibmangotree.app.logger import get_logger - logger = get_logger(__name__) - - analyzers = [] - for ep in importlib.metadata.entry_points(group='cibmangotree.analyzers'): - try: - analyzer = ep.load() - analyzers.append(analyzer) - except Exception as e: - logger.warning(f"Failed to load analyzer {ep.name}: {e}") - return analyzers -``` - -#### 2. Dynamic Frozen Plugin Generation (Build-Time) - -The `pyinstaller.spec` file automatically generates `_frozen_plugins.py` based on installed packages with entry points. **No manual maintenance required!** - -The spec file: - -1. Discovers all plugins via entry points at build time -2. Auto-generates `_frozen_plugins.py` with appropriate imports -3. Auto-generates `hiddenimports` list for PyInstaller -4. Prints build report showing what's being bundled - -```python -# Excerpt from pyinstaller.spec (see full version in PyInstaller Compatibility section) - -def discover_plugins(group): - """Discover all plugins for a given entry point group.""" - plugins = [] - for ep in importlib.metadata.entry_points(group=group): - module_path, attr_name = ep.value.split(':') - package_name = module_path.split('.')[0] - plugins.append({ - 'name': ep.name, - 'module': module_path, - 'attr': attr_name, - 'package': package_name, - }) - return plugins - -# Discover plugins at build time -analyzers = discover_plugins('cibmangotree.analyzers') -tokenizers = discover_plugins('cibmangotree.tokenizers') - -# Generate _frozen_plugins.py automatically -frozen_plugins_path = generate_frozen_plugins( - analyzers, tokenizers, - 'packages/core/src/cibmangotree/_frozen_plugins.py' -) - -# Generate hiddenimports automatically -plugin_hiddenimports = get_plugin_hiddenimports(analyzers + tokenizers) -``` - -**Auto-generated `_frozen_plugins.py` example:** - -```python -""" -Auto-generated frozen plugin loader for PyInstaller. -Generated during build - DO NOT EDIT MANUALLY. - -This file is automatically generated by pyinstaller.spec based on -installed packages with cibmangotree plugin entry points. -""" - -from cibmangotree.plugin_system.discovery import AnalyzerRegistry - -# Import all bundled analyzers -from cibmangotree_analyzer_hashtags.base import hashtags -from cibmangotree_analyzer_hashtags.web import hashtags_web -from cibmangotree_analyzer_ngrams.base import ngrams -from cibmangotree_analyzer_ngrams.stats import ngram_stats -from cibmangotree_analyzer_ngrams.web import ngrams_web -# ... etc - -# Register all analyzers -_analyzers = [ - hashtags, # hashtags - hashtags_web, # hashtags_web - ngrams, # ngrams - # ... etc -] - -for analyzer in _analyzers: - AnalyzerRegistry.register(analyzer) -``` - -#### 3. Application Startup - -```python -# cibmangotree/__main__.py - -import sys -from cibmangotree.plugin_system.discovery import AnalyzerRegistry -from cibmangotree.analyzer_interface import AnalyzerSuite - -def main(): - # Load frozen plugins if running as executable - if getattr(sys, 'frozen', False): - import cibmangotree._frozen_plugins - - # Discover analyzers (uses registry in frozen mode, entry points otherwise) - analyzers = AnalyzerRegistry.discover() - suite = AnalyzerSuite(all_analyzers=analyzers) - - # ... rest of application initialization -``` - -### Benefits - -✅ **Zero Maintenance** - Automatically discovers and bundles all installed plugins -✅ **No Hardcoding** - Entry points are single source of truth -✅ **Development Mode** - Auto-discovery via entry points, install only what you need -✅ **Frozen Mode** - Auto-generated imports, PyInstaller bundles correctly -✅ **External Plugins** - Contributors can create separate packages -✅ **Selective Bundling** - Only bundles analyzers installed during build -✅ **Build Reports** - Shows exactly what's being bundled - -### Adding New Plugins - -**Developer workflow:** - -```bash -# 1. Create analyzer package with entry points in pyproject.toml -# 2. Install it in workspace -uv sync - -# 3. Build - automatically discovered and bundled! -uv run pyinstaller pyinstaller.spec -``` - -**No changes to spec file or frozen plugins needed!** Everything is discovered and generated automatically at build time. - ---- - -## Configuration Strategy - -### Centralized Configuration (Root `pyproject.toml`) - -All tool configurations, version constraints, and dev dependencies defined once at root. - -```toml -[project] -name = "cibmangotree-workspace" -version = "0.1.0" -requires-python = ">=3.12" -description = "CIB Mango Tree CLI - Social Media Data Analysis Tool" - -[tool.uv.workspace] -members = [ - "packages/core", - "packages/testing", - "packages/tokenizers/basic", - "packages/analyzers/example", - "packages/analyzers/hashtags", - "packages/analyzers/ngrams", - "packages/analyzers/temporal", - "packages/analyzers/time_coordination", -] - -# Centralized version constraints - all packages inherit these -[tool.uv.workspace.dependencies] -# Data processing -polars = ">=1.9.0" -pandas = ">=2.2.3" -pyarrow = ">=17.0.0" - -# Models & validation -pydantic = ">=2.9.1" - -# Storage -tinydb = ">=4.8.0" -platformdirs = ">=4.3.6" -filelock = ">=3.16.1" - -# Terminal UI -inquirer = ">=3.4.0" -rich = ">=14.0.0" -colorama = ">=0.4.6" - -# Web frameworks -dash = ">=2.18.1" -plotly = ">=5.24.1" -shiny = ">=1.4.0" -shinywidgets = ">=0.6.2" -starlette = ">=0.47.1" -uvicorn = ">=0.34.3" - -# Import/Export -xlsxwriter = ">=3.2.0" -fastexcel = ">=0.13.0" - -# Text processing -regex = ">=2025.9.1" - -# Utilities -python-json-logger = ">=2.0.7" -a2wsgi = ">=1.10.10" - -# Development tools -[tool.uv] -dev-dependencies = [ - "black>=24.10.0", - "isort>=5.13.2", - "pytest>=8.3.4", - "pytest-benchmark>=5.1.0", - "pyinstaller>=6.14.1", - "pyarrow-stubs>=17.13", -] - -# Tool configurations - inherited by all packages -[tool.black] -line-length = 88 -target-version = ["py312"] - -[tool.isort] -profile = "black" - -[tool.pytest.ini_options] -pythonpath = ["."] -testpaths = ["packages"] - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" -``` - -### Package-Specific Configuration (Minimal) - -Each package only defines: name, version, description, dependencies, and entry points. - -#### Core Package - -```toml -# packages/core/pyproject.toml - -[project] -name = "cibmangotree" -version = "0.1.0" -description = "CIB Mango Tree CLI - Social Media Analysis Tool" -requires-python = ">=3.12" -dependencies = [ - # Data - "polars", - "pandas", - "pyarrow", - - # Models - "pydantic", - "platformdirs", - - # Storage - "tinydb", - "filelock", - - # Terminal UI - "inquirer", - "rich", - "colorama", - - # Web frameworks - "dash", - "plotly", - "shiny", - "shinywidgets", - "starlette", - "uvicorn", - - # Import/Export - "xlsxwriter", - "fastexcel", - - # Utils - "python-json-logger", - "regex", - "a2wsgi", -] - -[project.scripts] -cibmangotree = "cibmangotree.__main__:main" - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" -``` - -#### Tokenizer Plugin - -```toml -# packages/tokenizers/basic/pyproject.toml - -[project] -name = "cibmangotree-tokenizer-basic" -version = "0.1.0" -description = "Basic tokenizer implementation" -requires-python = ">=3.12" -dependencies = [ - "cibmangotree", - "regex", -] - -# Plugin entry points - auto-discovered by core in dev mode -[project.entry-points."cibmangotree.tokenizers"] -basic = "cibmangotree_tokenizer_basic:BasicTokenizer" - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" -``` - -#### Analyzer Plugin (with Entry Points) - -```toml -# packages/analyzers/hashtags/pyproject.toml - -[project] -name = "cibmangotree-analyzer-hashtags" -version = "0.1.0" -description = "Hashtag analysis for CIB Mango Tree" -requires-python = ">=3.12" -dependencies = [ - "cibmangotree", - "cibmangotree-testing", - "polars", -] - -# Plugin entry points - auto-discovered by core in dev mode -[project.entry-points."cibmangotree.analyzers"] -hashtags = "cibmangotree_analyzer_hashtags.base:hashtags" -hashtags_web = "cibmangotree_analyzer_hashtags.web:hashtags_web" - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" -``` - -#### Testing Utilities - -```toml -# packages/testing/pyproject.toml - -[project] -name = "cibmangotree-testing" -version = "0.1.0" -description = "Testing utilities for CIB Mango Tree" -requires-python = ">=3.12" -dependencies = [ - "cibmangotree", - "polars", - "pytest", -] - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" -``` - ---- - -## Import Path Migration - -### Core Package Imports - -**Before:** - -```python -from app import App -from app.logger import get_logger -from analyzer_interface import AnalyzerInterface, AnalyzerSuite -from analyzer_interface.context import PrimaryAnalyzerContext -from context import AnalysisContext -from meta import get_version -``` - -**After:** - -```python -from cibmangotree.app import App -from cibmangotree.app.logger import get_logger -from cibmangotree.analyzer_interface import AnalyzerInterface, AnalyzerSuite -from cibmangotree.analyzer_interface.context import PrimaryAnalyzerContext -from cibmangotree.context import AnalysisContext -from cibmangotree.meta import get_version -``` - -### Service Imports - -**Before:** - -```python -from storage import Storage -from services.tokenizer.core import AbstractTokenizer -from services.tokenizer.basic import BasicTokenizer -from preprocessing import series_semantic -from preprocessing.series_semantic import infer_series_semantic -from importing import ImporterSession -``` - -**After:** - -```python -from cibmangotree.services.storage import Storage -from cibmangotree.services.tokenizer.core import AbstractTokenizer -from cibmangotree_tokenizer_basic import BasicTokenizer -from cibmangotree.services.preprocessing import series_semantic -from cibmangotree.services.preprocessing.series_semantic import infer_series_semantic -from cibmangotree.services.importing import ImporterSession -``` - -### UI Imports - -**Before:** - -```python -from components import main_menu, splash -from components.main_menu import main_menu -from terminal_tools import ProgressReporter -from terminal_tools.inception import TerminalContext -``` - -**After:** - -```python -from cibmangotree.tui.components import main_menu, splash -from cibmangotree.tui.components.main_menu import main_menu -from cibmangotree.tui.tools import ProgressReporter -from cibmangotree.tui.tools.inception import TerminalContext -``` - -### Testing Imports - -**Before:** - -```python -from testing import test_primary_analyzer, CsvTestData -from testing.testdata import PolarsTestData -from testing.comparers import compare_dfs -``` - -**After:** - -```python -from cibmangotree_testing import test_primary_analyzer, CsvTestData -from cibmangotree_testing.testdata import PolarsTestData -from cibmangotree_testing.comparers import compare_dfs -``` - -### Analyzer Internal Imports (Simplified Names) - -**Before (inside `analyzers/hashtags/`):** - -```python -from .hashtags_base import hashtags -from .hashtags_web import hashtags_web -from .hashtags_base.interface import COL_TEXT, COL_TIMESTAMP -``` - -**After (inside `packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/`):** - -```python -from .base import hashtags -from .web import hashtags_web -from .base.interface import COL_TEXT, COL_TIMESTAMP -``` - -**Before (inside `analyzers/ngrams/`):** - -```python -from .ngrams_base import ngrams -from .ngram_stats import ngram_stats -from .ngram_web import ngrams_web -``` - -**After (inside `packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/`):** - -```python -from .base import ngrams -from .stats import ngram_stats -from .web import ngrams_web -``` - ---- - -## PyInstaller Compatibility - -### Challenge - -PyInstaller bundles Python code into a single executable. Standard plugin discovery mechanisms (entry points via `importlib.metadata`) don't work because: - -- No package metadata available at runtime -- No `site-packages` directory -- Entry points aren't accessible - -### Solution: Hybrid Discovery + Dynamic Generation - -The spec file dynamically discovers all plugins and generates the frozen plugin loader at build time. - -#### 1. Dynamic PyInstaller Spec - -```python -# pyinstaller.spec - -from PyInstaller.utils.hooks import copy_metadata -from PyInstaller.building.api import EXE, PYZ -from PyInstaller.building.build_main import Analysis -import sys -import os -import site -import importlib.metadata -from pathlib import Path - -site_packages_path = None -block_cipher = None - -for site_path in site.getsitepackages(): - if 'site-packages' in site_path: - site_packages_path = site_path - break - -if site_packages_path is None: - raise RuntimeError("site-packages directory not found") - - -# ============================================================================ -# DYNAMIC PLUGIN DISCOVERY -# ============================================================================ - -def discover_plugins(group): - """ - Discover all plugins for a given entry point group. - Returns list of dicts with plugin metadata. - """ - plugins = [] - try: - for ep in importlib.metadata.entry_points(group=group): - module_path, attr_name = ep.value.split(':') - package_name = module_path.split('.')[0] - - plugins.append({ - 'name': ep.name, - 'module': module_path, - 'attr': attr_name, - 'package': package_name, - 'value': ep.value, - }) - except Exception as e: - print(f"Warning: Failed to discover plugins for {group}: {e}") - - return plugins - - -def generate_frozen_plugins(analyzers, tokenizers, output_path): - """ - Generate the frozen plugins loader file dynamically. - This file imports and registers all plugins. - """ - lines = [ - '"""', - 'Auto-generated frozen plugin loader for PyInstaller.', - 'Generated during build - DO NOT EDIT MANUALLY.', - '', - 'This file is automatically generated by pyinstaller.spec based on', - 'installed packages with cibmangotree plugin entry points.', - '"""', - '', - 'from cibmangotree.plugin_system.discovery import AnalyzerRegistry', - '', - ] - - # Import analyzers - if analyzers: - lines.append('# Import all bundled analyzers') - for plugin in analyzers: - lines.append(f"from {plugin['module']} import {plugin['attr']}") - lines.append('') - - # Import tokenizers (if we add tokenizer registry later) - if tokenizers: - lines.append('# Import all bundled tokenizers') - for plugin in tokenizers: - lines.append(f"from {plugin['module']} import {plugin['attr']}") - lines.append('') - - # Register analyzers - if analyzers: - lines.append('# Register all analyzers') - lines.append('_analyzers = [') - for plugin in analyzers: - lines.append(f" {plugin['attr']}, # {plugin['name']}") - lines.append(']') - lines.append('') - lines.append('for analyzer in _analyzers:') - lines.append(' AnalyzerRegistry.register(analyzer)') - lines.append('') - - # Write file - output_path = Path(output_path) - output_path.parent.mkdir(parents=True, exist_ok=True) - - with open(output_path, 'w') as f: - f.write('\n'.join(lines)) - - print(f"Generated frozen plugins loader: {output_path}") - print(f" - {len(analyzers)} analyzers") - print(f" - {len(tokenizers)} tokenizers") - - return output_path - - -def get_plugin_hiddenimports(plugins): - """ - Generate hiddenimports list for PyInstaller from plugin metadata. - """ - imports = [] - for plugin in plugins: - # Add main package - imports.append(plugin['package']) - # Add specific module - imports.append(plugin['module']) - - # Add common submodules for analyzers - if 'analyzer' in plugin['package']: - base_pkg = plugin['package'] - # Try to add common submodules - for submodule in ['base', 'stats', 'web', 'report']: - imports.append(f"{base_pkg}.{submodule}") - - return imports - - -# Discover all plugins from installed packages -print("Discovering plugins...") -analyzers = discover_plugins('cibmangotree.analyzers') -tokenizers = discover_plugins('cibmangotree.tokenizers') - -print(f"Found {len(analyzers)} analyzer(s):") -for a in analyzers: - print(f" - {a['name']}: {a['value']}") - -print(f"Found {len(tokenizers)} tokenizer(s):") -for t in tokenizers: - print(f" - {t['name']}: {t['value']}") - -# Generate frozen plugins file -frozen_plugins_path = generate_frozen_plugins( - analyzers, - tokenizers, - 'packages/core/src/cibmangotree/_frozen_plugins.py' -) - -# Generate hiddenimports -plugin_hiddenimports = [] -plugin_hiddenimports.extend(get_plugin_hiddenimports(analyzers)) -plugin_hiddenimports.extend(get_plugin_hiddenimports(tokenizers)) - -print(f"\nGenerated {len(plugin_hiddenimports)} hidden imports") - -# ============================================================================ -# PYINSTALLER CONFIGURATION -# ============================================================================ - -a = Analysis( - ['cibmangotree.py'], - pathex=['packages/core/src'], - binaries=[], - datas=[ - # Version file - *( - [('./VERSION', '.')] - if os.path.exists('VERSION') else [] - ), - - # Metadata - *copy_metadata('readchar'), - - # Static assets - (os.path.join(site_packages_path, 'shiny/www'), 'shiny/www'), - (os.path.join(site_packages_path, 'shinywidgets/static'), 'shinywidgets/static'), - - # App assets - ('packages/core/src/cibmangotree/app/web_static', 'cibmangotree/app/web_static'), - ('packages/core/src/cibmangotree/app/web_templates', 'cibmangotree/app/web_templates'), - ], - hiddenimports=[ - # Standard hidden imports - 'readchar', - 'numpy', - 'numpy.core.multiarray', - 'shiny', - 'shiny.ui', - 'shiny.server', - 'htmltools', - 'starlette', - 'uvicorn', - 'uvicorn.logging', - 'uvicorn.loops', - 'uvicorn.loops.auto', - 'uvicorn.protocols', - 'uvicorn.protocols.http', - 'uvicorn.protocols.http.auto', - 'uvicorn.protocols.websockets', - 'uvicorn.protocols.websockets.auto', - 'uvicorn.lifespan', - 'uvicorn.lifespan.on', - 'asyncio', - 'websockets', - 'websockets.legacy', - 'websockets.legacy.server', - 'polars', - 'plotly', - 'linkify_it', - 'markdown_it', - 'mdit_py_plugins', - 'mdurl', - 'uc_micro', - 'pythonjsonlogger', - 'pythonjsonlogger.jsonlogger', - - # Core package - 'cibmangotree', - 'cibmangotree.app', - 'cibmangotree.analyzer_interface', - 'cibmangotree.tui.components', - 'cibmangotree.tui.tools', - 'cibmangotree.services.storage', - 'cibmangotree.services.importing', - 'cibmangotree.services.preprocessing', - 'cibmangotree.plugin_system', - - # Frozen plugin loader (auto-generated) - 'cibmangotree._frozen_plugins', - - # Testing utilities (if bundled) - 'cibmangotree_testing', - - # DYNAMICALLY DISCOVERED PLUGINS - *plugin_hiddenimports, - ], - hookspath=[], - runtime_hooks=[], - excludes=[], -) - -pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher) - -if sys.platform == "darwin": - exe = EXE( - pyz, - a.scripts, - a.binaries, - a.zipfiles, - a.datas, - name='cibmangotree', - debug=False, - strip=True, - upx=True, - console=True, - entitlements_file="./mango.entitlements", - codesign_identity=os.getenv('APPLE_APP_CERT_ID'), - ) -else: - exe = EXE( - pyz, - a.scripts, - a.binaries, - a.zipfiles, - a.datas, - name='cibmangotree', - debug=False, - strip=False, - upx=True, - console=True, - ) -``` - -#### 2. Backward Compatibility Stub - -Keep root-level `cibmangotree.py` for PyInstaller entry point: - -```python -# cibmangotree.py -""" -Entry point stub for backward compatibility with PyInstaller. -""" - -from cibmangotree.__main__ import main - -if __name__ == "__main__": - main() -``` - ---- - -## Migration Steps - -### Phase 1: Setup Monorepo Structure - -**Tasks:** - -1. Create `packages/` directory at root -2. Create root `pyproject.toml` with workspace configuration: - - Define workspace members - - Add centralized dependency versions - - Configure tools (black, isort, pytest) - - Add dev dependencies - -3. Test workspace setup: - - ```bash - uv sync --dry-run - ``` - -**Success Criteria:** - -- `packages/` directory exists -- Root `pyproject.toml` is valid -- `uv` command available and functional - ---- - -### Phase 2: Extract Core Package - -**Tasks:** - -1. Create directory structure: - - ```bash - mkdir -p packages/core/src/cibmangotree - mkdir -p packages/core/tests - ``` - -2. Move and reorganize core modules: - - `app/` → `packages/core/src/cibmangotree/app/` - - `analyzer_interface/` → `packages/core/src/cibmangotree/analyzer_interface/` - - `components/` → `packages/core/src/cibmangotree/tui/components/` - - `terminal_tools/` → `packages/core/src/cibmangotree/tui/tools/` - - `context/` → `packages/core/src/cibmangotree/context/` - - `meta/` → `packages/core/src/cibmangotree/meta/` - - `storage/` → `packages/core/src/cibmangotree/services/storage/` - - `importing/` → `packages/core/src/cibmangotree/services/importing/` - - `preprocessing/` → `packages/core/src/cibmangotree/services/preprocessing/` - - `services/tokenizer/core/` → `packages/core/src/cibmangotree/services/tokenizer/core/` - -3. Create placeholder: - - ```bash - mkdir -p packages/core/src/cibmangotree/gui - touch packages/core/src/cibmangotree/gui/__init__.py - ``` - -4. Create plugin system: - - ```bash - mkdir -p packages/core/src/cibmangotree/plugin_system - ``` - - Create `discovery.py` with `AnalyzerRegistry` class (see Plugin Architecture section) - -5. Note: `_frozen_plugins.py` will be auto-generated by `pyinstaller.spec` during builds - - Do not create this file manually - - Add to `.gitignore` (see Phase 6) - -6. Create `packages/core/pyproject.toml` (see Configuration Strategy section) - -7. Update `__main__.py` to use plugin discovery - -8. Update internal imports within core package - -**Success Criteria:** - -- Core package structure complete -- `uv sync` installs core package -- Can import `cibmangotree.*` modules - ---- - -### Phase 3: Extract Plugin Packages - -**For each plugin (tokenizer, analyzers):** - -#### 3.1 Basic Tokenizer - -```bash -mkdir -p packages/tokenizers/basic/src/cibmangotree_tokenizer_basic -mkdir -p packages/tokenizers/basic/tests -``` - -Move: - -- `services/tokenizer/basic/` → `packages/tokenizers/basic/src/cibmangotree_tokenizer_basic/` -- `services/tokenizer/basic/test_*.py` → `packages/tokenizers/basic/tests/` - -Create `packages/tokenizers/basic/pyproject.toml` - -#### 3.2 Example Analyzer - -```bash -mkdir -p packages/analyzers/example/src/cibmangotree_analyzer_example/{base,report,web} -mkdir -p packages/analyzers/example/tests/test_data -``` - -Move and rename: - -- `analyzers/example/example_base/` → `packages/analyzers/example/src/cibmangotree_analyzer_example/base/` -- `analyzers/example/example_report/` → `packages/analyzers/example/src/cibmangotree_analyzer_example/report/` -- `analyzers/example/example_web/` → `packages/analyzers/example/src/cibmangotree_analyzer_example/web/` -- `analyzers/example/test_*.py` → `packages/analyzers/example/tests/` -- `analyzers/example/test_data/` → `packages/analyzers/example/tests/test_data/` - -Update internal imports: - -```python -# Change from: -from .example_base import example_base - -# To: -from .base import example_base -``` - -Create `packages/analyzers/example/pyproject.toml` with entry points - -#### 3.3 Hashtags Analyzer - -Similar process: - -```bash -mkdir -p packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/{base,web} -mkdir -p packages/analyzers/hashtags/tests/test_data -``` - -Move and rename subdirectories, update imports, create pyproject.toml - -#### 3.4 Ngrams Analyzer - -```bash -mkdir -p packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/{base,stats,web} -mkdir -p packages/analyzers/ngrams/tests/test_data -``` - -Move and rename subdirectories, update imports, create pyproject.toml - -#### 3.5 Temporal Analyzer - -```bash -mkdir -p packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/{base,web} -mkdir -p packages/analyzers/temporal/tests -``` - -Move and rename subdirectories, update imports, create pyproject.toml - -#### 3.6 Time Coordination Analyzer - -```bash -mkdir -p packages/analyzers/time_coordination/src/cibmangotree_analyzer_time_coordination -mkdir -p packages/analyzers/time_coordination/tests -``` - -Move files, update imports, create pyproject.toml - -**Success Criteria:** - -- All plugin packages created -- Entry points defined -- `uv sync` completes successfully -- Can import plugins from new paths - ---- - -### Phase 4: Extract Testing Package - -**Tasks:** - -```bash -mkdir -p packages/testing/src/cibmangotree_testing -mkdir -p packages/testing/tests -``` - -Move: - -- `testing/` → `packages/testing/src/cibmangotree_testing/` - -Create `packages/testing/pyproject.toml` - -**Success Criteria:** - -- Testing package created -- Can import from `cibmangotree_testing` - ---- - -### Phase 5: Update All Imports - -**Systematic Import Updates:** - -1. Create search/replace mapping document -2. Use automated tools where possible: - - ```bash - # Example: update app imports - find packages -name "*.py" -type f -exec sed -i.bak \ - 's/from app import/from cibmangotree.app import/g' {} + - - # Example: update component imports - find packages -name "*.py" -type f -exec sed -i.bak \ - 's/from components import/from cibmangotree.tui.components import/g' {} + - ``` - -3. Manual review for complex cases: - - Relative imports - - Dynamic imports - - String-based imports - -4. Update imports in each package: - - Core package - - Each plugin package - - Testing package - -**Success Criteria:** - -- No import errors when running `uv run cibmangotree --help` -- All tests can import required modules - ---- - -### Phase 6: Update PyInstaller Spec for Dynamic Plugin Discovery - -**Tasks:** - -1. Update `pyinstaller.spec` with dynamic plugin discovery functions (see PyInstaller Compatibility section): - - Add `discover_plugins()` function - - Add `generate_frozen_plugins()` function - - Add `get_plugin_hiddenimports()` function - - Add plugin discovery calls at build time - - Update `hiddenimports` to include `*plugin_hiddenimports` - -2. Create/keep root `cibmangotree.py` stub for backward compatibility - -3. Add `.gitignore` entry for auto-generated file: - - ```gitignore - # Auto-generated by pyinstaller.spec - packages/core/src/cibmangotree/_frozen_plugins.py - ``` - -**Success Criteria:** - -- PyInstaller spec auto-discovers plugins at build time -- Generates `_frozen_plugins.py` automatically -- Generates `hiddenimports` list automatically -- Build outputs show discovered plugins -- Root stub exists - -**Estimated Time:** 1 hour - ---- - -### Phase 7: Update CI/CD & Development Tooling - -**Tasks:** - -1. Update GitHub Actions workflows (`.github/workflows/*.yml`): - - ```yaml - - name: Install uv - run: curl -LsSf https://astral.sh/uv/install.sh | sh - - - name: Install dependencies - run: uv sync - - - name: Run tests - run: uv run pytest - - - name: Format check - run: | - uv run black --check packages/ - uv run isort --check packages/ - - - name: Build executable - run: uv run pyinstaller pyinstaller.spec - ``` - -2. Update `bootstrap.sh`: - - ```bash - #!/bin/bash - - # Install uv if not present - if ! command -v uv &> /dev/null; then - echo "Installing uv..." - curl -LsSf https://astral.sh/uv/install.sh | sh - fi - - # Sync all workspace packages - echo "Syncing workspace..." - uv sync - - echo "Bootstrap complete. Run 'uv run cibmangotree' to start." - ``` - -3. Update `.gitignore`: - - ```gitignore - # uv - .venv/ - uv.lock - - # Python - __pycache__/ - *.py[cod] - *$py.class - *.so - .Python - build/ - develop-eggs/ - dist/ - downloads/ - eggs/ - .eggs/ - lib/ - lib64/ - parts/ - sdist/ - var/ - wheels/ - *.egg-info/ - .installed.cfg - *.egg - - # Project specific - venv/ - __private__ - /analysis_outputs - /site - VERSION - *.DS_Store - .env* - ``` - -4. Update documentation: - - `README.md` - Installation and setup - - `CLAUDE.md` - Code navigation examples - - `GUIDE.md` - Architecture references - - `.ai-context/README.md` - Structure overview - - `.ai-context/architecture-overview.md` - Package structure - - Create `CONTRIBUTING.md` - Contributor guide - -**Success Criteria:** - -- CI/CD pipeline passes -- Bootstrap script works -- Documentation accurate and complete - ---- - -### Phase 8: Testing & Validation - -**Tasks:** - -1. **Unit Testing:** - - ```bash - # Test entire workspace - uv run pytest - - # Test specific packages - uv run pytest packages/core/tests - uv run pytest packages/analyzers/hashtags/tests - ``` - -2. **Integration Testing:** - - ```bash - # Run application - uv run cibmangotree --help - uv run cibmangotree --noop - ``` - -3. **PyInstaller Build Testing:** - - ```bash - # Build executable - uv run pyinstaller pyinstaller.spec - - # Test executable - dist/cibmangotree --help - dist/cibmangotree --noop - ``` - -4. **Manual Testing:** - - Launch application - - Create new project - - Import sample data - - Run each analyzer - - Export results - - Launch web presenters - -5. **Cross-Platform Testing:** - - Test on Windows (via CI or local) - - Test on macOS (via CI or local) - - Test on Linux (via CI or local) - -6. **Fix Issues:** - - Import errors - - Path resolution issues - - Plugin discovery problems - - PyInstaller build failures - -**Success Criteria:** - -- All tests pass -- Application runs successfully -- All analyzers work -- Web presenters launch -- PyInstaller builds work on all platforms - ---- - -### Phase 9: Cleanup & Documentation - -**Tasks:** - -1. Remove old directory structure: - - ```bash - # Remove old directories (MUST BE EMPTY OR ONLY CONTAIN UNUSED FILES) - trash app/ analyzer_interface/ components/ terminal_tools/ - trash analyzers/ storage/ importing/ preprocessing/ services/ - trash testing/ context/ meta/ - ``` - -2. Update all documentation references - -3. Create migration guide for contributors - -4. Update `.ai-context/` files - -5. Final review of all changes - -**Success Criteria:** - -- Old structure removed -- Documentation complete -- No broken references - ---- - -**Recommended Approach:** Work in feature branch, commit after each phase - ---- - -## Testing Strategy - -### Unit Tests - -```bash -# Run all tests -uv run pytest - -# Run tests for specific package -uv run pytest packages/core/tests -uv run pytest packages/analyzers/hashtags/tests - -# Run specific test file -uv run pytest packages/analyzers/hashtags/tests/test_hashtags_base.py - -# Run with coverage -uv run pytest --cov=cibmangotree --cov-report=html -``` - -### Integration Tests - -```bash -# Test CLI entry point -uv run cibmangotree --help -uv run cibmangotree --noop - -# Test in development mode -uv run python -m cibmangotree - -# Test plugin discovery -uv run python -c "from cibmangotree.plugin_system import AnalyzerRegistry; print(len(AnalyzerRegistry.discover()))" -``` - -### Build Tests - -```bash -# Test PyInstaller build -uv run pyinstaller pyinstaller.spec - -# Test frozen executable -dist/cibmangotree --help -dist/cibmangotree --noop - -# Test on all platforms (via CI) -# - Windows 2022 -# - macOS 13 (x86) -# - macOS 15 (arm64) -``` - -### Manual Testing Checklist - -- [ ] Launch application -- [ ] Create new project -- [ ] Import CSV data -- [ ] Import Excel data -- [ ] Run hashtags analyzer -- [ ] Run ngrams analyzer -- [ ] Run temporal analyzer -- [ ] Run time coordination analyzer -- [ ] View analysis results -- [ ] Export results to XLSX -- [ ] Export results to CSV -- [ ] Launch hashtags web presenter -- [ ] Launch ngrams web presenter -- [ ] Launch temporal web presenter -- [ ] All web presenters display correctly - ---- - -## Risk Mitigation - -### Import Rewrites - -**Risk:** Breaking imports during migration - -**Mitigation:** - -- Work in feature branch -- Commit after each package migration -- Use automated search/replace tools -- Manual review of complex imports -- Test after each phase -- Keep import mapping document - -### PyInstaller Compatibility - -**Risk:** Frozen builds not working - -**Mitigation:** - -- Hybrid plugin discovery system -- Explicit imports in `_frozen_plugins.py` -- Comprehensive `hiddenimports` list -- Test builds frequently during migration -- Keep backward-compatible entry point - -### Dependency Conflicts - -**Risk:** Version conflicts between packages - -**Mitigation:** - -- Centralized version constraints -- Workspace-level dependency resolution -- Test `uv sync` frequently -- Document any version-specific requirements - -### Testing Gaps - -**Risk:** Missing test coverage during migration - -**Mitigation:** - -- Run full test suite after each phase -- Test at package and workspace level -- Manual testing of critical workflows -- Compare test coverage before/after - -### CI/CD Breaking - -**Risk:** GitHub Actions workflows fail - -**Mitigation:** - -- Update CI/CD in same commit as migration -- Test workflows in feature branch -- Have rollback plan ready -- Document new CI/CD setup - -### Contributor Confusion - -**Risk:** Contributors struggle with new structure - -**Mitigation:** - -- Update documentation immediately -- Create migration guide -- Update AI context files -- Clear package boundaries and naming -- Include example analyzer for reference - -### Data Compatibility - -**Risk:** Breaking existing user data - -**Mitigation:** - -- Keep storage format unchanged -- Test with existing projects -- Maintain backward compatibility -- Document any breaking changes - ---- - -## Success Criteria - -### Technical Metrics - -- ✅ All packages have valid `pyproject.toml` -- ✅ `uv sync` completes without errors -- ✅ Full test suite passes (maintain 100% coverage) -- ✅ `uv run cibmangotree` launches successfully -- ✅ All analyzers auto-discovered (dev mode) -- ✅ All analyzers bundled correctly (frozen mode) -- ✅ CI/CD pipeline passes on all platforms -- ✅ PyInstaller builds work on Windows/macOS/Linux - -### Code Quality Metrics - -- ✅ Black and isort pass on all code -- ✅ No circular dependencies -- ✅ Clear import paths -- ✅ Each package has minimal dependencies -- ✅ Plugin architecture works in both modes - -### Functional Metrics - -- ✅ Can import CSV/Excel data -- ✅ Can run all existing analyzers -- ✅ Can export results in all formats -- ✅ Web presenters launch correctly -- ✅ All existing features work as before -- ✅ No data loss or corruption - -### Developer Experience Metrics - -- ✅ Bootstrap time < 2 minutes -- ✅ Clear package boundaries -- ✅ Simple pyproject.toml files (< 30 lines) -- ✅ Documentation updated and accurate -- ✅ Easy to understand structure for new contributors - ---- - -## Development Workflow (Post-Migration) - -### Initial Setup - -```bash -# Clone repository -git clone https://github.com/civictech/cibmangotree.git -cd cibmangotree - -# Run bootstrap script -./bootstrap.sh - -# Or manually: -curl -LsSf https://astral.sh/uv/install.sh | sh -uv sync -``` - -### Daily Development - -```bash -# Sync workspace (after pulling changes) -uv sync - -# Run application -uv run cibmangotree - -# Run tests -uv run pytest - -# Run specific test -uv run pytest packages/analyzers/hashtags/tests/test_hashtags_base.py - -# Format code -uv run black packages/ -uv run isort packages/ - -# Build executable -uv run pyinstaller pyinstaller.spec -``` - -### Adding New Analyzer - -1. Create package structure: - - ```bash - mkdir -p packages/analyzers/my_analyzer/src/cibmangotree_analyzer_my_analyzer/{base,web} - mkdir -p packages/analyzers/my_analyzer/tests - ``` - -2. Create `packages/analyzers/my_analyzer/pyproject.toml`: - - ```toml - [project] - name = "cibmangotree-analyzer-my-analyzer" - version = "0.1.0" - description = "My analyzer" - dependencies = [ - "cibmangotree", - "cibmangotree-testing", - "polars", - ] - - [project.entry-points."cibmangotree.analyzers"] - my_analyzer = "cibmangotree_analyzer_my_analyzer.base:my_analyzer" - ``` - -3. Add to root workspace: - - ```toml - # Edit pyproject.toml - [tool.uv.workspace] - members = [ - # ... existing ... - "packages/analyzers/my_analyzer", - ] - ``` - -4. Sync workspace: - - ```bash - uv sync - ``` - -5. Implement analyzer following existing patterns - -6. Add to frozen plugins (for releases): - - ```python - # Edit cibmangotree/_frozen_plugins.py - from cibmangotree_analyzer_my_analyzer.base import my_analyzer - AnalyzerRegistry.register(my_analyzer) - ``` - -7. Add to PyInstaller spec: - - ```python - # Edit pyinstaller.spec hiddenimports - 'cibmangotree_analyzer_my_analyzer', - 'cibmangotree_analyzer_my_analyzer.base', - ``` - ---- - -## Appendix: Quick Reference - -### Package Structure - -| Package | Path | Purpose | -|---------|------|---------| -| core | `packages/core/` | Framework, app, UI, services | -| tokenizer-basic | `packages/tokenizers/basic/` | Basic tokenizer implementation | -| analyzer-example | `packages/analyzers/example/` | Example for contributors | -| analyzer-hashtags | `packages/analyzers/hashtags/` | Hashtag analysis | -| analyzer-ngrams | `packages/analyzers/ngrams/` | N-gram analysis | -| analyzer-temporal | `packages/analyzers/temporal/` | Temporal patterns | -| analyzer-time-coordination | `packages/analyzers/time_coordination/` | Coordination detection | -| testing | `packages/testing/` | Test utilities | - -### Import Cheat Sheet - -```python -# Core -from cibmangotree.app import App -from cibmangotree.app.logger import get_logger - -# Analyzer framework -from cibmangotree.analyzer_interface import AnalyzerInterface - -# UI -from cibmangotree.tui.components.main_menu import main_menu -from cibmangotree.tui.tools import ProgressReporter - -# Services -from cibmangotree.services.storage import Storage -from cibmangotree.services.importing import ImporterSession - -# Plugins -from cibmangotree_tokenizer_basic import BasicTokenizer -from cibmangotree_analyzer_hashtags.base import hashtags - -# Testing -from cibmangotree_testing import test_primary_analyzer -``` - -### Common Commands - -```bash -# Setup -uv sync - -# Run -uv run cibmangotree - -# Test -uv run pytest -uv run pytest packages/core/tests -uv run pytest -k test_hashtags - -# Format -uv run black packages/ -uv run isort packages/ - -# Build -uv run pyinstaller pyinstaller.spec - -# Build specific package -uv build -p packages/core -``` - ---- - -**Document Version**: 2.0 -**Last Updated**: 2025-10-09 -**Status**: Ready for Implementation From 2397ee37460f5eac5aea33265f589e0aa79159f9 Mon Sep 17 00:00:00 2001 From: Joe Karow <58997957+JoeKarow@users.noreply.github.com> Date: Thu, 9 Oct 2025 22:34:01 -0400 Subject: [PATCH 12/24] docs: update all documentation for UV monorepo structure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Comprehensive update of all documentation to reflect the repository reorganization to a UV workspace monorepo structure. ## Path Updates - Updated 100+ file path references from flat structure to monorepo - `app/` → `packages/core/src/cibmangotree/app/` - `analyzers/` → `packages/analyzers/*/src/cibmangotree_analyzer_*/` - `services/` → `packages/core/src/cibmangotree/services/` - `importing/` → `packages/core/src/cibmangotree/services/importing/` ## Import Statement Fixes - Updated all code examples to use correct monorepo imports - `from app.logger` → `from cibmangotree.app.logger` - `from analyzer_interface` → `from cibmangotree.analyzer_interface` - `from testing` → `from cibmangotree_testing` ## Command Updates - Updated all command examples to use UV workflow - `python -m mangotango` → `uv run cibmangotree` - Added `uv run` prefix to pytest, black, isort, pyinstaller - `pip install` → `uv sync` or `uv add` ## Structural Updates - Added monorepo context notes to key documents - Updated project structure diagrams to show packages/ organization - Rewrote setup guides to emphasize UV-first workflow - Added comprehensive UV workspace documentation ## Cleanup - Removed all Serena MCP references (obsolete tooling) - Removed references to deleted .serena/ directory - Updated IDE integration paths (.venv instead of venv) ## Files Updated (11 total) Critical: - .ai-context/symbol-reference.md - .ai-context/setup-guide.md - docs/guides/get-started/installation.md High Priority: - .ai-context/README.md - .ai-context/architecture-overview.md - docs/guides/contributing/analyzers.md - docs/guides/contributing/logging.md - docs/guides/design-philosophy/core-domain.md - docs/guides/design-philosophy/edge-domain.md Minor: - README.md - CONTRIBUTING.md All documentation now accurately reflects the UV workspace monorepo structure with packages organized under packages/ directory. --- .ai-context/README.md | 15 +- .ai-context/architecture-overview.md | 10 +- .ai-context/setup-guide.md | 379 ++++++++++++++----- .ai-context/symbol-reference.md | 74 ++-- CONTRIBUTING.md | 8 +- README.md | 7 +- docs/guides/contributing/analyzers.md | 106 +++--- docs/guides/contributing/logging.md | 36 +- docs/guides/design-philosophy/core-domain.md | 18 +- docs/guides/design-philosophy/edge-domain.md | 34 +- docs/guides/get-started/installation.md | 194 ++++++---- 11 files changed, 575 insertions(+), 306 deletions(-) diff --git a/.ai-context/README.md b/.ai-context/README.md index f5cb5edc..803c24b8 100644 --- a/.ai-context/README.md +++ b/.ai-context/README.md @@ -29,7 +29,7 @@ consistent UX while allowing easy contribution of new analyzers. UV workspace monorepo with 8 packages: -``` +```bash packages/ ├── core/ # cibmangotree - Main application ├── importing/ # cibmangotree-importing - Data I/O @@ -96,7 +96,7 @@ Dependency injection through context objects: ### Code Organization - Domain-driven module structure -- Interface-first analyzer design +- Interface-first analyzer design - Context-based dependency injection - Test co-location with implementation @@ -112,21 +112,14 @@ Dependency injection through context objects: ### For Development 1. **Setup**: See @.ai-context/setup-guide.md -2. **Architecture**: See @.ai-context/architecture-overview.md +2. **Architecture**: See @.ai-context/architecture-overview.md 3. **Symbol Reference**: See @.ai-context/symbol-reference.md 4. **Development Guide**: See @docs/dev-guide.md ### For AI Assistants -- **Claude Code users**: See @CLAUDE.md (includes Serena integration) +- **Claude Code users**: See @CLAUDE.md - **Cursor users**: See @.cursorrules -- **Deep semantic analysis**: Explore @.serena/memories/ - -### Quick References - -- **Commands**: @.serena/memories/suggested_commands.md -- **Style Guide**: @.serena/memories/code_style_conventions.md -- **Task Checklist**: @.serena/memories/task_completion_checklist.md ## External Dependencies diff --git a/.ai-context/architecture-overview.md b/.ai-context/architecture-overview.md index bb1e5920..461c2ffd 100644 --- a/.ai-context/architecture-overview.md +++ b/.ai-context/architecture-overview.md @@ -1,5 +1,7 @@ # Architecture Overview +> **Note:** This project uses a UV workspace monorepo structure. All packages are organized under `packages/`, with the core application in `packages/core/src/cibmangotree/` and supporting packages (analyzers, services, importing, testing) in their respective `packages/` subdirectories. + ## High-Level Component Diagram ```mermaid @@ -36,7 +38,7 @@ flowchart TD ### Application Layer (`app/`) -Central orchestration and workspace management +Central orchestration and workspace management (located in `packages/core/src/cibmangotree/app/`) Key Classes: @@ -50,7 +52,7 @@ Key Classes: ### View Layer (`components/`) -Terminal UI components using inquirer +Terminal UI components using inquirer (located in `packages/core/src/cibmangotree/tui/components/`) Key Components: @@ -62,7 +64,7 @@ Key Components: ### Model Layer (`storage/`) -Data persistence and state management +Data persistence and state management (located in `packages/core/src/cibmangotree/storage/`) Key Classes: @@ -79,7 +81,7 @@ Reusable services that support analyzers and data processing Key Services: -- **Tokenizer Service** (`services/tokenizer/`) - Unicode-aware scriptio continua tokenization +- **Tokenizer Service** - Unicode-aware scriptio continua tokenization (located in `packages/core/src/cibmangotree/services/tokenizer/`) - `AbstractTokenizer` - Base interface for tokenizer implementations - `TokenizerConfig` - Configuration for tokenization behavior - `BasicTokenizer` - Core implementation with character-level and word-level tokenization diff --git a/.ai-context/setup-guide.md b/.ai-context/setup-guide.md index c6d4715e..f8ae1858 100644 --- a/.ai-context/setup-guide.md +++ b/.ai-context/setup-guide.md @@ -7,6 +7,7 @@ - **Python 3.12** - Required for all features to work correctly - **Git** - For version control and contributing - **Terminal/Command Line** - Application runs in terminal interface +- **UV** - Python package manager (installed automatically by bootstrap scripts) ### System Requirements @@ -23,19 +24,9 @@ git clone https://github.com/CIB-Mango-Tree/mango-tango-cli.git cd mango-tango-cli ``` -### 2. Create Virtual Environment +### 2. Bootstrap Development Environment -```bash -python -m venv venv -``` - -**Verify Python version**: - -```bash -python --version # Should show Python 3.12.x -``` - -### 3. Bootstrap Development Environment +The bootstrap script installs UV (if not present) and sets up the entire development environment automatically. **macOS/Linux**: @@ -51,39 +42,100 @@ python --version # Should show Python 3.12.x The bootstrap script will: -- Activate the virtual environment -- Install all dependencies from `requirements-dev.txt` -- Set up pre-commit hooks for code formatting +- Install UV package manager (if not already installed) +- Create and manage `.venv/` virtual environment automatically +- Install all workspace dependencies via `uv sync --all-extras` +- Verify installation by importing the application -### 4. Verify Installation +### 3. Verify Installation ```bash -python -m mangotango --noop +uv run cibmangotree --noop ``` Should output: "No-op flag detected. Exiting successfully." ## Development Environment Setup +### UV Workflow + +This project uses **UV** as the primary package manager. UV automatically manages the virtual environment and dependencies. + +**Key UV Commands**: + +```bash +# Install/sync all dependencies +uv sync + +# Install with extras (docs, dev, etc.) +uv sync --all-extras + +# Run the application +uv run cibmangotree + +# Run tests +uv run pytest + +# Format code +uv run black . +uv run isort . + +# Build executable +uv run pyinstaller pyinstaller.spec +``` + +**Virtual Environment Management**: + +- UV creates and manages `.venv/` automatically +- No need to manually activate the virtual environment when using `uv run` +- For manual activation (if needed): + - macOS/Linux: `source .venv/bin/activate` + - Windows: `.venv\Scripts\activate` + ### Dependencies Overview -**Production Dependencies** (`requirements.txt`): +All dependencies are defined in `pyproject.toml` files within the monorepo workspace. + +**Workspace Structure** (`pyproject.toml`): + +```toml +[tool.uv.workspace] +members = [ + "packages/core", # cibmangotree - main application + "packages/testing", # cibmangotree-testing - testing utilities + "packages/tokenizers/basic", # cibmangotree-tokenizer-basic + "packages/analyzers/example", # cibmangotree-analyzers-example + "packages/analyzers/hashtags", + "packages/analyzers/ngrams", + "packages/analyzers/temporal", + "packages/analyzers/time_coordination", +] +``` -- `polars==1.9.0` - Primary data processing -- `pydantic==2.9.1` - Data validation and models -- `inquirer==3.4.0` - Interactive terminal prompts -- `tinydb==4.8.0` - Lightweight JSON database -- `dash==2.18.1` - Web dashboard framework -- `shiny==1.4.0` - Modern web UI framework -- `plotly==5.24.1` - Data visualization -- `XlsxWriter==3.2.0` - Excel export functionality +**Development Dependencies**: + +```toml +[dependency-groups] +dev = [ + "black>=24.10.0", + "isort>=5.13.2", + "pytest>=8.3.4", + "pytest-benchmark>=5.1.0", + "pyinstaller>=6.14.1", + "pyarrow-stubs>=17.13", +] +``` -**Development Dependencies** (`requirements-dev.txt`): +**Production Dependencies**: Defined in individual package `pyproject.toml` files: -- `black==24.10.0` - Code formatter -- `isort==5.13.2` - Import organizer -- `pytest==8.3.4` - Testing framework -- `pyinstaller==6.14.1` - Executable building +- `polars` - Primary data processing +- `pydantic` - Data validation and models +- `inquirer` - Interactive terminal prompts +- `tinydb` - Lightweight JSON database +- `dash` - Web dashboard framework +- `shiny` - Modern web UI framework +- `plotly` - Data visualization +- `XlsxWriter` - Excel export functionality ### Code Formatting Setup @@ -91,34 +143,44 @@ The project uses automatic code formatting: - **Black**: Code style and formatting - **isort**: Import organization -- **Pre-commit hooks**: Automatic formatting on commit +- **UV**: Runs formatters via `uv run` **Manual formatting**: ```bash -isort . -black . +uv run isort . +uv run black . ``` -### Project Structure Setup +### Project Structure After installation, your project structure should be: ```bash mango-tango-cli/ -├── venv/ # Virtual environment -├── .serena/ # Serena semantic analysis -│ └── memories/ # Project knowledge base +├── .venv/ # UV-managed virtual environment +├── packages/ # Monorepo workspace packages +│ ├── core/ # cibmangotree - main application +│ │ └── src/cibmangotree/ +│ │ ├── app/ # Application logic & terminal UI +│ │ ├── storage/ # Data persistence layer +│ │ ├── components/ # Terminal UI components +│ │ └── analyzers.py # Analyzer discovery & registry +│ ├── testing/ # cibmangotree-testing - testing utilities +│ ├── tokenizers/ # Tokenizer implementations +│ │ └── basic/ # cibmangotree-tokenizer-basic +│ └── analyzers/ # Analysis modules (plugins) +│ ├── example/ # cibmangotree-analyzers-example +│ ├── hashtags/ # cibmangotree-analyzers-hashtags +│ ├── ngrams/ # cibmangotree-analyzers-ngrams +│ ├── temporal/ # cibmangotree-analyzers-temporal +│ └── time_coordination/ # cibmangotree-analyzers-time-coordination ├── docs/ # Documentation -│ ├── ai-context/ # AI assistant context -│ └── dev-guide.md # Development guide -├── app/ # Application layer -├── analyzers/ # Analysis modules -├── components/ # Terminal UI components -├── storage/ # Data persistence -├── importing/ # Data import modules -├── requirements*.txt # Dependencies -└── mangotango.py # Main entry point +├── .ai-context/ # AI assistant context +├── pyproject.toml # Workspace configuration & tool settings +├── uv.lock # UV lock file (dependency resolution) +├── bootstrap.sh # macOS/Linux setup script +└── bootstrap.ps1 # Windows setup script ``` ## Database and Storage Setup @@ -144,19 +206,30 @@ No manual database setup required. ### Basic Usage ```bash -# Activate virtual environment (if not already active) -source venv/bin/activate # macOS/Linux -venv\Scripts\activate # Windows - -# Start the application -python -m mangotango +# Start the application (UV manages venv automatically) +uv run cibmangotree ``` ### Development Mode ```bash # Run with debugging/development flags -python -m mangotango --noop # Test mode, exits immediately +uv run cibmangotree --noop # Test mode, exits immediately +``` + +### Manual Virtual Environment Activation (Optional) + +If you prefer to activate the virtual environment manually: + +```bash +# Activate virtual environment +source .venv/bin/activate # macOS/Linux +.venv\Scripts\activate # Windows + +# Then run without 'uv run' prefix +cibmangotree +pytest +black . ``` ## Testing Setup @@ -165,22 +238,28 @@ python -m mangotango --noop # Test mode, exits immediately ```bash # Run all tests -pytest +uv run pytest # Run specific test file -pytest analyzers/hashtags/test_hashtags_analyzer.py +uv run pytest packages/analyzers/hashtags/tests/test_hashtags_analyzer.py # Run with verbose output -pytest -v +uv run pytest -v # Run specific test function -pytest analyzers/hashtags/test_hashtags_analyzer.py::test_gini +uv run pytest packages/analyzers/hashtags/tests/test_hashtags_analyzer.py::test_gini + +# Stop on first failure +uv run pytest -x + +# Run tests matching a pattern +uv run pytest -k "hashtag" ``` ### Test Data -- Test data is co-located with analyzers in `test_data/` directories -- Each analyzer should include its own test files +- Test data is co-located with analyzers in `tests/` directories +- Each analyzer includes its own test files and test data - Tests use sample data to verify functionality ## Build Setup (Optional) @@ -189,14 +268,14 @@ pytest analyzers/hashtags/test_hashtags_analyzer.py::test_gini ```bash # Build standalone executable -pyinstaller pyinstaller.spec +uv run pyinstaller pyinstaller.spec # Output will be in dist/ directory ``` ### Build Requirements -- Included in `requirements-dev.txt` +- Included in development dependencies - Used primarily for release distribution - Not required for development @@ -206,32 +285,23 @@ pyinstaller pyinstaller.spec **VS Code** (`.vscode/` configuration): -- Python interpreter: `./venv/bin/python` +- Python interpreter: `./.venv/bin/python` - Black formatter integration - isort integration - pytest test discovery **PyCharm**: -- Interpreter: Project virtual environment +- Interpreter: Project virtual environment (`.venv/`) - Code style: Black - Import optimizer: isort ### Git Configuration -**Pre-commit Hooks**: - -```bash -# Hooks are set up automatically by bootstrap script -# Manual setup if needed: -pip install pre-commit -pre-commit install -``` - **Git Flow**: -- Branch from `develop` (not `main`) -- Feature branches: `feature/name` +- Branch from `main` for new features +- Feature branches: `feature/name` or `username/issue-description` - Bug fixes: `bugfix/name` ## Troubleshooting @@ -244,37 +314,54 @@ pre-commit install # Check Python version python --version -# If not 3.12, install Python 3.12 and recreate venv -python3.12 -m venv venv +# If not 3.12, install Python 3.12 and re-run bootstrap +# UV will detect the correct Python version +./bootstrap.sh # macOS/Linux +./bootstrap.ps1 # Windows +``` + +**UV Not Found**: + +```bash +# Install UV manually (if bootstrap script fails) +# macOS/Linux: +curl -LsSf https://astral.sh/uv/install.sh | sh + +# Windows (PowerShell): +powershell -c "irm https://astral.sh/uv/install.ps1 | iex" + +# Then run: +uv sync --all-extras ``` **Import Errors**: ```bash -# Ensure virtual environment is activated -source venv/bin/activate # macOS/Linux -venv\Scripts\activate # Windows +# Re-sync dependencies +uv sync --all-extras -# Reinstall dependencies -pip install -r requirements-dev.txt +# If issues persist, remove .venv and re-bootstrap +rm -rf .venv +./bootstrap.sh # macOS/Linux +./bootstrap.ps1 # Windows ``` **Formatting Errors in CI**: ```bash # Run formatters locally before committing -isort . -black . +uv run isort . +uv run black . ``` **Test Failures**: ```bash # Ensure test data is present -ls analyzers/*/test_data/ +ls packages/analyzers/*/tests/test_data/ # Check if specific analyzer tests pass -pytest analyzers/hashtags/ -v +uv run pytest packages/analyzers/hashtags/ -v ``` ### Environment Variables @@ -294,5 +381,121 @@ pytest analyzers/hashtags/ -v **Development Performance**: -- Use `pytest -x` to stop on first failure -- Use `pytest -k pattern` to run specific test patterns +- Use `uv run pytest -x` to stop on first failure +- Use `uv run pytest -k pattern` to run specific test patterns +- Use `uv run pytest --lf` to re-run last failed tests + +## UV Workspace Deep Dive + +### Understanding UV Workspaces + +UV manages this project as a **monorepo workspace** with multiple packages: + +- **Workspace root**: `pyproject.toml` defines workspace members +- **Package members**: Each package has its own `pyproject.toml` +- **Shared dependencies**: Defined at workspace root +- **Lock file**: `uv.lock` ensures reproducible builds + +### Adding New Packages + +To add a new analyzer or package to the workspace: + +1. Create package directory: `packages/analyzers/my-analyzer/` +2. Add `pyproject.toml` with package metadata +3. Add to workspace members in root `pyproject.toml`: + + ```toml + [tool.uv.workspace] + members = [ + # ... existing members + "packages/analyzers/my-analyzer", + ] + ``` + +4. Run `uv sync` to update workspace + +### Dependency Management + +**Add production dependency to a package**: + +```bash +# Navigate to package directory +cd packages/analyzers/hashtags/ + +# Add dependency +uv add polars +``` + +**Add development dependency to workspace**: + +Edit root `pyproject.toml`: + +```toml +[dependency-groups] +dev = [ + "black>=24.10.0", + "your-new-dev-tool>=1.0.0", +] +``` + +Then run: `uv sync` + +### UV Lock File + +- `uv.lock` contains exact versions of all dependencies +- Committed to version control for reproducibility +- Updated automatically when dependencies change +- Ensures consistent environments across all developers + +## Quick Reference + +### Essential Commands + +```bash +# Setup +./bootstrap.sh # Initial setup (macOS/Linux) +./bootstrap.ps1 # Initial setup (Windows) + +# Development +uv run cibmangotree # Run application +uv run pytest # Run tests +uv run black . # Format code +uv run isort . # Organize imports + +# Dependency management +uv sync # Sync dependencies +uv sync --all-extras # Sync with all extras (dev, docs) +uv add package-name # Add dependency to package + +# Building +uv run pyinstaller pyinstaller.spec # Build executable +``` + +### Directory Navigation + +```bash +# Core application +packages/core/src/cibmangotree/ + +# Analyzers +packages/analyzers/hashtags/ +packages/analyzers/ngrams/ +packages/analyzers/temporal/ + +# Testing utilities +packages/testing/ + +# Tokenizers +packages/tokenizers/basic/ + +# Documentation +docs/ +.ai-context/ +``` + +### Getting Help + +- **Development Guide**: `docs/dev-guide.md` +- **AI Context**: `.ai-context/README.md` +- **Architecture**: `.ai-context/architecture-overview.md` +- **UV Documentation**: diff --git a/.ai-context/symbol-reference.md b/.ai-context/symbol-reference.md index 64525698..4ad419d3 100644 --- a/.ai-context/symbol-reference.md +++ b/.ai-context/symbol-reference.md @@ -1,12 +1,13 @@ # Symbol Reference Guide > **Note**: This reference is generated from semantic code analysis and reflects the actual codebase structure. Update as the codebase evolves. +> **Monorepo Structure**: This is a UV workspace monorepo with packages organized in the `packages/` directory. Core application is `cibmangotree`, with analyzers and services as separate packages. ## Core Domain Objects ### Application Layer (`app/`) -#### `App` class - `app/app.py:10` +#### `App` class - `packages/core/src/cibmangotree/app/app.py:10` Main application controller and workspace orchestrator @@ -15,14 +16,14 @@ Main application controller and workspace orchestrator - `create_project(name, input_file) -> ProjectModel` - Initialize new project - `file_selector_state() -> AppFileSelectorStateManager` - File picker state -#### `AppContext` class - `app/app_context.py` +#### `AppContext` class - `packages/core/src/cibmangotree/app/app_context.py` Application-wide dependency injection container - Provides storage, analyzer suite, and core services - Used throughout the application for accessing shared resources -#### `ProjectContext` class - `app/project_context.py` +#### `ProjectContext` class - `packages/core/src/cibmangotree/app/project_context.py` Project-specific operations and column semantic mapping @@ -30,16 +31,16 @@ Project-specific operations and column semantic mapping - Maps user data columns to analyzer requirements - `UserInputColumn` - Column metadata with semantic types -#### `AnalysisContext` class - `app/analysis_context.py` +#### `AnalysisContext` class - `packages/core/src/cibmangotree/app/analysis_context.py` Analysis execution environment - `AnalysisRunProgressEvent` - Progress tracking for long-running analyses - Provides file paths, preprocessing functions, and progress callbacks -### Storage Layer (`storage/`) +### Storage Layer (`services/storage/`) -#### `Storage` class - `storage/__init__.py:60` +#### `Storage` class - `packages/core/src/cibmangotree/services/storage/__init__.py:60` Main data persistence and workspace management @@ -78,9 +79,9 @@ Export Operations: - `FileSelectionState` - File picker UI state - `TableStats` - Data statistics and preview information -### View Layer (`components/`) +### View Layer (`tui/components/`) -#### `ViewContext` class - `components/context.py` +#### `ViewContext` class - `packages/core/src/cibmangotree/tui/components/context.py` UI state management and terminal context @@ -103,9 +104,9 @@ UI state management and terminal context ## Service Layer -### Data Import (`importing/`) +### Data Import (`services/importing/`) -#### `Importer` base class - `importing/importer.py` +#### `Importer` base class - `packages/core/src/cibmangotree/services/importing/importer.py` Base interface for data importers @@ -114,40 +115,40 @@ Base interface for data importers #### Concrete Importers -- `CSVImporter` - `importing/csv.py` - CSV file import with encoding detection -- `ExcelImporter` - `importing/excel.py` - Excel file import with sheet selection +- `CSVImporter` - `packages/core/src/cibmangotree/services/importing/csv.py` - CSV file import with encoding detection +- `ExcelImporter` - `packages/core/src/cibmangotree/services/importing/excel.py` - Excel file import with sheet selection -### Analyzer System (`analyzers/`) +### Analyzer System (`packages/analyzers/`) #### Built-in Analyzers **Primary Analyzers** (core data processing): -- `hashtags` - `analyzers/hashtags/main.py:main()` - Hashtag extraction and analysis -- `ngrams` - `analyzers/ngrams/main.py:main()` - N-gram generation and tokenization -- `temporal` - `analyzers/temporal/main.py:main()` - Time-based aggregation -- `time_coordination` - `analyzers/time_coordination/main.py:main()` - User coordination analysis +- `hashtags` - `packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/base/main.py:main()` - Hashtag extraction and analysis +- `ngrams` - `packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/main.py:main()` - N-gram generation and tokenization +- `temporal` - `packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/base/main.py:main()` - Time-based aggregation +- `time_coordination` - `packages/analyzers/time_coordination/src/cibmangotree_analyzer_time_coordination/base/main.py:main()` - User coordination analysis **Secondary Analyzers** (result transformation): -- `ngram_stats` - `analyzers/ngram_stats/main.py:main()` - N-gram statistics calculation +- `ngram_stats` - `packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/main.py:main()` - N-gram statistics calculation - `hashtags_web/analysis.py:secondary_analyzer()` - Hashtag summary statistics **Web Presenters** (interactive dashboards): -- `hashtags_web` - `analyzers/hashtags_web/factory.py:factory()` - Hashtag dashboard -- `ngram_web` - `analyzers/ngram_web/factory.py:factory()` - N-gram exploration dashboard -- `temporal_barplot` - `analyzers/temporal_barplot/factory.py:factory()` - Temporal visualization +- `hashtags_web` - `packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/web/factory.py:factory()` - Hashtag dashboard +- `ngram_web` - `packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/factory.py:factory()` - N-gram exploration dashboard +- `temporal_barplot` - `packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/factory.py:factory()` - Temporal visualization #### Analyzer Registration -- `analyzers.suite` - `analyzers/__init__.py` - Central registry of all analyzers +- `analyzers.suite` - `packages/core/src/cibmangotree/plugin_system/__init__.py` - Central registry of all analyzers ### Tokenizer Service (`services/tokenizer/`) Unicode-aware text tokenization with scriptio continua (character-level) and space-separated script support, plus social media entity preservation. -#### Core Interface - `services/tokenizer/core/base.py` +#### Core Interface - `packages/core/src/cibmangotree/services/tokenizer/core/base.py` **`AbstractTokenizer` class** @@ -159,7 +160,7 @@ Base interface for all tokenizer implementations: - `_preprocess_text(text: str) -> str` - Apply preprocessing (case, normalization) - `_postprocess_tokens(tokens: list[str]) -> list[str]` - Filter and clean tokens -#### Configuration Types - `services/tokenizer/core/types.py` +#### Configuration Types - `packages/core/src/cibmangotree/services/tokenizer/core/types.py` **`TokenizerConfig` dataclass** @@ -177,7 +178,7 @@ Comprehensive tokenization configuration: - `TokenType` - Token classifications (WORD, HASHTAG, MENTION, URL, EMOJI, etc.) - `CaseHandling` - Case transformation options (PRESERVE, LOWERCASE, UPPERCASE, NORMALIZE) -#### Basic Implementation - `services/tokenizer/basic/tokenizer.py` +#### Basic Implementation - `packages/tokenizers/basic/src/cibmangotree_tokenizer_basic/tokenizer.py` **`BasicTokenizer` class** @@ -190,7 +191,7 @@ Core tokenizer implementation with Unicode awareness: - Configurable preprocessing and postprocessing - Single-pass regex-based token extraction with order preservation -#### Pattern Matching - `services/tokenizer/basic/patterns.py` +#### Pattern Matching - `packages/tokenizers/basic/src/cibmangotree_tokenizer_basic/patterns.py` **Pattern Functions:** @@ -204,7 +205,7 @@ Core tokenizer implementation with Unicode awareness: - `LINGUISTIC_PATTERNS` - Language-specific tokenization patterns - `FORMATTING_PATTERNS` - Text formatting and structure patterns -#### Service API - `services/tokenizer/__init__.py` +#### Service API - `packages/core/src/cibmangotree/services/tokenizer/__init__.py` **Convenience Functions:** @@ -221,7 +222,7 @@ Core tokenizer implementation with Unicode awareness: ### Main Application -- `mangotango.py` - Application bootstrap and initialization +- `packages/core/src/cibmangotree/__main__.py` - Application bootstrap and initialization - `freeze_support()` - Multiprocessing setup - `enable_windows_ansi_support()` - Terminal color support - Storage initialization with app metadata @@ -229,8 +230,9 @@ Core tokenizer implementation with Unicode awareness: ### Module Entry Point -- `python -m mangotango` - Standard execution command -- `python -m mangotango --noop` - No-operation mode for testing +- `uv run cibmangotree` - Standard execution command via UV +- `python -m cibmangotree` - Alternative execution via Python module +- `uv run cibmangotree --noop` - No-operation mode for testing ## Integration Points @@ -276,7 +278,7 @@ Application-wide structured JSON logging with configurable levels and automatic **Usage Pattern:** ```python -from app.logger import get_logger +from cibmangotree.app.logger import get_logger logger = get_logger(__name__) logger.info("Message", extra={"context": "value"}) ``` @@ -285,19 +287,19 @@ logger.info("Message", extra={"context": "value"}) - `parquet_row_count(path) -> int` - Efficient row counting for large files -### Storage Utilities (`storage/__init__.py`) +### Storage Utilities (`services/storage/__init__.py`) - `collect_dataframe_chunks(paths) -> polars.DataFrame` - Combine multiple parquet files - `TableStats` - Data statistics and preview generation -### File Management (`storage/file_selector.py`) +### File Management (`services/storage/file_selector.py`) - `FileSelectorStateManager` - File picker state persistence - `AppFileSelectorStateManager` - Application-specific file selection ## Testing Infrastructure -### Test Utilities (`testing/`) +### Test Utilities (`packages/testing/`) - Primary analyzer testing framework - Secondary analyzer testing framework @@ -305,8 +307,8 @@ logger.info("Message", extra={"context": "value"}) ### Example Tests -- `analyzers/hashtags/test_hashtags_analyzer.py` - Hashtag analyzer tests -- `analyzers/example/test_example_base.py` - Example analyzer tests +- `packages/analyzers/hashtags/tests/test_hashtags_analyzer.py` - Hashtag analyzer tests +- `packages/analyzers/example/tests/test_example_base.py` - Example analyzer tests - Test data directories co-located with analyzers ## Development Patterns diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 30923d66..849a32f0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -40,7 +40,7 @@ Before contributing, familiarize yourself with: The Mango Tango CLI is a modular, extensible Python terminal application organized as a UV workspace monorepo: -``` +```bash packages/ ├── core/ # Core application (cibmangotree) ├── importing/ # Data import/export @@ -50,6 +50,7 @@ packages/ ``` Three main architectural domains: + - **Core Domain**: Application logic, terminal UI, and storage - **Edge Domain**: Data import/export and preprocessing - **Content Domain**: Analysis modules and web presenters @@ -77,6 +78,7 @@ Run the bootstrap script for your platform: ``` This will: + - Install UV package manager (if not present) - Install all project dependencies - Set up the development environment @@ -158,7 +160,7 @@ git branch -d feature/your-feature-name - Input: Primary + secondary outputs - Output: Dash/Shiny web applications -See the [analyzer example](analyzers/example/README.md) for implementation guidance. +See the [analyzer example](packages/analyzers/example/README.md) for implementation guidance. ### 🔧 Core Improvements @@ -330,7 +332,7 @@ Types: `feat`, `fix`, `docs`, `style`, `refactor`, `test`, `chore` This project includes comprehensive AI assistant integration: -- **Claude Code users**: See `CLAUDE.md` + Serena MCP integration +- **Claude Code users**: See `CLAUDE.md` - **Cursor users**: See `.cursorrules` + `.ai-context/` - **Other AI tools**: Start with `.ai-context/README.md` diff --git a/README.md b/README.md index b7f2da2b..d77a72c9 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ +

CIB Mango Tree

An Interactive Command Line and Dashboard Tool for Detecting Coordinated Inauthentic Behavior Datasets of Online Activity

@@ -49,6 +50,7 @@ Run the bootstrap script for your platform: ``` This will: + - Install UV package manager (if not present) - Install all project dependencies - Set up the development environment @@ -64,7 +66,7 @@ uv run cibmangotree This is a UV workspace monorepo with the following packages: -``` +```bash packages/ ├── core/ # Core application (cibmangotree) │ ├── src/cibmangotree/ # Main application code @@ -119,11 +121,10 @@ uv sync --upgrade # Upgrade dependencies This repository includes hybrid AI documentation enhanced with semantic code analysis: -- **For Claude Code users**: See `CLAUDE.md` + Serena MCP integration +- **For Claude Code users**: See `CLAUDE.md` - **Important**: Always start sessions with "Read the initial instructions" - **For Cursor users**: See `.cursorrules` + `.ai-context/` - **For other AI tools**: See `.ai-context/README.md` -- **For deep semantic analysis**: Serena memories in `.serena/memories/` ### Quick Start for Contributors diff --git a/docs/guides/contributing/analyzers.md b/docs/guides/contributing/analyzers.md index 53bbf03f..d268e4c9 100644 --- a/docs/guides/contributing/analyzers.md +++ b/docs/guides/contributing/analyzers.md @@ -29,9 +29,9 @@ Every primary analyzer must define an interface that specifies: - Output tables the analyzer produces ```python -from analyzer_interface import ( +from cibmangotree.analyzer_interface import ( AnalyzerInput, - AnalyzerInterface, + AnalyzerInterface, AnalyzerOutput, AnalyzerParam, InputColumn, @@ -45,14 +45,14 @@ interface = AnalyzerInterface( name="Example Analyzer", short_description="Counts characters in messages", long_description=""" -This analyzer demonstrates the basic structure by counting +This analyzer demonstrates the basic structure by counting characters in each message and marking long messages. """, input=AnalyzerInput( columns=[ InputColumn( name="message_id", - human_readable_name="Unique Message ID", + human_readable_name="Unique Message ID", data_type="identifier", description="The unique identifier of the message", name_hints=["post", "message", "tweet", "id"] @@ -60,7 +60,7 @@ characters in each message and marking long messages. InputColumn( name="message_text", human_readable_name="Message Text", - data_type="text", + data_type="text", description="The text content of the message", name_hints=["message", "text", "content", "body"] ) @@ -78,7 +78,7 @@ characters in each message and marking long messages. outputs=[ AnalyzerOutput( id="character_count", - name="Character Count Per Message", + name="Character Count Per Message", internal=True, # Not shown in export list columns=[ OutputColumn(name="message_id", data_type="integer"), @@ -95,18 +95,18 @@ The main function receives a context object with access to input data and output ```python import polars as pl -from analyzer_interface.context import PrimaryAnalyzerContext +from cibmangotree.analyzer_interface.context import PrimaryAnalyzerContext from terminal_tools import ProgressReporter def main(context: PrimaryAnalyzerContext): # Read and preprocess input data input_reader = context.input() df_input = input_reader.preprocess(pl.read_parquet(input_reader.parquet_path)) - + # Access parameters fudge_factor = context.params.get("fudge_factor") assert isinstance(fudge_factor, int), "Fudge factor must be an integer" - + # Perform analysis with progress reporting with ProgressReporter("Counting characters") as progress: df_count = df_input.select( @@ -117,7 +117,7 @@ def main(context: PrimaryAnalyzerContext): .alias("character_count") ) progress.update(1.0) - + # Write output to specified path df_count.write_parquet(context.output("character_count").parquet_path) ``` @@ -127,7 +127,7 @@ def main(context: PrimaryAnalyzerContext): Finally, create the analyzer declaration: ```python -from analyzer_interface import AnalyzerDeclaration +from cibmangotree.analyzer_interface import AnalyzerDeclaration from .interface import interface from .main import main @@ -147,19 +147,19 @@ Secondary analyzers process the output of primary analyzers to create user-frien Secondary analyzers specify their base primary analyzer and their own outputs: ```python -from analyzer_interface import AnalyzerOutput, OutputColumn, SecondaryAnalyzerInterface +from cibmangotree.analyzer_interface import AnalyzerOutput, OutputColumn, SecondaryAnalyzerInterface from ..example_base.interface import interface as example_base interface = SecondaryAnalyzerInterface( id="example_report", - version="0.1.0", + version="0.1.0", name="Example Report", short_description="Adds 'is_long' flag to character count analysis", base_analyzer=example_base, # Reference to primary analyzer outputs=[ AnalyzerOutput( id="example_report", - name="Example Report", + name="Example Report", columns=[ OutputColumn(name="message_id", data_type="integer"), OutputColumn(name="character_count", data_type="integer"), @@ -176,22 +176,22 @@ Secondary analyzers read primary outputs and create enhanced results: ```python import polars as pl -from analyzer_interface.context import SecondaryAnalyzerContext +from cibmangotree.analyzer_interface.context import SecondaryAnalyzerContext def main(context: SecondaryAnalyzerContext): # Read primary analyzer output df_character_count = pl.read_parquet( context.base.table("character_count").parquet_path ) - + # Add derived columns df_export = df_character_count.with_columns( pl.col("character_count").gt(100).alias("is_long") ) - + # Access primary analyzer parameters if needed fudge_factor = context.base_params.get("fudge_factor") - + # Write enhanced output df_export.write_parquet(context.output("example_report").parquet_path) ``` @@ -203,12 +203,12 @@ Web presenters create interactive dashboards using either Dash or Shiny framewor ### Interface Definition ```python -from analyzer_interface import WebPresenterInterface +from cibmangotree.analyzer_interface import WebPresenterInterface from ..example_base import interface as example_base from ..example_report import interface as example_report interface = WebPresenterInterface( - id="example_web", + id="example_web", version="0.1.0", name="Message Length Histogram", short_description="Shows distribution of message lengths", @@ -224,19 +224,19 @@ For more interactive dashboards: ```python from shiny import reactive, render, ui from shinywidgets import output_widget, render_widget -from analyzer_interface.context import WebPresenterContext, FactoryOutputContext, ShinyContext +from cibmangotree.analyzer_interface.context import WebPresenterContext, FactoryOutputContext, ShinyContext def factory(context: WebPresenterContext) -> FactoryOutputContext: # Load data df = pl.read_parquet(context.base.table("character_count").parquet_path) - + # Define UI components analysis_panel = ui.card( ui.card_header("Character Count Analysis"), ui.input_checkbox("show_details", "Show detailed view", value=False), output_widget("histogram", height="400px") ) - + def server(input, output, session): @render_widget def histogram(): @@ -244,11 +244,11 @@ def factory(context: WebPresenterContext) -> FactoryOutputContext: show_details = input.show_details() # ... create plotly figure ... return fig - + @render.text def summary(): return f"Total messages: {len(df)}" - + return FactoryOutputContext( shiny=ShinyContext( server_handler=server, @@ -267,41 +267,41 @@ from ..utils.pop import pop_unnecessary_fields def api_factory(context: WebPresenterContext, options: Optional[dict[str, Any]] = None): """ Provides structured data for React dashboards via API endpoints. - + Args: context: WebPresenterContext with access to analyzer outputs options: Optional parameters from API requests (filters, etc.) - + Returns: Dict with presenter metadata and processed data arrays """ # Extract API options/filters filter_value = options.get("matcher", "") if options else "" - + # Load data data_frame = pl.read_parquet(context.base.table("character_count").parquet_path) - + # Apply filters if provided if filter_value: # Apply filtering logic based on the filter_value data_frame = data_frame.filter(pl.col("message_text").str.contains(filter_value)) - + # Build presenter model with metadata presenter_model = context.web_presenter.model_dump() - + # Add visualization configuration presenter_model["figure_type"] = "histogram" presenter_model["axis"] = { "x": {"label": "Message Character Count", "value": "message_character_count"}, "y": {"label": "Number of Messages", "value": "number_of_messages"} } - + # Add data arrays for the frontend presenter_model["x"] = data_frame["character_count"].to_list() - + # Remove internal fields not needed by frontend return FactoryOutputContext( - api=pop_unnecessary_fields(presenter_model) + api=pop_unnecessary_fields(presenter_model) ) ``` @@ -312,7 +312,7 @@ For analyzers with multiple outputs, return a dictionary with different data vie ```python def api_factory(context: WebPresenterContext, options: Optional[dict[str, Any]] = None): filter_value = options.get("matcher", "") if options else "" - + # Load different data sources df_stats = pl.read_parquet( context.dependency(ngram_stats).table(OUTPUT_NGRAM_STATS).parquet_path @@ -320,18 +320,18 @@ def api_factory(context: WebPresenterContext, options: Optional[dict[str, Any]] df_full = pl.read_parquet( context.dependency(ngram_stats).table(OUTPUT_NGRAM_FULL).parquet_path ) - + # Apply filtering to both datasets if filter_value: matcher = create_word_matcher(filter_value, pl.col(COL_NGRAM_WORDS)) if matcher is not None: df_stats = df_stats.filter(matcher) df_full = df_full.filter(matcher) - + # Create separate presenter models for each output stats_model = context.web_presenter.model_dump() full_model = context.web_presenter.model_dump() - + # Configure stats view stats_model.update({ "figure_type": "scatter", @@ -347,14 +347,14 @@ def api_factory(context: WebPresenterContext, options: Optional[dict[str, Any]] "y": { "total_repetition": df_stats[COL_NGRAM_TOTAL_REPS].to_list(), "amplification_factor": ( - df_stats[COL_NGRAM_TOTAL_REPS] / + df_stats[COL_NGRAM_TOTAL_REPS] / df_stats[COL_NGRAM_DISTINCT_POSTER_COUNT] ).to_list() }, "ngrams": df_stats[COL_NGRAM_WORDS].to_list() }) - - # Configure full data view + + # Configure full data view full_model.update({ "figure_type": "scatter", "ids": df_full[COL_NGRAM_ID].to_list(), @@ -363,13 +363,13 @@ def api_factory(context: WebPresenterContext, options: Optional[dict[str, Any]] "users": df_full[COL_AUTHOR_ID].to_list(), # ... additional fields for detailed view }) - + return FactoryOutputContext( - api={ - "default_output": OUTPUT_NGRAM_STATS, - OUTPUT_NGRAM_STATS: pop_unnecessary_fields(stats_model), - OUTPUT_NGRAM_FULL: pop_unnecessary_fields(full_model) - } + api={ + "default_output": OUTPUT_NGRAM_STATS, + OUTPUT_NGRAM_STATS: pop_unnecessary_fields(stats_model), + OUTPUT_NGRAM_FULL: pop_unnecessary_fields(full_model) + } ) ``` @@ -408,7 +408,7 @@ curl "/api/presenters/ngram_repetition_by_poster/download/csv" ### Testing Primary Analyzers ```python -from testing import CsvTestData, test_primary_analyzer +from cibmangotree_testing import CsvTestData, test_primary_analyzer from .interface import interface from .main import main @@ -430,7 +430,7 @@ def test_example_analyzer(): ### Testing Secondary Analyzers ```python -from testing import test_secondary_analyzer, ParquetTestData +from cibmangotree_testing import test_secondary_analyzer, ParquetTestData def test_example_report(): test_secondary_analyzer( @@ -478,18 +478,18 @@ def test_example_report(): ## Adding to the Suite -Register all analyzers in `analyzers/__init__.py`: +Register all analyzers in `packages/core/src/cibmangotree/analyzers.py`: ```python -from analyzer_interface import AnalyzerSuite +from cibmangotree.analyzer_interface import AnalyzerSuite from .example.example_base import example_base -from .example.example_report import example_report +from .example.example_report import example_report from .example.example_web import example_web suite = AnalyzerSuite( all_analyzers=[ example_base, - example_report, + example_report, example_web, # ... other analyzers ] diff --git a/docs/guides/contributing/logging.md b/docs/guides/contributing/logging.md index 6221f243..5b28a4f3 100644 --- a/docs/guides/contributing/logging.md +++ b/docs/guides/contributing/logging.md @@ -15,7 +15,7 @@ The application uses a structured JSON logging system that provides consistent l #### Basic Usage ```python -from app.logger import get_logger +from cibmangotree.app.logger import get_logger # Get a logger for your module logger = get_logger(__name__) @@ -48,25 +48,25 @@ logger.critical("A very serious error occurred, program may not be able to conti When developing analyzers, add logging to help with debugging and monitoring: ```python -from app.logger import get_logger +from cibmangotree.app.logger import get_logger def main(context): logger = get_logger(__name__) - + logger.info("Starting analysis", extra={ "input_path": str(context.input_path), "output_path": str(context.output_path) }) - + try: # Your analysis code here result = perform_analysis(context) - + logger.info("Analysis completed successfully", extra={ "records_processed": len(result), "execution_time": time.time() - start_time }) - + except Exception as e: logger.error("Analysis failed", extra={ "error": str(e), @@ -78,14 +78,14 @@ def main(context): ### Logging Best Practices 1. **Use Appropriate Log Levels**: - + - `DEBUG`: Detailed diagnostic information, only useful when debugging - `INFO`: General information about program execution - `WARNING`: Something unexpected happened, but the program continues - `ERROR`: A serious problem occurred - `CRITICAL`: A very serious error occurred, program may not be able to continue 2. **Include Context with `extra` Parameter**: - + ```python logger.info("Processing file", extra={ "filename": filename, @@ -93,18 +93,18 @@ def main(context): "record_count": record_count }) ``` - + 3. **Log Exceptions Properly**: - + ```python try: risky_operation() except Exception as e: logger.error("Operation failed", exc_info=True) # Includes stack trace ``` - + 4. **Avoid Logging Sensitive Information**: - + - Never log passwords, API keys, or personal data - Be cautious with user-provided data @@ -114,13 +114,13 @@ Users can control log verbosity when running the application: ```shell # Default INFO level -python -m mangotango +uv run cibmangotree # Verbose DEBUG level for troubleshooting -python -m mangotango --log-level DEBUG +uv run cibmangotree --log-level DEBUG # Only show warnings and errors in log file -python -m mangotango --log-level WARNING +uv run cibmangotree --log-level WARNING ``` ### Log File Management @@ -136,15 +136,15 @@ When writing tests that involve logging: ```python import logging -from app.logger import get_logger +from cibmangotree.app.logger import get_logger def test_my_function_logs_correctly(caplog): with caplog.at_level(logging.INFO): my_function() - + assert "Expected log message" in caplog.text ``` # Next Steps -Once you finish reading this it's recommended to check out the [architecture](./architecture.md) section. \ No newline at end of file +Once you finish reading this it's recommended to check out the [architecture](./architecture.md) section. diff --git a/docs/guides/design-philosophy/core-domain.md b/docs/guides/design-philosophy/core-domain.md index 0394a8a7..34be590a 100644 --- a/docs/guides/design-philosophy/core-domain.md +++ b/docs/guides/design-philosophy/core-domain.md @@ -2,13 +2,13 @@ ### Application -The Application lives inside the `app` directory in the project root. This is responsible for defining and executing all capabilities of the application's workspace. Any extension or modification of the application's workspace capabilities should be done here. +The Application lives in `packages/core/src/cibmangotree/app/`. This is responsible for defining and executing all capabilities of the application's workspace. Any extension or modification of the application's workspace capabilities should be done here. The application code should be free of specific storage implementation and be agnostic about the specifics of the terminal interface and the available analyzers. Here's what the entrypoint for the application module looks like -**./app/__init__.py**: +**packages/core/src/cibmangotree/app/**init**.py**: ```python from .analysis_context import AnalysisContext @@ -22,13 +22,13 @@ from .settings_context import SettingsContext ### Terminal Components -The Terminal Components live inside the `terminal_tools` inside the project root. Their main responsibility is user flow, rendering the terminal interface, and handling user input. +The Terminal Components live in `packages/core/src/cibmangotree/tui/`. Their main responsibility is user flow, rendering the terminal interface, and handling user input. Terminal utilities and tools are located in `packages/core/src/cibmangotree/tui/tools/`. The user flow understandably depends on the set of capabilities offered by the [Application](#application), so an adjustment there may require an adjustment here. -Here's what the entrypoint for the termnal module looks like +Here's what the entrypoint for the terminal tools module looks like -**./terminal_tools/__init__.py** +**packages/core/src/cibmangotree/tui/tools/**init**.py** ```python from .progress import ProgressReporter @@ -39,17 +39,21 @@ from .utils import ( enable_windows_ansi_support, open_directory_explorer, print_ascii_table, + print_dialog_section_title, + smart_print_data_frame, wait_for_key, ) ``` ### Storage IO -The Storage IO lives Inside the `storage` directory inside the project root. It is responsible for interacting directly with the file system where the workspace data and data files are stored. It makes decisions on paths, intermediate file formats, and database schema and implementation. It should know as little as possible about how the data is used and should be agnostic about the specifics of the terminal interface and the available analyzers. +The Storage IO lives in `packages/core/src/cibmangotree/storage/`. It is responsible for interacting directly with the file system where the workspace data and data files are stored. It makes decisions on paths, intermediate file formats, and database schema and implementation. It should know as little as possible about how the data is used and should be agnostic about the specifics of the terminal interface and the available analyzers. Here's what the entrypoint for the storage module looks like -**./storage/__init__.py**: +**packages/core/src/cibmangotree/storage/**init**.py**: + +> **Note:** The code listing below (lines 55-627) is the actual implementation of the Storage module, not just example code. ```python import math diff --git a/docs/guides/design-philosophy/edge-domain.md b/docs/guides/design-philosophy/edge-domain.md index 83c6af1f..7333306a 100644 --- a/docs/guides/design-philosophy/edge-domain.md +++ b/docs/guides/design-philosophy/edge-domain.md @@ -4,29 +4,45 @@ The Edge domain governs data import and export. ### Importers -The Importers live inside the `importing` directory inside the project root. Each importer offers a new way to import data into the workspace. The importers should be agnostic about the available analyzers. However, the Importers currently provide a terminal user flow so that their options can be customized by the user—a necessity since each importer may expose different sets of options and may have different UX approaches for their configuration. +The Importers are now part of the core package services, located at `packages/core/src/cibmangotree/services/importing/`. Each importer offers a new way to import data into the workspace. The importers should be agnostic about the available analyzers. However, the Importers currently provide a terminal user flow so that their options can be customized by the user—a necessity since each importer may expose different sets of options and may have different UX approaches for their configuration. The importers eventually write data to a parquet file, whose path is provisioned by the application. Here's what the entrypoint for the importer module looks like -**./importing/__init__.py**: +**packages/core/src/cibmangotree/services/importing/**init**.py**: ```python -from .csv import CSVImporter -from .excel import ExcelImporter +# Import base classes first (no circular dependency) from .importer import Importer, ImporterSession -importers: list[Importer[ImporterSession]] = [CSVImporter(), ExcelImporter()] +# Lazy import for CSV and Excel to avoid circular import +# CSV/Excel importers use TUI which imports from app +def __getattr__(name): + if name == "CSVImporter": + from .csv import CSVImporter + return CSVImporter + elif name == "ExcelImporter": + from .excel import ExcelImporter + return ExcelImporter + elif name == "importers": + from .csv import CSVImporter + from .excel import ExcelImporter + return [CSVImporter(), ExcelImporter()] + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + +__all__ = ["Importer", "ImporterSession", "CSVImporter", "ExcelImporter", "importers"] ``` +> **Note**: The importing module uses lazy imports to avoid circular dependencies, as the CSV and Excel importers depend on TUI components which import from the main app. + ### Semantic Preprocessor -The Semantic Preprocessor lives inside the `preprocessing` directory inside the project root. It defines all the column data semantics—a kind of type system that is used to guide the user in selecting the right columns for the right analysis. It is agnostic about the specific analyzers but does depend on them in a generic way—the available semantics exist to support the needs of analyzers and will be extended as necessary. +The Semantic Preprocessor is now part of the core package services, located at `packages/core/src/cibmangotree/services/preprocessing/`. It defines all the column data semantics—a kind of type system that is used to guide the user in selecting the right columns for the right analysis. It is agnostic about the specific analyzers but does depend on them in a generic way—the available semantics exist to support the needs of analyzers and will be extended as necessary. Here's what the entrypoint for the preprocessing module looks like -**./preprocessing/series_semantic.py**: +**packages/core/src/cibmangotree/services/preprocessing/series_semantic.py**: ```python from datetime import datetime @@ -35,7 +51,7 @@ from typing import Callable, Type, Union import polars as pl from pydantic import BaseModel -from analyzer_interface import DataType +from cibmangotree.analyzer_interface import DataType class SeriesSemantic(BaseModel): @@ -177,4 +193,4 @@ Once you finish reading this section it would be a good idea to review the other - [Core Domain](./core-domain.md) - [Content Domain](./content-domain.md) - [Shiny Dashboards](../dashboards/shiny.md) -- [React Dashboards](../dashboards/react.md) \ No newline at end of file +- [React Dashboards](../dashboards/react.md) diff --git a/docs/guides/get-started/installation.md b/docs/guides/get-started/installation.md index d9ceeb01..7fdb1b96 100644 --- a/docs/guides/get-started/installation.md +++ b/docs/guides/get-started/installation.md @@ -3,6 +3,7 @@ ## Required Software - **Python 3.12** - Required for all features to work correctly +- **UV** - Modern Python package manager (automatically installed by bootstrap script) - Node.JS (20.0.0 or above) - Required for the React dashboards to work correctly - **Git** - For version control and contributing @@ -37,7 +38,7 @@ which ### Windows ```PowerShell -where.exe +where.exe ``` # Installation @@ -49,19 +50,7 @@ git clone https://github.com/CIB-Mango-Tree/mango-tango-cli.git cd mango-tango-cli ``` -## 2. Create Virtual Environment - -```bash -python -m venv venv -``` - -**Verify Python version**: - -```bash -python --version # Should show Python 3.12.x -``` - -## 3. Bootstrap Development Environment +## 2. Bootstrap Development Environment **Mac OS/Linux (Bash)**: @@ -77,58 +66,79 @@ python --version # Should show Python 3.12.x The bootstrap script will: -- Activate the virtual environment -- Install all dependencies from `requirements-dev.txt` +- Install UV package manager (if not already installed) +- Create `.venv/` virtual environment using UV +- Install all workspace dependencies via `uv sync` - Set up pre-commit hooks for code formatting -## 4. Verify Installation +**Manual UV Setup** (if needed): + +```bash +# Install UV +curl -LsSf https://astral.sh/uv/install.sh | sh # macOS/Linux +# or +pip install uv + +# Sync dependencies +uv sync +``` + +## 3. Verify Installation ```bash -python -m mangotango --noop +uv run cibmangotree --noop ``` Should output: "No-op flag detected. Exiting successfully." # Activating Virtual Environment -After Completing the Installation the following commands can be used to activate -the virtual environment in order to work with the project. +UV automatically manages the virtual environment. When you use `uv run`, it +activates the `.venv/` environment automatically. + +**Manual Activation** (if needed): **Mac OS/Linux (Bash)**: ```bash -source ./venv/bin/activate +source .venv/bin/activate ``` **PowerShell (Windows)**: ```PowerShell -./env/bin/Activate.ps1 +.venv/Scripts/Activate.ps1 ``` +**Note**: With UV, you typically don't need to manually activate the environment. +Use `uv run ` instead. + # Development Environment Setup ## Dependencies Overview -**Production Dependencies** (`requirements.txt`): +Dependencies are managed by UV using `pyproject.toml` in a workspace configuration. +Run `uv sync` to install all dependencies across all packages. + +**Core Production Dependencies**: -- `polars==1.9.0` - Primary data processing -- `pydantic==2.9.1` - Data validation and models -- `inquirer==3.4.0` - Interactive terminal prompts -- `tinydb==4.8.0` - Lightweight JSON database -- `dash==2.18.1` - Web dashboard framework -- `shiny==1.4.0` - Modern web UI framework -- `plotly==5.24.1` - Data visualization -- `XlsxWriter==3.2.0` - Excel export functionality +- `polars>=1.9.0` - Primary data processing +- `pydantic>=2.9.1` - Data validation and models +- `inquirer>=3.4.0` - Interactive terminal prompts +- `tinydb>=4.8.0` - Lightweight JSON database +- `dash>=2.18.1` - Web dashboard framework +- `shiny>=1.4.0` - Modern web UI framework +- `plotly>=5.24.1` - Data visualization +- `XlsxWriter>=3.2.0` - Excel export functionality -**Development Dependencies** (`requirements-dev.txt`): +**Development Dependencies**: -- `black==24.10.0` - Code formatter -- `isort==5.13.2` - Import organizer -- `pytest==8.3.4` - Testing framework -- `pyinstaller==6.14.1` - Executable building +- `black>=24.10.0` - Code formatter +- `isort>=5.13.2` - Import organizer +- `pytest>=8.3.4` - Testing framework +- `pyinstaller>=6.14.1` - Executable building -**React Dashboard Dependencies** (app/web_templates/package.json): +**React Dashboard Dependencies** (packages/core/src/cibmangotree/app/web_templates/package.json): - typescript: 5.7.3 - vite: 6.3.5 @@ -141,6 +151,22 @@ source ./venv/bin/activate - tailwindcss: 4.0.6 - lucide-react: 0.475.0 +**Managing Dependencies**: + +```bash +# Install/sync all workspace dependencies +uv sync + +# Add a new dependency to core package +uv add polars --package cibmangotree + +# Add a development dependency +uv add --dev pytest + +# Update all dependencies +uv sync --upgrade +``` + ## Code Formatting Setup The project uses automatic code formatting: @@ -152,8 +178,8 @@ The project uses automatic code formatting: **Manual formatting**: ```bash -isort . -black . +uv run isort . +uv run black . ``` ## Project Structure Setup @@ -162,19 +188,27 @@ After installation, your project structure should be: ```bash mango-tango-cli/ -├── venv/ # Virtual environment -├── .serena/ # Serena semantic analysis -│ └── memories/ # Project knowledge base +├── .venv/ # UV-managed virtual environment +├── .ai-context/ # AI assistant context documentation ├── docs/ # Documentation -│ ├── ai-context/ # AI assistant context +│ ├── guides/ # User guides │ └── dev-guide.md # Development guide -├── app/ # Application layer -├── analyzers/ # Analysis modules -├── components/ # Terminal UI components -├── storage/ # Data persistence -├── importing/ # Data import modules -├── requirements*.txt # Dependencies -└── mangotango.py # Main entry point +├── packages/ # UV workspace packages +│ ├── core/ # cibmangotree - Main application +│ │ └── src/cibmangotree/ +│ │ ├── app/ # Application layer +│ │ ├── components/ # Terminal UI components +│ │ └── storage/ # Data persistence +│ ├── importing/ # cibmangotree-importing - Data I/O +│ ├── services/ # cibmangotree-services - Shared services +│ ├── testing/ # cibmangotree-testing - Testing utilities +│ └── analyzers/ # Analysis modules (plugins) +│ ├── hashtags/ # Hashtag analysis +│ ├── ngrams/ # N-gram analysis +│ ├── temporal/ # Temporal patterns +│ └── example/ # Example analyzer template +├── pyproject.toml # UV workspace configuration +└── uv.lock # UV lockfile ``` # Database and Storage Setup @@ -201,14 +235,14 @@ No manual database setup required. ```bash # Start the application -python -m mangotango +uv run cibmangotree ``` ## Development Mode ```bash # Run with debugging/development flags -python -m mangotango --noop # Test mode, exits immediately +uv run cibmangotree --noop # Test mode, exits immediately ``` ## Development Mode for The React Dashboards @@ -219,14 +253,14 @@ react dashboards that are currently in development. **npm**: ```bash -cd ./app/web_templates +cd packages/core/src/cibmangotree/app/web_templates npm run dev ``` **pnpm**: ```bash -cd ./app/web_templates +cd packages/core/src/cibmangotree/app/web_templates pnpm dev ``` @@ -236,23 +270,24 @@ pnpm dev ```bash # Run all tests -pytest +uv run pytest # Run specific test file -pytest analyzers/hashtags/test_hashtags_analyzer.py +uv run pytest packages/analyzers/hashtags/tests/test_hashtags_analyzer.py # Run with verbose output -pytest -v +uv run pytest -v # Run specific test function -pytest analyzers/hashtags/test_hashtags_analyzer.py::test_gini +uv run pytest packages/analyzers/hashtags/tests/test_hashtags_analyzer.py::test_gini ``` ## Test Data - Test data is co-located with analyzers in `test_data/` directories -- Each analyzer should include its own test files +- Each analyzer package includes its own test files - Tests use sample data to verify functionality +- Example: `packages/analyzers/hashtags/test_data/` # Build Setup (Optional) @@ -260,7 +295,7 @@ pytest analyzers/hashtags/test_hashtags_analyzer.py::test_gini ```bash # Build standalone executable -pyinstaller pyinstaller.spec +uv run pyinstaller pyinstaller.spec # Output will be in dist/ directory ``` @@ -270,18 +305,20 @@ pyinstaller pyinstaller.spec **npm**: ```bash +cd packages/core/src/cibmangotree/app/web_templates npm run build ``` **pnpm**: ```bash +cd packages/core/src/cibmangotree/app/web_templates pnpm build ``` ## Build Requirements -- Included in `requirements-dev.txt` +- Managed by UV through `pyproject.toml` - Used primarily for release distribution - Not required for development @@ -291,16 +328,18 @@ pnpm build **VS Code** (`.vscode/` configuration): -- Python interpreter: `./venv/bin/python` +- Python interpreter: `./.venv/bin/python` - Black formatter integration - isort integration - pytest test discovery +- UV extension (optional): Better UV integration **PyCharm**: -- Interpreter: Project virtual environment +- Interpreter: `.venv` virtual environment - Code style: Black - Import optimizer: isort +- External tools: Configure UV commands ## Git Configuration @@ -309,7 +348,9 @@ pnpm build ```bash # Hooks are set up automatically by bootstrap script # Manual setup if needed: -pip install pre-commit +uv add --dev pre-commit +# or if you need to install directly +uv run pip install pre-commit pre-commit install ``` @@ -370,7 +411,7 @@ if you run into this issue. The commands needed to run the installation manually from the project root are as such. ```bash -cd ./app/web_templates +cd packages/core/src/cibmangotree/app/web_templates npm install --legacy-peer-deps ``` @@ -379,30 +420,35 @@ npm install --legacy-peer-deps **Import Errors**: ```bash -# Ensure virtual environment is activated -source venv/bin/activate # macOS/Linux -venv\Scripts\Activate.ps1 # Windows +# Ensure dependencies are synced +uv sync + +# Verify installation +uv run cibmangotree --noop -# Reinstall dependencies -pip install -r requirements-dev.txt +# Check UV environment +uv pip list ``` **Formatting Errors in CI**: ```bash # Run formatters locally before committing -isort . -black . +uv run isort . +uv run black . ``` **Test Failures**: ```bash # Ensure test data is present -ls analyzers/*/test_data/ +ls packages/analyzers/*/test_data/ # Check if specific analyzer tests pass -pytest analyzers/hashtags/ -v +uv run pytest packages/analyzers/hashtags/ -v + +# Verify all dependencies +uv sync ``` ## Environment Variables From d60695e93da84e5a739d27a41c1871c37ec78bc4 Mon Sep 17 00:00:00 2001 From: Joe Karow <58997957+JoeKarow@users.noreply.github.com> Date: Thu, 9 Oct 2025 22:41:00 -0400 Subject: [PATCH 13/24] format --- .../report/interface.py | 6 ++++- .../example/tests/test_example_base.py | 6 ++--- .../example/tests/test_example_report.py | 5 ++--- .../hashtags/tests/test_hashtags_base.py | 11 ++++++---- .../cibmangotree_analyzer_ngrams/base/main.py | 2 +- .../stats/interface.py | 6 ++++- .../cibmangotree_analyzer_ngrams/web/app.py | 10 ++++----- .../web/interface.py | 2 +- .../ngrams/tests/test_ngram_stats.py | 11 ++++++---- .../ngrams/tests/test_ngrams_base.py | 15 ++++++++----- packages/core/src/cibmangotree/__init__.py | 22 +++++++++---------- packages/core/src/cibmangotree/__main__.py | 3 ++- .../analyzer_interface/context.py | 6 ++--- .../analyzer_interface/declaration.py | 4 ++-- .../src/cibmangotree/app/project_context.py | 5 ++++- .../core/src/cibmangotree/context/__init__.py | 14 +++++++++--- .../src/cibmangotree/services/__init__.py | 7 ++++++ .../services/importing/__init__.py | 5 +++++ .../tui/components/analysis_params.py | 5 ++++- .../tui/components/new_analysis.py | 7 +++++- .../tui/components/select_project.py | 7 +++++- .../src/cibmangotree_testing/context.py | 10 +++++++-- .../basic/tests/test_basic_tokenizer.py | 2 +- 23 files changed, 115 insertions(+), 56 deletions(-) diff --git a/packages/analyzers/example/src/cibmangotree_analyzer_example/report/interface.py b/packages/analyzers/example/src/cibmangotree_analyzer_example/report/interface.py index 021bf15b..f79b05c2 100644 --- a/packages/analyzers/example/src/cibmangotree_analyzer_example/report/interface.py +++ b/packages/analyzers/example/src/cibmangotree_analyzer_example/report/interface.py @@ -1,4 +1,8 @@ -from cibmangotree.analyzer_interface import AnalyzerOutput, OutputColumn, SecondaryAnalyzerInterface +from cibmangotree.analyzer_interface import ( + AnalyzerOutput, + OutputColumn, + SecondaryAnalyzerInterface, +) from ..base.interface import interface as example_base diff --git a/packages/analyzers/example/tests/test_example_base.py b/packages/analyzers/example/tests/test_example_base.py index 41f45449..239567d4 100644 --- a/packages/analyzers/example/tests/test_example_base.py +++ b/packages/analyzers/example/tests/test_example_base.py @@ -1,11 +1,11 @@ import os +from pathlib import Path from cibmangotree.services.preprocessing.series_semantic import identifier -from cibmangotree_testing import CsvConfig, CsvTestData, test_primary_analyzer - from cibmangotree_analyzer_example.base.interface import interface from cibmangotree_analyzer_example.base.main import main -from pathlib import Path +from cibmangotree_testing import CsvConfig, CsvTestData, test_primary_analyzer + test_data_dir = Path(__file__).parent / "test_data" # from .test_data import test_data_dir diff --git a/packages/analyzers/example/tests/test_example_report.py b/packages/analyzers/example/tests/test_example_report.py index 021c1a04..a0dda0b4 100644 --- a/packages/analyzers/example/tests/test_example_report.py +++ b/packages/analyzers/example/tests/test_example_report.py @@ -1,11 +1,10 @@ import os - -from cibmangotree_testing import CsvTestData, test_secondary_analyzer +from pathlib import Path from cibmangotree_analyzer_example.report.interface import interface from cibmangotree_analyzer_example.report.main import main +from cibmangotree_testing import CsvTestData, test_secondary_analyzer -from pathlib import Path test_data_dir = Path(__file__).parent / "test_data" diff --git a/packages/analyzers/hashtags/tests/test_hashtags_base.py b/packages/analyzers/hashtags/tests/test_hashtags_base.py index bfe4cea7..728ca08d 100644 --- a/packages/analyzers/hashtags/tests/test_hashtags_base.py +++ b/packages/analyzers/hashtags/tests/test_hashtags_base.py @@ -1,12 +1,15 @@ import os +from pathlib import Path import numpy as np import polars as pl from cibmangotree.analyzer_interface.params import TimeBinningValue -from cibmangotree.services.preprocessing.series_semantic import datetime_string, identifier, text_catch_all -from cibmangotree_testing import CsvTestData, JsonTestData, test_primary_analyzer - +from cibmangotree.services.preprocessing.series_semantic import ( + datetime_string, + identifier, + text_catch_all, +) from cibmangotree_analyzer_hashtags.base.interface import ( COL_AUTHOR_ID, COL_POST, @@ -15,8 +18,8 @@ interface, ) from cibmangotree_analyzer_hashtags.base.main import gini, main +from cibmangotree_testing import CsvTestData, JsonTestData, test_primary_analyzer -from pathlib import Path test_data_dir = Path(__file__).parent / "test_data" HASHTAGS = [ diff --git a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/main.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/main.py index 049f2455..d083b2e8 100644 --- a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/main.py +++ b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/base/main.py @@ -1,9 +1,9 @@ import polars as pl from cibmangotree.analyzer_interface.context import PrimaryAnalyzerContext -from cibmangotree_tokenizer_basic import TokenizerConfig, tokenize_text from cibmangotree.services.tokenizer.core.types import CaseHandling from cibmangotree.tui.tools import ProgressReporter +from cibmangotree_tokenizer_basic import TokenizerConfig, tokenize_text from .interface import ( COL_AUTHOR_ID, diff --git a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/interface.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/interface.py index 99114d85..ec40077e 100644 --- a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/interface.py +++ b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/stats/interface.py @@ -1,4 +1,8 @@ -from cibmangotree.analyzer_interface import AnalyzerOutput, OutputColumn, SecondaryAnalyzerInterface +from cibmangotree.analyzer_interface import ( + AnalyzerOutput, + OutputColumn, + SecondaryAnalyzerInterface, +) from ..base import interface as ngrams_interface from ..base.interface import ( diff --git a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/app.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/app.py index 79775d83..e954fe61 100644 --- a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/app.py +++ b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/app.py @@ -4,11 +4,6 @@ from shiny import reactive, render, ui from shinywidgets import output_widget, render_widget -from ..stats.interface import ( - COL_NGRAM_DISTINCT_POSTER_COUNT, - COL_NGRAM_TOTAL_REPS, - COL_NGRAM_WORDS, -) from ..base.interface import ( COL_AUTHOR_ID, COL_MESSAGE_TEXT, @@ -16,6 +11,11 @@ COL_NGRAM_ID, COL_NGRAM_LENGTH, ) +from ..stats.interface import ( + COL_NGRAM_DISTINCT_POSTER_COUNT, + COL_NGRAM_TOTAL_REPS, + COL_NGRAM_WORDS, +) MANGO_DARK_GREEN = "#609949" CLICKED_COLOR = "red" diff --git a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/interface.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/interface.py index 8d955634..2ff306fe 100644 --- a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/interface.py +++ b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/web/interface.py @@ -1,7 +1,7 @@ from cibmangotree.analyzer_interface import WebPresenterInterface -from ..stats import interface as ngram_stats_interface from ..base import interface as ngrams_interface +from ..stats import interface as ngram_stats_interface interface = WebPresenterInterface( id="ngram_repetition_by_poster", diff --git a/packages/analyzers/ngrams/tests/test_ngram_stats.py b/packages/analyzers/ngrams/tests/test_ngram_stats.py index c8cb4fa9..458f29b4 100644 --- a/packages/analyzers/ngrams/tests/test_ngram_stats.py +++ b/packages/analyzers/ngrams/tests/test_ngram_stats.py @@ -1,14 +1,17 @@ from pathlib import Path -from cibmangotree_testing import ParquetTestData, test_secondary_analyzer - -from cibmangotree_analyzer_ngrams.stats.interface import OUTPUT_NGRAM_FULL, OUTPUT_NGRAM_STATS, interface -from cibmangotree_analyzer_ngrams.stats.main import main from cibmangotree_analyzer_ngrams.base.interface import ( OUTPUT_MESSAGE, OUTPUT_MESSAGE_NGRAMS, OUTPUT_NGRAM_DEFS, ) +from cibmangotree_analyzer_ngrams.stats.interface import ( + OUTPUT_NGRAM_FULL, + OUTPUT_NGRAM_STATS, + interface, +) +from cibmangotree_analyzer_ngrams.stats.main import main +from cibmangotree_testing import ParquetTestData, test_secondary_analyzer test_data_dir = Path(__file__).parent / "test_data" diff --git a/packages/analyzers/ngrams/tests/test_ngrams_base.py b/packages/analyzers/ngrams/tests/test_ngrams_base.py index d9458fea..8a2667f9 100644 --- a/packages/analyzers/ngrams/tests/test_ngrams_base.py +++ b/packages/analyzers/ngrams/tests/test_ngrams_base.py @@ -1,11 +1,14 @@ import types + +# Test data directory from pathlib import Path -from cibmangotree.services.preprocessing.series_semantic import datetime_string, identifier, text_catch_all -from cibmangotree_tokenizer_basic import TokenizerConfig, tokenize_text +from cibmangotree.services.preprocessing.series_semantic import ( + datetime_string, + identifier, + text_catch_all, +) from cibmangotree.services.tokenizer.core.types import CaseHandling -from cibmangotree_testing import CsvTestData, ParquetTestData, test_primary_analyzer - from cibmangotree_analyzer_ngrams.base.interface import ( COL_AUTHOR_ID, COL_MESSAGE_ID, @@ -17,9 +20,9 @@ interface, ) from cibmangotree_analyzer_ngrams.base.main import main, ngrams, serialize_ngram +from cibmangotree_testing import CsvTestData, ParquetTestData, test_primary_analyzer +from cibmangotree_tokenizer_basic import TokenizerConfig, tokenize_text -# Test data directory -from pathlib import Path test_data_dir = Path(__file__).parent / "test_data" TEST_CSV_FILENAME = "ngrams_test_input.csv" diff --git a/packages/core/src/cibmangotree/__init__.py b/packages/core/src/cibmangotree/__init__.py index 17bce114..1d9f8cbe 100644 --- a/packages/core/src/cibmangotree/__init__.py +++ b/packages/core/src/cibmangotree/__init__.py @@ -33,17 +33,6 @@ "Storage", ] -# Import core application components -from .app import ( - App, - AppContext, - AnalysisContext, - AnalysisOutputContext, - AnalysisWebServerContext, - ProjectContext, - SettingsContext, -) - # Import analyzer interface from .analyzer_interface import ( AnalyzerInterface, @@ -55,5 +44,16 @@ SecondaryAnalyzerContext, ) +# Import core application components +from .app import ( + AnalysisContext, + AnalysisOutputContext, + AnalysisWebServerContext, + App, + AppContext, + ProjectContext, + SettingsContext, +) + # Import storage service from .services.storage import Storage diff --git a/packages/core/src/cibmangotree/__main__.py b/packages/core/src/cibmangotree/__main__.py index 8db6a741..3a6c17c9 100644 --- a/packages/core/src/cibmangotree/__main__.py +++ b/packages/core/src/cibmangotree/__main__.py @@ -53,9 +53,9 @@ def main(): try: from .app import App, AppContext from .app.logger import setup_logging - from .tui.components import ViewContext, main_menu, splash from .meta import get_version from .services.storage import Storage + from .tui.components import ViewContext, main_menu, splash from .tui.tools.inception import TerminalContext # Initialize storage @@ -76,6 +76,7 @@ def main(): # Initialize app context from .analyzer_interface.suite import AnalyzerSuite + suite = AnalyzerSuite() # Start the application diff --git a/packages/core/src/cibmangotree/analyzer_interface/context.py b/packages/core/src/cibmangotree/analyzer_interface/context.py index d257fb18..a8a0d12b 100644 --- a/packages/core/src/cibmangotree/analyzer_interface/context.py +++ b/packages/core/src/cibmangotree/analyzer_interface/context.py @@ -138,9 +138,9 @@ def parquet_path(self) -> str: class InputTableReader(TableReader): @abstractmethod - def preprocess[ - PolarsDataFrameLike - ](self, df: PolarsDataFrameLike) -> PolarsDataFrameLike: + def preprocess[PolarsDataFrameLike]( + self, df: PolarsDataFrameLike + ) -> PolarsDataFrameLike: """ Given the manually loaded user input dataframe, apply column mapping and semantic transformations to give the input dataframe that the analyzer diff --git a/packages/core/src/cibmangotree/analyzer_interface/declaration.py b/packages/core/src/cibmangotree/analyzer_interface/declaration.py index 4209b32a..1dc231b6 100644 --- a/packages/core/src/cibmangotree/analyzer_interface/declaration.py +++ b/packages/core/src/cibmangotree/analyzer_interface/declaration.py @@ -27,7 +27,7 @@ def __init__( is_distributed: bool = False, default_params: Callable[[PrimaryAnalyzerContext], dict[str, ParamValue]] = ( lambda _: dict() - ) + ), ): """Creates a primary analyzer declaration @@ -48,7 +48,7 @@ def __init__( **interface.model_dump(), entry_point=main, default_params=default_params, - is_distributed=is_distributed + is_distributed=is_distributed, ) diff --git a/packages/core/src/cibmangotree/app/project_context.py b/packages/core/src/cibmangotree/app/project_context.py index 08e6fc2d..3c22de01 100644 --- a/packages/core/src/cibmangotree/app/project_context.py +++ b/packages/core/src/cibmangotree/app/project_context.py @@ -5,7 +5,10 @@ from cibmangotree.analyzer_interface import ParamValue from cibmangotree.analyzer_interface import UserInputColumn as BaseUserInputColumn -from cibmangotree.services.preprocessing.series_semantic import SeriesSemantic, infer_series_semantic +from cibmangotree.services.preprocessing.series_semantic import ( + SeriesSemantic, + infer_series_semantic, +) from cibmangotree.services.storage import AnalysisModel, ProjectModel from .app_context import AppContext diff --git a/packages/core/src/cibmangotree/context/__init__.py b/packages/core/src/cibmangotree/context/__init__.py index 0cf35b93..c16dd0eb 100644 --- a/packages/core/src/cibmangotree/context/__init__.py +++ b/packages/core/src/cibmangotree/context/__init__.py @@ -12,15 +12,23 @@ WebPresenterInterface, backfill_param_values, ) -from cibmangotree.analyzer_interface.context import AssetsReader, InputTableReader +from cibmangotree.analyzer_interface.context import ( + AssetsReader, + InputTableReader, +) from cibmangotree.analyzer_interface.context import ( PrimaryAnalyzerContext as BasePrimaryAnalyzerContext, ) from cibmangotree.analyzer_interface.context import ( SecondaryAnalyzerContext as BaseSecondaryAnalyzerContext, ) -from cibmangotree.analyzer_interface.context import TableReader, TableWriter -from cibmangotree.analyzer_interface.context import WebPresenterContext as BaseWebPresenterContext +from cibmangotree.analyzer_interface.context import ( + TableReader, + TableWriter, +) +from cibmangotree.analyzer_interface.context import ( + WebPresenterContext as BaseWebPresenterContext, +) from cibmangotree.services.preprocessing.series_semantic import SeriesSemantic from cibmangotree.services.storage import AnalysisModel, Storage diff --git a/packages/core/src/cibmangotree/services/__init__.py b/packages/core/src/cibmangotree/services/__init__.py index d440ae78..e9c7c4a8 100644 --- a/packages/core/src/cibmangotree/services/__init__.py +++ b/packages/core/src/cibmangotree/services/__init__.py @@ -10,26 +10,33 @@ # Re-export storage (no circular import) from .storage import Storage + # Lazy import for importing module to avoid circular import # The importing module imports from TUI which imports from app def __getattr__(name): if name == "CSVImporter": from .importing import CSVImporter + return CSVImporter elif name == "ExcelImporter": from .importing import ExcelImporter + return ExcelImporter elif name == "Importer": from .importing import Importer + return Importer elif name == "ImporterSession": from .importing import ImporterSession + return ImporterSession elif name == "SeriesSemantic": from .preprocessing import SeriesSemantic + return SeriesSemantic raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + __all__ = [ # Storage "Storage", diff --git a/packages/core/src/cibmangotree/services/importing/__init__.py b/packages/core/src/cibmangotree/services/importing/__init__.py index 8beaf135..132af83d 100644 --- a/packages/core/src/cibmangotree/services/importing/__init__.py +++ b/packages/core/src/cibmangotree/services/importing/__init__.py @@ -1,19 +1,24 @@ # Import base classes first (no circular dependency) from .importer import Importer, ImporterSession + # Lazy import for CSV and Excel to avoid circular import # CSV/Excel importers use TUI which imports from app def __getattr__(name): if name == "CSVImporter": from .csv import CSVImporter + return CSVImporter elif name == "ExcelImporter": from .excel import ExcelImporter + return ExcelImporter elif name == "importers": from .csv import CSVImporter from .excel import ExcelImporter + return [CSVImporter(), ExcelImporter()] raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + __all__ = ["Importer", "ImporterSession", "CSVImporter", "ExcelImporter", "importers"] diff --git a/packages/core/src/cibmangotree/tui/components/analysis_params.py b/packages/core/src/cibmangotree/tui/components/analysis_params.py index d03b460e..39158993 100644 --- a/packages/core/src/cibmangotree/tui/components/analysis_params.py +++ b/packages/core/src/cibmangotree/tui/components/analysis_params.py @@ -11,7 +11,10 @@ ParamValue, TimeBinningValue, ) -from cibmangotree.context import InputColumnProvider, PrimaryAnalyzerDefaultParametersContext +from cibmangotree.context import ( + InputColumnProvider, + PrimaryAnalyzerDefaultParametersContext, +) from cibmangotree.tui.tools import prompts, smart_print_data_frame from .context import ViewContext diff --git a/packages/core/src/cibmangotree/tui/components/new_analysis.py b/packages/core/src/cibmangotree/tui/components/new_analysis.py index d73ef101..6a2b7ff3 100644 --- a/packages/core/src/cibmangotree/tui/components/new_analysis.py +++ b/packages/core/src/cibmangotree/tui/components/new_analysis.py @@ -10,7 +10,12 @@ column_automap, get_data_type_compatibility_score, ) -from cibmangotree.tui.tools import draw_box, prompts, smart_print_data_frame, wait_for_key +from cibmangotree.tui.tools import ( + draw_box, + prompts, + smart_print_data_frame, + wait_for_key, +) from .analysis_params import customize_analysis from .context import ViewContext diff --git a/packages/core/src/cibmangotree/tui/components/select_project.py b/packages/core/src/cibmangotree/tui/components/select_project.py index 5b73b8fb..73dd37da 100644 --- a/packages/core/src/cibmangotree/tui/components/select_project.py +++ b/packages/core/src/cibmangotree/tui/components/select_project.py @@ -1,6 +1,11 @@ from typing import TYPE_CHECKING, Optional -from cibmangotree.tui.tools import draw_box, prompts, smart_print_data_frame, wait_for_key +from cibmangotree.tui.tools import ( + draw_box, + prompts, + smart_print_data_frame, + wait_for_key, +) from .context import ViewContext diff --git a/packages/testing/src/cibmangotree_testing/context.py b/packages/testing/src/cibmangotree_testing/context.py index 03cabecf..b7b68181 100644 --- a/packages/testing/src/cibmangotree_testing/context.py +++ b/packages/testing/src/cibmangotree_testing/context.py @@ -6,14 +6,20 @@ from pydantic import BaseModel from cibmangotree.analyzer_interface import ParamValue, SecondaryAnalyzerInterface -from cibmangotree.analyzer_interface.context import AssetsReader, InputTableReader +from cibmangotree.analyzer_interface.context import ( + AssetsReader, + InputTableReader, +) from cibmangotree.analyzer_interface.context import ( PrimaryAnalyzerContext as BasePrimaryAnalyzerContext, ) from cibmangotree.analyzer_interface.context import ( SecondaryAnalyzerContext as BaseSecondaryAnalyzerContext, ) -from cibmangotree.analyzer_interface.context import TableReader, TableWriter +from cibmangotree.analyzer_interface.context import ( + TableReader, + TableWriter, +) class TestPrimaryAnalyzerContext(BasePrimaryAnalyzerContext): diff --git a/packages/tokenizers/basic/tests/test_basic_tokenizer.py b/packages/tokenizers/basic/tests/test_basic_tokenizer.py index 3f2ee256..3480fb7a 100644 --- a/packages/tokenizers/basic/tests/test_basic_tokenizer.py +++ b/packages/tokenizers/basic/tests/test_basic_tokenizer.py @@ -397,7 +397,7 @@ def test_very_long_text(self): def test_special_characters(self): """Test handling of special Unicode characters.""" tokenizer = BasicTokenizer() - text = "Hello\u00A0world\u2000test" # Non-breaking space and em space + text = "Hello\u00a0world\u2000test" # Non-breaking space and em space result = tokenizer.tokenize(text) expected = ["hello", "world", "test"] From 1119ccabcf81d200b23c2aeb3fa5df05bb19c0d8 Mon Sep 17 00:00:00 2001 From: Joe Karow <58997957+JoeKarow@users.noreply.github.com> Date: Tue, 14 Oct 2025 12:48:34 -0400 Subject: [PATCH 14/24] fix: build errors --- pyinstaller.spec | 269 +++++++++++++++++++++++------------------------ run_app.py | 12 +++ 2 files changed, 141 insertions(+), 140 deletions(-) create mode 100644 run_app.py diff --git a/pyinstaller.spec b/pyinstaller.spec index 57d09df1..f7f5f971 100644 --- a/pyinstaller.spec +++ b/pyinstaller.spec @@ -1,14 +1,15 @@ # code: language=python # pyinstaller.spec # This file tells PyInstaller how to bundle the monorepo application with dynamic plugin discovery -from PyInstaller.utils.hooks import copy_metadata -from PyInstaller.building.api import EXE, PYZ -from PyInstaller.building.build_main import Analysis -import sys import os import site +import sys from pathlib import Path +from PyInstaller.building.api import EXE, PYZ +from PyInstaller.building.build_main import Analysis +from PyInstaller.utils.hooks import copy_metadata + # Import plugin discovery system try: import importlib.metadata as importlib_metadata @@ -20,6 +21,7 @@ except ImportError: # Plugin Discovery System # ============================================================================ + def discover_plugins(): """ Discover all installed plugins via entry points. @@ -28,10 +30,7 @@ def discover_plugins(): dict: Dictionary with 'analyzers' and 'tokenizers' keys containing lists of plugin metadata (name, module, attr). """ - plugins = { - 'analyzers': [], - 'tokenizers': [] - } + plugins = {"analyzers": [], "tokenizers": []} print("=" * 70) print("DISCOVERING PLUGINS FOR PYINSTALLER BUILD") @@ -41,19 +40,17 @@ def discover_plugins(): try: eps = importlib_metadata.entry_points() # Handle both old (dict) and new (SelectableGroups) API - if hasattr(eps, 'select'): - analyzer_eps = eps.select(group='cibmangotree.analyzers') + if hasattr(eps, "select"): + analyzer_eps = eps.select(group="cibmangotree.analyzers") else: - analyzer_eps = eps.get('cibmangotree.analyzers', []) + analyzer_eps = eps.get("cibmangotree.analyzers", []) for ep in analyzer_eps: - module_path = ep.value.split(':')[0] - attr_name = ep.value.split(':')[1] if ':' in ep.value else None - plugins['analyzers'].append({ - 'name': ep.name, - 'module': module_path, - 'attr': attr_name - }) + module_path = ep.value.split(":")[0] + attr_name = ep.value.split(":")[1] if ":" in ep.value else None + plugins["analyzers"].append( + {"name": ep.name, "module": module_path, "attr": attr_name} + ) print(f" [Analyzer] {ep.name:20s} -> {module_path}") except Exception as e: print(f" Warning: Could not discover analyzer plugins: {e}") @@ -61,25 +58,25 @@ def discover_plugins(): # Discover tokenizer plugins try: eps = importlib_metadata.entry_points() - if hasattr(eps, 'select'): - tokenizer_eps = eps.select(group='cibmangotree.tokenizers') + if hasattr(eps, "select"): + tokenizer_eps = eps.select(group="cibmangotree.tokenizers") else: - tokenizer_eps = eps.get('cibmangotree.tokenizers', []) + tokenizer_eps = eps.get("cibmangotree.tokenizers", []) for ep in tokenizer_eps: - module_path = ep.value.split(':')[0] - attr_name = ep.value.split(':')[1] if ':' in ep.value else None - plugins['tokenizers'].append({ - 'name': ep.name, - 'module': module_path, - 'attr': attr_name - }) + module_path = ep.value.split(":")[0] + attr_name = ep.value.split(":")[1] if ":" in ep.value else None + plugins["tokenizers"].append( + {"name": ep.name, "module": module_path, "attr": attr_name} + ) print(f" [Tokenizer] {ep.name:20s} -> {module_path}") except Exception as e: print(f" Warning: Could not discover tokenizer plugins: {e}") - print(f"\nTotal discovered: {len(plugins['analyzers'])} analyzers, " - f"{len(plugins['tokenizers'])} tokenizers") + print( + f"\nTotal discovered: {len(plugins['analyzers'])} analyzers, " + f"{len(plugins['tokenizers'])} tokenizers" + ) print("=" * 70) return plugins @@ -98,7 +95,7 @@ def generate_frozen_plugins(plugins): Returns: str: Path to the generated frozen plugins file """ - frozen_file = Path('packages/core/src/cibmangotree/_frozen_plugins.py') + frozen_file = Path("packages/core/src/cibmangotree/_frozen_plugins.py") content = '''""" Auto-generated frozen plugins for PyInstaller builds. @@ -108,19 +105,21 @@ This file is used when the application is frozen (packaged with PyInstaller) to provide explicit plugin imports, since entry points don't work in frozen apps. """ +import sys + # Analyzer plugins - mapping from plugin name to module path ANALYZER_PLUGINS = { ''' - for plugin in plugins['analyzers']: + for plugin in plugins["analyzers"]: content += f" '{plugin['name']}': '{plugin['module']}:{plugin['attr']}',\n" - content += '''}\n + content += """}\n # Tokenizer plugins - mapping from plugin name to module path TOKENIZER_PLUGINS = { -''' +""" - for plugin in plugins['tokenizers']: + for plugin in plugins["tokenizers"]: content += f" '{plugin['name']}': '{plugin['module']}:{plugin['attr']}',\n" content += '''}\n @@ -150,15 +149,15 @@ def get_plugin_hiddenimports(plugins): imports = [] # Add analyzer modules - for plugin in plugins['analyzers']: - base_module = plugin['module'].split('.')[0] - imports.append(plugin['module']) + for plugin in plugins["analyzers"]: + base_module = plugin["module"].split(".")[0] + imports.append(plugin["module"]) imports.append(base_module) # Add tokenizer modules - for plugin in plugins['tokenizers']: - base_module = plugin['module'].split('.')[0] - imports.append(plugin['module']) + for plugin in plugins["tokenizers"]: + base_module = plugin["module"].split(".")[0] + imports.append(plugin["module"]) imports.append(base_module) # Remove duplicates while preserving order @@ -180,7 +179,7 @@ site_packages_path = None block_cipher = None for site_path in site.getsitepackages(): - if 'site-packages' in site_path: + if "site-packages" in site_path: site_packages_path = site_path break @@ -212,126 +211,116 @@ plugin_imports = get_plugin_hiddenimports(discovered_plugins) # ============================================================================ a = Analysis( - ['cibmangotree.py'], # Entry point (imports from cibmangotree package) + ["run_app.py"], # Entry point wrapper with absolute imports pathex=[ - 'packages/core/src', # Core package source + "packages/core/src/cibmangotree", # Core package source ], binaries=[], datas=[ # Version file, if defined - *( - [('./VERSION', '.')] - if os.path.exists('VERSION') else [] - ), - + *([("./VERSION", ".")] if os.path.exists("VERSION") else []), # Inquirer depends on readchar as a hidden dependency that requires package metadata - *copy_metadata('readchar'), - + *copy_metadata("readchar"), # Static assets for web servers (from site-packages) - (os.path.join(site_packages_path, 'shiny/www'), 'shiny/www'), - (os.path.join(site_packages_path, 'shinywidgets/static'), 'shinywidgets/static'), - + (os.path.join(site_packages_path, "shiny/www"), "shiny/www"), + ( + os.path.join(site_packages_path, "shinywidgets/static"), + "shinywidgets/static", + ), # Application static assets (from monorepo) - ('packages/core/src/cibmangotree/app/web_static', 'cibmangotree/app/web_static'), - ('packages/core/src/cibmangotree/app/web_templates', 'cibmangotree/app/web_templates'), - + ( + "packages/core/src/cibmangotree/app/web_static", + "cibmangotree/app/web_static", + ), + ( + "packages/core/src/cibmangotree/app/web_templates", + "cibmangotree/app/web_templates", + ), # Include the frozen plugins file - (frozen_plugins_file, 'cibmangotree'), + (frozen_plugins_file, "cibmangotree"), ], hiddenimports=[ # Core package modules - 'cibmangotree', - 'cibmangotree.__main__', - 'cibmangotree.app', - 'cibmangotree.analyzer_interface', - 'cibmangotree.tui', - 'cibmangotree.tui.components', - 'cibmangotree.tui.tools', - 'cibmangotree.services', - 'cibmangotree.services.storage', - 'cibmangotree.services.data_import', - 'cibmangotree.services.tokenizers', - 'cibmangotree.context', - 'cibmangotree.meta', - 'cibmangotree.plugin_system', - 'cibmangotree.plugin_system.analyzer_loader', - 'cibmangotree.plugin_system.tokenizer_loader', - + "cibmangotree", + "cibmangotree.__main__", + "cibmangotree.app", + "cibmangotree.analyzer_interface", + "cibmangotree.tui", + "cibmangotree.tui.components", + "cibmangotree.tui.tools", + "cibmangotree.services", + "cibmangotree.services.storage", + "cibmangotree.services.data_import", + "cibmangotree.services.tokenizers", + "cibmangotree.context", + "cibmangotree.meta", + "cibmangotree.plugin_system", + "cibmangotree.plugin_system.analyzer_loader", + "cibmangotree.plugin_system.tokenizer_loader", # Dynamically discovered plugin modules *plugin_imports, - # Terminal UI dependencies - 'readchar', - 'inquirer', - 'rich', - 'colorama', - + "readchar", + "inquirer", + "rich", + "colorama", # Data processing - 'numpy', - 'numpy.core.multiarray', - 'polars', - 'pandas', - 'pyarrow', - + "numpy", + "numpy.core.multiarray", + "polars", + "pandas", + "pyarrow", # Visualization - 'plotly', - 'plotly.graph_objs', - + "plotly", + "plotly.graph_objs", # Web frameworks - 'dash', - 'dash.dependencies', - 'shiny', - 'shiny.ui', - 'shiny.server', - 'shinywidgets', - 'htmltools', - 'starlette', - 'starlette.middleware', - 'starlette.routing', - + "dash", + "dash.dependencies", + "shiny", + "shiny.ui", + "shiny.server", + "shinywidgets", + "htmltools", + "starlette", + "starlette.middleware", + "starlette.routing", # Web server - 'uvicorn', - 'uvicorn.logging', - 'uvicorn.loops', - 'uvicorn.loops.auto', - 'uvicorn.protocols', - 'uvicorn.protocols.http', - 'uvicorn.protocols.http.auto', - 'uvicorn.protocols.websockets', - 'uvicorn.protocols.websockets.auto', - 'uvicorn.lifespan', - 'uvicorn.lifespan.on', - 'asyncio', - 'websockets', - 'websockets.legacy', - 'websockets.legacy.server', - + "uvicorn", + "uvicorn.logging", + "uvicorn.loops", + "uvicorn.loops.auto", + "uvicorn.protocols", + "uvicorn.protocols.http", + "uvicorn.protocols.http.auto", + "uvicorn.protocols.websockets", + "uvicorn.protocols.websockets.auto", + "uvicorn.lifespan", + "uvicorn.lifespan.on", + "asyncio", + "websockets", + "websockets.legacy", + "websockets.legacy.server", # Markdown rendering (for Shiny) - 'linkify_it', - 'markdown_it', - 'mdit_py_plugins', - 'mdurl', - 'uc_micro', - + "linkify_it", + "markdown_it", + "mdit_py_plugins", + "mdurl", + "uc_micro", # Logging - 'pythonjsonlogger', - 'pythonjsonlogger.jsonlogger', - + "pythonjsonlogger", + "pythonjsonlogger.jsonlogger", # Storage - 'tinydb', - 'platformdirs', - 'filelock', - + "tinydb", + "platformdirs", + "filelock", # Text processing - 'regex', - + "regex", # Data validation - 'pydantic', - 'pydantic.v1', - + "pydantic", + "pydantic.v1", # Import/Export - 'xlsxwriter', - 'fastexcel', + "xlsxwriter", + "fastexcel", ], hookspath=[], runtime_hooks=[], @@ -351,13 +340,13 @@ if sys.platform == "darwin": a.binaries, a.zipfiles, a.datas, - name='cibmangotree', # The name of the executable + name="cibmangotree", # The name of the executable debug=False, strip=True, upx=True, console=True, entitlements_file="./mango.entitlements", - codesign_identity=os.getenv('APPLE_APP_CERT_ID'), + codesign_identity=os.getenv("APPLE_APP_CERT_ID"), ) else: exe = EXE( @@ -366,7 +355,7 @@ else: a.binaries, a.zipfiles, a.datas, - name='cibmangotree', + name="cibmangotree", debug=False, strip=False, upx=True, diff --git a/run_app.py b/run_app.py new file mode 100644 index 00000000..1885c83c --- /dev/null +++ b/run_app.py @@ -0,0 +1,12 @@ +""" +PyInstaller entry point wrapper for CIB Mango Tree CLI. + +This wrapper uses absolute imports to avoid the "attempted relative import +with no known parent package" error that occurs when PyInstaller runs +__main__.py directly. +""" + +from cibmangotree.__main__ import main + +if __name__ == "__main__": + main() From 4a911cb882d22484174d0a6815f412ab1de4376a Mon Sep 17 00:00:00 2001 From: Joe Karow <58997957+JoeKarow@users.noreply.github.com> Date: Tue, 14 Oct 2025 12:56:14 -0400 Subject: [PATCH 15/24] feat: implement plugin discovery system Replace placeholder plugin system with working implementation: - Add discover_analyzers() to load all analyzer plugins - Support both normal (entry points) and frozen (PyInstaller) environments - Integrate plugin discovery into main application bootstrap - Add structured logging throughout plugin loading - Handle both old and new importlib.metadata APIs This enables the app to automatically discover and load analyzer plugins from installed packages via the cibmangotree.analyzers entry point. --- packages/core/src/cibmangotree/__main__.py | 4 +- .../cibmangotree/plugin_system/__init__.py | 191 ++++++++++++------ 2 files changed, 130 insertions(+), 65 deletions(-) diff --git a/packages/core/src/cibmangotree/__main__.py b/packages/core/src/cibmangotree/__main__.py index 3a6c17c9..76a34a7d 100644 --- a/packages/core/src/cibmangotree/__main__.py +++ b/packages/core/src/cibmangotree/__main__.py @@ -76,8 +76,10 @@ def main(): # Initialize app context from .analyzer_interface.suite import AnalyzerSuite + from .plugin_system import discover_analyzers - suite = AnalyzerSuite() + all_analyzers = discover_analyzers() + suite = AnalyzerSuite(all_analyzers=all_analyzers) # Start the application splash() diff --git a/packages/core/src/cibmangotree/plugin_system/__init__.py b/packages/core/src/cibmangotree/plugin_system/__init__.py index c3a8ccf7..b3f76af9 100644 --- a/packages/core/src/cibmangotree/plugin_system/__init__.py +++ b/packages/core/src/cibmangotree/plugin_system/__init__.py @@ -1,85 +1,148 @@ """ Plugin System for CIB Mango Tree -This module will provide plugin discovery and loading functionality for: +This module provides plugin discovery and loading functionality for: - Analyzer plugins (via cibmangotree.analyzers entry point) - Tokenizer plugins (via cibmangotree.tokenizers entry point) -TODO: Phase 6 - Implement Plugin System --------------------------------------- -This is a placeholder for the plugin system that will be implemented -after the monorepo reorganization is complete. - -Planned Features: -1. Plugin Discovery - - Use importlib.metadata to discover installed plugins - - Scan entry points: cibmangotree.analyzers, cibmangotree.tokenizers - - Validate plugin interfaces - -2. Plugin Loading - - Lazy loading of plugins - - Error handling for malformed plugins - - Version compatibility checking - -3. Plugin Registry - - Central registry of available plugins - - Metadata: name, version, description, dependencies - - Conflict detection (duplicate names) - -4. Plugin Lifecycle - - Initialize plugins on demand - - Resource cleanup - - Hot reload support (future) - -Example Usage (planned): -```python -from cibmangotree.plugin_system import discover_plugins, load_plugin - -# Discover all analyzer plugins -analyzers = discover_plugins("cibmangotree.analyzers") - -# Load a specific plugin -hashtag_analyzer = load_plugin("cibmangotree.analyzers", "hashtags") -``` - -Entry Point Format: -```toml -# In analyzer plugin's pyproject.toml -[project.entry-points."cibmangotree.analyzers"] -hashtags = "cibmangotree_analyzer_hashtags:analyzer" -``` +Supports both normal (entry point) and frozen (PyInstaller) environments. """ -# Placeholder - will be implemented in Phase 6 -__all__ = [] +import importlib +import logging +import sys +from typing import TYPE_CHECKING, Union +try: + import importlib.metadata as importlib_metadata +except ImportError: + import importlib_metadata # type: ignore -def discover_plugins(entry_point_group: str) -> list: - """ - Discover plugins for a given entry point group. +if TYPE_CHECKING: + from cibmangotree.analyzer_interface import ( + AnalyzerDeclaration, + SecondaryAnalyzerDeclaration, + WebPresenterDeclaration, + ) - TODO: Implement using importlib.metadata.entry_points() +logger = logging.getLogger(__name__) - Args: - entry_point_group: Entry point group name (e.g., "cibmangotree.analyzers") +__all__ = ["discover_analyzers"] - Returns: - List of discovered plugin metadata - """ - raise NotImplementedError("Plugin system not yet implemented (Phase 6)") +def _is_frozen(): + """Check if running in a frozen (PyInstaller) environment.""" + return getattr(sys, "frozen", False) and hasattr(sys, "_MEIPASS") -def load_plugin(entry_point_group: str, plugin_name: str): - """ - Load a specific plugin by name. - TODO: Implement plugin loading and validation +def _load_plugin_interface(module_path: str, attr_name: str): + """ + Load a plugin interface by importing the module and calling the getter. Args: - entry_point_group: Entry point group name - plugin_name: Name of the plugin to load + module_path: Module path (e.g., "cibmangotree_analyzer_hashtags") + attr_name: Attribute name (e.g., "get_interface") + + Returns: + The result of calling the interface getter function + """ + try: + module = importlib.import_module(module_path) + getter = getattr(module, attr_name) + return getter() + except Exception as e: + logger.error( + f"Failed to load plugin {module_path}:{attr_name}", + exc_info=True, + extra={"plugin_module": module_path, "attr": attr_name, "error": str(e)}, + ) + return None + + +def discover_analyzers() -> list[ + Union[ + "AnalyzerDeclaration", + "SecondaryAnalyzerDeclaration", + "WebPresenterDeclaration", + ] +]: + """ + Discover all analyzer plugins and return their declarations. + + Works in both normal and frozen (PyInstaller) environments: + - In normal mode: Uses importlib.metadata entry points + - In frozen mode: Uses _frozen_plugins.ANALYZER_PLUGINS Returns: - Loaded plugin object + List of all analyzer declarations (primary, secondary, and web presenters) """ - raise NotImplementedError("Plugin system not yet implemented (Phase 6)") + all_declarations = [] + + if _is_frozen(): + # Frozen environment: Load from _frozen_plugins + logger.info("Running in frozen environment, loading plugins from _frozen_plugins") + try: + from cibmangotree._frozen_plugins import ANALYZER_PLUGINS + + for plugin_name, plugin_spec in ANALYZER_PLUGINS.items(): + module_path, attr_name = plugin_spec.split(":") + logger.debug( + f"Loading frozen plugin: {plugin_name}", + extra={"plugin_module": module_path, "attr": attr_name}, + ) + interface_dict = _load_plugin_interface(module_path, attr_name) + if interface_dict: + # Each plugin returns {"base": ..., "web": ...} + if "base" in interface_dict: + all_declarations.append(interface_dict["base"]) + if "web" in interface_dict: + all_declarations.append(interface_dict["web"]) + except ImportError as e: + logger.error( + "Failed to import _frozen_plugins in frozen environment", + exc_info=True, + extra={"error": str(e)}, + ) + else: + # Normal environment: Load from entry points + logger.info("Running in normal environment, loading plugins from entry points") + try: + eps = importlib_metadata.entry_points() + # Handle both old (dict) and new (SelectableGroups) API + if hasattr(eps, "select"): + analyzer_eps = eps.select(group="cibmangotree.analyzers") + else: + # Old API - eps is a dict + analyzer_eps = eps.get("cibmangotree.analyzers", []) # type: ignore[attr-defined] + + for ep in analyzer_eps: + logger.debug( + f"Loading plugin from entry point: {ep.name}", + extra={"name": ep.name, "value": ep.value}, + ) + try: + # Load the entry point (calls get_interface()) + interface_dict = ep.load()() + # Each plugin returns {"base": ..., "web": ...} + if "base" in interface_dict: + all_declarations.append(interface_dict["base"]) + if "web" in interface_dict: + all_declarations.append(interface_dict["web"]) + except Exception as e: + logger.error( + f"Failed to load plugin entry point: {ep.name}", + exc_info=True, + extra={"name": ep.name, "error": str(e)}, + ) + except Exception as e: + logger.error( + "Failed to discover analyzer plugins", + exc_info=True, + extra={"error": str(e)}, + ) + + logger.info( + f"Discovered {len(all_declarations)} analyzer declarations", + extra={"count": len(all_declarations)}, + ) + return all_declarations From 6dbd7bea3df230fb26699ce14c2b436dc77a1b6e Mon Sep 17 00:00:00 2001 From: Joe Karow <58997957+JoeKarow@users.noreply.github.com> Date: Tue, 14 Oct 2025 12:59:33 -0400 Subject: [PATCH 16/24] fix(temporal): correct import paths from temporal_base to base The temporal analyzer was using incorrect import paths that referenced a non-existent 'temporal_base' module. The correct path is 'base' to match the actual directory structure. This fixes ModuleNotFoundError during PyInstaller frozen builds. --- .../temporal/src/cibmangotree_analyzer_temporal/web/factory.py | 2 +- .../src/cibmangotree_analyzer_temporal/web/interface.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/factory.py b/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/factory.py index 5a98d501..765e8976 100644 --- a/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/factory.py +++ b/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/factory.py @@ -5,7 +5,7 @@ from cibmangotree.analyzer_interface.context import WebPresenterContext -from ..temporal_base.interface import ( +from ..base.interface import ( OUTPUT_COL_POST_COUNT, OUTPUT_COL_TIME_INTERVAL_END, OUTPUT_COL_TIME_INTERVAL_START, diff --git a/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/interface.py b/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/interface.py index 17757065..bbf0207b 100644 --- a/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/interface.py +++ b/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/web/interface.py @@ -1,6 +1,6 @@ from cibmangotree.analyzer_interface import WebPresenterInterface -from ..temporal_base import interface as temporal_interface +from ..base import interface as temporal_interface interface = WebPresenterInterface( id="time_interval_frequencies", From b12431ea916617759c61ec7706f05d46f77d9e37 Mon Sep 17 00:00:00 2001 From: Joe Karow <58997957+JoeKarow@users.noreply.github.com> Date: Tue, 14 Oct 2025 13:05:44 -0400 Subject: [PATCH 17/24] fix(plugins): return declarations instead of interfaces from get_interface() All analyzer plugins were returning raw Interface objects instead of Declaration objects, causing Pydantic validation errors in AnalyzerSuite. Declaration objects wrap interfaces with entry point functions (main, factory). Changes: - Hashtags: Return hashtags/hashtags_web declarations - Temporal: Return temporal/temporal_web declarations - N-grams: Return ngrams/ngram_stats/ngrams_web declarations - Example: Return example_base/example_report/example_web declarations - Time Coordination: Create and return time_coordination declaration Updated plugin discovery to iterate through all dict values (base, web, stats, report) instead of checking specific keys. Fixes AnalyzerSuite validation error. Application now discovers 11 analyzers correctly (5 primary, 2 secondary, 4 web presenters). --- .../cibmangotree_analyzer_example/__init__.py | 14 ++++++------- .../__init__.py | 10 +++++----- .../cibmangotree_analyzer_ngrams/__init__.py | 14 ++++++------- .../__init__.py | 10 +++++----- .../__init__.py | 11 ++++++++-- .../cibmangotree/plugin_system/__init__.py | 20 +++++++++---------- 6 files changed, 43 insertions(+), 36 deletions(-) diff --git a/packages/analyzers/example/src/cibmangotree_analyzer_example/__init__.py b/packages/analyzers/example/src/cibmangotree_analyzer_example/__init__.py index 8fa95cb9..f4ef46cd 100644 --- a/packages/analyzers/example/src/cibmangotree_analyzer_example/__init__.py +++ b/packages/analyzers/example/src/cibmangotree_analyzer_example/__init__.py @@ -2,17 +2,17 @@ __version__ = "0.1.0" -from .base.interface import interface as base_interface -from .report.interface import interface as report_interface -from .web.interface import interface as web_interface +from .base import example_base +from .report import example_report +from .web import example_web def get_interface(): - """Return the analyzer interface for plugin discovery.""" + """Return the analyzer declarations for plugin discovery.""" return { - "base": base_interface, - "report": report_interface, - "web": web_interface, + "base": example_base, + "report": example_report, + "web": example_web, } diff --git a/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/__init__.py b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/__init__.py index 4d66cfa3..e88b6603 100644 --- a/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/__init__.py +++ b/packages/analyzers/hashtags/src/cibmangotree_analyzer_hashtags/__init__.py @@ -2,15 +2,15 @@ __version__ = "0.1.0" -from .base.interface import interface as base_interface -from .web.interface import interface as web_interface +from .base import hashtags +from .web import hashtags_web def get_interface(): - """Return the analyzer interface for plugin discovery.""" + """Return the analyzer declarations for plugin discovery.""" return { - "base": base_interface, - "web": web_interface, + "base": hashtags, + "web": hashtags_web, } diff --git a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/__init__.py b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/__init__.py index 186e0a0f..a922e239 100644 --- a/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/__init__.py +++ b/packages/analyzers/ngrams/src/cibmangotree_analyzer_ngrams/__init__.py @@ -2,17 +2,17 @@ __version__ = "0.1.0" -from .base.interface import interface as base_interface -from .stats.interface import interface as stats_interface -from .web.interface import interface as web_interface +from .base import ngrams +from .stats import ngram_stats +from .web import ngrams_web def get_interface(): - """Return the analyzer interface for plugin discovery.""" + """Return the analyzer declarations for plugin discovery.""" return { - "base": base_interface, - "stats": stats_interface, - "web": web_interface, + "base": ngrams, + "stats": ngram_stats, + "web": ngrams_web, } diff --git a/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/__init__.py b/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/__init__.py index a02dc6c2..c0b631fc 100644 --- a/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/__init__.py +++ b/packages/analyzers/temporal/src/cibmangotree_analyzer_temporal/__init__.py @@ -2,15 +2,15 @@ __version__ = "0.1.0" -from .base.interface import interface as base_interface -from .web.interface import interface as web_interface +from .base import temporal +from .web import temporal_web def get_interface(): - """Return the analyzer interface for plugin discovery.""" + """Return the analyzer declarations for plugin discovery.""" return { - "base": base_interface, - "web": web_interface, + "base": temporal, + "web": temporal_web, } diff --git a/packages/analyzers/time_coordination/src/cibmangotree_analyzer_time_coordination/__init__.py b/packages/analyzers/time_coordination/src/cibmangotree_analyzer_time_coordination/__init__.py index 5c1de6b8..695478a8 100644 --- a/packages/analyzers/time_coordination/src/cibmangotree_analyzer_time_coordination/__init__.py +++ b/packages/analyzers/time_coordination/src/cibmangotree_analyzer_time_coordination/__init__.py @@ -2,12 +2,19 @@ __version__ = "0.1.0" +from cibmangotree.analyzer_interface import AnalyzerDeclaration + from .interface import interface +from .main import main + +time_coordination = AnalyzerDeclaration( + interface=interface, main=main, is_distributed=True +) def get_interface(): - """Return the analyzer interface for plugin discovery.""" - return interface + """Return the analyzer declarations for plugin discovery.""" + return {"base": time_coordination} __all__ = ["get_interface", "__version__"] diff --git a/packages/core/src/cibmangotree/plugin_system/__init__.py b/packages/core/src/cibmangotree/plugin_system/__init__.py index b3f76af9..c65caf3d 100644 --- a/packages/core/src/cibmangotree/plugin_system/__init__.py +++ b/packages/core/src/cibmangotree/plugin_system/__init__.py @@ -92,11 +92,11 @@ def discover_analyzers() -> list[ ) interface_dict = _load_plugin_interface(module_path, attr_name) if interface_dict: - # Each plugin returns {"base": ..., "web": ...} - if "base" in interface_dict: - all_declarations.append(interface_dict["base"]) - if "web" in interface_dict: - all_declarations.append(interface_dict["web"]) + # Each plugin returns {"base": ..., "web": ..., "stats": ..., "report": ...} + # Collect all declaration objects from the dict + for key, declaration in interface_dict.items(): + if declaration is not None: + all_declarations.append(declaration) except ImportError as e: logger.error( "Failed to import _frozen_plugins in frozen environment", @@ -123,11 +123,11 @@ def discover_analyzers() -> list[ try: # Load the entry point (calls get_interface()) interface_dict = ep.load()() - # Each plugin returns {"base": ..., "web": ...} - if "base" in interface_dict: - all_declarations.append(interface_dict["base"]) - if "web" in interface_dict: - all_declarations.append(interface_dict["web"]) + # Each plugin returns {"base": ..., "web": ..., "stats": ..., "report": ...} + # Collect all declaration objects from the dict + for key, declaration in interface_dict.items(): + if declaration is not None: + all_declarations.append(declaration) except Exception as e: logger.error( f"Failed to load plugin entry point: {ep.name}", From 73183da1372528b34faf989bb1ae20ee18adb0ac Mon Sep 17 00:00:00 2001 From: Joe Karow <58997957+JoeKarow@users.noreply.github.com> Date: Tue, 14 Oct 2025 13:06:33 -0400 Subject: [PATCH 18/24] format --- packages/core/src/cibmangotree/plugin_system/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/core/src/cibmangotree/plugin_system/__init__.py b/packages/core/src/cibmangotree/plugin_system/__init__.py index c65caf3d..e138e046 100644 --- a/packages/core/src/cibmangotree/plugin_system/__init__.py +++ b/packages/core/src/cibmangotree/plugin_system/__init__.py @@ -80,7 +80,9 @@ def discover_analyzers() -> list[ if _is_frozen(): # Frozen environment: Load from _frozen_plugins - logger.info("Running in frozen environment, loading plugins from _frozen_plugins") + logger.info( + "Running in frozen environment, loading plugins from _frozen_plugins" + ) try: from cibmangotree._frozen_plugins import ANALYZER_PLUGINS From e61c563043ecbd99440ce43ca95148dd4f21a9d6 Mon Sep 17 00:00:00 2001 From: Joe Karow <58997957+JoeKarow@users.noreply.github.com> Date: Tue, 14 Oct 2025 13:50:03 -0400 Subject: [PATCH 19/24] use `uv` in action Signed-off-by: Joe Karow <58997957+JoeKarow@users.noreply.github.com> --- .github/workflows/docs.yml | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index e8fc4242..bd0f4249 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -14,7 +14,7 @@ on: workflow_dispatch: inputs: deploy_to: - description: "Deployment destination" + description: 'Deployment destination' type: choice options: - prod @@ -35,14 +35,26 @@ jobs: with: # ✅ Explicitly checkout the PR branch ref: ${{ github.event.pull_request.head.sha }} + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version-file: '.python-version' + + - name: Install UV + uses: astral-sh/setup-uv@v5 + with: + enable-cache: true + + - name: Install dependencies + run: uv sync - name: Build uses: Tiryoh/actions-mkdocs@v0.24.0 if: github.event.action != 'closed' with: # mkdocs_version: 'latest' # option #mkdocs_version: '1.1' # option - requirements: "requirements-mkdocs.txt" # option - configfile: "mkdocs.yml" # option + # requirements: "requirements-mkdocs.txt" # option + configfile: 'mkdocs.yml' # option - name: Fix site permissions if: github.event.action != 'closed' run: | From 3b91619055361056000640380fa5b4d65b5e8859 Mon Sep 17 00:00:00 2001 From: Joe Karow <58997957+JoeKarow@users.noreply.github.com> Date: Tue, 14 Oct 2025 13:58:12 -0400 Subject: [PATCH 20/24] add `--all-extras` Signed-off-by: Joe Karow <58997957+JoeKarow@users.noreply.github.com> --- .github/workflows/docs.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index bd0f4249..a415e364 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -44,9 +44,8 @@ jobs: uses: astral-sh/setup-uv@v5 with: enable-cache: true - - name: Install dependencies - run: uv sync + run: uv sync --all-extras - name: Build uses: Tiryoh/actions-mkdocs@v0.24.0 if: github.event.action != 'closed' From ee5d0101230a06ed7de305cd855fd8bbb82f0ca0 Mon Sep 17 00:00:00 2001 From: Joe Karow <58997957+JoeKarow@users.noreply.github.com> Date: Tue, 14 Oct 2025 14:14:39 -0400 Subject: [PATCH 21/24] docs: update mkdocstring references for reorganized package structure Update all API documentation references to reflect the new module paths after repository reorganization. Changes include: - Core modules: cibmangotree.analyzer_interface, cibmangotree.app, cibmangotree.meta - Service modules: cibmangotree.services.{importing,preprocessing,storage,tokenizer} - TUI modules: cibmangotree.tui.{components,tools} - Testing module: cibmangotree_testing - Tokenizer packages: cibmangotree.services.tokenizer.core, cibmangotree_tokenizer_basic --- docs/reference/analyzer_interface.md | 2 +- docs/reference/app.md | 2 +- docs/reference/components.md | 2 +- docs/reference/importing.md | 2 +- docs/reference/meta.md | 2 +- docs/reference/preprocessing.md | 2 +- docs/reference/storage.md | 2 +- docs/reference/terminal_tools.md | 2 +- docs/reference/testing.md | 2 +- docs/reference/tokenizer.md | 4 +- mkdocs.yml | 66 ++++++++++++++-------------- 11 files changed, 44 insertions(+), 44 deletions(-) diff --git a/docs/reference/analyzer_interface.md b/docs/reference/analyzer_interface.md index 29111a53..b597198a 100644 --- a/docs/reference/analyzer_interface.md +++ b/docs/reference/analyzer_interface.md @@ -1,3 +1,3 @@ -:::analyzer_interface +:::cibmangotree.analyzer_interface options: show_submodules: true diff --git a/docs/reference/app.md b/docs/reference/app.md index 48daaa7a..4544838c 100644 --- a/docs/reference/app.md +++ b/docs/reference/app.md @@ -1,3 +1,3 @@ -:::app +:::cibmangotree.app options: show_submodules: true \ No newline at end of file diff --git a/docs/reference/components.md b/docs/reference/components.md index d790b8be..95f89766 100644 --- a/docs/reference/components.md +++ b/docs/reference/components.md @@ -1,3 +1,3 @@ -:::components +:::cibmangotree.tui.components options: show_submodules: true \ No newline at end of file diff --git a/docs/reference/importing.md b/docs/reference/importing.md index 2592c016..3eb352c6 100644 --- a/docs/reference/importing.md +++ b/docs/reference/importing.md @@ -1,3 +1,3 @@ -:::importing +:::cibmangotree.services.importing options: show_submodules: true diff --git a/docs/reference/meta.md b/docs/reference/meta.md index 32bf08ad..cad603a3 100644 --- a/docs/reference/meta.md +++ b/docs/reference/meta.md @@ -1,3 +1,3 @@ -:::meta +:::cibmangotree.meta options: show_submodules: true \ No newline at end of file diff --git a/docs/reference/preprocessing.md b/docs/reference/preprocessing.md index 9ddfbe50..87812058 100644 --- a/docs/reference/preprocessing.md +++ b/docs/reference/preprocessing.md @@ -1,3 +1,3 @@ -:::preprocessing +:::cibmangotree.services.preprocessing options: show_submodules: true \ No newline at end of file diff --git a/docs/reference/storage.md b/docs/reference/storage.md index f77c9ee8..2793a4dd 100644 --- a/docs/reference/storage.md +++ b/docs/reference/storage.md @@ -1,3 +1,3 @@ -:::storage +:::cibmangotree.services.storage options: show_submodules: true \ No newline at end of file diff --git a/docs/reference/terminal_tools.md b/docs/reference/terminal_tools.md index 2683e836..caa395ed 100644 --- a/docs/reference/terminal_tools.md +++ b/docs/reference/terminal_tools.md @@ -1,5 +1,5 @@ # Terminal tools -:::terminal_tools +:::cibmangotree.tui.tools options: show_submodules: true \ No newline at end of file diff --git a/docs/reference/testing.md b/docs/reference/testing.md index 5bb1e308..74634f0b 100644 --- a/docs/reference/testing.md +++ b/docs/reference/testing.md @@ -1,3 +1,3 @@ -:::testing +:::cibmangotree_testing options: show_submodules: true \ No newline at end of file diff --git a/docs/reference/tokenizer.md b/docs/reference/tokenizer.md index 2b8bf177..69b21a57 100644 --- a/docs/reference/tokenizer.md +++ b/docs/reference/tokenizer.md @@ -1,7 +1,7 @@ -::: services.tokenizer.core +::: cibmangotree.services.tokenizer.core options: group_by_category: false -::: services.tokenizer.basic +::: cibmangotree_tokenizer_basic options: group_by_category: false diff --git a/mkdocs.yml b/mkdocs.yml index 30307361..155b18f8 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,41 +1,41 @@ site_name: CIB Mango Tree site_author: CIB Mango Tree -site_description: "An interactive python terminal UI wrapper for Mango Tree analyzers." +site_description: 'An interactive python terminal UI wrapper for Mango Tree analyzers.' site_url: https://cibmangotree.org repo_url: https://github.com/civictechdc/mango-tango-cli copyright: Copyright © 2025 CIB Mango Tree nav: - - "Home": - - "About": "index.md" - - "License": "license.md" - - "Guides": - - "Getting Started": - - "Overview": "guides/get-started/overview.md" - - "Installation": "guides/get-started/installation.md" - - "Contributing": - - "Contributor Workflow": "guides/contributing/contributing.md" - - "Implementing Analyzers": "guides/contributing/analyzers.md" - - "Dashboards": - - "Shiny": "guides/contributing/dashboards/shiny.md" - - "React (WIP)": "guides/contributing/dashboards/react.md" - - "Logging": "guides/contributing/logging.md" - - "Testing": "guides/contributing/testing.md" - - "Design Philosophy": - - "Architecture": "guides/design-philosophy/architecture.md" - - "Core Domain": "guides/design-philosophy/core-domain.md" - - "Edge Domain": "guides/design-philosophy/edge-domain.md" - - "Content Domain": "guides/design-philosophy/content-domain.md" - - "Reference (CLI)": - - "Analyzer Interface": "reference/analyzer_interface.md" - - "Components": "reference/components.md" - - "App": "reference/app.md" - - "Preprocessing": "reference/preprocessing.md" - - "Terminal Tools": "reference/terminal_tools.md" - - "Importing": "reference/importing.md" - - "Meta": "reference/meta.md" - - "Storage": "reference/storage.md" - - "Testing": "reference/testing.md" - - "Tokenizer": "reference/tokenizer.md" + - 'Home': + - 'About': 'index.md' + - 'License': 'license.md' + - 'Guides': + - 'Getting Started': + - 'Overview': 'guides/get-started/overview.md' + - 'Installation': 'guides/get-started/installation.md' + - 'Contributing': + - 'Contributor Workflow': 'guides/contributing/contributing.md' + - 'Implementing Analyzers': 'guides/contributing/analyzers.md' + - 'Dashboards': + - 'Shiny': 'guides/contributing/dashboards/shiny.md' + - 'React (WIP)': 'guides/contributing/dashboards/react.md' + - 'Logging': 'guides/contributing/logging.md' + - 'Testing': 'guides/contributing/testing.md' + - 'Design Philosophy': + - 'Architecture': 'guides/design-philosophy/architecture.md' + - 'Core Domain': 'guides/design-philosophy/core-domain.md' + - 'Edge Domain': 'guides/design-philosophy/edge-domain.md' + - 'Content Domain': 'guides/design-philosophy/content-domain.md' + - 'Reference (CLI)': + - 'Analyzer Interface': 'reference/analyzer_interface.md' + - 'Components': 'reference/components.md' + - 'App': 'reference/app.md' + - 'Preprocessing': 'reference/preprocessing.md' + - 'Terminal Tools': 'reference/terminal_tools.md' + - 'Importing': 'reference/importing.md' + - 'Meta': 'reference/meta.md' + - 'Storage': 'reference/storage.md' + - 'Testing': 'reference/testing.md' + - 'Tokenizer': 'reference/tokenizer.md' theme: name: material @@ -55,7 +55,7 @@ plugins: - mkdocstrings: handlers: python: - # paths: [src] # search packages in the src folder + paths: [packages] options: docstring_style: google extensions: From 7ba690739f4f5f0bcdb9352279c192cad10abd42 Mon Sep 17 00:00:00 2001 From: Joe Karow <58997957+JoeKarow@users.noreply.github.com> Date: Tue, 14 Oct 2025 14:21:40 -0400 Subject: [PATCH 22/24] docs: fix broken internal links after documentation reorganization Update all cross-reference links to reflect the new documentation structure where domain docs moved from guides/contributing/domains/ to guides/design-philosophy/. Changes: - Fix 28 broken links across 10 documentation files - Update domain links (core, edge, content) to point to design-philosophy/ - Update dashboard and analyzer links to point to contributing/ subdirectories - Fix contributing and architecture cross-references - Fix installation guide link to contributing workflow This resolves all mkdocs link validation warnings. --- docs/guides/contributing/analyzers.md | 6 +++--- docs/guides/contributing/contributing.md | 4 ++-- docs/guides/contributing/dashboards/react.md | 6 +++--- docs/guides/contributing/dashboards/shiny.md | 6 +++--- docs/guides/contributing/logging.md | 2 +- docs/guides/design-philosophy/architecture.md | 20 +++++++++---------- .../design-philosophy/content-domain.md | 4 ++-- docs/guides/design-philosophy/core-domain.md | 4 ++-- docs/guides/design-philosophy/edge-domain.md | 4 ++-- docs/guides/get-started/installation.md | 2 +- 10 files changed, 29 insertions(+), 29 deletions(-) diff --git a/docs/guides/contributing/analyzers.md b/docs/guides/contributing/analyzers.md index d268e4c9..fabdfd10 100644 --- a/docs/guides/contributing/analyzers.md +++ b/docs/guides/contributing/analyzers.md @@ -502,8 +502,8 @@ This creates a complete analysis pipeline that users can run through the applica Once you finish reading this it would be a good idea to review the sections for each domain. Might also be a good idea to review the sections that discuss implementing [Shiny](https://shiny.posit.co/py/), and [React](https://react.dev) dashboards. -- [Core Domain](./domains/core-domain.md) -- [Edge Domain](./domains/edge-domain.md) -- [Content Domain](./domains/content-domain.md) +- [Core Domain](../design-philosophy/core-domain.md) +- [Edge Domain](../design-philosophy/edge-domain.md) +- [Content Domain](../design-philosophy/content-domain.md) - [Shiny Dashboards](./dashboards/shiny.md) - [React Dashboards](./dashboards/react.md) diff --git a/docs/guides/contributing/contributing.md b/docs/guides/contributing/contributing.md index f01332a1..a7194c2e 100644 --- a/docs/guides/contributing/contributing.md +++ b/docs/guides/contributing/contributing.md @@ -1,4 +1,4 @@ -Before following this workflow please refer to our [**Getting Started**](./overview.md) page for instructions on installing dependencies and setting up your development environment. +Before following this workflow please refer to our [**Getting Started**](../get-started/overview.md) page for instructions on installing dependencies and setting up your development environment. # Contributor Workflow @@ -50,5 +50,5 @@ graph TD; # Next Steps -Once you finish reading this it's recommended to check out the [architecture](./architecture.md) section. +Once you finish reading this it's recommended to check out the [architecture](../design-philosophy/architecture.md) section. diff --git a/docs/guides/contributing/dashboards/react.md b/docs/guides/contributing/dashboards/react.md index d71a4128..bf5c7589 100644 --- a/docs/guides/contributing/dashboards/react.md +++ b/docs/guides/contributing/dashboards/react.md @@ -1625,9 +1625,9 @@ This comprehensive guide covers all aspects of building React dashboards for the After this section it would be a good idea to review the sections that discuss implementing [Shiny](https://shiny.posit.co/py/) dashboards. Although once you finish reading this it would also be a good idea to review the sections for each domain. -- [Core Domain](../domains/core-domain.md) -- [Edge Domain](../domains/edge-domain.md) -- [Content Domain](../domains/content-domain.md) +- [Core Domain](../../design-philosophy/core-domain.md) +- [Edge Domain](../../design-philosophy/edge-domain.md) +- [Content Domain](../../design-philosophy/content-domain.md) - [Shiny Dashboards](./shiny.md) diff --git a/docs/guides/contributing/dashboards/shiny.md b/docs/guides/contributing/dashboards/shiny.md index 351e86a7..8732a588 100644 --- a/docs/guides/contributing/dashboards/shiny.md +++ b/docs/guides/contributing/dashboards/shiny.md @@ -746,9 +746,9 @@ This comprehensive guide covers all aspects of building Shiny dashboards for you Once you finish reading section be a good idea to review the section that discuss implementing [React](https://react.dev) dashboards. Might also be a good idea to review the sections for each domain. -- [Core Domain](../domains/core-domain.md) -- [Edge Domain](../domains/edge-domain.md) -- [Content Domain](../domains/content-domain.md) +- [Core Domain](../../design-philosophy/core-domain.md) +- [Edge Domain](../../design-philosophy/edge-domain.md) +- [Content Domain](../../design-philosophy/content-domain.md) - [React Dashboards](./react.md) diff --git a/docs/guides/contributing/logging.md b/docs/guides/contributing/logging.md index 5b28a4f3..17074f9a 100644 --- a/docs/guides/contributing/logging.md +++ b/docs/guides/contributing/logging.md @@ -147,4 +147,4 @@ def test_my_function_logs_correctly(caplog): # Next Steps -Once you finish reading this it's recommended to check out the [architecture](./architecture.md) section. +Once you finish reading this it's recommended to check out the [architecture](../design-philosophy/architecture.md) section. diff --git a/docs/guides/design-philosophy/architecture.md b/docs/guides/design-philosophy/architecture.md index 6b8220cf..1897ee13 100644 --- a/docs/guides/design-philosophy/architecture.md +++ b/docs/guides/design-philosophy/architecture.md @@ -1,4 +1,4 @@ -Before contributing please refer to our [**Contributor Workflow**](./contributing.md) +Before contributing please refer to our [**Contributor Workflow**](../contributing/contributing.md) ## Application Design Overview The CIB 🥭 application is a terminal-based tool for performing data analysis and visualization. It is designed to be modular and extensible, allowing developers to contribute new analysis modules and visualization components while providing a consistent user experience around data import, preprocessing, and output generation. @@ -10,18 +10,18 @@ The architecture of the CIB 🥭 application is designed to address this problem ## Architecture Overview The application has three "domains": -- The [**Core**](../domains/core-domain.md) domain is responsible for workspace management, user flow, and integration of analysis runs and data import/export in a generic sense. It has three parts that correspond loosely to the MVC paradigm. +- The [**Core**](./core-domain.md) domain is responsible for workspace management, user flow, and integration of analysis runs and data import/export in a generic sense. It has three parts that correspond loosely to the MVC paradigm. - The **Application** defines the workspace logic and exposes generic capabilities for importing and exporting data as well as analyses and dashboards. This is the "controller" part. - The **Terminal** Components render the terminal interface and handle user input. This is the "view" part. - The **Storage IO** persists the workspace data and is responsible for reading and writing data. This is the "model" part. The core application provides the context necessary for the other domains to function in a way that allows them to be agnostic about the specifics of the workspace and user flow. -- The [**Edge**](../domains/edge-domain.md) domain is responsible for data import and export while being agnostic about the specific analysis being run. Currently, this consists of the **Importers** and the **Semantic Preprocessor**. +- The [**Edge**](./edge-domain.md) domain is responsible for data import and export while being agnostic about the specific analysis being run. Currently, this consists of the **Importers** and the **Semantic Preprocessor**. > Note that the Storage IO is currently responsible for data export, but we should consider moving this to the Edge domain to allow for more extensibility and looser coupling. -- The [**Content**](../domains/content-domain.md) domain is responsible for the actual data analysis and visualization and is agnostic about data import/export or workspace specifics. This consists of the **Analyzers** (both **Primary** and **Secondary**) as well as the **Web Presenters**. +- The [**Content**](./content-domain.md) domain is responsible for the actual data analysis and visualization and is agnostic about data import/export or workspace specifics. This consists of the **Analyzers** (both **Primary** and **Secondary**) as well as the **Web Presenters**. ```mermaid flowchart TD @@ -51,11 +51,11 @@ Talk to us on the [Civic Tech DC Slack workspace](https://civictechdc.slack.com) It would be recommended to review the sections for each domain, and the section for implementing analyzers. Might also be a good idea to review the sections that discuss implementing [Shiny](https://shiny.posit.co/py/), and [React](https://react.dev) dashboards. -- [Core Domain](../domains/core-domain.md) -- [Edge Domain](../domains/edge-domain.md) -- [Content Domain](../domains/content-domain.md) -- [Implementing Analyzers](../analyzers.md) -- [Shiny Dashboards](../dashboards/shiny.md) -- [React Dashboards](../dashboards/react.md) +- [Core Domain](./core-domain.md) +- [Edge Domain](./edge-domain.md) +- [Content Domain](./content-domain.md) +- [Implementing Analyzers](../contributing/analyzers.md) +- [Shiny Dashboards](../contributing/dashboards/shiny.md) +- [React Dashboards](../contributing/dashboards/react.md) diff --git a/docs/guides/design-philosophy/content-domain.md b/docs/guides/design-philosophy/content-domain.md index c399b5b6..7211262d 100644 --- a/docs/guides/design-philosophy/content-domain.md +++ b/docs/guides/design-philosophy/content-domain.md @@ -14,5 +14,5 @@ Once you finish reading this section it would be a good idea to review the other - [Core Domain](./core-domain.md) - [Edge Domain](./edge-domain.md) -- [Shiny Dashboards](../dashboards/shiny.md) -- [React Dashboards](../dashboards/react.md) +- [Shiny Dashboards](../contributing/dashboards/shiny.md) +- [React Dashboards](../contributing/dashboards/react.md) diff --git a/docs/guides/design-philosophy/core-domain.md b/docs/guides/design-philosophy/core-domain.md index 34be590a..10b208a3 100644 --- a/docs/guides/design-philosophy/core-domain.md +++ b/docs/guides/design-philosophy/core-domain.md @@ -636,5 +636,5 @@ Once you finish reading this section it would be a good idea to review the other - [Edge Domain](./edge-domain.md) - [Content Domain](./content-domain.md) -- [Shiny Dashboards](../dashboards/shiny.md) -- [React Dashboards](../dashboards/react.md) +- [Shiny Dashboards](../contributing/dashboards/shiny.md) +- [React Dashboards](../contributing/dashboards/react.md) diff --git a/docs/guides/design-philosophy/edge-domain.md b/docs/guides/design-philosophy/edge-domain.md index 7333306a..daeab5ec 100644 --- a/docs/guides/design-philosophy/edge-domain.md +++ b/docs/guides/design-philosophy/edge-domain.md @@ -192,5 +192,5 @@ Once you finish reading this section it would be a good idea to review the other - [Core Domain](./core-domain.md) - [Content Domain](./content-domain.md) -- [Shiny Dashboards](../dashboards/shiny.md) -- [React Dashboards](../dashboards/react.md) +- [Shiny Dashboards](../contributing/dashboards/shiny.md) +- [React Dashboards](../contributing/dashboards/react.md) diff --git a/docs/guides/get-started/installation.md b/docs/guides/get-started/installation.md index 7fdb1b96..e4e5746b 100644 --- a/docs/guides/get-started/installation.md +++ b/docs/guides/get-started/installation.md @@ -461,4 +461,4 @@ uv sync # Next Steps Once you have everything installed and running without any problems, -the next step is to check out the [Contributor Workflow](contributing.md) +the next step is to check out the [Contributor Workflow](../contributing/contributing.md) From 14b873a0587e5d248fa513a785bfac349ea964c9 Mon Sep 17 00:00:00 2001 From: Joe Karow <58997957+JoeKarow@users.noreply.github.com> Date: Tue, 14 Oct 2025 14:33:32 -0400 Subject: [PATCH 23/24] docs: fix mkdocs/griffe documentation warnings Resolved 11 griffe warnings to ensure clean documentation builds: - Fixed parameter name mismatches in docstrings (declaration.py) - Added missing type annotations (prompts.py, patterns.py) - Corrected docstring indentation to follow 8-space continuation standard - Removed invalid 'Args: None' sections - Updated return type documentation for consistency --- .../analyzer_interface/declaration.py | 2 +- .../core/src/cibmangotree/tui/tools/prompts.py | 18 ++++++++---------- .../core/src/cibmangotree/tui/tools/utils.py | 13 +++++++------ .../cibmangotree_tokenizer_basic/patterns.py | 9 ++++++--- 4 files changed, 22 insertions(+), 20 deletions(-) diff --git a/packages/core/src/cibmangotree/analyzer_interface/declaration.py b/packages/core/src/cibmangotree/analyzer_interface/declaration.py index 1dc231b6..e62f8199 100644 --- a/packages/core/src/cibmangotree/analyzer_interface/declaration.py +++ b/packages/core/src/cibmangotree/analyzer_interface/declaration.py @@ -91,7 +91,7 @@ def __init__( modify the Dash app in the context to add whatever plotting interface the web presenter needs. - server_name (str): + name (str): The server name for the Dash app. Typically, you will use the global variable `__name__` here. diff --git a/packages/core/src/cibmangotree/tui/tools/prompts.py b/packages/core/src/cibmangotree/tui/tools/prompts.py index 43dd55f0..5a3c45d8 100644 --- a/packages/core/src/cibmangotree/tui/tools/prompts.py +++ b/packages/core/src/cibmangotree/tui/tools/prompts.py @@ -16,15 +16,12 @@ from string import ascii_uppercase -def get_drives(): +def get_drives() -> list[str]: """ Returns a list of the logically assigned drives on a windows system. - Args: - None - Returns: - list: A list of drive letters available and accessible on the system. + list[str]: A list of drive letters available and accessible on the system. """ drives = [] @@ -40,17 +37,18 @@ def get_drives(): def file_selector( message: str = "select a file", *, state: Optional[FileSelectorStateManager] = None -): +) -> Optional[str]: """Lets the user select a file from the filesystem. Args: message (str, optional): The prompt message. Defaults to "select a file". - initial_path (str, optional): Where to start the directory listing. - Defaults to current working directory. + state (FileSelectorStateManager, optional): State manager to track directory + navigation. If provided, the selector will start from the last known path. + Defaults to None (starts in current working directory). Returns: - (str, optional): The absolute path selected by the user, or None if the - user cancels the prompt. + str | None: The absolute path selected by the user, or None if the user + cancels the prompt. """ initial_dir = state and state.get_current_path() if initial_dir and not os.path.isdir(initial_dir): diff --git a/packages/core/src/cibmangotree/tui/tools/utils.py b/packages/core/src/cibmangotree/tui/tools/utils.py index 63e499a5..11840b1f 100644 --- a/packages/core/src/cibmangotree/tui/tools/utils.py +++ b/packages/core/src/cibmangotree/tui/tools/utils.py @@ -22,7 +22,7 @@ def wait_for_key(prompt: bool = False): Args: prompt (bool, optional): If true, a default text - `Press any key to continue` will be shown. Defaults to False. + `Press any key to continue` will be shown. Defaults to False. """ if prompt: print("Press any key to continue...", end="", flush=True) @@ -86,11 +86,12 @@ def draw_box(text: str, *, padding_spaces: int = 5, padding_lines: int = 1) -> s Args: text (str): The text to be drawn, may be multiline. - ANSI formatting and emojis are not supported, as they mess with - both the character count calculation and the monospace font. - - padding_spaces (int, optional): Extra spaces on either side of the longest line. Defaults to 5. - padding_lines (int, optional): Extra lines above and below the text. Defaults to 1. + ANSI formatting and emojis are not supported, as they mess with + both the character count calculation and the monospace font. + padding_spaces (int, optional): Extra spaces on either side of the longest line. + Defaults to 5. + padding_lines (int, optional): Extra lines above and below the text. + Defaults to 1. Returns: str: The text surrounded by a box. diff --git a/packages/tokenizers/basic/src/cibmangotree_tokenizer_basic/patterns.py b/packages/tokenizers/basic/src/cibmangotree_tokenizer_basic/patterns.py index 41073845..37f08393 100644 --- a/packages/tokenizers/basic/src/cibmangotree_tokenizer_basic/patterns.py +++ b/packages/tokenizers/basic/src/cibmangotree_tokenizer_basic/patterns.py @@ -7,7 +7,10 @@ """ import re -from typing import Any, Dict, List +from typing import TYPE_CHECKING, Any, Dict, List + +if TYPE_CHECKING: + from cibmangotree.services.tokenizer.core.types import TokenizerConfig # Try to use the more powerful regex module, fall back to re try: @@ -162,7 +165,7 @@ def get_pattern(self, pattern_name: str) -> Any: raise KeyError(f"Pattern '{pattern_name}' not found") return self._patterns[pattern_name] - def get_comprehensive_pattern(self, config) -> Any: + def get_comprehensive_pattern(self, config: "TokenizerConfig") -> Any: """ Build comprehensive tokenization pattern based on configuration. @@ -223,7 +226,7 @@ def get_comprehensive_pattern(self, config) -> Any: else: return re.compile(r"\S+", re.IGNORECASE) - def get_exclusion_pattern(self, config) -> Any: + def get_exclusion_pattern(self, config: "TokenizerConfig") -> Any: """ Build pattern to identify and skip excluded entities in text. From b3caad176cf74f33ebf596b5dff03d6d640bca92 Mon Sep 17 00:00:00 2001 From: Joe Karow <58997957+JoeKarow@users.noreply.github.com> Date: Tue, 14 Oct 2025 14:38:15 -0400 Subject: [PATCH 24/24] update build cmd Signed-off-by: Joe Karow <58997957+JoeKarow@users.noreply.github.com> --- .github/workflows/docs.yml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index a415e364..f11b790f 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -47,13 +47,14 @@ jobs: - name: Install dependencies run: uv sync --all-extras - name: Build - uses: Tiryoh/actions-mkdocs@v0.24.0 + # uses: Tiryoh/actions-mkdocs@v0.24.0 if: github.event.action != 'closed' - with: - # mkdocs_version: 'latest' # option - #mkdocs_version: '1.1' # option - # requirements: "requirements-mkdocs.txt" # option - configfile: 'mkdocs.yml' # option + run: uv run mkdocs build + # with: + # # mkdocs_version: 'latest' # option + # #mkdocs_version: '1.1' # option + # # requirements: "requirements-mkdocs.txt" # option + # configfile: 'mkdocs.yml' # option - name: Fix site permissions if: github.event.action != 'closed' run: |