diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 0a7f8dd..e4396ee 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -12,7 +12,7 @@ // Update 'VARIANT' to pick a Python version: 3, 3.9, 3.8, 3.7, 3.6. // Append -bullseye or -buster to pin to an OS version. // Use -bullseye variants on local on arm64/Apple Silicon. - "VARIANT": "3.7", + "VARIANT": "3.9", // Options "NODE_VERSION": "lts/*" } @@ -29,12 +29,13 @@ "python.linting.mypyPath": "/usr/local/py-utils/bin/mypy", "python.linting.pycodestylePath": "/usr/local/py-utils/bin/pycodestyle", "python.linting.pydocstylePath": "/usr/local/py-utils/bin/pydocstyle", - "python.linting.pylintPath": "/usr/local/py-utils/bin/pylint", + "python.linting.pylintPath": "/usr/local/py-utils/bin/pylint" }, // Add the IDs of extensions you want installed when the container is created. "extensions": [ "ms-python.python", - "ms-python.vscode-pylance" + "ms-python.vscode-pylance", + "elagil.pre-commit-helper" ], // Use 'forwardPorts' to make a list of ports inside the container available locally. "forwardPorts": [ diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 0f0801a..db981aa 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -7,11 +7,11 @@ env: LC_ALL: "C.UTF-8" jobs: pre-commit: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 strategy: matrix: - os: [ubuntu-20.04, macos-latest] - python-version: [3.7, 3.8] + os: [ubuntu-22.04, macos-latest] + python-version: [3.8, 3.9] steps: - uses: actions/checkout@v2 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index adeb725..6af63a6 100755 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,12 +1,12 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.0.1 + rev: v4.4.0 hooks: - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace - repo: https://github.com/pre-commit/mirrors-yapf - rev: "v0.31.0" + rev: "v0.32.0" hooks: - id: yapf args: ["--style=.style.yapf", "--parallel", "--in-place"] @@ -15,7 +15,7 @@ repos: hooks: - id: seed-isort-config - repo: https://github.com/pycqa/isort - rev: 5.9.3 + rev: 5.12.0 hooks: - id: isort name: isort (python) diff --git a/picatrix/__init__.py b/picatrix/__init__.py index 19c7b3a..ecbe6c8 100644 --- a/picatrix/__init__.py +++ b/picatrix/__init__.py @@ -13,9 +13,12 @@ # limitations under the License. """Sets up Picatrix environment.""" -from typing import Optional, Text, Tuple +from typing import Dict, Optional, Text, Tuple + +import pandas as pd from .lib.namespace import ( + AccessorNamespaceTemplate, FeatureContext, FeatureNamespace, Function, @@ -112,6 +115,39 @@ def new_cell_magic(func: Function, name: Optional[Text] = None): px.add_cell_magic(func, name if name else func.__name__) +_accessor_namespaces: Dict[Text, AccessorNamespaceTemplate] = {} + + +def new_accessor_namespace( + name: Text, docstring: Optional[Text] = None) -> AccessorNamespaceTemplate: + """Adds a new Pandas DataFrame accessor namespace. + + Function returns an AccessorNamespaceTemplate which exposes `.add` that + can be used to add functions (accessors) to the namespace. + Newly added namespace will later be available as . + and its accessors as ... + + Args: + name: name of the the namespace + docstring: a string describing the functionalities of the namespace + + Returns: + AccessorNamespaceTemplate: a template to be used for spawning accessor + namespaces + """ + if not docstring: + docstring = f"Group of namespaces related to \"{name}\"" + + template = AccessorNamespaceTemplate(docstring) + reg = pd.api.extensions.register_dataframe_accessor # type: ignore + reg(name)(template.create) + + _accessor_namespaces[name] = template + return template + + # shouldn't be exported -del Optional, Text, Tuple # type: ignore -del FeatureContext, FeatureNamespace, Function, RootContext, RootNamespace, +del Dict, Optional, Text, Tuple +del AccessorNamespaceTemplate, FeatureContext, FeatureNamespace, +del Function, RootContext, RootNamespace +del pd diff --git a/picatrix/lib/namespace.py b/picatrix/lib/namespace.py index 6c9963a..7b54fe8 100644 --- a/picatrix/lib/namespace.py +++ b/picatrix/lib/namespace.py @@ -14,6 +14,7 @@ """Types and functions defining Picatrix namespacing.""" from difflib import get_close_matches +from functools import wraps from inspect import cleandoc from types import SimpleNamespace from typing import ( @@ -96,16 +97,14 @@ def _as_df_record(name: Text, item: Any, with_doc: bool, return record -class Namespace(SimpleNamespace, Generic[A]): - """Key-value type of structure with items accessible as attributes.""" +class BareNamespace(SimpleNamespace, Generic[A]): + """Minimal key-value type of structure with items accessible as attributes.""" - name: Text __dict__: Dict[Text, A] - def __init__(self, name: Text, docstring: Text, **kwargs: A): + def __init__(self, docstring: Text, **kwargs: Any): super().__init__(**kwargs) self.__doc__ = cleandoc(docstring) - self.name = name def __setattr__(self, key: Text, value: A): if not key.isidentifier(): @@ -123,7 +122,7 @@ def __setattr__(self, key: Text, value: A): def __getattr__(self, key: Text) -> A: if key in self: - return super().__getattr__(key) + return super().__getattribute__(key) else: raise NamespaceKeyMissingError(key, self.keys()) @@ -139,6 +138,26 @@ def __iter__(self) -> Iterator[Text]: def __contains__(self, key: Text): return self.__dict__.__contains__(key) + def _add(self, key: Text, value: A): + """Adds a new value under the key. + + Raises: + NamespaceKeyExistsError: when required key already exists + NamespaceKeyError: when key is invalid, e.g. isn't Python identifier + """ + setattr(self, key, value) + + +class Namespace(BareNamespace[A]): + """Key-value type of structure with items accessible as attributes.""" + + name: Text + __dict__: Dict[Text, A] + + def __init__(self, name: Text, docstring: Text, **kwargs: A): + super().__init__(docstring, **kwargs) + self.name = name + def keys(self) -> Iterator[Text]: """Iterator over all of the keys in the namespace.""" return iter(self.__dict__.keys()) @@ -183,15 +202,6 @@ def search(self, keyword: Text) -> pandas.DataFrame: return df[df.Name.str.contains(keyword) | # type: ignore df.Docstring.str.contains(keyword)] # type: ignore - def _add(self, key: Text, value: A): - """Adds a new value under the key. - - Raises: - NamespaceKeyExistsError: when required key already exists - NamespaceKeyError: when key is invalid, e.g. isn't Python identifier - """ - setattr(self, key, value) - def get(self, key: Text, default: A) -> A: """Return the value for key if key is in the namespace, else default.""" if key in self: @@ -383,3 +393,78 @@ def add_namespace( ctx = FeatureContext(name=key, docstring=docstring) self._add(name, ctx) return ctx + + +PandasAccessor = Callable[..., Any] +"""Type representation of a pandas DataFrame accessor. + +The definition of this type should be `Callable[[pd.DataFrame, ...], Any]` +(meaning first argument is a DataFrame and rest is up to the implementer) +but it isn't allowed by Python typing system.""" + +PandasAccessorValidator = Callable[[pandas.DataFrame], bool] +"""Function validating if an accessor is applicable to a specific DataFrame.""" + +AccessorDef = Tuple[Text, PandasAccessorValidator, PandasAccessor] +"""Definition of the accessor, i.e. name, validator and the accessor itself.""" + + +def _accessor_wrapper(f: PandasAccessor, df: pandas.DataFrame): + + @wraps(f) + def _inner(*args: Any, **kwargs: Any): + return f(df, *args, **kwargs) + + return _inner + + +class AccessorNamespace(BareNamespace[PandasAccessor]): + """Namespace holding pandas DataFrame accessors.""" + + def __init__( + self, docstring: Text, df: pandas.DataFrame, fs: List[AccessorDef]): + super().__init__(docstring) + + valid = False + for name, validate, accessor in fs: + if validate(df): + self._add(name, _accessor_wrapper(accessor, df)) + valid = True + + if not valid: + raise AttributeError( + "DataFrame doesn't match requirments of any of the accessors.") + + +class AccessorNamespaceTemplate: + """Holds parameters to be used to create AccessorNamespace.""" + docstring: Text + functions: Dict[Text, Tuple[PandasAccessorValidator, PandasAccessor]] + + def __init__(self, docstring: Text): + self.docstring = docstring + self.functions = {} + + def add_accessor( + self, + name: Optional[Text] = None, + validator: PandasAccessorValidator = lambda _: True, + ) -> Callable[[PandasAccessor], None]: + """A decorator for adding accessor to the namespace.""" + + def _inner(accessor: PandasAccessor): + + key = name if name else accessor.__name__ + + if key in self.functions: + raise KeyError(f"Accessor \"{key}\" already exists.") + else: + self.functions[key] = (validator, accessor) + + return _inner + + def create(self, df: pandas.DataFrame) -> AccessorNamespace: + """Creates AccessorNamespace.""" + fs = [(n, v, a) for n, (v, a) in self.functions.items()] + + return AccessorNamespace(self.docstring, df, fs) diff --git a/picatrix/lib/namespace_test.py b/picatrix/lib/namespace_test.py index 580f35a..8fa356b 100644 --- a/picatrix/lib/namespace_test.py +++ b/picatrix/lib/namespace_test.py @@ -21,7 +21,7 @@ import pandas as pd import pytest -from .namespace import Namespace, NamespaceKeyError +from .namespace import AccessorNamespaceTemplate, Namespace, NamespaceKeyError def test_invalid_key(): @@ -107,3 +107,94 @@ def test_to_frame(): ]) got = n.to_frame(with_doc=True) assert want.equals(got) + + +def test_accessor_template_add_accessor(): + """Test adding accessors to AccessorNamespaces through templates.""" + + ant = AccessorNamespaceTemplate("Some docstring") + ant.add_accessor("i_never_attach", validator=lambda _: False)(lambda x: x) + + with pytest.raises(AttributeError): + _ = ant.create( + pd.DataFrame.from_records( + [ + { + "name": "something", + "value": 11 + }, + { + "name": "anything", + "value": 15 + }, + ])) + + ant.add_accessor("i_always_attach", validator=lambda _: True)(lambda x: x) + ant.add_accessor( + "i_conditionally_attach", + validator=lambda df: "abracadabra" in df.columns)(lambda x: x) + + ns1 = ant.create( + pd.DataFrame.from_records( + [ + { + "name": "something", + "value": 11 + }, + { + "name": "anything", + "value": 15 + }, + ])) + + assert "i_always_attach" in ns1 + assert "i_never_attach" not in ns1 + assert "i_conditionally_attach" not in ns1 + + ns2 = ant.create( + pd.DataFrame.from_records( + [ + { + "name": "something", + "value": 11, + "abracadabra": "alakazam" + }, + { + "name": "anything", + "value": 15, + "abracadabra": "hocus pocus" + }, + ])) + + assert "i_always_attach" in ns2 + assert "i_never_attach" not in ns2 + assert "i_conditionally_attach" in ns2 + + +def test_accessor_template_call_accessor(): + """Test calling the accessor in AccessorNamepace created out of template.""" + + ant = AccessorNamespaceTemplate("Some docstring") + ant.add_accessor( + "echo", validator=lambda _: True)(lambda df, a, b, c: (df, a, b, c)) + + df = pd.DataFrame.from_records( + [ + { + "name": "something", + "value": 11, + "abracadabra": "alakazam" + }, + { + "name": "anything", + "value": 15, + "abracadabra": "hocus pocus" + }, + ]) + a, b, c = 1, 2, 3 + + ns = ant.create(df) + + df_, a_, b_, c_ = ns.echo(a, b, c) + + assert df.equals(df_) and a == a_ and b == b_ and c == c_ diff --git a/picatrix/magics/timesketch.py b/picatrix/magics/timesketch.py index 9a3beb1..68c93e4 100644 --- a/picatrix/magics/timesketch.py +++ b/picatrix/magics/timesketch.py @@ -397,14 +397,14 @@ def timesketch_add_manual_event( 'Unable to convert date string, is it really in ISO 8601 format?') return {} try: - elements.CopyFromString(date_string) + elements.CopyFromStringRFC822(date_string) except ValueError: try: elements.CopyFromStringRFC1123(date_string) except ValueError: logger.error( - 'Unable to convert date string, needs to be in ISO 8601, 1123 or ' - 'in the format YYYY-MM-DD hh:mm:ss.######[+-]##:##') + 'Unable to convert date string, needs to be in ISO 8601, RFC 822' + ' or 1123') return {} date = elements.CopyToDateTimeStringISO8601() diff --git a/picatrix/modules/__init__.py b/picatrix/modules/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/picatrix/modules/formats.py b/picatrix/modules/formats.py new file mode 100644 index 0000000..839971a --- /dev/null +++ b/picatrix/modules/formats.py @@ -0,0 +1,87 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions and magics for handling popular data formats.""" + +import json as _json +from io import StringIO +from typing import Any, Dict, Text + +import pandas + +from picatrix import new_cell_magic + + +@new_cell_magic +def text(cell: Text) -> Text: + """Ingest the cell content as raw text without evaluating escape codes. + + Args: + cell: text to be loaded + + Returns: + cell text + """ + return cell + + +@new_cell_magic +def csv( + cell: Text, + sep: Text = ",", + quotechar: Text = "\"", + doublequote: bool = True, + escapechar: Text = "\\", +) -> pandas.DataFrame: + """Parse cell content as a CSV data and create a pandas DataFrame. + + The function uses pandas.read_csv function to parse the text: + https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html + + Args: + cell: text to be parsed as CSV + sep: Delimiter to use + quotechar: The character used to denote the start and end of a + quoted item. Quoted items can include the delimiter and it + will be ignored. + doublequote: a flag determining whether or not to interpret two + consecutive quotechar elements INSIDE a field as a single + quotechar element. + escapechar: One-character string used to escape other characters. + + Returns: + pandas DataFrame constructed from the CSV. + """ + return pandas.read_csv( # type: ignore + StringIO(cell), + sep=sep, + quotechar=quotechar, + doublequote=doublequote, + escapechar=escapechar, + ) + + +@new_cell_magic +def json(cell: Text) -> Dict[Text, Any]: + """Parse cell content as a JSON object and create a Python dict. + + The function uses json.load function to parse the text: + https://docs.python.org/3/library/json.html#json.load + + Args: + cell: text to be parsed as JSON + + Returns: + a Python dict construced from the JSON + """ + return _json.loads(cell) diff --git a/requirements_runtime.txt b/requirements_runtime.txt index cd3744b..db80bc0 100644 --- a/requirements_runtime.txt +++ b/requirements_runtime.txt @@ -6,6 +6,4 @@ jupyter-contrib-nbextensions>=0.5.1 jupyter-http-over-ws>=0.0.8 jupyter>=1.0.0 matplotlib>=2.2.0 -numpy>=1.19.0 -pandas>=1.1.3 scikit-learn>=1.0