diff --git a/environment.yaml b/environment.yaml new file mode 100644 index 0000000..b17c34f --- /dev/null +++ b/environment.yaml @@ -0,0 +1,526 @@ +name: base +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - _anaconda_depends=2023.09=py311_openblas_1 + - abseil-cpp=20211102.0=hc377ac9_0 + - aiobotocore=2.5.0=py311hca03da5_0 + - aiofiles=22.1.0=py311hca03da5_0 + - aiohttp=3.8.5=py311h80987f9_0 + - aioitertools=0.7.1=pyhd3eb1b0_0 + - aiosignal=1.2.0=pyhd3eb1b0_0 + - aiosqlite=0.18.0=py311hca03da5_0 + - alabaster=0.7.12=pyhd3eb1b0_0 + - anaconda-anon-usage=0.4.2=py311hd6b623d_0 + - anaconda-catalogs=0.2.0=py311hca03da5_0 + - anaconda-client=1.12.1=py311hca03da5_0 + - anaconda-cloud-auth=0.1.3=py311hca03da5_0 + - anaconda-navigator=2.5.1=py311hca03da5_0 + - anaconda-project=0.11.1=py311hca03da5_0 + - anyio=3.5.0=py311hca03da5_0 + - aom=3.6.0=h313beb8_0 + - appdirs=1.4.4=pyhd3eb1b0_0 + - applaunchservices=0.3.0=py311hca03da5_0 + - appnope=0.1.2=py311hca03da5_1001 + - appscript=1.1.2=py311h80987f9_0 + - argon2-cffi=21.3.0=pyhd3eb1b0_0 + - argon2-cffi-bindings=21.2.0=py311h80987f9_0 + - arrow=1.2.3=py311hca03da5_1 + - arrow-cpp=11.0.0=hc7aafb3_2 + - astroid=2.14.2=py311hca03da5_0 + - astropy=5.1=py311ha0d4635_0 + - asttokens=2.0.5=pyhd3eb1b0_0 + - async-timeout=4.0.2=py311hca03da5_0 + - atomicwrites=1.4.0=py_0 + - attrs=22.1.0=py311hca03da5_0 + - automat=20.2.0=py_0 + - autopep8=1.6.0=pyhd3eb1b0_1 + - aws-c-common=0.6.8=h80987f9_1 + - aws-c-event-stream=0.1.6=h313beb8_6 + - aws-checksums=0.1.11=h80987f9_2 + - aws-sdk-cpp=1.8.185=ha71a6ea_1 + - babel=2.11.0=py311hca03da5_0 + - backcall=0.2.0=pyhd3eb1b0_0 + - backports=1.1=pyhd3eb1b0_0 + - backports.functools_lru_cache=1.6.4=pyhd3eb1b0_0 + - backports.tempfile=1.0=pyhd3eb1b0_1 + - backports.weakref=1.0.post1=py_1 + - bcrypt=3.2.0=py311h80987f9_1 + - beautifulsoup4=4.12.2=py311hca03da5_0 + - binaryornot=0.4.4=pyhd3eb1b0_1 + - black=23.3.0=py311hca03da5_0 + - blas=1.0=openblas + - bleach=4.1.0=pyhd3eb1b0_0 + - blosc=1.21.3=h313beb8_0 + - bokeh=3.2.1=py311hb6e6a13_0 + - boltons=23.0.0=py311hca03da5_0 + - boost-cpp=1.73.0=h1a28f6b_12 + - botocore=1.29.76=py311hca03da5_0 + - bottleneck=1.3.5=py311ha0d4635_0 + - brotli=1.0.9=h1a28f6b_7 + - brotli-bin=1.0.9=h1a28f6b_7 + - brotlipy=0.7.0=py311h80987f9_1002 + - brunsli=0.1=hc377ac9_1 + - bzip2=1.0.8=h620ffc9_4 + - c-ares=1.19.1=h80987f9_0 + - c-blosc2=2.8.0=h313beb8_0 + - ca-certificates=2025.1.31=hf0a4a13_0 + - cctools=949.0.1=hc179dcd_25 + - cctools_osx-arm64=949.0.1=h332cad3_25 + - certifi=2025.1.31=pyhd8ed1ab_0 + - cffi=1.15.1=py311h80987f9_3 + - cfitsio=3.470=h7f6438f_7 + - chardet=4.0.0=py311hca03da5_1003 + - charls=2.2.0=hc377ac9_0 + - charset-normalizer=2.0.4=pyhd3eb1b0_0 + - click=8.0.4=py311hca03da5_0 + - cloudpickle=2.2.1=py311hca03da5_0 + - clyent=1.2.2=py311hca03da5_1 + - colorama=0.4.6=py311hca03da5_0 + - colorcet=3.0.1=py311hca03da5_0 + - comm=0.1.2=py311hca03da5_0 + - conda=23.7.4=py311hca03da5_0 + - conda-build=3.26.1=py311hca03da5_0 + - conda-content-trust=0.2.0=py311hca03da5_0 + - conda-index=0.3.0=py311hca03da5_0 + - conda-libmamba-solver=23.7.0=py311hca03da5_0 + - conda-pack=0.6.0=pyhd3eb1b0_0 + - conda-package-handling=2.2.0=py311hca03da5_0 + - conda-package-streaming=0.9.0=py311hca03da5_0 + - conda-repo-cli=1.0.75=py311hca03da5_0 + - conda-token=0.4.0=pyhd3eb1b0_0 + - conda-verify=3.4.2=py_1 + - constantly=15.1.0=py311hca03da5_0 + - contourpy=1.0.5=py311h48ca7d4_0 + - cookiecutter=1.7.3=pyhd3eb1b0_0 + - cryptography=41.0.3=py311hd4332d6_0 + - cssselect=1.1.0=pyhd3eb1b0_0 + - curl=8.2.1=h02f6b3c_0 + - cycler=0.11.0=pyhd3eb1b0_0 + - cyrus-sasl=2.1.28=h9131b1a_1 + - cytoolz=0.12.0=py311h80987f9_0 + - dask=2023.6.0=py311hca03da5_0 + - dask-core=2023.6.0=py311hca03da5_0 + - datasets=2.12.0=py311hca03da5_0 + - datashader=0.15.2=py311hca03da5_0 + - datashape=0.5.4=py311hca03da5_1 + - dav1d=1.2.1=h80987f9_0 + - debugpy=1.6.7=py311h313beb8_0 + - decorator=5.1.1=pyhd3eb1b0_0 + - defusedxml=0.7.1=pyhd3eb1b0_0 + - diff-match-patch=20200713=pyhd3eb1b0_0 + - dill=0.3.6=py311hca03da5_0 + - distributed=2023.6.0=py311hca03da5_0 + - docstring-to-markdown=0.11=py311hca03da5_0 + - docutils=0.18.1=py311hca03da5_3 + - entrypoints=0.4=py311hca03da5_0 + - et_xmlfile=1.1.0=py311hca03da5_0 + - executing=0.8.3=pyhd3eb1b0_0 + - filelock=3.9.0=py311hca03da5_0 + - flake8=6.0.0=py311hca03da5_0 + - flask=2.2.2=py311hca03da5_0 + - fmt=9.1.0=h48ca7d4_0 + - fonttools=4.25.0=pyhd3eb1b0_0 + - freetype=2.12.1=h1192e45_0 + - frozenlist=1.3.3=py311h80987f9_0 + - fsspec=2023.4.0=py311hca03da5_0 + - future=0.18.3=py311hca03da5_0 + - gensim=4.3.0=py311h6956b77_0 + - gettext=0.21.0=h13f89a0_1 + - gflags=2.2.2=hc377ac9_0 + - giflib=5.2.1=h80987f9_3 + - glib=2.69.1=h514c7bf_2 + - glob2=0.7=pyhd3eb1b0_0 + - glog=0.5.0=hc377ac9_0 + - gmp=6.2.1=hc377ac9_3 + - gmpy2=2.1.2=py311h40f64dc_0 + - greenlet=2.0.1=py311h313beb8_0 + - grpc-cpp=1.48.2=hc60591f_1 + - gst-plugins-base=1.14.1=h313beb8_1 + - gstreamer=1.14.1=h80987f9_1 + - h5py=3.9.0=py311hba6ad2f_0 + - hdf5=1.12.1=h05c076b_3 + - heapdict=1.0.1=pyhd3eb1b0_0 + - holoviews=1.17.1=py311hca03da5_0 + - huggingface_hub=0.15.1=py311hca03da5_0 + - hvplot=0.8.4=py311hca03da5_0 + - hyperlink=21.0.0=pyhd3eb1b0_0 + - icu=68.1=hc377ac9_0 + - idna=3.4=py311hca03da5_0 + - imagecodecs=2023.1.23=py311h5e7c512_0 + - imageio=2.31.1=py311hca03da5_0 + - imagesize=1.4.1=py311hca03da5_0 + - imbalanced-learn=0.10.1=py311hca03da5_1 + - importlib-metadata=6.0.0=py311hca03da5_0 + - importlib_metadata=6.0.0=hd3eb1b0_0 + - incremental=21.3.0=pyhd3eb1b0_0 + - inflection=0.5.1=py311hca03da5_0 + - iniconfig=1.1.1=pyhd3eb1b0_0 + - intake=0.6.8=py311hca03da5_0 + - intervaltree=3.1.0=pyhd3eb1b0_0 + - ipykernel=6.25.0=py311hb6e6a13_0 + - ipython=8.15.0=py311hca03da5_0 + - ipython_genutils=0.2.0=pyhd3eb1b0_1 + - ipywidgets=8.0.4=py311hca03da5_0 + - isort=5.9.3=pyhd3eb1b0_0 + - itemadapter=0.3.0=pyhd3eb1b0_0 + - itemloaders=1.0.4=pyhd3eb1b0_1 + - itsdangerous=2.0.1=pyhd3eb1b0_0 + - jaraco.classes=3.2.1=pyhd3eb1b0_0 + - jedi=0.18.1=py311hca03da5_1 + - jellyfish=1.0.1=py311h15d1925_0 + - jinja2=3.1.2=py311hca03da5_0 + - jinja2-time=0.2.0=pyhd3eb1b0_3 + - jmespath=0.10.0=pyhd3eb1b0_0 + - joblib=1.2.0=py311hca03da5_0 + - jpeg=9e=h80987f9_1 + - jq=1.6=h1a28f6b_1 + - json5=0.9.6=pyhd3eb1b0_0 + - jsonpatch=1.32=pyhd3eb1b0_0 + - jsonpointer=2.1=pyhd3eb1b0_0 + - jsonschema=4.17.3=py311hca03da5_0 + - jupyter=1.0.0=py311hca03da5_8 + - jupyter_client=7.4.9=py311hca03da5_0 + - jupyter_console=6.6.3=py311hca03da5_0 + - jupyter_core=5.3.0=py311hca03da5_0 + - jupyter_events=0.6.3=py311hca03da5_0 + - jupyter_server=1.23.4=py311hca03da5_0 + - jupyter_server_fileid=0.9.0=py311hca03da5_0 + - jupyter_server_ydoc=0.8.0=py311hca03da5_1 + - jupyter_ydoc=0.2.4=py311hca03da5_0 + - jupyterlab=3.6.3=py311hca03da5_0 + - jupyterlab_pygments=0.1.2=py_0 + - jupyterlab_server=2.22.0=py311hca03da5_0 + - jupyterlab_widgets=3.0.5=py311hca03da5_0 + - jxrlib=1.1=h1a28f6b_2 + - kaleido-core=0.2.1=h80987f9_0 + - keyring=23.13.1=py311hca03da5_0 + - kiwisolver=1.4.4=py311h313beb8_0 + - krb5=1.20.1=hf3e1bf2_1 + - krona=2.8.1=pl5321hdfd78af_1 + - lazy-object-proxy=1.6.0=py311h80987f9_0 + - lazy_loader=0.2=py311hca03da5_0 + - lcms2=2.12=hba8e193_0 + - ld64=530=hb29bf3f_25 + - ld64_osx-arm64=530=h001ce53_25 + - ldid=2.1.5=h20b2a84_3 + - lerc=3.0=hc377ac9_0 + - libaec=1.0.4=hc377ac9_1 + - libarchive=3.6.2=h62fee54_2 + - libavif=0.11.1=h80987f9_0 + - libboost=1.73.0=h49e8a49_12 + - libbrotlicommon=1.0.9=h1a28f6b_7 + - libbrotlidec=1.0.9=h1a28f6b_7 + - libbrotlienc=1.0.9=h1a28f6b_7 + - libclang=14.0.6=default_h1b80db6_1 + - libclang13=14.0.6=default_h24352ff_1 + - libcurl=8.2.1=h3e2b118_0 + - libcxx=14.0.6=h848a8c0_0 + - libdeflate=1.17=h80987f9_0 + - libedit=3.1.20221030=h80987f9_0 + - libev=4.33=h1a28f6b_1 + - libevent=2.1.12=h02f6b3c_1 + - libffi=3.4.4=hca03da5_0 + - libgfortran=5.0.0=11_3_0_hca03da5_28 + - libgfortran5=11.3.0=h009349e_28 + - libiconv=1.16=h1a28f6b_2 + - liblief=0.12.3=h313beb8_0 + - libllvm14=14.0.6=h7ec7a93_3 + - libmamba=1.5.1=h15e39b3_0 + - libmambapy=1.5.1=py311h1c5506f_0 + - libnghttp2=1.52.0=h62f6fdd_1 + - libopenblas=0.3.21=h269037a_0 + - libpng=1.6.39=h80987f9_0 + - libpq=12.15=h02f6b3c_1 + - libprotobuf=3.20.3=h514c7bf_0 + - libsodium=1.0.18=h1a28f6b_0 + - libsolv=0.7.24=h514c7bf_0 + - libspatialindex=1.9.3=hc377ac9_0 + - libssh2=1.10.0=h02f6b3c_2 + - libthrift=0.15.0=h73c2103_2 + - libtiff=4.5.1=h313beb8_0 + - libwebp=1.3.2=ha3663a8_0 + - libwebp-base=1.3.2=h80987f9_0 + - libxml2=2.10.4=h372ba2a_0 + - libxslt=1.1.37=habca612_0 + - libzopfli=1.0.3=hc377ac9_0 + - linkify-it-py=2.0.0=py311hca03da5_0 + - llvm-openmp=14.0.6=hc6e5704_0 + - llvmlite=0.40.0=py311h514c7bf_0 + - locket=1.0.0=py311hca03da5_0 + - lxml=4.9.3=py311h50ffb84_0 + - lz4=4.3.2=py311h80987f9_0 + - lz4-c=1.9.4=h313beb8_0 + - lzo=2.10=h1a28f6b_2 + - markdown=3.4.1=py311hca03da5_0 + - markdown-it-py=2.2.0=py311hca03da5_1 + - markupsafe=2.1.1=py311h80987f9_0 + - mathjax=2.7.5=hca03da5_0 + - matplotlib=3.7.2=py311hca03da5_0 + - matplotlib-base=3.7.2=py311h7aedaa7_0 + - matplotlib-inline=0.1.6=py311hca03da5_0 + - mccabe=0.7.0=pyhd3eb1b0_0 + - mdit-py-plugins=0.3.0=py311hca03da5_0 + - mdurl=0.1.0=py311hca03da5_0 + - mistune=0.8.4=py311h80987f9_1000 + - more-itertools=8.12.0=pyhd3eb1b0_0 + - mpc=1.1.0=h8c48613_1 + - mpfr=4.0.2=h695f6f0_1 + - mpmath=1.3.0=py311hca03da5_0 + - msgpack-python=1.0.3=py311h48ca7d4_0 + - multidict=6.0.2=py311h80987f9_0 + - multipledispatch=0.6.0=py311hca03da5_0 + - multiprocess=0.70.14=py311hca03da5_0 + - munkres=1.1.4=py_0 + - mypy_extensions=1.0.0=py311hca03da5_0 + - mysql=5.7.24=ha71a6ea_2 + - navigator-updater=0.4.0=py311hca03da5_1 + - nbclassic=0.5.5=py311hca03da5_0 + - nbclient=0.5.13=py311hca03da5_0 + - nbconvert=6.5.4=py311hca03da5_0 + - nbformat=5.9.2=py311hca03da5_0 + - ncurses=6.4=h313beb8_0 + - nest-asyncio=1.5.6=py311hca03da5_0 + - networkx=3.1=py311hca03da5_0 + - nltk=3.8.1=py311hca03da5_0 + - notebook=6.5.4=py311hca03da5_1 + - notebook-shim=0.2.2=py311hca03da5_0 + - numba=0.57.1=py311h7aedaa7_0 + - numexpr=2.8.4=py311h6dc990b_1 + - numpy=1.24.3=py311hb57d4eb_0 + - numpy-base=1.24.3=py311h1d85a46_0 + - numpydoc=1.5.0=py311hca03da5_0 + - oniguruma=6.9.7.1=h1a28f6b_0 + - openjpeg=2.3.0=h7a6adac_2 + - openpyxl=3.0.10=py311h80987f9_0 + - openssl=3.4.1=h81ee809_0 + - orc=1.7.4=hdca1487_1 + - pandas=2.0.3=py311h7aedaa7_0 + - pandoc=3.6.2=hce30654_0 + - pandocfilters=1.5.0=pyhd3eb1b0_0 + - panel=1.2.3=py311hca03da5_0 + - param=1.13.0=py311hca03da5_0 + - parsel=1.6.0=py311hca03da5_0 + - parso=0.8.3=pyhd3eb1b0_0 + - partd=1.4.0=py311hca03da5_0 + - patch=2.7.6=h1a28f6b_1001 + - pathlib=1.0.1=pyhd3eb1b0_1 + - pathspec=0.10.3=py311hca03da5_0 + - patsy=0.5.3=py311hca03da5_0 + - pcre=8.45=hc377ac9_0 + - pcre2=10.42=hb066dcc_0 + - pep8=1.7.1=py311hca03da5_1 + - perl=5.32.1=7_h4614cfb_perl5 + - pexpect=4.8.0=pyhd3eb1b0_3 + - pickleshare=0.7.5=pyhd3eb1b0_1003 + - pillow=10.4.0=py311h80987f9_0 + - pip=23.2.1=py311hca03da5_0 + - pkce=1.0.3=py311hca03da5_0 + - pkginfo=1.9.6=py311hca03da5_0 + - platformdirs=3.10.0=py311hca03da5_0 + - plotly=5.9.0=py311hca03da5_0 + - pluggy=1.0.0=py311hca03da5_1 + - ply=3.11=py311hca03da5_0 + - poyo=0.5.0=pyhd3eb1b0_0 + - prometheus_client=0.14.1=py311hca03da5_0 + - prompt-toolkit=3.0.36=py311hca03da5_0 + - prompt_toolkit=3.0.36=hd3eb1b0_0 + - protego=0.1.16=py_0 + - psutil=5.9.0=py311h80987f9_0 + - ptyprocess=0.7.0=pyhd3eb1b0_2 + - pure_eval=0.2.2=pyhd3eb1b0_0 + - py-cpuinfo=8.0.0=pyhd3eb1b0_1 + - py-lief=0.12.3=py311h313beb8_0 + - pyarrow=11.0.0=py311h7575258_1 + - pyasn1=0.4.8=pyhd3eb1b0_0 + - pyasn1-modules=0.2.8=py_0 + - pybind11-abi=4=hd3eb1b0_1 + - pycodestyle=2.10.0=py311hca03da5_0 + - pycosat=0.6.4=py311h80987f9_0 + - pycparser=2.21=pyhd3eb1b0_0 + - pyct=0.5.0=py311hca03da5_0 + - pycurl=7.45.2=py311h02f6b3c_1 + - pydantic=1.10.8=py311h80987f9_0 + - pydispatcher=2.0.5=py311hca03da5_2 + - pydocstyle=6.3.0=py311hca03da5_0 + - pyerfa=2.0.0=py311h80987f9_0 + - pyflakes=3.0.1=py311hca03da5_0 + - pygments=2.15.1=py311hca03da5_1 + - pyjwt=2.4.0=py311hca03da5_0 + - pylint=2.16.2=py311hca03da5_0 + - pylint-venv=2.3.0=py311hca03da5_0 + - pyls-spyder=0.4.0=pyhd3eb1b0_0 + - pyobjc-core=9.0=py311h3eb5a62_1 + - pyobjc-framework-cocoa=9.0=py311hb094c41_0 + - pyobjc-framework-coreservices=9.0=py311hdd8dd1f_0 + - pyobjc-framework-fsevents=9.0=py311hca03da5_0 + - pyodbc=4.0.34=py311h313beb8_0 + - pyopenssl=23.2.0=py311hca03da5_0 + - pyparsing=3.0.9=py311hca03da5_0 + - pyqt=5.15.7=py311h313beb8_0 + - pyqt5-sip=12.11.0=py311h313beb8_0 + - pyqtwebengine=5.15.7=py311h313beb8_0 + - pyrsistent=0.18.0=py311h80987f9_0 + - pysocks=1.7.1=py311hca03da5_0 + - pytables=3.8.0=py311he080bb3_3 + - pytest=7.4.0=py311hca03da5_0 + - python=3.11.5=hb885b13_0 + - python-dateutil=2.8.2=pyhd3eb1b0_0 + - python-dotenv=0.21.0=py311hca03da5_0 + - python-fastjsonschema=2.16.2=py311hca03da5_0 + - python-json-logger=2.0.7=py311hca03da5_0 + - python-kaleido=0.2.1=py311hca03da5_0 + - python-libarchive-c=2.9=pyhd3eb1b0_1 + - python-lmdb=1.4.1=py311h313beb8_0 + - python-lsp-black=1.2.1=py311hca03da5_0 + - python-lsp-jsonrpc=1.0.0=pyhd3eb1b0_0 + - python-lsp-server=1.7.2=py311hca03da5_0 + - python-slugify=5.0.2=pyhd3eb1b0_0 + - python-snappy=0.6.1=py311h313beb8_0 + - python-tzdata=2023.3=pyhd3eb1b0_0 + - python-xxhash=2.0.2=py311h80987f9_1 + - python.app=3=py311h80987f9_0 + - pytoolconfig=1.2.5=py311hca03da5_1 + - pytz=2023.3.post1=py311hca03da5_0 + - pyviz_comms=2.3.0=py311hca03da5_0 + - pywavelets=1.4.1=py311h80987f9_0 + - pyyaml=6.0=py311h80987f9_1 + - pyzmq=23.2.0=py311h313beb8_0 + - qdarkstyle=3.0.2=pyhd3eb1b0_0 + - qstylizer=0.2.2=py311hca03da5_0 + - qt-main=5.15.2=h9b4df51_9 + - qt-webengine=5.15.9=h2903aaf_7 + - qtawesome=1.2.2=py311hca03da5_0 + - qtconsole=5.4.2=py311hca03da5_0 + - qtpy=2.2.0=py311hca03da5_0 + - qtwebkit=5.212=h19f419d_5 + - queuelib=1.5.0=py311hca03da5_0 + - re2=2022.04.01=hc377ac9_0 + - readline=8.2=h1a28f6b_0 + - regex=2022.7.9=py311h80987f9_0 + - reproc=14.2.4=hc377ac9_1 + - reproc-cpp=14.2.4=hc377ac9_1 + - requests=2.31.0=py311hca03da5_0 + - requests-file=1.5.1=pyhd3eb1b0_0 + - requests-toolbelt=1.0.0=py311hca03da5_0 + - responses=0.13.3=pyhd3eb1b0_0 + - rfc3339-validator=0.1.4=py311hca03da5_0 + - rfc3986-validator=0.1.1=py311hca03da5_0 + - rope=1.7.0=py311hca03da5_0 + - rtree=1.0.1=py311hca03da5_0 + - ruamel.yaml=0.17.21=py311h80987f9_0 + - ruamel_yaml=0.17.21=py311h80987f9_0 + - s3fs=2023.4.0=py311hca03da5_0 + - safetensors=0.3.2=py311hf0e4da2_0 + - scikit-image=0.20.0=py311h313beb8_0 + - scikit-learn=1.3.0=py311h7aedaa7_0 + - scipy=1.11.1=py311hc76d9b0_0 + - scrapy=2.8.0=py311hca03da5_0 + - send2trash=1.8.0=pyhd3eb1b0_1 + - service_identity=18.1.0=pyhd3eb1b0_1 + - setuptools=68.0.0=py311hca03da5_0 + - sip=6.6.2=py311h313beb8_0 + - six=1.16.0=pyhd3eb1b0_1 + - smart_open=5.2.1=py311hca03da5_0 + - snappy=1.1.9=hc377ac9_0 + - sniffio=1.2.0=py311hca03da5_1 + - snowballstemmer=2.2.0=pyhd3eb1b0_0 + - sortedcontainers=2.4.0=pyhd3eb1b0_0 + - soupsieve=2.4=py311hca03da5_0 + - sphinx=5.0.2=py311hca03da5_0 + - sphinxcontrib-applehelp=1.0.2=pyhd3eb1b0_0 + - sphinxcontrib-devhelp=1.0.2=pyhd3eb1b0_0 + - sphinxcontrib-htmlhelp=2.0.0=pyhd3eb1b0_0 + - sphinxcontrib-jsmath=1.0.1=pyhd3eb1b0_0 + - sphinxcontrib-qthelp=1.0.3=pyhd3eb1b0_0 + - sphinxcontrib-serializinghtml=1.1.5=pyhd3eb1b0_0 + - spyder=5.4.3=py311hca03da5_1 + - spyder-kernels=2.4.4=py311hca03da5_0 + - sqlalchemy=1.4.39=py311h80987f9_0 + - sqlite=3.41.2=h80987f9_0 + - stack_data=0.2.0=pyhd3eb1b0_0 + - statsmodels=0.14.0=py311hb9f6ed7_0 + - tabulate=0.8.10=py311hca03da5_0 + - tapi=1100.0.11=h8754e6a_1 + - tbb=2021.8.0=h48ca7d4_0 + - tblib=1.7.0=pyhd3eb1b0_0 + - tenacity=8.2.2=py311hca03da5_0 + - terminado=0.17.1=py311hca03da5_0 + - text-unidecode=1.3=pyhd3eb1b0_0 + - textdistance=4.2.1=pyhd3eb1b0_0 + - threadpoolctl=2.2.0=pyh0d69192_0 + - three-merge=0.1.1=pyhd3eb1b0_0 + - tifffile=2023.4.12=py311hca03da5_0 + - tinycss2=1.2.1=py311hca03da5_0 + - tk=8.6.12=hb8d0fd4_0 + - tldextract=3.2.0=pyhd3eb1b0_0 + - tokenizers=0.13.2=py311h3dd52b7_1 + - toml=0.10.2=pyhd3eb1b0_0 + - tomlkit=0.11.1=py311hca03da5_0 + - toolz=0.12.0=py311hca03da5_0 + - tornado=6.3.2=py311h80987f9_0 + - tqdm=4.65.0=py311hb6e6a13_0 + - traitlets=5.7.1=py311hca03da5_0 + - transformers=4.32.1=py311hca03da5_0 + - twisted=22.10.0=py311h80987f9_0 + - tzdata=2023c=h04d1e81_0 + - uc-micro-py=1.0.1=py311hca03da5_0 + - ujson=5.4.0=py311h313beb8_0 + - unidecode=1.2.0=pyhd3eb1b0_0 + - unixodbc=2.3.11=h1a28f6b_0 + - urllib3=1.26.16=py311hca03da5_0 + - utf8proc=2.6.1=h1a28f6b_0 + - w3lib=1.21.0=pyhd3eb1b0_0 + - watchdog=2.1.6=py311h80987f9_0 + - wcwidth=0.2.5=pyhd3eb1b0_0 + - webencodings=0.5.1=py311hca03da5_1 + - websocket-client=0.58.0=py311hca03da5_4 + - werkzeug=2.2.3=py311hca03da5_0 + - whatthepatch=1.0.2=py311hca03da5_0 + - wheel=0.38.4=py311hca03da5_0 + - widgetsnbextension=4.0.5=py311hca03da5_0 + - wrapt=1.14.1=py311h80987f9_0 + - wurlitzer=3.0.2=py311hca03da5_0 + - xarray=2023.6.0=py311hca03da5_0 + - xlwings=0.29.1=py311hca03da5_0 + - xxhash=0.8.0=h1a28f6b_3 + - xyzservices=2022.9.0=py311hca03da5_1 + - xz=5.4.2=h80987f9_0 + - y-py=0.5.9=py311ha6e5c4f_0 + - yaml=0.2.5=h1a28f6b_0 + - yaml-cpp=0.7.0=hc377ac9_1 + - yapf=0.31.0=pyhd3eb1b0_0 + - yarl=1.8.1=py311h80987f9_0 + - ypy-websocket=0.8.2=py311hca03da5_0 + - zeromq=4.3.4=hc377ac9_0 + - zfp=1.0.0=h313beb8_0 + - zict=2.2.0=py311hca03da5_0 + - zipp=3.11.0=py311hca03da5_0 + - zlib=1.2.13=h5a0b063_0 + - zlib-ng=2.0.7=h80987f9_0 + - zope=1.0=py311hca03da5_1 + - zope.interface=5.4.0=py311h80987f9_0 + - zstandard=0.19.0=py311h80987f9_0 + - zstd=1.5.5=hd90d995_0 + - pip: + - anndata==0.11.4 + - array-api-compat==1.11.2 + - biopython==1.81 + - ete3==3.1.3 + - legacy-api-wrap==1.4.1 + - natsort==8.4.0 + - orffinder==1.8 + - packaging==24.2 + - pynndescent==0.5.13 + - scanpy==1.11.1 + - seaborn==0.13.2 + - session-info2==0.1.2 + - sympy==1.13.1 + - torch==2.6.0 + - torchaudio==2.6.0 + - torchvision==0.21.0 + - typing-extensions==4.13.2 + - umap-learn==0.5.7 +prefix: /Users/markkachanovskii/anaconda3 diff --git a/filter.log b/filter.log new file mode 100644 index 0000000..9daa09a --- /dev/null +++ b/filter.log @@ -0,0 +1,24 @@ +2025-04-19 15:36:38,181 - INFO - Starting filtering of fastq file: /private/var/folders/8y/lpd58rc57495mk0nf4dsz2d80000gn/T/pytest-of-markkachanovskii/pytest-30/test_output_exists0/example_fastq.fastq +2025-04-19 15:36:38,181 - INFO - Parameters: GC=(0, 100), Length=(0, 4294967296), Quality=0 +2025-04-19 15:36:38,193 - INFO - Filtered 4 sequences +2025-04-19 15:36:38,194 - INFO - Starting filtering of fastq file: /private/var/folders/8y/lpd58rc57495mk0nf4dsz2d80000gn/T/pytest-of-markkachanovskii/pytest-30/test_quality_threshold_output0/example_fastq.fastq +2025-04-19 15:36:38,195 - INFO - Parameters: GC=(0, 100), Length=(0, 4294967296), Quality=38 +2025-04-19 15:36:38,196 - INFO - Filtered 3 sequences +2025-04-19 15:36:38,200 - INFO - Starting filtering of fastq file: /private/var/folders/8y/lpd58rc57495mk0nf4dsz2d80000gn/T/pytest-of-markkachanovskii/pytest-30/test_gc_bounds_filtering0/example_fastq.fastq +2025-04-19 15:36:38,200 - INFO - Parameters: GC=(50.1, 100), Length=(0, 4294967296), Quality=0 +2025-04-19 15:36:38,200 - INFO - Filtered 3 sequences +2025-04-19 15:36:38,201 - INFO - Starting filtering of fastq file: /private/var/folders/8y/lpd58rc57495mk0nf4dsz2d80000gn/T/pytest-of-markkachanovskii/pytest-30/test_length_bounds_filtering0/example_fastq.fastq +2025-04-19 15:36:38,201 - INFO - Parameters: GC=(0, 100), Length=(5, 15), Quality=0 +2025-04-19 15:36:38,201 - INFO - Filtered 1 sequences +2025-04-19 15:36:38,202 - INFO - Starting filtering of fastq file: /private/var/folders/8y/lpd58rc57495mk0nf4dsz2d80000gn/T/pytest-of-markkachanovskii/pytest-30/test_empty_result0/example_fastq.fastq +2025-04-19 15:36:38,202 - INFO - Parameters: GC=(0, 100), Length=(0, 4294967296), Quality=100 +2025-04-19 15:36:38,202 - INFO - No sequences was found the filtering criteria. +2025-04-19 15:36:38,203 - INFO - Starting filtering of fastq file: non_existent.fastq +2025-04-19 15:36:38,203 - INFO - Parameters: GC=(0, 100), Length=(0, 4294967296), Quality=0 +2025-04-19 15:36:38,203 - ERROR - Fastq file 'non_existent.fastq' was not found +2025-04-19 15:36:38,203 - INFO - Starting filtering of fastq file: /private/var/folders/8y/lpd58rc57495mk0nf4dsz2d80000gn/T/pytest-of-markkachanovskii/pytest-30/test_logging_output0/example_fastq.fastq +2025-04-19 15:36:38,204 - INFO - Parameters: GC=(0, 100), Length=(0, 4294967296), Quality=0 +2025-04-19 15:36:38,204 - INFO - Filtered 4 sequences +2025-04-19 15:36:38,204 - INFO - Starting filtering of fastq file: /private/var/folders/8y/lpd58rc57495mk0nf4dsz2d80000gn/T/pytest-of-markkachanovskii/pytest-30/test_invalid_fastq_format0/invalid_fastq.fastq +2025-04-19 15:36:38,204 - INFO - Parameters: GC=(0, 100), Length=(0, 4294967296), Quality=0 +2025-04-19 15:36:38,204 - ERROR - Invalid format in /private/var/folders/8y/lpd58rc57495mk0nf4dsz2d80000gn/T/pytest-of-markkachanovskii/pytest-30/test_invalid_fastq_format0/invalid_fastq.fastq: Lengths of sequence and quality values differs for seq1 (4 and 2). diff --git a/filter_fastq.py b/filter_fastq.py index 20ee98e..c8c0a80 100644 --- a/filter_fastq.py +++ b/filter_fastq.py @@ -1,45 +1,108 @@ +import argparse +import logging from Bio import SeqIO -from Bio.SeqUtils import GC +from Bio.SeqUtils import gc_fraction import numpy as np from typing import Dict, Tuple -def filter_fastq(fastq_file: str, gc_bounds: Tuple[float, float] = (0, 100), +logger = logging.getLogger(__name__) # логгирование +logger.setLevel(logging.INFO) +if not logger.handlers: + file_handler = logging.FileHandler("filter.log") + file_handler.setLevel(logging.INFO) + stream_handler = logging.StreamHandler() + stream_handler.setLevel(logging.INFO) + formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") + file_handler.setFormatter(formatter) + stream_handler.setFormatter(formatter) + logger.addHandler(file_handler) + logger.addHandler(stream_handler) + +def filter_fastq(fastq_file: str, gc_bounds: Tuple[float, float] = (0, 100), length_bounds: Tuple[int, int] = (0, 2**32), quality_threshold: float = 0) -> Dict[str, Tuple[str, str]]: """ - Filter FastQ sequences based on GC content, length, and average quality. + Filter sequences based on GC content, length, and average quality. Args: - fastq_file: Path to the FastQ file. - gc_bounds: GC content range in percentage. Defaults to (0, 100). - length_bounds: Sequence length range. Defaults to (0, 2**32). + fastq_file: Path to the fastq file. + gc_bounds: GC content range in percentage (min-max). Defaults to (0, 100). + length_bounds: Sequence length range (min-max). Defaults to (0, 2**32). quality_threshold: Minimum average quality threshold (Phred scale). Defaults to 0. Returns: A dictionary with filtered sequences in the format {name: (sequence, quality)}. - Returns a string message if no sequences meet the criteria. + Returns a string message if no sequences meet the criteria or if the file is invalid. + + Raises: + FileNotFoundError: If the fastq file does not exist. """ filtered_seqs: Dict[str, Tuple[str, str]] = {} + logger.info(f"Starting filtering of fastq file: {fastq_file}") + logger.info(f"Parameters: GC={gc_bounds}, Length={length_bounds}, Quality={quality_threshold}") + try: with open(fastq_file, 'r'): pass except FileNotFoundError: - raise FileNotFoundError(f"FastQ file '{fastq_file}' not found") + logger.error(f"Fastq file '{fastq_file}' was not found") + raise FileNotFoundError(f"Fastq file '{fastq_file}' was not found") - for record in SeqIO.parse(fastq_file, "fastq"): - sequence = str(record.seq) - gc_content = GC(record.seq) - seq_length = len(sequence) - quality_scores = record.letter_annotations["phred_quality"] - avg_quality = np.mean(quality_scores) if quality_scores else 0.0 + try: + for record in SeqIO.parse(fastq_file, "fastq"): + sequence = str(record.seq) + gc_content = gc_fraction(record.seq) * 100 + seq_length = len(sequence) + quality_scores = record.letter_annotations["phred_quality"] + avg_quality = np.mean(quality_scores) if quality_scores else 0.0 - if (gc_bounds[0] <= gc_content <= gc_bounds[1] and - length_bounds[0] <= seq_length <= length_bounds[1] and - avg_quality >= quality_threshold): - quality_str = "".join(chr(q + 33) for q in quality_scores) - filtered_seqs[record.id] = (sequence, quality_str) + if (gc_bounds[0] <= gc_content <= gc_bounds[1] and + length_bounds[0] <= seq_length <= length_bounds[1] and + avg_quality >= quality_threshold): + quality_str = "".join(chr(q + 33) for q in quality_scores) + filtered_seqs[record.id] = (sequence, quality_str) + except ValueError as e: + logger.error(f"Invalid format in {fastq_file}: {str(e)}") + return "Invalid format. Please check the file." if not filtered_seqs: - return "Удовлетворяющих последовательностей не найдено. Попробуйте другие аргументы." - + logger.info("No sequences was found the filtering criteria.") + return "No sequences was found the filtering criteria. Try another parameters." + + logger.info(f"Filtered {len(filtered_seqs)} sequences") return filtered_seqs + +def parse_arguments() -> argparse.Namespace: + """ + Parse command-line arguments. + """ + parser = argparse.ArgumentParser(description="Filter fastq sequences by GC content, length, and quality.") + parser.add_argument("fastq_file", type=str, help="Path to the input fastq file") + parser.add_argument("--gc-bounds", type=float, nargs=2, default=(0, 100), + help="GC content range (min-max, percentage), default: 0-100") + parser.add_argument("--length-bounds", type=int, nargs=2, default=(0, 2**32), + help="Sequence length range (min-max), default: 0-2^32") + parser.add_argument("--quality-threshold", type=float, default=0, + help="Minimum average Phred quality score, default: 0") + return parser.parse_args() + +def main(): + """ + Main function to run the fastq filter. + """ + args = parse_arguments() + result = filter_fastq( + fastq_file=args.fastq_file, + gc_bounds=tuple(args.gc_bounds), + length_bounds=tuple(args.length_bounds), + quality_threshold=args.quality_threshold + ) + if isinstance(result, str): + print(result) + else: + print(f"{len(result)} sequences were filtered:") + for seq_id, (seq, qual) in result.items(): + print(f">{seq_id}\n{seq}\nQuality: {qual}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/test_filter_fastq.py b/test_filter_fastq.py new file mode 100644 index 0000000..3ec00bb --- /dev/null +++ b/test_filter_fastq.py @@ -0,0 +1,112 @@ +import os +import pytest +import numpy as np +from filter_fastq import filter_fastq + +class TestFilterFastq: + """ + Test suite for the filter_fastq function in filter_fastq module. + Tests cover functionality, error handling, logging, and edge cases. + """ + + @pytest.fixture + def input_file_path(self, tmp_path): + """ + Fixture to create a temporary FastQ file for testing. + """ + content = ( + "@seq1\nGACCTTTCCGCAAGCTGTCGC\n+\nIIIIIIIIIIIIIIIIIIIII\n" + "@seq2\nCATGGTGGCG\n+\nIIIIIIIIII\n" + "@seq3\nC\n+\nI\n" + "@seq4\nATCG\n+\n!!!!\n" + ) + file_path = tmp_path / "example_fastq.fastq" + file_path.write_text(content) + return str(file_path) + + @pytest.fixture + def log_file_path(self, tmp_path): + """ + Fixture to clean up the log file after the test. + """ + yield str(tmp_path / "filter.log") + #if os.path.exists("filter.log"): + #os.remove("filter.log") + + def test_output_exists(self, input_file_path): + """ + Test that filter_fastq returns a non-empty dictionary with default parameters. + """ + result = filter_fastq(input_file_path) + assert isinstance(result, dict), "Result should be a dictionary" + assert len(result) > 0, "Result dictionary should not be empty" + + def test_quality_threshold_output(self, input_file_path): + """ + Test the correctness of output sequences with quality_threshold=38. + """ + target_seqs = ['GACCTTTCCGCAAGCTGTCGC', 'CATGGTGGCG', 'C'] + result = filter_fastq(input_file_path, quality_threshold=38) + assert isinstance(result, dict), "Result should be a dictionary" + result_seqs = [seq for seq, _ in result.values()] + assert result_seqs == target_seqs, f"Expected sequences {target_seqs}, but got {result_seqs}" + + def test_gc_bounds_filtering(self, input_file_path): + """ + Test filtering by GC content (gc_bounds=(50.1, 100)). + """ + target_seqs = ['GACCTTTCCGCAAGCTGTCGC', 'CATGGTGGCG', 'C'] + result = filter_fastq(input_file_path, gc_bounds=(50.1, 100)) + assert isinstance(result, dict), "Result should be a dictionary" + result_seqs = [seq for seq, _ in result.values()] + assert result_seqs == target_seqs, f"Expected sequences {target_seqs}, but got {result_seqs}" + + def test_length_bounds_filtering(self, input_file_path): + """ + Test filtering by sequence length (length_bounds=(5, 15)). + """ + target_seqs = ['CATGGTGGCG'] + result = filter_fastq(input_file_path, length_bounds=(5, 15)) + assert isinstance(result, dict), "Result should be a dictionary" + result_seqs = [seq for seq, _ in result.values()] + assert result_seqs == target_seqs, f"Expected sequences {target_seqs}, but got {result_seqs}" + + def test_empty_result(self, input_file_path): + """ + Test that filter_fastq returns a string when no sequences pass the filter. + """ + result = filter_fastq(input_file_path, quality_threshold=100) + assert isinstance(result, str), "Result should be a string" + assert "No sequences was found" in result, "Result should indicate no sequences found" + + def test_file_not_found_error(self): + """ + Test that filter_fastq raises FileNotFoundError for a non-existent file. + """ + with pytest.raises(FileNotFoundError, match="Fastq file 'non_existent.fastq' was not found"): + filter_fastq("non_existent.fastq") + + def test_logging_output(self, input_file_path, log_file_path): + """ + Test that filter_fastq creates a log file and writes expected messages. + """ + result = filter_fastq(input_file_path) + log_file = "filter.log" + assert os.path.exists(log_file), "Log file 'filter.log' should be created" + with open(log_file, 'r') as f: + log_content = f.read() + assert "Starting filtering of fastq file" in log_content, "Log should contain start message" + assert f"Filtered {len(result)} sequences" in log_content, "Log should contain result message" + + def test_invalid_fastq_format(self, tmp_path): + """ + Test that filter_fastq handles invalid FastQ format gracefully. + """ + invalid_content = ( + "@seq1\nATCG\n+\n!!\n" + ) + file_path = tmp_path / "invalid_fastq.fastq" + file_path.write_text(invalid_content) + result = filter_fastq(str(file_path)) + assert isinstance(result, str), "Result should be a string" + assert "Invalid format" in result, "Result should indicate invalid format" \ No newline at end of file