diff --git a/.gitignore b/.gitignore index d08c2cda36..38c331c5a1 100644 --- a/.gitignore +++ b/.gitignore @@ -24,8 +24,6 @@ test-results node_modules/ .vscode/ /test-env -/ynr/apps/sopn_parsing/tests/data/sopn_baseline.json -/ynr/apps/sopn_parsing/tests/data/sopn_baseline_copy.json # PyCharm .idea/ diff --git a/Makefile b/Makefile deleted file mode 100644 index f8d2865666..0000000000 --- a/Makefile +++ /dev/null @@ -1,45 +0,0 @@ -export DJANGO_SETTINGS_MODULE?=ynr.settings.sopn_testing - - -.PHONY: sopn-runserver -sopn-runserver: - python manage.py runserver - -.PHONY: sopn-shell -sopn-shell: - python manage.py shell_plus - -.PHONY: migrate-db -migrate-db: - python manage.py migrate - -.PHONY: test-sopns -test-sopns: migrate-db - python manage.py sopn_tooling_compare_raw_people --election-slugs= --ballot= --date 2021-05-06 - -.PHONY: download-sopns -download-sopns: - python manage.py migrate --no-input - python manage.py sopn_tooling_create_official_documents --election-slugs= --date 2021-05-06 - -.PHONY: populate-sopn-testing-database -populate-sopn-testing-database: migrate-db - python manage.py candidates_import_from_live_site - -.PHONY: delete-test-sopns -delete-test-sopns: - python manage.py sopn_tooling_clear_existing_objects - rm -rf ./ynr/media/sopn_testing/ - -.PHONY: create-baseline-file -create-baseline-file: - python manage.py sopn_tooling_write_baseline - -.PHONY: copy-baseline-file -copy-baseline-file: - cp ynr/apps/sopn_parsing/tests/data/sopn_baseline.json ynr/apps/sopn_parsing/tests/data/sopn_baseline_copy.json - -.PHONY: prod-import-sopns -prod-import-sopns: - cd deploy; \ - ansible-playbook import_sopns.yml diff --git a/pyproject.toml b/pyproject.toml index 4cbbb16b4a..d6d5661d4a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,12 +41,10 @@ dependencies = [ "markdown-it-py==4.0.0", "nameparser==1.1.2", "ndg-httpsclient==0.5.1", - "openai==1.30.3", # for compatibility with openai==1.30.3 # TODO: review/remove when we upgrade openai package "httpx==0.27.0", - "Pillow==10.3.0", "psycopg==3.1.12", "python-dateutil==2.8.2", @@ -58,14 +56,13 @@ dependencies = [ "whitenoise==6.5.0", "sorl-thumbnail-serializer-field", "slacker2", - # SOPN parsing "pdfminer.six==20201018", - "camelot-py[cv]==0.8.2", "pypandoc_binary==1.14", "PyPDF2==2.12.1", "amazon-textract-response-parser==1.0.3", "amazon-textract-helper==0.0.35", + "pandas>=3.0.0", ] [dependency-groups] diff --git a/uv.lock b/uv.lock index ca36a3cc96..c92fd1cf73 100644 --- a/uv.lock +++ b/uv.lock @@ -4,7 +4,9 @@ requires-python = "==3.12.*" resolution-markers = [ "sys_platform == 'darwin'", "platform_machine == 'aarch64' and sys_platform == 'linux'", - "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')", + "sys_platform == 'win32'", + "sys_platform == 'emscripten'", + "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32')", ] [[package]] @@ -323,29 +325,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bc/47/e35f788047c91110f48703a6254e5c84e33111b3291f7b57a653ca00accf/botocore-1.34.162-py3-none-any.whl", hash = "sha256:2d918b02db88d27a75b48275e6fb2506e9adaaddbec1ffa6a8a0898b34e769be", size = 12468049, upload-time = "2024-08-15T19:25:18.301Z" }, ] -[[package]] -name = "camelot-py" -version = "0.8.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "chardet" }, - { name = "click" }, - { name = "numpy" }, - { name = "openpyxl" }, - { name = "pandas" }, - { name = "pdfminer-six" }, - { name = "pypdf2" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/13/d4/cfd74357cf62d3e4c91439074422710df1147a261f09af72d808bfa40cd2/camelot-py-0.8.2.tar.gz", hash = "sha256:8e1e2a8e59c2dbdce9a6790f7007cd091343b136e29d64a45c8fd09c11360057", size = 38173, upload-time = "2020-07-27T12:28:17.846Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ed/78/123007d6aff9811bd087ff5dd68e06a9df62b2fdbf7685d74c9d6f247921/camelot_py-0.8.2-py3-none-any.whl", hash = "sha256:0b2e612ad0e11190b84a29937bad5d487b540faea408863f01e410d2b976336e", size = 42794, upload-time = "2020-07-27T12:28:15.966Z" }, -] - -[package.optional-dependencies] -cv = [ - { name = "opencv-python" }, -] - [[package]] name = "cattrs" version = "25.3.0" @@ -425,18 +404,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0a/4c/925909008ed5a988ccbb72dcc897407e5d6d3bd72410d69e051fc0c14647/charset_normalizer-3.4.4-py3-none-any.whl", hash = "sha256:7a32c560861a02ff789ad905a2fe94e3f840803362c84fecf1851cb4cf3dc37f", size = 53402, upload-time = "2025-10-14T04:42:31.76Z" }, ] -[[package]] -name = "click" -version = "8.3.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/3d/fa/656b739db8587d7b5dfa22e22ed02566950fbfbcdc20311993483657a5c0/click-8.3.1.tar.gz", hash = "sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a", size = 295065, upload-time = "2025-11-15T20:45:42.706Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274, upload-time = "2025-11-15T20:45:41.139Z" }, -] - [[package]] name = "colorama" version = "0.4.6" @@ -843,15 +810,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/28/f0/65101e51dc7c850e7b7581a5d8fa8721a1d7479a0dca6c08386328e19882/editdistance-0.8.1-cp312-cp312-win_amd64.whl", hash = "sha256:09f01ed51746d90178af7dd7ea4ebb41497ef19f53c7f327e864421743dffb0a", size = 79853, upload-time = "2024-02-10T07:44:05.687Z" }, ] -[[package]] -name = "et-xmlfile" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" }, -] - [[package]] name = "executing" version = "2.2.1" @@ -1302,21 +1260,21 @@ wheels = [ [[package]] name = "numpy" -version = "2.3.5" +version = "2.4.1" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/76/65/21b3bc86aac7b8f2862db1e808f1ea22b028e30a225a34a5ede9bf8678f2/numpy-2.3.5.tar.gz", hash = "sha256:784db1dcdab56bf0517743e746dfb0f885fc68d948aba86eeec2cba234bdf1c0", size = 20584950, upload-time = "2025-11-16T22:52:42.067Z" } +sdist = { url = "https://files.pythonhosted.org/packages/24/62/ae72ff66c0f1fd959925b4c11f8c2dea61f47f6acaea75a08512cdfe3fed/numpy-2.4.1.tar.gz", hash = "sha256:a1ceafc5042451a858231588a104093474c6a5c57dcc724841f5c888d237d690", size = 20721320, upload-time = "2026-01-10T06:44:59.619Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/44/37/e669fe6cbb2b96c62f6bbedc6a81c0f3b7362f6a59230b23caa673a85721/numpy-2.3.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:74ae7b798248fe62021dbf3c914245ad45d1a6b0cb4a29ecb4b31d0bfbc4cc3e", size = 16733873, upload-time = "2025-11-16T22:49:49.84Z" }, - { url = "https://files.pythonhosted.org/packages/c5/65/df0db6c097892c9380851ab9e44b52d4f7ba576b833996e0080181c0c439/numpy-2.3.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ee3888d9ff7c14604052b2ca5535a30216aa0a58e948cdd3eeb8d3415f638769", size = 12259838, upload-time = "2025-11-16T22:49:52.863Z" }, - { url = "https://files.pythonhosted.org/packages/5b/e1/1ee06e70eb2136797abe847d386e7c0e830b67ad1d43f364dd04fa50d338/numpy-2.3.5-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:612a95a17655e213502f60cfb9bf9408efdc9eb1d5f50535cc6eb365d11b42b5", size = 5088378, upload-time = "2025-11-16T22:49:55.055Z" }, - { url = "https://files.pythonhosted.org/packages/6d/9c/1ca85fb86708724275103b81ec4cf1ac1d08f465368acfc8da7ab545bdae/numpy-2.3.5-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:3101e5177d114a593d79dd79658650fe28b5a0d8abeb8ce6f437c0e6df5be1a4", size = 6628559, upload-time = "2025-11-16T22:49:57.371Z" }, - { url = "https://files.pythonhosted.org/packages/74/78/fcd41e5a0ce4f3f7b003da85825acddae6d7ecb60cf25194741b036ca7d6/numpy-2.3.5-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b973c57ff8e184109db042c842423ff4f60446239bd585a5131cc47f06f789d", size = 14250702, upload-time = "2025-11-16T22:49:59.632Z" }, - { url = "https://files.pythonhosted.org/packages/b6/23/2a1b231b8ff672b4c450dac27164a8b2ca7d9b7144f9c02d2396518352eb/numpy-2.3.5-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0d8163f43acde9a73c2a33605353a4f1bc4798745a8b1d73183b28e5b435ae28", size = 16606086, upload-time = "2025-11-16T22:50:02.127Z" }, - { url = "https://files.pythonhosted.org/packages/a0/c5/5ad26fbfbe2012e190cc7d5003e4d874b88bb18861d0829edc140a713021/numpy-2.3.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:51c1e14eb1e154ebd80e860722f9e6ed6ec89714ad2db2d3aa33c31d7c12179b", size = 16025985, upload-time = "2025-11-16T22:50:04.536Z" }, - { url = "https://files.pythonhosted.org/packages/d2/fa/dd48e225c46c819288148d9d060b047fd2a6fb1eb37eae25112ee4cb4453/numpy-2.3.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b46b4ec24f7293f23adcd2d146960559aaf8020213de8ad1909dba6c013bf89c", size = 18542976, upload-time = "2025-11-16T22:50:07.557Z" }, - { url = "https://files.pythonhosted.org/packages/05/79/ccbd23a75862d95af03d28b5c6901a1b7da4803181513d52f3b86ed9446e/numpy-2.3.5-cp312-cp312-win32.whl", hash = "sha256:3997b5b3c9a771e157f9aae01dd579ee35ad7109be18db0e85dbdbe1de06e952", size = 6285274, upload-time = "2025-11-16T22:50:10.746Z" }, - { url = "https://files.pythonhosted.org/packages/2d/57/8aeaf160312f7f489dea47ab61e430b5cb051f59a98ae68b7133ce8fa06a/numpy-2.3.5-cp312-cp312-win_amd64.whl", hash = "sha256:86945f2ee6d10cdfd67bcb4069c1662dd711f7e2a4343db5cecec06b87cf31aa", size = 12782922, upload-time = "2025-11-16T22:50:12.811Z" }, - { url = "https://files.pythonhosted.org/packages/78/a6/aae5cc2ca78c45e64b9ef22f089141d661516856cf7c8a54ba434576900d/numpy-2.3.5-cp312-cp312-win_arm64.whl", hash = "sha256:f28620fe26bee16243be2b7b874da327312240a7cdc38b769a697578d2100013", size = 10194667, upload-time = "2025-11-16T22:50:16.16Z" }, + { url = "https://files.pythonhosted.org/packages/78/7f/ec53e32bf10c813604edf07a3682616bd931d026fcde7b6d13195dfb684a/numpy-2.4.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d3703409aac693fa82c0aee023a1ae06a6e9d065dba10f5e8e80f642f1e9d0a2", size = 16656888, upload-time = "2026-01-10T06:42:40.913Z" }, + { url = "https://files.pythonhosted.org/packages/b8/e0/1f9585d7dae8f14864e948fd7fa86c6cb72dee2676ca2748e63b1c5acfe0/numpy-2.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7211b95ca365519d3596a1d8688a95874cc94219d417504d9ecb2df99fa7bfa8", size = 12373956, upload-time = "2026-01-10T06:42:43.091Z" }, + { url = "https://files.pythonhosted.org/packages/8e/43/9762e88909ff2326f5e7536fa8cb3c49fb03a7d92705f23e6e7f553d9cb3/numpy-2.4.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:5adf01965456a664fc727ed69cc71848f28d063217c63e1a0e200a118d5eec9a", size = 5202567, upload-time = "2026-01-10T06:42:45.107Z" }, + { url = "https://files.pythonhosted.org/packages/4b/ee/34b7930eb61e79feb4478800a4b95b46566969d837546aa7c034c742ef98/numpy-2.4.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:26f0bcd9c79a00e339565b303badc74d3ea2bd6d52191eeca5f95936cad107d0", size = 6549459, upload-time = "2026-01-10T06:42:48.152Z" }, + { url = "https://files.pythonhosted.org/packages/79/e3/5f115fae982565771be994867c89bcd8d7208dbfe9469185497d70de5ddf/numpy-2.4.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0093e85df2960d7e4049664b26afc58b03236e967fb942354deef3208857a04c", size = 14404859, upload-time = "2026-01-10T06:42:49.947Z" }, + { url = "https://files.pythonhosted.org/packages/d9/7d/9c8a781c88933725445a859cac5d01b5871588a15969ee6aeb618ba99eee/numpy-2.4.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7ad270f438cbdd402c364980317fb6b117d9ec5e226fff5b4148dd9aa9fc6e02", size = 16371419, upload-time = "2026-01-10T06:42:52.409Z" }, + { url = "https://files.pythonhosted.org/packages/a6/d2/8aa084818554543f17cf4162c42f162acbd3bb42688aefdba6628a859f77/numpy-2.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:297c72b1b98100c2e8f873d5d35fb551fce7040ade83d67dd51d38c8d42a2162", size = 16182131, upload-time = "2026-01-10T06:42:54.694Z" }, + { url = "https://files.pythonhosted.org/packages/60/db/0425216684297c58a8df35f3284ef56ec4a043e6d283f8a59c53562caf1b/numpy-2.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:cf6470d91d34bf669f61d515499859fa7a4c2f7c36434afb70e82df7217933f9", size = 18295342, upload-time = "2026-01-10T06:42:56.991Z" }, + { url = "https://files.pythonhosted.org/packages/31/4c/14cb9d86240bd8c386c881bafbe43f001284b7cce3bc01623ac9475da163/numpy-2.4.1-cp312-cp312-win32.whl", hash = "sha256:b6bcf39112e956594b3331316d90c90c90fb961e39696bda97b89462f5f3943f", size = 5959015, upload-time = "2026-01-10T06:42:59.631Z" }, + { url = "https://files.pythonhosted.org/packages/51/cf/52a703dbeb0c65807540d29699fef5fda073434ff61846a564d5c296420f/numpy-2.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:e1a27bb1b2dee45a2a53f5ca6ff2d1a7f135287883a1689e930d44d1ff296c87", size = 12310730, upload-time = "2026-01-10T06:43:01.627Z" }, + { url = "https://files.pythonhosted.org/packages/69/80/a828b2d0ade5e74a9fe0f4e0a17c30fdc26232ad2bc8c9f8b3197cf7cf18/numpy-2.4.1-cp312-cp312-win_arm64.whl", hash = "sha256:0e6e8f9d9ecf95399982019c01223dc130542960a12edfa8edd1122dfa66a8a8", size = 10312166, upload-time = "2026-01-10T06:43:03.673Z" }, ] [[package]] @@ -1337,35 +1295,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8c/e9/ceef41cbc47ee82c3da44e47a60780cb628322ffd311043e8c7522990478/openai-1.30.3-py3-none-any.whl", hash = "sha256:f88119c8a848998be533c71ab8aa832446fa72b7ddbc70917c3f5886dc132051", size = 320635, upload-time = "2024-05-24T16:06:14.491Z" }, ] -[[package]] -name = "opencv-python" -version = "4.11.0.86" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/17/06/68c27a523103dad5837dc5b87e71285280c4f098c60e4fe8a8db6486ab09/opencv-python-4.11.0.86.tar.gz", hash = "sha256:03d60ccae62304860d232272e4a4fda93c39d595780cb40b161b310244b736a4", size = 95171956, upload-time = "2025-01-16T13:52:24.737Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/05/4d/53b30a2a3ac1f75f65a59eb29cf2ee7207ce64867db47036ad61743d5a23/opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl", hash = "sha256:432f67c223f1dc2824f5e73cdfcd9db0efc8710647d4e813012195dc9122a52a", size = 37326322, upload-time = "2025-01-16T13:52:25.887Z" }, - { url = "https://files.pythonhosted.org/packages/3b/84/0a67490741867eacdfa37bc18df96e08a9d579583b419010d7f3da8ff503/opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_x86_64.whl", hash = "sha256:9d05ef13d23fe97f575153558653e2d6e87103995d54e6a35db3f282fe1f9c66", size = 56723197, upload-time = "2025-01-16T13:55:21.222Z" }, - { url = "https://files.pythonhosted.org/packages/f3/bd/29c126788da65c1fb2b5fb621b7fed0ed5f9122aa22a0868c5e2c15c6d23/opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b92ae2c8852208817e6776ba1ea0d6b1e0a1b5431e971a2a0ddd2a8cc398202", size = 42230439, upload-time = "2025-01-16T13:51:35.822Z" }, - { url = "https://files.pythonhosted.org/packages/2c/8b/90eb44a40476fa0e71e05a0283947cfd74a5d36121a11d926ad6f3193cc4/opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b02611523803495003bd87362db3e1d2a0454a6a63025dc6658a9830570aa0d", size = 62986597, upload-time = "2025-01-16T13:52:08.836Z" }, - { url = "https://files.pythonhosted.org/packages/fb/d7/1d5941a9dde095468b288d989ff6539dd69cd429dbf1b9e839013d21b6f0/opencv_python-4.11.0.86-cp37-abi3-win32.whl", hash = "sha256:810549cb2a4aedaa84ad9a1c92fbfdfc14090e2749cedf2c1589ad8359aa169b", size = 29384337, upload-time = "2025-01-16T13:52:13.549Z" }, - { url = "https://files.pythonhosted.org/packages/a4/7d/f1c30a92854540bf789e9cd5dde7ef49bbe63f855b85a2e6b3db8135c591/opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:085ad9b77c18853ea66283e98affefe2de8cc4c1f43eda4c100cf9b2721142ec", size = 39488044, upload-time = "2025-01-16T13:52:21.928Z" }, -] - -[[package]] -name = "openpyxl" -version = "3.1.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "et-xmlfile" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" }, -] - [[package]] name = "packaging" version = "25.0" @@ -1377,23 +1306,23 @@ wheels = [ [[package]] name = "pandas" -version = "2.3.3" +version = "3.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy" }, { name = "python-dateutil" }, - { name = "pytz" }, - { name = "tzdata" }, + { name = "tzdata", marker = "sys_platform == 'emscripten' or sys_platform == 'win32'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/33/01/d40b85317f86cf08d853a4f495195c73815fdf205eef3993821720274518/pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b", size = 4495223, upload-time = "2025-09-29T23:34:51.853Z" } +sdist = { url = "https://files.pythonhosted.org/packages/de/da/b1dc0481ab8d55d0f46e343cfe67d4551a0e14fcee52bd38ca1bd73258d8/pandas-3.0.0.tar.gz", hash = "sha256:0facf7e87d38f721f0af46fe70d97373a37701b1c09f7ed7aeeb292ade5c050f", size = 4633005, upload-time = "2026-01-21T15:52:04.726Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/9c/fb/231d89e8637c808b997d172b18e9d4a4bc7bf31296196c260526055d1ea0/pandas-2.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d21f6d74eb1725c2efaa71a2bfc661a0689579b58e9c0ca58a739ff0b002b53", size = 11597846, upload-time = "2025-09-29T23:19:48.856Z" }, - { url = "https://files.pythonhosted.org/packages/5c/bd/bf8064d9cfa214294356c2d6702b716d3cf3bb24be59287a6a21e24cae6b/pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3fd2f887589c7aa868e02632612ba39acb0b8948faf5cc58f0850e165bd46f35", size = 10729618, upload-time = "2025-09-29T23:39:08.659Z" }, - { url = "https://files.pythonhosted.org/packages/57/56/cf2dbe1a3f5271370669475ead12ce77c61726ffd19a35546e31aa8edf4e/pandas-2.3.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecaf1e12bdc03c86ad4a7ea848d66c685cb6851d807a26aa245ca3d2017a1908", size = 11737212, upload-time = "2025-09-29T23:19:59.765Z" }, - { url = "https://files.pythonhosted.org/packages/e5/63/cd7d615331b328e287d8233ba9fdf191a9c2d11b6af0c7a59cfcec23de68/pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b3d11d2fda7eb164ef27ffc14b4fcab16a80e1ce67e9f57e19ec0afaf715ba89", size = 12362693, upload-time = "2025-09-29T23:20:14.098Z" }, - { url = "https://files.pythonhosted.org/packages/a6/de/8b1895b107277d52f2b42d3a6806e69cfef0d5cf1d0ba343470b9d8e0a04/pandas-2.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a68e15f780eddf2b07d242e17a04aa187a7ee12b40b930bfdd78070556550e98", size = 12771002, upload-time = "2025-09-29T23:20:26.76Z" }, - { url = "https://files.pythonhosted.org/packages/87/21/84072af3187a677c5893b170ba2c8fbe450a6ff911234916da889b698220/pandas-2.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:371a4ab48e950033bcf52b6527eccb564f52dc826c02afd9a1bc0ab731bba084", size = 13450971, upload-time = "2025-09-29T23:20:41.344Z" }, - { url = "https://files.pythonhosted.org/packages/86/41/585a168330ff063014880a80d744219dbf1dd7a1c706e75ab3425a987384/pandas-2.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:a16dcec078a01eeef8ee61bf64074b4e524a2a3f4b3be9326420cabe59c4778b", size = 10992722, upload-time = "2025-09-29T23:20:54.139Z" }, + { url = "https://files.pythonhosted.org/packages/0b/38/db33686f4b5fa64d7af40d96361f6a4615b8c6c8f1b3d334eee46ae6160e/pandas-3.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9803b31f5039b3c3b10cc858c5e40054adb4b29b4d81cb2fd789f4121c8efbcd", size = 10334013, upload-time = "2026-01-21T15:50:34.771Z" }, + { url = "https://files.pythonhosted.org/packages/a5/7b/9254310594e9774906bacdd4e732415e1f86ab7dbb4b377ef9ede58cd8ec/pandas-3.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:14c2a4099cd38a1d18ff108168ea417909b2dea3bd1ebff2ccf28ddb6a74d740", size = 9874154, upload-time = "2026-01-21T15:50:36.67Z" }, + { url = "https://files.pythonhosted.org/packages/63/d4/726c5a67a13bc66643e66d2e9ff115cead482a44fc56991d0c4014f15aaf/pandas-3.0.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d257699b9a9960e6125686098d5714ac59d05222bef7a5e6af7a7fd87c650801", size = 10384433, upload-time = "2026-01-21T15:50:39.132Z" }, + { url = "https://files.pythonhosted.org/packages/bf/2e/9211f09bedb04f9832122942de8b051804b31a39cfbad199a819bb88d9f3/pandas-3.0.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:69780c98f286076dcafca38d8b8eee1676adf220199c0a39f0ecbf976b68151a", size = 10864519, upload-time = "2026-01-21T15:50:41.043Z" }, + { url = "https://files.pythonhosted.org/packages/00/8d/50858522cdc46ac88b9afdc3015e298959a70a08cd21e008a44e9520180c/pandas-3.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4a66384f017240f3858a4c8a7cf21b0591c3ac885cddb7758a589f0f71e87ebb", size = 11394124, upload-time = "2026-01-21T15:50:43.377Z" }, + { url = "https://files.pythonhosted.org/packages/86/3f/83b2577db02503cd93d8e95b0f794ad9d4be0ba7cb6c8bcdcac964a34a42/pandas-3.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:be8c515c9bc33989d97b89db66ea0cececb0f6e3c2a87fcc8b69443a6923e95f", size = 11920444, upload-time = "2026-01-21T15:50:45.932Z" }, + { url = "https://files.pythonhosted.org/packages/64/2d/4f8a2f192ed12c90a0aab47f5557ece0e56b0370c49de9454a09de7381b2/pandas-3.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:a453aad8c4f4e9f166436994a33884442ea62aa8b27d007311e87521b97246e1", size = 9730970, upload-time = "2026-01-21T15:50:47.962Z" }, + { url = "https://files.pythonhosted.org/packages/d4/64/ff571be435cf1e643ca98d0945d76732c0b4e9c37191a89c8550b105eed1/pandas-3.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:da768007b5a33057f6d9053563d6b74dd6d029c337d93c6d0d22a763a5c2ecc0", size = 9041950, upload-time = "2026-01-21T15:50:50.422Z" }, ] [[package]] @@ -1436,7 +1365,7 @@ name = "pexpect" version = "4.9.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "ptyprocess" }, + { name = "ptyprocess", marker = "sys_platform != 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450, upload-time = "2023-11-25T09:07:26.339Z" } wheels = [ @@ -2252,7 +2181,6 @@ dependencies = [ { name = "beautifulsoup4" }, { name = "blessed" }, { name = "boto3" }, - { name = "camelot-py", extra = ["cv"] }, { name = "croniter" }, { name = "django" }, { name = "django-cors-headers" }, @@ -2284,6 +2212,7 @@ dependencies = [ { name = "ndg-httpsclient" }, { name = "nh3" }, { name = "openai" }, + { name = "pandas" }, { name = "pdfminer-six" }, { name = "pillow" }, { name = "psycopg" }, @@ -2344,7 +2273,6 @@ requires-dist = [ { name = "beautifulsoup4", specifier = "==4.12.0" }, { name = "blessed", specifier = "==1.20.0" }, { name = "boto3", specifier = "==1.34.105" }, - { name = "camelot-py", extras = ["cv"], specifier = "==0.8.2" }, { name = "croniter", specifier = "==6.0.0" }, { name = "django", specifier = "==5.2.9" }, { name = "django-cors-headers", specifier = "==4.9.0" }, @@ -2376,6 +2304,7 @@ requires-dist = [ { name = "ndg-httpsclient", specifier = "==0.5.1" }, { name = "nh3", specifier = "==0.2.21" }, { name = "openai", specifier = "==1.30.3" }, + { name = "pandas", specifier = ">=3.0.0" }, { name = "pdfminer-six", specifier = "==20201018" }, { name = "pillow", specifier = "==10.3.0" }, { name = "psycopg", specifier = "==3.1.12" }, diff --git a/ynr/apps/bulk_adding/tests/test_bulk_add.py b/ynr/apps/bulk_adding/tests/test_bulk_add.py index 5361cda441..e18589768a 100644 --- a/ynr/apps/bulk_adding/tests/test_bulk_add.py +++ b/ynr/apps/bulk_adding/tests/test_bulk_add.py @@ -752,79 +752,3 @@ def test_bulk_add_person_removes_spaces_from_name(self): self.assertContains(resp, "Review candidates") resp = form.submit() self.assertContains(resp, "Bart Simpson") - - def test_fall_back_to_camelot_if_no_textract(self): - data = {"name": "Bart", "party_id": "PP52"} - - raw_people = RawPeople.objects.create( - ballot=self.dulwich_post_ballot, - data=[data], - source_type=RawPeople.SOURCE_PARSED_PDF, - ) - - self.assertEqual( - raw_people.as_form_kwargs(), - { - "initial": [ - { - "name": "Bart", - "party": ["PP52", "PP52"], - "previous_party_affiliations": [], - "source": "", - } - ] - }, - ) - raw_people.delete() - - textract_data = {"name": "Lisa", "party_id": "PP53"} - raw_people = RawPeople.objects.create( - ballot=self.dulwich_post_ballot, - data=[data], - textract_data=[textract_data], - source_type=RawPeople.SOURCE_PARSED_PDF, - ) - - self.assertEqual( - raw_people.as_form_kwargs(), - { - "initial": [ - { - "name": "Lisa", - "party": ["PP53", "PP53"], - "previous_party_affiliations": [], - "source": "", - } - ] - }, - ) - - def test_can_change_parser_in_frontend(self): - """ - Check that a query param can change the parser we use - """ - BallotSOPN.objects.create( - source_url="http://example.com", - ballot=self.dulwich_post_ballot, - uploaded_file="sopn.pdf", - ) - RawPeople.objects.create( - ballot=self.dulwich_post_ballot, - data=[{"name": "Bart", "party_id": "PP52"}], - textract_data=[{"name": "Lisa", "party_id": "PP53"}], - source_type=RawPeople.SOURCE_PARSED_PDF, - ) - response = self.app.get( - "/bulk_adding/sopn/parl.65808.2015-05-07/", user=self.user - ) - form = response.forms["bulk_add_form"] - # This should be the Textract data - self.assertEqual(form.fields["form-0-name"][0].value, "Lisa") - - response = self.app.get( - "/bulk_adding/sopn/parl.65808.2015-05-07/?v1_parser=1", - user=self.user, - ) - form = response.forms["bulk_add_form"] - # This should be the Textract data - self.assertEqual(form.fields["form-0-name"][0].value, "Bart") diff --git a/ynr/apps/bulk_adding/views/sopns.py b/ynr/apps/bulk_adding/views/sopns.py index e0003e9c9e..6564db69a5 100644 --- a/ynr/apps/bulk_adding/views/sopns.py +++ b/ynr/apps/bulk_adding/views/sopns.py @@ -123,12 +123,8 @@ def get(self, request, *args, **kwargs): return super().get(request, *args, **kwargs) def get_active_parser(self) -> Optional[SOPNParsingBackends]: - if self.request.GET.get("v1_parser"): - return SOPNParsingBackends.CAMELOT if self.ballot.rawpeople.textract_data: return SOPNParsingBackends.TEXTRACT - if self.ballot.rawpeople.data: - return SOPNParsingBackends.CAMELOT return None def get_context_data(self, **kwargs): diff --git a/ynr/apps/elections/templates/elections/includes/_sopn_debug.html b/ynr/apps/elections/templates/elections/includes/_sopn_debug.html index c61ab936ce..620857b058 100644 --- a/ynr/apps/elections/templates/elections/includes/_sopn_debug.html +++ b/ynr/apps/elections/templates/elections/includes/_sopn_debug.html @@ -7,28 +7,12 @@

Parsing Status

-

Camelot raw Data

- {% if object.sopn.camelotparsedsopn.raw_data %} -
{{ object.sopn.camelotparsedsopn.as_pandas.to_dict|pprint }}
- {% else %} - N/A - {% endif %} - -

Camelot table Data

- {% if object.sopn.camelotparsedsopn.data_as_html %} - {{ object.sopn.camelotparsedsopn.data_as_html|safe }} - {% else %} - N/A - {% endif %} -
- {% if textract_parsed and textract_parsed.as_textractor_document %}

AWS extracted table{{ textract_parsed.as_textractor_document.tables|pluralize }}

diff --git a/ynr/apps/official_documents/models.py b/ynr/apps/official_documents/models.py index 952aedf866..3c283950eb 100644 --- a/ynr/apps/official_documents/models.py +++ b/ynr/apps/official_documents/models.py @@ -4,7 +4,6 @@ from typing import List from candidates.models import Ballot -from django.conf import settings from django.core.files.base import ContentFile from django.core.validators import FileExtensionValidator from django.db import models @@ -258,7 +257,6 @@ def parse(self): """ - from sopn_parsing.helpers.extract_tables import extract_ballot_table from sopn_parsing.helpers.textract_helpers import ( NotUsingAWSException, TextractSOPNHelper, @@ -274,12 +272,6 @@ def parse(self): # There's a cron job that should pick up the result and carry on parsing later. textract_helper.start_detection() - if getattr( - settings, "CAMELOT_ENABLED", False - ) and self.uploaded_file.name.endswith(".pdf"): - # Camelot - extract_ballot_table(self.ballot) - class BallotSOPNHistory(BaseBallotSOPN): ballot = models.ForeignKey( diff --git a/ynr/apps/official_documents/tests/test_upload.py b/ynr/apps/official_documents/tests/test_upload.py index 03423f9ce7..61f015a280 100644 --- a/ynr/apps/official_documents/tests/test_upload.py +++ b/ynr/apps/official_documents/tests/test_upload.py @@ -2,7 +2,6 @@ import textwrap from os.path import dirname, join, realpath from pathlib import Path -from unittest import skipIf from candidates.models import LoggedAction from candidates.tests.auth import TestUserMixin @@ -27,7 +26,6 @@ EXAMPLE_DOCX_FILENAME, EXAMPLE_HTML_FILENAME, ) -from sopn_parsing.tests import should_skip_conversion_tests from webtest import Upload TEST_MEDIA_ROOT = realpath( @@ -114,20 +112,8 @@ def test_upload_authorized(self): with open(self.example_image_filename, "rb") as f: form["uploaded_file"] = Upload("pilot.jpg", f.read()) - # TODO: Add back in - # with patch( - # "official_documents.views.extract_pages_for_ballot" - # ) as extract_pages, patch( - # "official_documents.views.extract_ballot_table" - # ) as extract_tables, patch( - # "official_documents.views.parse_raw_data_for_ballot" - # ) as parse_tables: response = form.submit() self.assertEqual(response.status_code, 302) - # TODO: Add back in - # extract_pages.assert_called_once() - # extract_tables.assert_called_once() - # parse_tables.assert_called_once() ballot_sopns = BallotSOPN.objects.all() self.assertEqual(ballot_sopns.count(), 1) @@ -155,9 +141,6 @@ def test_upload_authorized(self): ) self.assertInHTML("Update SOPN", response.text) - @skipIf( - should_skip_conversion_tests(), "Required conversion libs not installed" - ) def test_docx_upload_form_validation(self): self.assertFalse(LoggedAction.objects.exists()) response = self.app.get( @@ -181,26 +164,11 @@ def test_docx_upload_form_validation(self): with open(self.example_docx_filename, "rb") as f: form["uploaded_file"] = Upload("pilot.docx", f.read()) - # TODO: add back in - # with patch( - # "official_documents.views.extract_pages_for_ballot" - # ) as extract_pages, patch( - # "official_documents.views.extract_ballot_table" - # ) as extract_tables, patch( - # "official_documents.views.parse_raw_data_for_ballot" - # ) as parse_tables: response = form.submit() self.assertEqual(response.status_code, 302) - # TODO Add back in - # extract_pages.assert_called_once() - # extract_tables.assert_called_once() - # parse_tables.assert_called_once() self.assertEqual(BallotSOPN.objects.count(), 1) self.assertEqual(response.location, self.ballot.get_sopn_url()) - @skipIf( - should_skip_conversion_tests(), "Required conversion libs not installed" - ) def test_html_upload_form_validation(self): self.assertFalse(LoggedAction.objects.exists()) response = self.app.get( @@ -229,9 +197,6 @@ def test_html_upload_form_validation(self): response.text, ) - @skipIf( - should_skip_conversion_tests(), "Required conversion libs not installed" - ) def test_jpg_form_validation(self): self.assertFalse(LoggedAction.objects.exists()) response = self.app.get( @@ -256,9 +221,6 @@ def test_jpg_form_validation(self): self.assertEqual(response.status_code, 302) self.assertEqual(BallotSOPN.objects.count(), 1) - @skipIf( - should_skip_conversion_tests(), "Required conversion libs not installed" - ) def test_update_existing_sopn(self): self.assertFalse(LoggedAction.objects.exists()) response = self.app.get( diff --git a/ynr/apps/sopn_parsing/helpers/extract_tables.py b/ynr/apps/sopn_parsing/helpers/extract_tables.py deleted file mode 100644 index 0b610c7847..0000000000 --- a/ynr/apps/sopn_parsing/helpers/extract_tables.py +++ /dev/null @@ -1,63 +0,0 @@ -import json - -import pandas as pd -from sopn_parsing.helpers.text_helpers import NoTextInDocumentError, clean_text -from sopn_parsing.models import CamelotParsedSOPN - - -def extract_ballot_table(ballot, parse_flavor="lattice"): - """ - Given a OfficialDocument model, update or create a CamelotParsedSOPN model with the - contents of the table as a JSON string. - - :type ballot: candidates.models.Ballot - - """ - import camelot # import here to avoid import error running tests without pdf deps installed - - document = ballot.sopn - try: - tables = camelot.read_pdf( - document.uploaded_file.path, - pages="all", - flavor=parse_flavor, - ) - except (NotImplementedError, AttributeError): - # * NotImplementedError is thrown if the PDF is an image or generally - # unreadable. - # * AttributeError is thrown on some PDFs saying they need a password. - # Assume this is a bug in camelot, and ignore these PDFs - raise NoTextInDocumentError() - - # Tables can span pages, camelot assumes they're different tables, so we - # need to join them back together - table_list = [] - for table in tables: - table_list.append(table) - table_list.sort(key=lambda t: (t.page, t.order)) - - if not table_list: - return None - - table_data = table_list.pop(0).df - - for table in table_list: - # It's possible to have the "situation of poll" document on the SOPN - # Ignore any table that contains "polling station" (SOPNs tables don't) - table = table.df - first_row = table.iloc[0].to_string() - - if "polling station" in clean_text(first_row): - break - # Append the continuation table to the first one in the document. - # ignore_index is needed so the e.g table 2 row 1 doesn't replace - # table 1 row 1 - table_data = pd.concat([table_data, table], ignore_index=True) - - if not table_data.empty: - parsed, _ = CamelotParsedSOPN.objects.update_or_create( - sopn=document, - defaults={"raw_data": json.dumps(table_data.to_dict())}, - ) - return parsed - return None diff --git a/ynr/apps/sopn_parsing/helpers/parse_tables.py b/ynr/apps/sopn_parsing/helpers/parse_tables.py index 247d0e7e31..e8152b0816 100644 --- a/ynr/apps/sopn_parsing/helpers/parse_tables.py +++ b/ynr/apps/sopn_parsing/helpers/parse_tables.py @@ -476,20 +476,12 @@ def parse_dataframe(ballot: Ballot, df: DataFrame): def parse_raw_data(ballot: Ballot, reparse=False): """ - Given a Ballot, go and get the Camelot and the AWS Textract dataframes + Given a Ballot, go and get the AWS Textract dataframes and process them """ - camelot_model = getattr(ballot.sopn, "camelotparsedsopn", None) - camelot_data = {} textract_model = getattr(ballot.sopn, "awstextractparsedsopn", None) textract_data = {} - if ( - camelot_model - and camelot_model.raw_data_type == "pandas" - and (reparse or not camelot_model.parsed_data) - ): - camelot_data = parse_dataframe(ballot, camelot_model.as_pandas) if ( textract_model and textract_model.raw_data @@ -500,7 +492,7 @@ def parse_raw_data(ballot: Ballot, reparse=False): textract_model.parse_raw_data() textract_data = parse_dataframe(ballot, textract_model.as_pandas) - if camelot_data or textract_data: + if textract_data: # Check there isn't a rawpeople object from another (better) source rawpeople_qs = RawPeople.objects.filter(ballot=ballot).exclude( source_type=RawPeople.SOURCE_PARSED_PDF @@ -510,7 +502,7 @@ def parse_raw_data(ballot: Ballot, reparse=False): RawPeople.objects.update_or_create( ballot=ballot, defaults={ - "data": camelot_data or "", + "data": "", "textract_data": textract_data or "", "source": "Parsed from {}".format( ballot.sopn.source_url @@ -525,17 +517,10 @@ def parse_raw_data(ballot: Ballot, reparse=False): return # We've done the parsing, so let's still save the result storage = DefaultStorage() - storage.save( - f"raw_people/camelot_{ballot.ballot_paper_id}.json", - ContentFile(json.dumps(camelot_data, indent=4).encode("utf8")), - ) storage.save( f"raw_people/textract_{ballot.ballot_paper_id}.json", ContentFile(json.dumps(textract_data, indent=4).encode("utf8")), ) - if camelot_model: - ballot.sopn.camelotparsedsopn.status = "parsed" - ballot.sopn.camelotparsedsopn.save() if textract_model: ballot.sopn.awstextractparsedsopn.status = "parsed" ballot.sopn.awstextractparsedsopn.save() diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_extract_tables.py b/ynr/apps/sopn_parsing/management/commands/sopn_parsing_extract_tables.py deleted file mode 100644 index 3a4e091290..0000000000 --- a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_extract_tables.py +++ /dev/null @@ -1,29 +0,0 @@ -from sopn_parsing.helpers.command_helpers import BaseSOPNParsingCommand -from sopn_parsing.helpers.extract_tables import extract_ballot_table -from sopn_parsing.helpers.text_helpers import NoTextInDocumentError - - -class Command(BaseSOPNParsingCommand): - help = """ - Parse tables out of PDFs in to CamelotParsedSOPN models for later parsing. - """ - - def handle(self, *args, **options): - qs = self.get_queryset(options) - filter_kwargs = {} - if not options["ballot"] and not options["testing"]: - if not options["reparse"]: - filter_kwargs["sopn__camelotparsedsopn"] = None - - qs = qs.filter(**filter_kwargs) - for ballot in qs: - try: - extract_ballot_table(ballot) - except NoTextInDocumentError: - self.stdout.write( - f"{ballot} raised a NoTextInDocumentError trying to extract tables" - ) - except ValueError: - self.stdout.write( - f"{ballot} raised a ValueError trying extract tables" - ) diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_parse_tables.py b/ynr/apps/sopn_parsing/management/commands/sopn_parsing_parse_tables.py deleted file mode 100644 index 26448b697f..0000000000 --- a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_parse_tables.py +++ /dev/null @@ -1,67 +0,0 @@ -from bulk_adding.models import RawPeople -from sopn_parsing.helpers.command_helpers import BaseSOPNParsingCommand -from sopn_parsing.helpers.parse_tables import parse_raw_data_for_ballot - - -class Command(BaseSOPNParsingCommand): - help = """ - Convert the raw extracted tables on the CamelotParsedSOPN model to a parsed - RawPeople model, and set the status as parsed. - - """ - - def build_filter_kwargs(self, options): - """ - Build kwargs used to filter the BallotQuerySet that is parsed - - Always skip any ballots where we do not have a CamelotParsedSOPN to try to - extract candidates from - - When test flag is used, dont make any changes - - When parsing a single ballot, dont make any changes - - When reparsing, only use ballots where we have previously created a - RawPeople object from a CamelotParsedSOPN - - Otherwise filter by unparsed CamelotParsedSOPN objects - """ - # Always skip any ballots where we do not have a CamelotParsedSOPN to try to - # extract candidates from - filter_kwargs = {} - if options.get("testing"): - return filter_kwargs - - if options.get("ballot"): - return filter_kwargs - - if options.get("reparse"): - filter_kwargs[ - "rawpeople__source_type" - ] = RawPeople.SOURCE_PARSED_PDF - return filter_kwargs - - return filter_kwargs - - def handle(self, *args, **options): - # filters that we never change with args. These two would raise - # ValueErrors in the parse_raw_data_for_ballot function - base_qs = self.get_queryset(options) - filter_kwargs = self.build_filter_kwargs(options) - - qs = base_qs.filter(**filter_kwargs) - qs = qs.filter( - candidates_locked=False, # Never parse a locked ballot - suggestedpostlock=None, # Never parse a ballot with lock suggestions - ) - - if not qs.exists(): - msg = ["No ballots to parse found."] - - if options.get("ballot"): - msg.append( - "This ballot might be locked or have lock suggestions" - ) - - self.stderr.write("\n".join(msg)) - - for ballot in qs: - try: - parse_raw_data_for_ballot(ballot, options["reparse"]) - except ValueError as e: - print(str(e)) diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_process_unparsed.py b/ynr/apps/sopn_parsing/management/commands/sopn_parsing_process_unparsed.py index 41db0e1f57..7b38b54b6a 100644 --- a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_process_unparsed.py +++ b/ynr/apps/sopn_parsing/management/commands/sopn_parsing_process_unparsed.py @@ -1,4 +1,3 @@ -from django.conf import settings from django.core.management.base import BaseCommand from sopn_parsing.helpers.parse_tables import parse_raw_data_for_ballot from sopn_parsing.helpers.textract_helpers import ( @@ -8,7 +7,6 @@ from sopn_parsing.models import ( AWSTextractParsedSOPN, AWSTextractParsedSOPNStatus, - CamelotParsedSOPN, ) @@ -21,22 +19,16 @@ class Command(BaseCommand): This script picks up where `parse` left off. It manages two cases: - # Camelot - - We expect to have made a `CamelotParsedSOPN` with `raw_data` populated. This will only have - happened if the file is a PDF readable by Camelot. - - We need to parse the `raw_data` into `parsed_data` and then make a `RawData` object for bulk adding. - # AWS Textract - We should have made a `AWSTextractParsedSOPN` with `job_id` populated. Textract is async, - so the initial `parse` just submits the data to AWS and gets a job_id. + We should have made a `AWSTextractParsedSOPN` with `job_id` populated. + Textract is async, so the initial `parse` just submits the data to AWS and + gets a job_id. We need to check if the job ID has finished and pull in the data to `raw_data`. - We're then in the same state as the Camelot method above, we need to parse the `raw_data` into - `parsed_data` and makr a `RawData` object for bulk adding. + We need to parse the `raw_data` into `parsed_data` and makr a `RawData` + object for bulk adding. """ def handle(self, *args, **options): @@ -45,15 +37,6 @@ def handle(self, *args, **options): "sopn__ballot__candidates_locked": False, } - if getattr(settings, "CAMELOT_ENABLED", False): - # Camelot first - qs = ( - CamelotParsedSOPN.objects.filter(parsed_data=None) - .exclude(raw_data="") - .filter(**current_ballot_kwargs) - ) - self.parse_tables_for_qs(qs) - # Textract qs = AWSTextractParsedSOPN.objects.exclude( status__in=[ diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_clear_existing_objects.py b/ynr/apps/sopn_parsing/management/commands/sopn_tooling_clear_existing_objects.py deleted file mode 100644 index cb68ffdf02..0000000000 --- a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_clear_existing_objects.py +++ /dev/null @@ -1,27 +0,0 @@ -from bulk_adding.models import RawPeople -from django.conf import settings -from django.core.management.base import BaseCommand -from official_documents.models import OfficialDocument - - -class Command(BaseCommand): - """ - Used to quickly delete existing objects used when testing SOPN - parsing so that you can start fresh for example, when you want - to start testing a new set of SOPNs. - """ - - def print_deleted(self, deleted_dict): - for object, count in deleted_dict.items(): - self.stdout.write(f"Deleted {count} {object}") - - def handle(self, *args, **options): - if settings.SETTINGS_MODULE != "ynr.settings.sopn_testing": - raise ValueError( - "You are trying to run this command outside of SOPN testing environment" - ) - - deleted_dict = {} - deleted_dict.update(OfficialDocument.objects.all().delete()[1]) - deleted_dict.update(RawPeople.objects.all().delete()[1]) - self.print_deleted(deleted_dict) diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_compare_raw_people.py b/ynr/apps/sopn_parsing/management/commands/sopn_tooling_compare_raw_people.py deleted file mode 100644 index dbe5eb913a..0000000000 --- a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_compare_raw_people.py +++ /dev/null @@ -1,237 +0,0 @@ -import json -import os -from collections import Counter - -from bulk_adding.models import RawPeople -from candidates.models import Ballot -from django.core.management import call_command -from official_documents.models import OfficialDocument -from popolo.models import Membership -from sopn_parsing.helpers.command_helpers import BaseSOPNParsingCommand -from sopn_parsing.models import CamelotParsedSOPN - - -class Command(BaseSOPNParsingCommand): - CORRECT_EXACTLY = "correct_exactly" - NUM_CORRECT_MISSING_PARTIES = "num_correct_some_parties_missing" - NUM_INCORRECT = "num_incorrect" - ZERO_CANDIDATES = "zero_candidates" - - def add_arguments(self, parser): - super().add_arguments(parser) - parser.add_argument("--loud", action="store_true", default=False) - - def handle(self, *args, **options): - """ - - Check we have a baseline file to compare with - - Prepare some OfficialDocuments - - Re-parse the documents - - Loop through the created RawPeople objects, comparing to our baseline - to make sure that we are parsing at least as many people as before - - If no asserts failed, use the data to write a new baseline file - """ - - self.loud = options.pop("loud") - - self.candidates_results = { - "correct_exactly": [], - "num_correct_some_parties_missing": [], - "num_incorrect": [], - "zero_candidates": [], - } - - raw_people_file = "ynr/apps/sopn_parsing/tests/data/sopn_baseline.json" - if not os.path.isfile(raw_people_file): - call_command("sopn_tooling_write_baseline") - self.stdout.write("Baseline file didn't exist so one was created") - - options.update({"testing": True}) - - OfficialDocument.objects.update(relevant_pages="") - call_command("sopn_parsing_extract_page_numbers", *args, **options) - CamelotParsedSOPN.objects.all().delete() - call_command("sopn_parsing_extract_tables", *args, **options) - RawPeople.objects.all().delete() - call_command("sopn_parsing_parse_tables", *args, **options) - - with open(raw_people_file) as file: - old_raw_people = json.loads(file.read()) - - self.new_raw_people = {} - for ballot in Ballot.objects.exclude(officialdocument__isnull=True): - ballot_data = old_raw_people.get(ballot.ballot_paper_id, {}) - - self.compare_relevant_pages(ballot=ballot, ballot_data=ballot_data) - - self.compare_raw_people(ballot=ballot, ballot_data=ballot_data) - - # display some overall totals - self.stdout.write( - "Old total 'people' parsed WAS {old}\n" - "New total 'people' parsed IS {new}".format( - old=self.count_people_parsed(old_raw_people), - new=self.count_people_parsed(self.new_raw_people), - ) - ) - - old_raw_people_obj_count = len( - {k: v for k, v in old_raw_people.items() if v["raw_people"]} - ) - new_raw_people_obj_count = RawPeople.objects.count() - style = self.style.SUCCESS - if new_raw_people_obj_count < old_raw_people_obj_count: - style = self.style.ERROR - self.stdout.write( - style( - f"Old RawPeople count: {old_raw_people_obj_count}\n" - f"New total RawPeople count: {new_raw_people_obj_count}" - ) - ) - - for result, ballots in self.candidates_results.items(): - total = len(ballots) - self.stdout.write(f"{total} ballots parsed {result}") - # Write a new baseline - call_command("sopn_tooling_write_baseline") - - def compare_relevant_pages(self, ballot, ballot_data): - old_relevant_pages = ballot_data.get("relevant_pages", "") - new_relevant_pages = ballot.sopn.relevant_pages - - if old_relevant_pages != new_relevant_pages: - self.stdout.write( - self.style.WARNING( - f"RELEVANT PAGES CHANGED FROM {old_relevant_pages} to {new_relevant_pages} for {ballot.ballot_paper_id}" - ) - ) - - def compare_raw_people(self, ballot, ballot_data): - try: - raw_people = ballot.rawpeople.data - except RawPeople.DoesNotExist: - raw_people = [] - - old_raw_people_for_ballot = ballot_data.get("raw_people", []) - old_count = len(old_raw_people_for_ballot) - new_count = len(raw_people) - if new_count < old_count: - self.stderr.write( - f"Uh oh, parsed people for {ballot.ballot_paper_id} decreased from {old_count} to {new_count}. Stopping." - ) - - if new_count > old_count: - self.stdout.write( - f"{ballot.ballot_paper_id} increased from {old_count} to {new_count} parsed people.\n" - f"Check the SOPN at https://candidates.democracyclub.org.uk{ballot.get_sopn_url()}." - ) - for person in raw_people: - if person not in old_raw_people_for_ballot: - self.stdout.write(self.style.SUCCESS(person)) - - # when people parsed have changed e.g. different name/different party print it for further checking - changed_people = [ - person - for person in old_raw_people_for_ballot - if person not in raw_people - ] - if changed_people: - self.stdout.write( - self.style.WARNING( - f"Parsed data changed for {ballot.ballot_paper_id}\n" - f"New raw people data:\n" - f"{raw_people}\n" - "Missing people:" - ) - ) - for person in changed_people: - self.stderr.write(str(person)) - - self.new_raw_people[ballot.ballot_paper_id] = {"raw_people": raw_people} - - self.parties_correct(ballot, raw_people) - - def count_people_parsed(self, raw_people_data): - """ - Returns the total number of "people" that were parsed. - NB that just because something was parsed, it doesnt mean that it was - accurately parsed. Therefore this total is best used to look for large - changes that should then be checked in detail. - """ - return sum( - [len(data["raw_people"]) for data in raw_people_data.values()] - ) - - def parties_correct(self, ballot, raw_people_for_ballot): - candidates = Membership.objects.filter(ballot=ballot) - if not candidates: - self.stdout.write( - self.style.WARNING( - f"We dont have candidates for {ballot.ballot_paper_id}. Try updating with the live site first?" - ) - ) - - if not raw_people_for_ballot: - self.candidates_results[self.ZERO_CANDIDATES].append( - ballot.ballot_paper_id - ) - return None - - num_candidates_correct = candidates.count() == len( - raw_people_for_ballot - ) - - if self.loud: - if num_candidates_correct: - self.stdout.write( - self.style.SUCCESS( - f"Correct number of people parsed as expected for {ballot.ballot_paper_id}" - ) - ) - else: - self.stdout.write( - self.style.ERROR( - f"Incorrect number of people parsed for {ballot.ballot_paper_id}" - ) - ) - - parsed = sorted( - [person["party_id"] for person in raw_people_for_ballot] - ) - expected = list( - candidates.values_list("party__ec_id", flat=True).order_by( - "party__ec_id" - ) - ) - - if parsed == expected: - return self.candidates_results[self.CORRECT_EXACTLY].append( - ballot.ballot_paper_id - ) - - # count number of each missing party ID as there could be more than one - # missing candidate for a party e.g. 1 missing Green, 2 missing independents - parsed = Counter(parsed) - expected = Counter(expected) - missing = expected - parsed - if missing: - total = sum(missing.values()) - self.stderr.write( - f"{total} MISSING parties for {ballot.ballot_paper_id} (party_id:num_missing)\n{missing}" - ) - else: - # sometimes we incorrectly parse extra people - often independents - # due to an empty row - extras = parsed - expected - total = sum(extras.values()) - self.stderr.write( - f"{total} EXTRA parties for {ballot.ballot_paper_id}\n{extras}" - ) - - if num_candidates_correct: - return self.candidates_results[ - self.NUM_CORRECT_MISSING_PARTIES - ].append(ballot.ballot_paper_id) - - return self.candidates_results[self.NUM_INCORRECT].append( - ballot.ballot_paper_id - ) diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_create_official_documents.py b/ynr/apps/sopn_parsing/management/commands/sopn_tooling_create_official_documents.py deleted file mode 100644 index e7c3f3e1b2..0000000000 --- a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_create_official_documents.py +++ /dev/null @@ -1,124 +0,0 @@ -import requests -from candidates.models import Ballot -from django.conf import settings -from django.core.files.base import ContentFile -from django.core.management.base import BaseCommand -from elections.models import Election -from official_documents.models import OfficialDocument - - -class Command(BaseCommand): - """This command uses the ballots endpoint to loop over each - ballot and store each sopn pdf (uploaded_file) locally""" - - def add_arguments(self, parser): - parser.add_argument( - "--date", - "-d", - action="store", - help="Election date in ISO format, defaults to 2021-05-06", - default="2021-05-06", - type=str, - ) - parser.add_argument( - "--site_url", - "-u", - action="store", - help="URL of site to download from", - default="https://candidates.democracyclub.org.uk/", - type=str, - ) - parser.add_argument( - "--election-count", - "-c", - action="store", - help="URL of site to download from", - default=50, - type=int, - ) - parser.add_argument( - "--election-slugs", "-s", action="store", required=False - ) - - def handle(self, *args, **options): - site_url = options.get("site_url") - election_date = options.get("date") - election_count = options.get("election_count") - - if options["election_slugs"]: - election_slugs = options["election_slugs"].split(",") - else: - election_slugs = Election.objects.filter( - election_date=election_date - ).values_list("slug", flat=True)[:election_count] - - for slug in election_slugs: - url = f"{site_url}api/next/ballots/?has_sopn=1&page_size=200&election_id={slug}&auth_token={settings.YNR_API_KEY}" - self.create_official_documents(url=url) - - def create_official_documents(self, url): - data = requests.get(url=url).json() - try: - next_page = data["next"] - except KeyError: - next_page = None - if "results" in data: - for ballot_data in data["results"]: - ballot = Ballot.objects.get( - ballot_paper_id=ballot_data["ballot_paper_id"] - ) - sopn_data = ballot_data["sopn"] - - # if we already have the SOPN no need to recreate - if ballot.officialdocument_set.filter( - source_url=sopn_data["source_url"] - ).exists(): - self.stdout.write( - f"SOPN already exists for {ballot.ballot_paper_id}" - ) - continue - - # check if we already have an OfficialDocument with this source - # downloaded - official_document = OfficialDocument.objects.filter( - source_url=sopn_data["source_url"] - ).first() - if official_document: - # if so we dont need to redownload the file, we can create a new - # object for this ballot with the same file - self.stdout.write( - f"Found SOPN for source {sopn_data['source_url']}" - ) - OfficialDocument.objects.create( - ballot=ballot, - source_url=sopn_data["source_url"], - uploaded_file=official_document.uploaded_file, - document_type=OfficialDocument.NOMINATION_PAPER, - ) - continue - - # otherwise we dont have this file stored already, so download it as - # part of creating the OfficialDocument - self.stdout.write( - f"Downloading SOPN from {sopn_data['uploaded_file']}" - ) - file_response = requests.get(sopn_data["uploaded_file"]) - file_object = ContentFile(content=file_response.content) - official_document = OfficialDocument( - ballot=ballot, - source_url=sopn_data["source_url"], - document_type=OfficialDocument.NOMINATION_PAPER, - ) - file_extension = sopn_data["uploaded_file"].split(".")[-1] - filename = f"{ballot.ballot_paper_id}.{file_extension}" - official_document.uploaded_file.save( - name=filename, content=file_object - ) - else: - self.stdout.write("No results found") - - # this should only be the case where the election object has > 200 - # ballots e.g. parliamentary elections - if next_page: - return self.create_official_documents(url=next_page) - return None diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_write_baseline.py b/ynr/apps/sopn_parsing/management/commands/sopn_tooling_write_baseline.py deleted file mode 100644 index 07ae9309cd..0000000000 --- a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_write_baseline.py +++ /dev/null @@ -1,51 +0,0 @@ -import json -import os - -from bulk_adding.models import RawPeople -from candidates.models import Ballot -from django.core.management.base import BaseCommand -from django.db.models import Q - - -class Command(BaseCommand): - """ - Creates a JSON file to represent ballots that have an Officialdocument. - Only include ballots where: - - The source of the RawPeople is from parsing a PDF - - No RawPeople were created from the OfficialDocument. This is so that we - will know if we make make improvements that mean more RawPeople are parsed - from an OfficialDocument - """ - - def add_arguments(self, parser): - parser.add_argument( - "--data", - action="store", - help="Dictionary of raw people to write as a baseline", - ) - - def handle(self, *args, **options): - json_data = options["data"] or {} - - if not json_data: - qs = Ballot.objects.exclude(officialdocument__isnull=True).filter( - Q(rawpeople__source_type=RawPeople.SOURCE_PARSED_PDF) - | Q(rawpeople__isnull=True) - ) - for ballot in qs: - raw_people = getattr(ballot, "rawpeople", []) - try: - raw_people = ballot.rawpeople.data - except RawPeople.DoesNotExist: - raw_people = [] - - json_data[ballot.ballot_paper_id] = { - "raw_people": raw_people, - "relevant_pages": ballot.sopn.relevant_pages, - } - - file_path = os.path.join( - os.getcwd(), "ynr/apps/sopn_parsing/tests/data/sopn_baseline.json" - ) - with open(file_path, "w") as f: - f.write(json.dumps(json_data)) diff --git a/ynr/apps/sopn_parsing/tests/__init__.py b/ynr/apps/sopn_parsing/tests/__init__.py index 0c1d689775..934f393dcf 100644 --- a/ynr/apps/sopn_parsing/tests/__init__.py +++ b/ynr/apps/sopn_parsing/tests/__init__.py @@ -5,12 +5,3 @@ def should_skip_pdf_tests(): return False except ImportError: return True - - -def should_skip_conversion_tests(): - try: - import pypandoc # noqa - - return False - except ImportError: - return True diff --git a/ynr/apps/sopn_parsing/tests/test_extract_tables.py b/ynr/apps/sopn_parsing/tests/test_extract_tables.py index 21a03dfb63..bda0d99fd3 100644 --- a/ynr/apps/sopn_parsing/tests/test_extract_tables.py +++ b/ynr/apps/sopn_parsing/tests/test_extract_tables.py @@ -1,15 +1,10 @@ from os.path import abspath, dirname, join -from unittest import skipIf from candidates.tests.helpers import TmpMediaRootMixin from candidates.tests.uk_examples import UK2015ExamplesMixin from django.core.files.uploadedfile import SimpleUploadedFile -from django.core.management import call_command from django.test import TestCase from official_documents.models import BallotSOPN -from sopn_parsing.helpers.extract_tables import extract_ballot_table -from sopn_parsing.models import CamelotParsedSOPN -from sopn_parsing.tests import should_skip_pdf_tests class TestSOPNHelpers(TmpMediaRootMixin, UK2015ExamplesMixin, TestCase): @@ -27,152 +22,3 @@ def setUp(self): uploaded_file=SimpleUploadedFile("sopn.pdf", sopn_file), source_url="example.com", ) - - @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed") - def test_extract_tables(self): - extract_ballot_table(self.dulwich_post_ballot) - self.assertEqual( - CamelotParsedSOPN.objects.get().as_pandas.to_dict(), - { - "0": { - "0": "Name of \nCandidate", - "1": "ALAGARATNAM \nRathy", - "2": "BARBER \nJames", - "3": "HAYES \nHelen Elizabeth", - "4": "KANUMANSA \nAmadu", - "5": "KOTECHA \nResham", - "6": "LAMBERT \nRobin Andrew \nDavid", - "7": "NALLY \nSteve", - "8": "NIX \nRashid", - }, - "1": { - "0": "Home \nAddress", - "1": "(address in the \nMitcham and Morden \nConstituency)", - "2": "33 Champion Hill, \nLondon, SE5 8BS", - "3": "11 Woodsyre, \nSydenham Hill, \nLondon, SE26 6SS", - "4": "11 Coleridge House, \nBrowning Street, \nLondon, SE17 1DG", - "5": "(address in the \nRuislip, Northwood \nand Pinner \nConstituency)", - "6": "(address in the \nDuwlich and West \nNorwood \nConstituency)", - "7": "(address in the \nVauxhall \nConstituency)", - "8": "66 Guinness Court, \nLondon, SW3 2PQ", - }, - "2": { - "0": "Description \n(if any)", - "1": "UK Independence \nParty (UKIP)", - "2": "Liberal Democrat", - "3": "Labour Party", - "4": "All People`s Party", - "5": "The Conservative \nParty Candidate", - "6": "Independent", - "7": "Trade Unionist \nand Socialist \nCoalition", - "8": "The Green Party", - }, - "3": { - "0": "Name of Assentors \nProposer(+), Seconder(++)", - "1": "Coleman Alice M + \n" - "Potter Keith S ++ \n" - "Potter Stephanie \n" - "Smith Bryan L \n" - "Anderson Beth \n" - "Lumba Avita \n" - "Andersen Robert \n" - "Patel Sajal \n" - "Stanbury Linda \n" - "Stanbury James", - "2": "Fitchett Keith + \n" - "Price Jonathan ++ \n" - "Gardner Brigid \n" - "Waddington Simon \n" - "Morland Laura \n" - "Lester Rachel \n" - "Pidgeon Caroline \n" - "Hare David \n" - "Hanton Alastair \n" - "Haylett Alexander", - "3": "Samuel Gaynelle + \n" - "Whaley Stephen P ++ \n" - "Brazell Shadi M \n" - "De Souza Johnny \n" - "Alcock Heather \n" - "Natzler Robert S \n" - "Pearce Michelle E \n" - "Pickering Robert \n" - "Richardson Katherine G \n" - "Pickard Jane", - "4": "King James + \n" - "King Rosemary ++ \n" - "King David \n" - "Davies Yadalieu \n" - "Sesay Mary \n" - "Rahman Layla K \n" - "Rahman Syed A \n" - "Ahmed Jalaluddin \n" - "Rahman Tajwar S \n" - "Rahman Taamid S", - "5": "Davis James G + \n" - "Bradbury David S ++ \n" - "Badman Susan E \n" - "Hill-Archer Roderick C \n" - "Langley Anne C \n" - "Mitchell Andrew M \n" - "Virgo Marjorie J \n" - "Virgo Philip A \n" - "Chathli Lindsay \n" - "Broomhead Robert A", - "6": "Smith Caitlin + \n" - "Parks Jesse ++ \n" - "Connage Kyesha \n" - "Hendry Perihan \n" - "Mounty E J \n" - "Sharif B \n" - "Scott Wellesley \n" - "Harriott S A \n" - "Harriott Clive \n" - "Ojumu Ibi", - "7": "Tullis Andrew C + \n" - "Mason Joshua H ++ \n" - "Parkinson Francine M \n" - "Gait Elizabeth \n" - "Doolan Samantha \n" - "Ubiaro Elizabeth \n" - "Garner Stuart \n" - "Akinjogbin Dolapo \n" - "Walker Donna \n" - "Lang Geoffrey P", - "8": "Atwell E G + \n" - "Rose Lloyd ++ \n" - "O`Shea C \n" - "Gomes Jacqueline \n" - "Wood Thomas \n" - "Rosenfeld David \n" - "Conroy Martin \n" - "Skiadopoulou I \n" - "Rosenfeld Lawrence \n" - "Rosenfeld Emily", - }, - "4": { - "0": "Reason why \nno longer \nnominated*", - "1": "", - "2": "", - "3": "", - "4": "", - "5": "", - "6": "", - "7": "", - "8": "", - }, - }, - ) - - @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed") - def test_extract_command_current(self): - self.assertEqual(CamelotParsedSOPN.objects.count(), 0) - call_command("sopn_parsing_extract_tables", current=True) - self.assertEqual(CamelotParsedSOPN.objects.count(), 1) - - def test_extract_command_current_no_current_elections(self): - self.election.current = False - self.election.save() - self.assertEqual(CamelotParsedSOPN.objects.count(), 0) - call_command("sopn_parsing_extract_tables", current=True) - self.assertEqual(CamelotParsedSOPN.objects.count(), 0) diff --git a/ynr/apps/sopn_parsing/tests/test_parse_tables.py b/ynr/apps/sopn_parsing/tests/test_parse_tables.py deleted file mode 100644 index 922c487dd4..0000000000 --- a/ynr/apps/sopn_parsing/tests/test_parse_tables.py +++ /dev/null @@ -1,529 +0,0 @@ -import json -from pathlib import Path -from unittest import skipIf -from unittest.mock import patch - -from bulk_adding.models import RawPeople -from candidates.tests.uk_examples import UK2015ExamplesMixin -from django.core.management import call_command -from django.db import connection -from django.test import TestCase -from official_documents.models import BallotSOPN -from pandas import Index, Series -from parties.models import Party, PartyDescription -from parties.tests.factories import PartyFactory -from parties.tests.fixtures import DefaultPartyFixtures -from sopn_parsing.helpers import parse_tables -from sopn_parsing.models import CamelotParsedSOPN -from sopn_parsing.tests import should_skip_pdf_tests -from sopn_parsing.tests.data.welsh_sopn_data import welsh_sopn_data - -from ynr.apps.sopn_parsing.management.commands.sopn_parsing_parse_tables import ( - Command as ParseTablesCommand, -) - - -class TestSOPNHelpers(DefaultPartyFixtures, UK2015ExamplesMixin, TestCase): - def setUp(self): - PartyFactory(ec_id="PP85", name="UK Independence Party (UKIP)") - with connection.cursor() as cursor: - cursor.execute("CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;") - - @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed") - def test_basic_parsing(self): - self.assertFalse(RawPeople.objects.exists()) - doc = BallotSOPN.objects.create( - ballot=self.dulwich_post_ballot, - source_url="example.com", - ) - dataframe = json.dumps( - { - "0": { - "0": "Name of \nCandidate", - "1": "BRADBURY \nAndrew John", - "2": "COLLINS \nDave", - "3": "HARVEY \nPeter John", - "4": "JENNER \nMelanie", - }, - "1": { - "0": "Home Address", - "1": "10 Fowey Close, \nShoreham by Sea, \nWest Sussex, \nBN43 5HE", - "2": "51 Old Fort Road, \nShoreham by Sea, \nBN43 5RL", - "3": "76 Harbour Way, \nShoreham by Sea, \nSussex, \nBN43 5HH", - "4": "9 Flag Square, \nShoreham by Sea, \nWest Sussex, \nBN43 5RZ", - }, - "2": { - "0": "Description (if \nany)", - "1": "Green Party", - "2": "Independent", - "3": "UK Independence \nParty (UKIP)", - "4": "Labour Party", - }, - "3": { - "0": "Name of \nProposer", - "1": "Tiffin Susan J", - "2": "Loader Jocelyn C", - "3": "Hearne James H", - "4": "O`Connor Lavinia", - }, - "4": { - "0": "Reason \nwhy no \nlonger \nnominated\n*", - "1": "", - "2": "", - "3": "", - "4": "", - }, - } - ) - CamelotParsedSOPN.objects.create( - sopn=doc, raw_data=dataframe, status="unparsed" - ) - call_command("sopn_parsing_parse_tables") - self.assertEqual(RawPeople.objects.count(), 1) - raw_people = RawPeople.objects.get() - self.assertEqual( - raw_people.data, - [ - {"name": "Andrew John Bradbury", "party_id": "PP63"}, - {"name": "Dave Collins", "party_id": "ynmp-party:2"}, - {"name": "Peter John Harvey", "party_id": "PP85"}, - {"name": "Melanie Jenner", "party_id": "PP53"}, - ], - ) - - @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed") - def test_welsh_run_sopn(self): - """ - Test that if the ballot is welsh run and previous party affiliations - are included they are parsed - """ - self.assertFalse(RawPeople.objects.exists()) - doc = BallotSOPN.objects.create( - ballot=self.senedd_ballot, - source_url="example.com", - ) - - plaid_cymru, _ = Party.objects.update_or_create( - ec_id="PP77", - legacy_slug="party:77", - defaults={ - "name": "Plaid Cymru - The Party of Wales", - "date_registered": "1999-01-14", - }, - ) - - dataframe = json.dumps(welsh_sopn_data) - CamelotParsedSOPN.objects.create( - sopn=doc, raw_data=dataframe, status="unparsed" - ) - call_command("sopn_parsing_parse_tables") - self.assertEqual(RawPeople.objects.count(), 1) - raw_people = RawPeople.objects.get() - self.assertEqual( - raw_people.data, - [ - { - "name": "John Smith", - "party_id": self.conservative_party.ec_id, - "previous_party_affiliations": [self.ld_party.ec_id], - }, - { - "name": "Joe Bloggs", - "party_id": self.labour_party.ec_id, - "previous_party_affiliations": ["ynmp-party:2"], - }, - {"name": "Jon Doe", "party_id": self.ld_party.ec_id}, - { - "name": "Jane Brown", - "party_id": "ynmp-party:2", - "previous_party_affiliations": [plaid_cymru.ec_id], - }, - { - "name": "Judy Johnson", - "party_id": plaid_cymru.ec_id, - "previous_party_affiliations": [self.labour_party.ec_id], - }, - {"name": "Julie Williams", "party_id": "ynmp-party:2"}, - ], - ) - - @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed") - def test_match_complex_descriptions(self): - self.assertFalse(RawPeople.objects.exists()) - doc = BallotSOPN.objects.create( - ballot=self.senedd_ballot, - source_url="example.com", - ) - - plaid_cymru, _ = Party.objects.update_or_create( - ec_id="PP77", - legacy_slug="party:77", - defaults={ - "name": "Plaid Cymru - The Party of Wales", - "date_registered": "1999-01-14", - }, - ) - - dickens_heath, _ = Party.objects.update_or_create( - ec_id="PP1", - legacy_slug="PP!", - defaults={ - "name": "Independent Dickens Heath Residents Action Group", - "date_registered": "1999-01-14", - }, - ) - PartyDescription.objects.create( - party=dickens_heath, - description="Independent Dickens Heath Residents Action Group", - ) - lib_dem, _ = Party.objects.update_or_create( - ec_id="PP100", - legacy_slug="PP100", - defaults={ - "name": "Liberal Democrats", - "date_registered": "1999-01-14", - }, - register="GB", - ) - - PartyDescription.objects.create( - party=lib_dem, - description="Liberal Democrat Focus Team | Tîm Ffocws y Democratiaid Rhyddfrydol", - ) - - data_path = ( - Path(__file__).parent / "data/edge_case_description_data.json" - ) - with data_path.open() as f: - CamelotParsedSOPN.objects.create( - sopn=doc, raw_data=f.read(), status="unparsed" - ) - call_command("sopn_parsing_parse_tables") - self.assertEqual(RawPeople.objects.count(), 1) - raw_people = RawPeople.objects.get() - self.assertEqual( - sorted(raw_people.data, key=lambda x: x["name"]), - sorted( - [ - { - "name": "John Smith", - "party_id": self.conservative_party.ec_id, - }, - { - "name": "Joe Bloggs", - "party_id": self.labour_party.ec_id, - }, - { - "name": "Jon Doe", - "party_id": self.ld_party.ec_id, - }, - { - "name": "Jane Brown", - "party_id": "ynmp-party:2", - }, - { - "name": "Judy Johnson", - "party_id": plaid_cymru.ec_id, - }, - {"name": "Julie Williams", "party_id": "ynmp-party:2"}, - ], - key=lambda x: x["name"], - ), - ) - - -class TestParseTablesUnitTests(UK2015ExamplesMixin, TestCase): - def get_two_name_field_cases(self): - # this could be updated with more combinations as we come across them - return [ - { - "name_fields": ["candidate surname", "candidate forename"], - "row": { - "candidate surname": "BAGSHAW", - "candidate forename": "Elaine Sheila", - "home address": "1 Foo Street \n London \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - "ordered_name_fields": [ - "candidate forename", - "candidate surname", - ], - "expected_name": "Elaine Sheila Bagshaw", - }, - { - "name_fields": ["surname", "other names"], - "row": { - "surname": "BAGSHAW", - "other names": "Elaine Sheila", - "home address": "1 Foo Street \nLondon \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - "ordered_name_fields": ["other names", "surname"], - "expected_name": "Elaine Sheila Bagshaw", - }, - { - "name_fields": ["last name", "other names"], - "row": { - "last name": "BAGSHAW", - "other names": "Elaine Sheila", - "home address": "1 Foo Street \nLondon \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - "ordered_name_fields": ["other names", "last name"], - "expected_name": "Elaine Sheila Bagshaw", - }, - { - "name_fields": ["candidate forename", "candidate surname"], - "row": { - "candidate forename": "Elaine Sheila", - "candidate surname": "BAGSHAW", - "home address": "1 Foo Street \n London \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - "ordered_name_fields": [ - "candidate forename", - "candidate surname", - ], - "expected_name": "Elaine Sheila Bagshaw", - }, - ] - - def get_single_name_field_cases(self): - return [ - { - "name_fields": ["name of candidate"], - "row": { - "name of candidate": "BAGSHAW Elaine Sheila", - "home address": "1 Foo Street \n London \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - }, - { - "name_fields": ["names of candidate"], - "row": { - "names of candidate": "BAGSHAW Elaine Sheila", - "home address": "1 Foo Street \nLondon \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - }, - { - "name_fields": ["candidate name"], - "row": { - "candidate name": "BAGSHAW Elaine Sheila", - "home address": "1 Foo Street \nLondon \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - }, - { - "name_fields": ["surname"], - "row": { - "surname": "BAGSHAW Elaine Sheila", - "home address": "1 Foo Street \nLondon \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - }, - { - "name_fields": ["candidates surname"], - "row": { - "candidates surname": "BAGSHAW Elaine Sheila", - "home address": "1 Foo Street \nLondon \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - }, - { - "name_fields": ["other name"], - "row": { - "other name": "BAGSHAW Elaine Sheila", - "home address": "1 Foo Street \nLondon \nE14 6FW", - "description": "London Liberal \nDemocrats", - "reason why no longer nominated": "", - }, - }, - ] - - def test_get_name_single_field(self): - for case in self.get_single_name_field_cases(): - row = Series(case["row"]) - name_fields = case["name_fields"] - with self.subTest(name_fields=name_fields): - assert len(case["name_fields"]) == 1 - name = parse_tables.get_name(row=row, name_fields=name_fields) - assert name == "Elaine Sheila Bagshaw" - - def test_get_name_two_fields(self): - for case in self.get_two_name_field_cases(): - row = Series(case["row"]) - name_fields = case["name_fields"] - with self.subTest(name_fields=name_fields): - assert len(case["name_fields"]) == 2 - name = parse_tables.get_name(row=row, name_fields=name_fields) - assert name == case["expected_name"] - - def test_get_name_fields_single(self): - for case in self.get_single_name_field_cases(): - row = Index(case["row"]) - with self.subTest(row=row): - name_fields = parse_tables.get_name_fields(row=row) - assert len(name_fields) == 1 - assert name_fields == case["name_fields"] - - def test_get_name_fields_two(self): - for case in self.get_two_name_field_cases(): - row = Index(case["row"]) - with self.subTest(row=row): - name_fields = parse_tables.get_name_fields(row=row) - assert len(name_fields) == 2 - assert name_fields == case["name_fields"] - - def test_get_name_fields_raises_error(self): - row = Index({"foo": "Bar"}) - with self.assertRaises(ValueError): - parse_tables.get_name_fields(row=row) - - def test_order_name_fields(self): - for case in self.get_two_name_field_cases(): - name_fields = case["name_fields"] - with self.subTest(name_fields=name_fields): - result = parse_tables.order_name_fields(name_fields) - assert result == case["ordered_name_fields"] - - def test_clean_name_replaces_backticks(self): - name = parse_tables.clean_name("D`SOUZA") - assert "`" not in name - assert "'" in name - - def test_clean_name_replaces_newlines(self): - name = parse_tables.clean_name( - "A Very Long Name That Splits \nOver Lines" - ) - assert "\n" not in name - - def test_clean_name_capitalized_last_and_titalized(self): - name = parse_tables.clean_name("SMITH John") - assert name == "John Smith" - - def test_clean_last_names(self): - name = parse_tables.clean_last_names(["MACDONALD", "John"]) - assert name == "MacDonald" - - def test_clean_name_two_word_surnames(self): - names = [ - ("EDE COOPER \nPalmer", "Palmer Ede Cooper"), - ("VAN DULKEN \nRichard Michael", "Richard Michael Van Dulken"), - ("ARMSTRONG LILLEY \nLynne", "Lynne Armstrong Lilley"), - ( - " D`SOUZA Aaron Anthony Jose \nHasan", - "Aaron Anthony Jose Hasan D'Souza", - ), - ("Michael James Collins", "Michael James Collins"), - (" Michael James Collins ", "Michael James Collins"), - ("DAVE Nitesh Pravin", "Nitesh Pravin Dave"), - ("DAVE\nNitesh Pravin", "Nitesh Pravin Dave"), - ("COOKE Anne-Marie", "Anne-Marie Cooke"), - ("COOKE\nAnne-Marie", "Anne-Marie Cooke"), - ("BROOKES-\nDUNCAN\nKaty", "Katy Brookes-Duncan"), - ("HOUNSOME\nJohn", "John Hounsome"), - ("O`CONNELL \nStephen John", "Stephen John O'Connell"), - ("O`NEAL \nCarol Joy", "Carol Joy O'Neal"), - ("O`REILLY \nTracey Linda \nDiane", "Tracey Linda Diane O'Reilly"), - ("LIAM THOMAS O'ROURKE", "Liam Thomas O'Rourke"), - ("O'CALLAGHAN \nClaire Louise", "Claire Louise O'Callaghan"), - ] - for name in names: - with self.subTest(name=names[0]): - assert parse_tables.clean_name(name[0]) == name[1] - - def test_clean_description_removes_newlines(self): - cleaned_description = parse_tables.clean_description( - "A Long Description That Splits \nOver \\nLines" - ) - assert "\n" not in cleaned_description - assert "\\n" not in cleaned_description - - def test_clean_description_replaces_backticks(self): - cleaned_description = parse_tables.clean_description( - "All People`s Party" - ) - assert "`" not in cleaned_description - assert "'" in cleaned_description - assert cleaned_description == "All People's Party" - - def test_guess_previous_party_affiliations_field(self): - sopn = CamelotParsedSOPN(raw_data=json.dumps(welsh_sopn_data)) - data = sopn.as_pandas - data.columns = data.iloc[0] - - cases = [ - (self.dulwich_post_ballot, None), - (self.senedd_ballot, "statement of party membership"), - ] - for case in cases: - with self.subTest(msg=case[0]): - sopn.sopn = BallotSOPN(ballot=case[0]) - result = parse_tables.guess_previous_party_affiliations_field( - data=data, sopn=sopn - ) - assert result == case[1] - - def test_add_previous_party_affiliations(self): - cases = [ - {"party_str": "", "party": None, "expected": {}}, - {"party_str": "Unknown Party", "party": None, "expected": {}}, - { - "party_str": "Labour Party", - "party": self.labour_party, - "expected": { - "previous_party_affiliations": [self.labour_party.ec_id] - }, - }, - ] - for case in cases: - with self.subTest(msg=case["party_str"]), patch.object( - parse_tables, "get_party", return_value=case["party"] - ): - raw_data = {} - sopn = CamelotParsedSOPN() - result = parse_tables.add_previous_party_affiliations( - party_str=case["party_str"], - raw_data=raw_data, - sopn=sopn, - ) - assert result == case["expected"] - - -class TestParseTablesFilterKwargs(TestCase): - def setUp(self): - self.command = ParseTablesCommand() - self.default_filter_kwargs = {} - - def test_when_testing(self): - options = {"testing": True} - result = self.command.build_filter_kwargs(options) - self.assertEqual(result, self.default_filter_kwargs) - - def test_when_using_ballot(self): - options = {"ballot": "local.foo.bar.2021-05-06"} - result = self.command.build_filter_kwargs(options) - self.assertEqual(result, self.default_filter_kwargs) - - def test_when_using_reparse(self): - options = {"reparse": True} - result = self.command.build_filter_kwargs(options) - expected = self.default_filter_kwargs.copy() - expected["rawpeople__source_type"] = RawPeople.SOURCE_PARSED_PDF - self.assertEqual(result, expected) - - def test_when_no_options(self): - options = {} - result = self.command.build_filter_kwargs(options) - expected = self.default_filter_kwargs.copy() - self.assertEqual(result, expected) diff --git a/ynr/apps/sopn_parsing/tests/test_pdf_conversion.py b/ynr/apps/sopn_parsing/tests/test_pdf_conversion.py index dceec34d7e..25a6afee1d 100644 --- a/ynr/apps/sopn_parsing/tests/test_pdf_conversion.py +++ b/ynr/apps/sopn_parsing/tests/test_pdf_conversion.py @@ -1,5 +1,3 @@ -from unittest import skipIf - from candidates.tests.uk_examples import UK2015ExamplesMixin from django.core.files.uploadedfile import SimpleUploadedFile from django.test import TestCase @@ -13,12 +11,8 @@ PandocConversionError, convert_docx_to_pdf, ) -from sopn_parsing.tests import should_skip_conversion_tests -@skipIf( - should_skip_conversion_tests(), "Required conversion libs not installed" -) class TestSOPNHelpers(UK2015ExamplesMixin, TestCase): example_docx_filename = EXAMPLE_DOCX_FILENAME example_html_filename = EXAMPLE_HTML_FILENAME