diff --git a/.gitignore b/.gitignore
index d08c2cda36..38c331c5a1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,8 +24,6 @@ test-results
node_modules/
.vscode/
/test-env
-/ynr/apps/sopn_parsing/tests/data/sopn_baseline.json
-/ynr/apps/sopn_parsing/tests/data/sopn_baseline_copy.json
# PyCharm
.idea/
diff --git a/Makefile b/Makefile
deleted file mode 100644
index f8d2865666..0000000000
--- a/Makefile
+++ /dev/null
@@ -1,45 +0,0 @@
-export DJANGO_SETTINGS_MODULE?=ynr.settings.sopn_testing
-
-
-.PHONY: sopn-runserver
-sopn-runserver:
- python manage.py runserver
-
-.PHONY: sopn-shell
-sopn-shell:
- python manage.py shell_plus
-
-.PHONY: migrate-db
-migrate-db:
- python manage.py migrate
-
-.PHONY: test-sopns
-test-sopns: migrate-db
- python manage.py sopn_tooling_compare_raw_people --election-slugs= --ballot= --date 2021-05-06
-
-.PHONY: download-sopns
-download-sopns:
- python manage.py migrate --no-input
- python manage.py sopn_tooling_create_official_documents --election-slugs= --date 2021-05-06
-
-.PHONY: populate-sopn-testing-database
-populate-sopn-testing-database: migrate-db
- python manage.py candidates_import_from_live_site
-
-.PHONY: delete-test-sopns
-delete-test-sopns:
- python manage.py sopn_tooling_clear_existing_objects
- rm -rf ./ynr/media/sopn_testing/
-
-.PHONY: create-baseline-file
-create-baseline-file:
- python manage.py sopn_tooling_write_baseline
-
-.PHONY: copy-baseline-file
-copy-baseline-file:
- cp ynr/apps/sopn_parsing/tests/data/sopn_baseline.json ynr/apps/sopn_parsing/tests/data/sopn_baseline_copy.json
-
-.PHONY: prod-import-sopns
-prod-import-sopns:
- cd deploy; \
- ansible-playbook import_sopns.yml
diff --git a/pyproject.toml b/pyproject.toml
index 4cbbb16b4a..d6d5661d4a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,12 +41,10 @@ dependencies = [
"markdown-it-py==4.0.0",
"nameparser==1.1.2",
"ndg-httpsclient==0.5.1",
-
"openai==1.30.3",
# for compatibility with openai==1.30.3
# TODO: review/remove when we upgrade openai package
"httpx==0.27.0",
-
"Pillow==10.3.0",
"psycopg==3.1.12",
"python-dateutil==2.8.2",
@@ -58,14 +56,13 @@ dependencies = [
"whitenoise==6.5.0",
"sorl-thumbnail-serializer-field",
"slacker2",
-
# SOPN parsing
"pdfminer.six==20201018",
- "camelot-py[cv]==0.8.2",
"pypandoc_binary==1.14",
"PyPDF2==2.12.1",
"amazon-textract-response-parser==1.0.3",
"amazon-textract-helper==0.0.35",
+ "pandas>=3.0.0",
]
[dependency-groups]
diff --git a/uv.lock b/uv.lock
index ca36a3cc96..c92fd1cf73 100644
--- a/uv.lock
+++ b/uv.lock
@@ -4,7 +4,9 @@ requires-python = "==3.12.*"
resolution-markers = [
"sys_platform == 'darwin'",
"platform_machine == 'aarch64' and sys_platform == 'linux'",
- "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')",
+ "sys_platform == 'win32'",
+ "sys_platform == 'emscripten'",
+ "(platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'emscripten' and sys_platform != 'linux' and sys_platform != 'win32')",
]
[[package]]
@@ -323,29 +325,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/bc/47/e35f788047c91110f48703a6254e5c84e33111b3291f7b57a653ca00accf/botocore-1.34.162-py3-none-any.whl", hash = "sha256:2d918b02db88d27a75b48275e6fb2506e9adaaddbec1ffa6a8a0898b34e769be", size = 12468049, upload-time = "2024-08-15T19:25:18.301Z" },
]
-[[package]]
-name = "camelot-py"
-version = "0.8.2"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
- { name = "chardet" },
- { name = "click" },
- { name = "numpy" },
- { name = "openpyxl" },
- { name = "pandas" },
- { name = "pdfminer-six" },
- { name = "pypdf2" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/13/d4/cfd74357cf62d3e4c91439074422710df1147a261f09af72d808bfa40cd2/camelot-py-0.8.2.tar.gz", hash = "sha256:8e1e2a8e59c2dbdce9a6790f7007cd091343b136e29d64a45c8fd09c11360057", size = 38173, upload-time = "2020-07-27T12:28:17.846Z" }
-wheels = [
- { url = "https://files.pythonhosted.org/packages/ed/78/123007d6aff9811bd087ff5dd68e06a9df62b2fdbf7685d74c9d6f247921/camelot_py-0.8.2-py3-none-any.whl", hash = "sha256:0b2e612ad0e11190b84a29937bad5d487b540faea408863f01e410d2b976336e", size = 42794, upload-time = "2020-07-27T12:28:15.966Z" },
-]
-
-[package.optional-dependencies]
-cv = [
- { name = "opencv-python" },
-]
-
[[package]]
name = "cattrs"
version = "25.3.0"
@@ -425,18 +404,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/0a/4c/925909008ed5a988ccbb72dcc897407e5d6d3bd72410d69e051fc0c14647/charset_normalizer-3.4.4-py3-none-any.whl", hash = "sha256:7a32c560861a02ff789ad905a2fe94e3f840803362c84fecf1851cb4cf3dc37f", size = 53402, upload-time = "2025-10-14T04:42:31.76Z" },
]
-[[package]]
-name = "click"
-version = "8.3.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
- { name = "colorama", marker = "sys_platform == 'win32'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/3d/fa/656b739db8587d7b5dfa22e22ed02566950fbfbcdc20311993483657a5c0/click-8.3.1.tar.gz", hash = "sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a", size = 295065, upload-time = "2025-11-15T20:45:42.706Z" }
-wheels = [
- { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274, upload-time = "2025-11-15T20:45:41.139Z" },
-]
-
[[package]]
name = "colorama"
version = "0.4.6"
@@ -843,15 +810,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/28/f0/65101e51dc7c850e7b7581a5d8fa8721a1d7479a0dca6c08386328e19882/editdistance-0.8.1-cp312-cp312-win_amd64.whl", hash = "sha256:09f01ed51746d90178af7dd7ea4ebb41497ef19f53c7f327e864421743dffb0a", size = 79853, upload-time = "2024-02-10T07:44:05.687Z" },
]
-[[package]]
-name = "et-xmlfile"
-version = "2.0.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/d3/38/af70d7ab1ae9d4da450eeec1fa3918940a5fafb9055e934af8d6eb0c2313/et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54", size = 17234, upload-time = "2024-10-25T17:25:40.039Z" }
-wheels = [
- { url = "https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa", size = 18059, upload-time = "2024-10-25T17:25:39.051Z" },
-]
-
[[package]]
name = "executing"
version = "2.2.1"
@@ -1302,21 +1260,21 @@ wheels = [
[[package]]
name = "numpy"
-version = "2.3.5"
+version = "2.4.1"
source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/76/65/21b3bc86aac7b8f2862db1e808f1ea22b028e30a225a34a5ede9bf8678f2/numpy-2.3.5.tar.gz", hash = "sha256:784db1dcdab56bf0517743e746dfb0f885fc68d948aba86eeec2cba234bdf1c0", size = 20584950, upload-time = "2025-11-16T22:52:42.067Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/24/62/ae72ff66c0f1fd959925b4c11f8c2dea61f47f6acaea75a08512cdfe3fed/numpy-2.4.1.tar.gz", hash = "sha256:a1ceafc5042451a858231588a104093474c6a5c57dcc724841f5c888d237d690", size = 20721320, upload-time = "2026-01-10T06:44:59.619Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/44/37/e669fe6cbb2b96c62f6bbedc6a81c0f3b7362f6a59230b23caa673a85721/numpy-2.3.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:74ae7b798248fe62021dbf3c914245ad45d1a6b0cb4a29ecb4b31d0bfbc4cc3e", size = 16733873, upload-time = "2025-11-16T22:49:49.84Z" },
- { url = "https://files.pythonhosted.org/packages/c5/65/df0db6c097892c9380851ab9e44b52d4f7ba576b833996e0080181c0c439/numpy-2.3.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ee3888d9ff7c14604052b2ca5535a30216aa0a58e948cdd3eeb8d3415f638769", size = 12259838, upload-time = "2025-11-16T22:49:52.863Z" },
- { url = "https://files.pythonhosted.org/packages/5b/e1/1ee06e70eb2136797abe847d386e7c0e830b67ad1d43f364dd04fa50d338/numpy-2.3.5-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:612a95a17655e213502f60cfb9bf9408efdc9eb1d5f50535cc6eb365d11b42b5", size = 5088378, upload-time = "2025-11-16T22:49:55.055Z" },
- { url = "https://files.pythonhosted.org/packages/6d/9c/1ca85fb86708724275103b81ec4cf1ac1d08f465368acfc8da7ab545bdae/numpy-2.3.5-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:3101e5177d114a593d79dd79658650fe28b5a0d8abeb8ce6f437c0e6df5be1a4", size = 6628559, upload-time = "2025-11-16T22:49:57.371Z" },
- { url = "https://files.pythonhosted.org/packages/74/78/fcd41e5a0ce4f3f7b003da85825acddae6d7ecb60cf25194741b036ca7d6/numpy-2.3.5-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b973c57ff8e184109db042c842423ff4f60446239bd585a5131cc47f06f789d", size = 14250702, upload-time = "2025-11-16T22:49:59.632Z" },
- { url = "https://files.pythonhosted.org/packages/b6/23/2a1b231b8ff672b4c450dac27164a8b2ca7d9b7144f9c02d2396518352eb/numpy-2.3.5-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0d8163f43acde9a73c2a33605353a4f1bc4798745a8b1d73183b28e5b435ae28", size = 16606086, upload-time = "2025-11-16T22:50:02.127Z" },
- { url = "https://files.pythonhosted.org/packages/a0/c5/5ad26fbfbe2012e190cc7d5003e4d874b88bb18861d0829edc140a713021/numpy-2.3.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:51c1e14eb1e154ebd80e860722f9e6ed6ec89714ad2db2d3aa33c31d7c12179b", size = 16025985, upload-time = "2025-11-16T22:50:04.536Z" },
- { url = "https://files.pythonhosted.org/packages/d2/fa/dd48e225c46c819288148d9d060b047fd2a6fb1eb37eae25112ee4cb4453/numpy-2.3.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b46b4ec24f7293f23adcd2d146960559aaf8020213de8ad1909dba6c013bf89c", size = 18542976, upload-time = "2025-11-16T22:50:07.557Z" },
- { url = "https://files.pythonhosted.org/packages/05/79/ccbd23a75862d95af03d28b5c6901a1b7da4803181513d52f3b86ed9446e/numpy-2.3.5-cp312-cp312-win32.whl", hash = "sha256:3997b5b3c9a771e157f9aae01dd579ee35ad7109be18db0e85dbdbe1de06e952", size = 6285274, upload-time = "2025-11-16T22:50:10.746Z" },
- { url = "https://files.pythonhosted.org/packages/2d/57/8aeaf160312f7f489dea47ab61e430b5cb051f59a98ae68b7133ce8fa06a/numpy-2.3.5-cp312-cp312-win_amd64.whl", hash = "sha256:86945f2ee6d10cdfd67bcb4069c1662dd711f7e2a4343db5cecec06b87cf31aa", size = 12782922, upload-time = "2025-11-16T22:50:12.811Z" },
- { url = "https://files.pythonhosted.org/packages/78/a6/aae5cc2ca78c45e64b9ef22f089141d661516856cf7c8a54ba434576900d/numpy-2.3.5-cp312-cp312-win_arm64.whl", hash = "sha256:f28620fe26bee16243be2b7b874da327312240a7cdc38b769a697578d2100013", size = 10194667, upload-time = "2025-11-16T22:50:16.16Z" },
+ { url = "https://files.pythonhosted.org/packages/78/7f/ec53e32bf10c813604edf07a3682616bd931d026fcde7b6d13195dfb684a/numpy-2.4.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d3703409aac693fa82c0aee023a1ae06a6e9d065dba10f5e8e80f642f1e9d0a2", size = 16656888, upload-time = "2026-01-10T06:42:40.913Z" },
+ { url = "https://files.pythonhosted.org/packages/b8/e0/1f9585d7dae8f14864e948fd7fa86c6cb72dee2676ca2748e63b1c5acfe0/numpy-2.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7211b95ca365519d3596a1d8688a95874cc94219d417504d9ecb2df99fa7bfa8", size = 12373956, upload-time = "2026-01-10T06:42:43.091Z" },
+ { url = "https://files.pythonhosted.org/packages/8e/43/9762e88909ff2326f5e7536fa8cb3c49fb03a7d92705f23e6e7f553d9cb3/numpy-2.4.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:5adf01965456a664fc727ed69cc71848f28d063217c63e1a0e200a118d5eec9a", size = 5202567, upload-time = "2026-01-10T06:42:45.107Z" },
+ { url = "https://files.pythonhosted.org/packages/4b/ee/34b7930eb61e79feb4478800a4b95b46566969d837546aa7c034c742ef98/numpy-2.4.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:26f0bcd9c79a00e339565b303badc74d3ea2bd6d52191eeca5f95936cad107d0", size = 6549459, upload-time = "2026-01-10T06:42:48.152Z" },
+ { url = "https://files.pythonhosted.org/packages/79/e3/5f115fae982565771be994867c89bcd8d7208dbfe9469185497d70de5ddf/numpy-2.4.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0093e85df2960d7e4049664b26afc58b03236e967fb942354deef3208857a04c", size = 14404859, upload-time = "2026-01-10T06:42:49.947Z" },
+ { url = "https://files.pythonhosted.org/packages/d9/7d/9c8a781c88933725445a859cac5d01b5871588a15969ee6aeb618ba99eee/numpy-2.4.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7ad270f438cbdd402c364980317fb6b117d9ec5e226fff5b4148dd9aa9fc6e02", size = 16371419, upload-time = "2026-01-10T06:42:52.409Z" },
+ { url = "https://files.pythonhosted.org/packages/a6/d2/8aa084818554543f17cf4162c42f162acbd3bb42688aefdba6628a859f77/numpy-2.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:297c72b1b98100c2e8f873d5d35fb551fce7040ade83d67dd51d38c8d42a2162", size = 16182131, upload-time = "2026-01-10T06:42:54.694Z" },
+ { url = "https://files.pythonhosted.org/packages/60/db/0425216684297c58a8df35f3284ef56ec4a043e6d283f8a59c53562caf1b/numpy-2.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:cf6470d91d34bf669f61d515499859fa7a4c2f7c36434afb70e82df7217933f9", size = 18295342, upload-time = "2026-01-10T06:42:56.991Z" },
+ { url = "https://files.pythonhosted.org/packages/31/4c/14cb9d86240bd8c386c881bafbe43f001284b7cce3bc01623ac9475da163/numpy-2.4.1-cp312-cp312-win32.whl", hash = "sha256:b6bcf39112e956594b3331316d90c90c90fb961e39696bda97b89462f5f3943f", size = 5959015, upload-time = "2026-01-10T06:42:59.631Z" },
+ { url = "https://files.pythonhosted.org/packages/51/cf/52a703dbeb0c65807540d29699fef5fda073434ff61846a564d5c296420f/numpy-2.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:e1a27bb1b2dee45a2a53f5ca6ff2d1a7f135287883a1689e930d44d1ff296c87", size = 12310730, upload-time = "2026-01-10T06:43:01.627Z" },
+ { url = "https://files.pythonhosted.org/packages/69/80/a828b2d0ade5e74a9fe0f4e0a17c30fdc26232ad2bc8c9f8b3197cf7cf18/numpy-2.4.1-cp312-cp312-win_arm64.whl", hash = "sha256:0e6e8f9d9ecf95399982019c01223dc130542960a12edfa8edd1122dfa66a8a8", size = 10312166, upload-time = "2026-01-10T06:43:03.673Z" },
]
[[package]]
@@ -1337,35 +1295,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/8c/e9/ceef41cbc47ee82c3da44e47a60780cb628322ffd311043e8c7522990478/openai-1.30.3-py3-none-any.whl", hash = "sha256:f88119c8a848998be533c71ab8aa832446fa72b7ddbc70917c3f5886dc132051", size = 320635, upload-time = "2024-05-24T16:06:14.491Z" },
]
-[[package]]
-name = "opencv-python"
-version = "4.11.0.86"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
- { name = "numpy" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/17/06/68c27a523103dad5837dc5b87e71285280c4f098c60e4fe8a8db6486ab09/opencv-python-4.11.0.86.tar.gz", hash = "sha256:03d60ccae62304860d232272e4a4fda93c39d595780cb40b161b310244b736a4", size = 95171956, upload-time = "2025-01-16T13:52:24.737Z" }
-wheels = [
- { url = "https://files.pythonhosted.org/packages/05/4d/53b30a2a3ac1f75f65a59eb29cf2ee7207ce64867db47036ad61743d5a23/opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl", hash = "sha256:432f67c223f1dc2824f5e73cdfcd9db0efc8710647d4e813012195dc9122a52a", size = 37326322, upload-time = "2025-01-16T13:52:25.887Z" },
- { url = "https://files.pythonhosted.org/packages/3b/84/0a67490741867eacdfa37bc18df96e08a9d579583b419010d7f3da8ff503/opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_x86_64.whl", hash = "sha256:9d05ef13d23fe97f575153558653e2d6e87103995d54e6a35db3f282fe1f9c66", size = 56723197, upload-time = "2025-01-16T13:55:21.222Z" },
- { url = "https://files.pythonhosted.org/packages/f3/bd/29c126788da65c1fb2b5fb621b7fed0ed5f9122aa22a0868c5e2c15c6d23/opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b92ae2c8852208817e6776ba1ea0d6b1e0a1b5431e971a2a0ddd2a8cc398202", size = 42230439, upload-time = "2025-01-16T13:51:35.822Z" },
- { url = "https://files.pythonhosted.org/packages/2c/8b/90eb44a40476fa0e71e05a0283947cfd74a5d36121a11d926ad6f3193cc4/opencv_python-4.11.0.86-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b02611523803495003bd87362db3e1d2a0454a6a63025dc6658a9830570aa0d", size = 62986597, upload-time = "2025-01-16T13:52:08.836Z" },
- { url = "https://files.pythonhosted.org/packages/fb/d7/1d5941a9dde095468b288d989ff6539dd69cd429dbf1b9e839013d21b6f0/opencv_python-4.11.0.86-cp37-abi3-win32.whl", hash = "sha256:810549cb2a4aedaa84ad9a1c92fbfdfc14090e2749cedf2c1589ad8359aa169b", size = 29384337, upload-time = "2025-01-16T13:52:13.549Z" },
- { url = "https://files.pythonhosted.org/packages/a4/7d/f1c30a92854540bf789e9cd5dde7ef49bbe63f855b85a2e6b3db8135c591/opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:085ad9b77c18853ea66283e98affefe2de8cc4c1f43eda4c100cf9b2721142ec", size = 39488044, upload-time = "2025-01-16T13:52:21.928Z" },
-]
-
-[[package]]
-name = "openpyxl"
-version = "3.1.5"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
- { name = "et-xmlfile" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/3d/f9/88d94a75de065ea32619465d2f77b29a0469500e99012523b91cc4141cd1/openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050", size = 186464, upload-time = "2024-06-28T14:03:44.161Z" }
-wheels = [
- { url = "https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2", size = 250910, upload-time = "2024-06-28T14:03:41.161Z" },
-]
-
[[package]]
name = "packaging"
version = "25.0"
@@ -1377,23 +1306,23 @@ wheels = [
[[package]]
name = "pandas"
-version = "2.3.3"
+version = "3.0.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "numpy" },
{ name = "python-dateutil" },
- { name = "pytz" },
- { name = "tzdata" },
+ { name = "tzdata", marker = "sys_platform == 'emscripten' or sys_platform == 'win32'" },
]
-sdist = { url = "https://files.pythonhosted.org/packages/33/01/d40b85317f86cf08d853a4f495195c73815fdf205eef3993821720274518/pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b", size = 4495223, upload-time = "2025-09-29T23:34:51.853Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/de/da/b1dc0481ab8d55d0f46e343cfe67d4551a0e14fcee52bd38ca1bd73258d8/pandas-3.0.0.tar.gz", hash = "sha256:0facf7e87d38f721f0af46fe70d97373a37701b1c09f7ed7aeeb292ade5c050f", size = 4633005, upload-time = "2026-01-21T15:52:04.726Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/9c/fb/231d89e8637c808b997d172b18e9d4a4bc7bf31296196c260526055d1ea0/pandas-2.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d21f6d74eb1725c2efaa71a2bfc661a0689579b58e9c0ca58a739ff0b002b53", size = 11597846, upload-time = "2025-09-29T23:19:48.856Z" },
- { url = "https://files.pythonhosted.org/packages/5c/bd/bf8064d9cfa214294356c2d6702b716d3cf3bb24be59287a6a21e24cae6b/pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3fd2f887589c7aa868e02632612ba39acb0b8948faf5cc58f0850e165bd46f35", size = 10729618, upload-time = "2025-09-29T23:39:08.659Z" },
- { url = "https://files.pythonhosted.org/packages/57/56/cf2dbe1a3f5271370669475ead12ce77c61726ffd19a35546e31aa8edf4e/pandas-2.3.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecaf1e12bdc03c86ad4a7ea848d66c685cb6851d807a26aa245ca3d2017a1908", size = 11737212, upload-time = "2025-09-29T23:19:59.765Z" },
- { url = "https://files.pythonhosted.org/packages/e5/63/cd7d615331b328e287d8233ba9fdf191a9c2d11b6af0c7a59cfcec23de68/pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b3d11d2fda7eb164ef27ffc14b4fcab16a80e1ce67e9f57e19ec0afaf715ba89", size = 12362693, upload-time = "2025-09-29T23:20:14.098Z" },
- { url = "https://files.pythonhosted.org/packages/a6/de/8b1895b107277d52f2b42d3a6806e69cfef0d5cf1d0ba343470b9d8e0a04/pandas-2.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a68e15f780eddf2b07d242e17a04aa187a7ee12b40b930bfdd78070556550e98", size = 12771002, upload-time = "2025-09-29T23:20:26.76Z" },
- { url = "https://files.pythonhosted.org/packages/87/21/84072af3187a677c5893b170ba2c8fbe450a6ff911234916da889b698220/pandas-2.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:371a4ab48e950033bcf52b6527eccb564f52dc826c02afd9a1bc0ab731bba084", size = 13450971, upload-time = "2025-09-29T23:20:41.344Z" },
- { url = "https://files.pythonhosted.org/packages/86/41/585a168330ff063014880a80d744219dbf1dd7a1c706e75ab3425a987384/pandas-2.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:a16dcec078a01eeef8ee61bf64074b4e524a2a3f4b3be9326420cabe59c4778b", size = 10992722, upload-time = "2025-09-29T23:20:54.139Z" },
+ { url = "https://files.pythonhosted.org/packages/0b/38/db33686f4b5fa64d7af40d96361f6a4615b8c6c8f1b3d334eee46ae6160e/pandas-3.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9803b31f5039b3c3b10cc858c5e40054adb4b29b4d81cb2fd789f4121c8efbcd", size = 10334013, upload-time = "2026-01-21T15:50:34.771Z" },
+ { url = "https://files.pythonhosted.org/packages/a5/7b/9254310594e9774906bacdd4e732415e1f86ab7dbb4b377ef9ede58cd8ec/pandas-3.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:14c2a4099cd38a1d18ff108168ea417909b2dea3bd1ebff2ccf28ddb6a74d740", size = 9874154, upload-time = "2026-01-21T15:50:36.67Z" },
+ { url = "https://files.pythonhosted.org/packages/63/d4/726c5a67a13bc66643e66d2e9ff115cead482a44fc56991d0c4014f15aaf/pandas-3.0.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d257699b9a9960e6125686098d5714ac59d05222bef7a5e6af7a7fd87c650801", size = 10384433, upload-time = "2026-01-21T15:50:39.132Z" },
+ { url = "https://files.pythonhosted.org/packages/bf/2e/9211f09bedb04f9832122942de8b051804b31a39cfbad199a819bb88d9f3/pandas-3.0.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:69780c98f286076dcafca38d8b8eee1676adf220199c0a39f0ecbf976b68151a", size = 10864519, upload-time = "2026-01-21T15:50:41.043Z" },
+ { url = "https://files.pythonhosted.org/packages/00/8d/50858522cdc46ac88b9afdc3015e298959a70a08cd21e008a44e9520180c/pandas-3.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4a66384f017240f3858a4c8a7cf21b0591c3ac885cddb7758a589f0f71e87ebb", size = 11394124, upload-time = "2026-01-21T15:50:43.377Z" },
+ { url = "https://files.pythonhosted.org/packages/86/3f/83b2577db02503cd93d8e95b0f794ad9d4be0ba7cb6c8bcdcac964a34a42/pandas-3.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:be8c515c9bc33989d97b89db66ea0cececb0f6e3c2a87fcc8b69443a6923e95f", size = 11920444, upload-time = "2026-01-21T15:50:45.932Z" },
+ { url = "https://files.pythonhosted.org/packages/64/2d/4f8a2f192ed12c90a0aab47f5557ece0e56b0370c49de9454a09de7381b2/pandas-3.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:a453aad8c4f4e9f166436994a33884442ea62aa8b27d007311e87521b97246e1", size = 9730970, upload-time = "2026-01-21T15:50:47.962Z" },
+ { url = "https://files.pythonhosted.org/packages/d4/64/ff571be435cf1e643ca98d0945d76732c0b4e9c37191a89c8550b105eed1/pandas-3.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:da768007b5a33057f6d9053563d6b74dd6d029c337d93c6d0d22a763a5c2ecc0", size = 9041950, upload-time = "2026-01-21T15:50:50.422Z" },
]
[[package]]
@@ -1436,7 +1365,7 @@ name = "pexpect"
version = "4.9.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
- { name = "ptyprocess" },
+ { name = "ptyprocess", marker = "sys_platform != 'win32'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450, upload-time = "2023-11-25T09:07:26.339Z" }
wheels = [
@@ -2252,7 +2181,6 @@ dependencies = [
{ name = "beautifulsoup4" },
{ name = "blessed" },
{ name = "boto3" },
- { name = "camelot-py", extra = ["cv"] },
{ name = "croniter" },
{ name = "django" },
{ name = "django-cors-headers" },
@@ -2284,6 +2212,7 @@ dependencies = [
{ name = "ndg-httpsclient" },
{ name = "nh3" },
{ name = "openai" },
+ { name = "pandas" },
{ name = "pdfminer-six" },
{ name = "pillow" },
{ name = "psycopg" },
@@ -2344,7 +2273,6 @@ requires-dist = [
{ name = "beautifulsoup4", specifier = "==4.12.0" },
{ name = "blessed", specifier = "==1.20.0" },
{ name = "boto3", specifier = "==1.34.105" },
- { name = "camelot-py", extras = ["cv"], specifier = "==0.8.2" },
{ name = "croniter", specifier = "==6.0.0" },
{ name = "django", specifier = "==5.2.9" },
{ name = "django-cors-headers", specifier = "==4.9.0" },
@@ -2376,6 +2304,7 @@ requires-dist = [
{ name = "ndg-httpsclient", specifier = "==0.5.1" },
{ name = "nh3", specifier = "==0.2.21" },
{ name = "openai", specifier = "==1.30.3" },
+ { name = "pandas", specifier = ">=3.0.0" },
{ name = "pdfminer-six", specifier = "==20201018" },
{ name = "pillow", specifier = "==10.3.0" },
{ name = "psycopg", specifier = "==3.1.12" },
diff --git a/ynr/apps/bulk_adding/tests/test_bulk_add.py b/ynr/apps/bulk_adding/tests/test_bulk_add.py
index 5361cda441..e18589768a 100644
--- a/ynr/apps/bulk_adding/tests/test_bulk_add.py
+++ b/ynr/apps/bulk_adding/tests/test_bulk_add.py
@@ -752,79 +752,3 @@ def test_bulk_add_person_removes_spaces_from_name(self):
self.assertContains(resp, "Review candidates")
resp = form.submit()
self.assertContains(resp, "Bart Simpson")
-
- def test_fall_back_to_camelot_if_no_textract(self):
- data = {"name": "Bart", "party_id": "PP52"}
-
- raw_people = RawPeople.objects.create(
- ballot=self.dulwich_post_ballot,
- data=[data],
- source_type=RawPeople.SOURCE_PARSED_PDF,
- )
-
- self.assertEqual(
- raw_people.as_form_kwargs(),
- {
- "initial": [
- {
- "name": "Bart",
- "party": ["PP52", "PP52"],
- "previous_party_affiliations": [],
- "source": "",
- }
- ]
- },
- )
- raw_people.delete()
-
- textract_data = {"name": "Lisa", "party_id": "PP53"}
- raw_people = RawPeople.objects.create(
- ballot=self.dulwich_post_ballot,
- data=[data],
- textract_data=[textract_data],
- source_type=RawPeople.SOURCE_PARSED_PDF,
- )
-
- self.assertEqual(
- raw_people.as_form_kwargs(),
- {
- "initial": [
- {
- "name": "Lisa",
- "party": ["PP53", "PP53"],
- "previous_party_affiliations": [],
- "source": "",
- }
- ]
- },
- )
-
- def test_can_change_parser_in_frontend(self):
- """
- Check that a query param can change the parser we use
- """
- BallotSOPN.objects.create(
- source_url="http://example.com",
- ballot=self.dulwich_post_ballot,
- uploaded_file="sopn.pdf",
- )
- RawPeople.objects.create(
- ballot=self.dulwich_post_ballot,
- data=[{"name": "Bart", "party_id": "PP52"}],
- textract_data=[{"name": "Lisa", "party_id": "PP53"}],
- source_type=RawPeople.SOURCE_PARSED_PDF,
- )
- response = self.app.get(
- "/bulk_adding/sopn/parl.65808.2015-05-07/", user=self.user
- )
- form = response.forms["bulk_add_form"]
- # This should be the Textract data
- self.assertEqual(form.fields["form-0-name"][0].value, "Lisa")
-
- response = self.app.get(
- "/bulk_adding/sopn/parl.65808.2015-05-07/?v1_parser=1",
- user=self.user,
- )
- form = response.forms["bulk_add_form"]
- # This should be the Textract data
- self.assertEqual(form.fields["form-0-name"][0].value, "Bart")
diff --git a/ynr/apps/bulk_adding/views/sopns.py b/ynr/apps/bulk_adding/views/sopns.py
index e0003e9c9e..6564db69a5 100644
--- a/ynr/apps/bulk_adding/views/sopns.py
+++ b/ynr/apps/bulk_adding/views/sopns.py
@@ -123,12 +123,8 @@ def get(self, request, *args, **kwargs):
return super().get(request, *args, **kwargs)
def get_active_parser(self) -> Optional[SOPNParsingBackends]:
- if self.request.GET.get("v1_parser"):
- return SOPNParsingBackends.CAMELOT
if self.ballot.rawpeople.textract_data:
return SOPNParsingBackends.TEXTRACT
- if self.ballot.rawpeople.data:
- return SOPNParsingBackends.CAMELOT
return None
def get_context_data(self, **kwargs):
diff --git a/ynr/apps/elections/templates/elections/includes/_sopn_debug.html b/ynr/apps/elections/templates/elections/includes/_sopn_debug.html
index c61ab936ce..620857b058 100644
--- a/ynr/apps/elections/templates/elections/includes/_sopn_debug.html
+++ b/ynr/apps/elections/templates/elections/includes/_sopn_debug.html
@@ -7,28 +7,12 @@
Parsing Status
- Pages matched: {% if object.sopn.get_pages %}Yes (matched pages: {{ object.sopn.get_pages|join:", " }}
){% else %}No{% endif %}
- - Camelot tables extracted: {% if object.sopn.camelotparsedsopn %}Yes{% else %}No{% endif %}
- Raw Person Data: {% if object.rawpeople %}Yes{% else %}No{% endif %}
- AWS Textract Data: {% if textract_parsed.raw_data %}Yes{% else %}No{% endif %}
- AWS Textract Parsed? {% if textract_parsed.parsed_data %}Yes{% else %}
No{% endif %}
- Camelot raw Data
- {% if object.sopn.camelotparsedsopn.raw_data %}
- {{ object.sopn.camelotparsedsopn.as_pandas.to_dict|pprint }}
- {% else %}
- N/A
- {% endif %}
-
- Camelot table Data
- {% if object.sopn.camelotparsedsopn.data_as_html %}
- {{ object.sopn.camelotparsedsopn.data_as_html|safe }}
- {% else %}
- N/A
- {% endif %}
-
-
{% if textract_parsed and textract_parsed.as_textractor_document %}
AWS extracted table{{ textract_parsed.as_textractor_document.tables|pluralize }}
diff --git a/ynr/apps/official_documents/models.py b/ynr/apps/official_documents/models.py
index 952aedf866..3c283950eb 100644
--- a/ynr/apps/official_documents/models.py
+++ b/ynr/apps/official_documents/models.py
@@ -4,7 +4,6 @@
from typing import List
from candidates.models import Ballot
-from django.conf import settings
from django.core.files.base import ContentFile
from django.core.validators import FileExtensionValidator
from django.db import models
@@ -258,7 +257,6 @@ def parse(self):
"""
- from sopn_parsing.helpers.extract_tables import extract_ballot_table
from sopn_parsing.helpers.textract_helpers import (
NotUsingAWSException,
TextractSOPNHelper,
@@ -274,12 +272,6 @@ def parse(self):
# There's a cron job that should pick up the result and carry on parsing later.
textract_helper.start_detection()
- if getattr(
- settings, "CAMELOT_ENABLED", False
- ) and self.uploaded_file.name.endswith(".pdf"):
- # Camelot
- extract_ballot_table(self.ballot)
-
class BallotSOPNHistory(BaseBallotSOPN):
ballot = models.ForeignKey(
diff --git a/ynr/apps/official_documents/tests/test_upload.py b/ynr/apps/official_documents/tests/test_upload.py
index 03423f9ce7..61f015a280 100644
--- a/ynr/apps/official_documents/tests/test_upload.py
+++ b/ynr/apps/official_documents/tests/test_upload.py
@@ -2,7 +2,6 @@
import textwrap
from os.path import dirname, join, realpath
from pathlib import Path
-from unittest import skipIf
from candidates.models import LoggedAction
from candidates.tests.auth import TestUserMixin
@@ -27,7 +26,6 @@
EXAMPLE_DOCX_FILENAME,
EXAMPLE_HTML_FILENAME,
)
-from sopn_parsing.tests import should_skip_conversion_tests
from webtest import Upload
TEST_MEDIA_ROOT = realpath(
@@ -114,20 +112,8 @@ def test_upload_authorized(self):
with open(self.example_image_filename, "rb") as f:
form["uploaded_file"] = Upload("pilot.jpg", f.read())
- # TODO: Add back in
- # with patch(
- # "official_documents.views.extract_pages_for_ballot"
- # ) as extract_pages, patch(
- # "official_documents.views.extract_ballot_table"
- # ) as extract_tables, patch(
- # "official_documents.views.parse_raw_data_for_ballot"
- # ) as parse_tables:
response = form.submit()
self.assertEqual(response.status_code, 302)
- # TODO: Add back in
- # extract_pages.assert_called_once()
- # extract_tables.assert_called_once()
- # parse_tables.assert_called_once()
ballot_sopns = BallotSOPN.objects.all()
self.assertEqual(ballot_sopns.count(), 1)
@@ -155,9 +141,6 @@ def test_upload_authorized(self):
)
self.assertInHTML("Update SOPN", response.text)
- @skipIf(
- should_skip_conversion_tests(), "Required conversion libs not installed"
- )
def test_docx_upload_form_validation(self):
self.assertFalse(LoggedAction.objects.exists())
response = self.app.get(
@@ -181,26 +164,11 @@ def test_docx_upload_form_validation(self):
with open(self.example_docx_filename, "rb") as f:
form["uploaded_file"] = Upload("pilot.docx", f.read())
- # TODO: add back in
- # with patch(
- # "official_documents.views.extract_pages_for_ballot"
- # ) as extract_pages, patch(
- # "official_documents.views.extract_ballot_table"
- # ) as extract_tables, patch(
- # "official_documents.views.parse_raw_data_for_ballot"
- # ) as parse_tables:
response = form.submit()
self.assertEqual(response.status_code, 302)
- # TODO Add back in
- # extract_pages.assert_called_once()
- # extract_tables.assert_called_once()
- # parse_tables.assert_called_once()
self.assertEqual(BallotSOPN.objects.count(), 1)
self.assertEqual(response.location, self.ballot.get_sopn_url())
- @skipIf(
- should_skip_conversion_tests(), "Required conversion libs not installed"
- )
def test_html_upload_form_validation(self):
self.assertFalse(LoggedAction.objects.exists())
response = self.app.get(
@@ -229,9 +197,6 @@ def test_html_upload_form_validation(self):
response.text,
)
- @skipIf(
- should_skip_conversion_tests(), "Required conversion libs not installed"
- )
def test_jpg_form_validation(self):
self.assertFalse(LoggedAction.objects.exists())
response = self.app.get(
@@ -256,9 +221,6 @@ def test_jpg_form_validation(self):
self.assertEqual(response.status_code, 302)
self.assertEqual(BallotSOPN.objects.count(), 1)
- @skipIf(
- should_skip_conversion_tests(), "Required conversion libs not installed"
- )
def test_update_existing_sopn(self):
self.assertFalse(LoggedAction.objects.exists())
response = self.app.get(
diff --git a/ynr/apps/sopn_parsing/helpers/extract_tables.py b/ynr/apps/sopn_parsing/helpers/extract_tables.py
deleted file mode 100644
index 0b610c7847..0000000000
--- a/ynr/apps/sopn_parsing/helpers/extract_tables.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import json
-
-import pandas as pd
-from sopn_parsing.helpers.text_helpers import NoTextInDocumentError, clean_text
-from sopn_parsing.models import CamelotParsedSOPN
-
-
-def extract_ballot_table(ballot, parse_flavor="lattice"):
- """
- Given a OfficialDocument model, update or create a CamelotParsedSOPN model with the
- contents of the table as a JSON string.
-
- :type ballot: candidates.models.Ballot
-
- """
- import camelot # import here to avoid import error running tests without pdf deps installed
-
- document = ballot.sopn
- try:
- tables = camelot.read_pdf(
- document.uploaded_file.path,
- pages="all",
- flavor=parse_flavor,
- )
- except (NotImplementedError, AttributeError):
- # * NotImplementedError is thrown if the PDF is an image or generally
- # unreadable.
- # * AttributeError is thrown on some PDFs saying they need a password.
- # Assume this is a bug in camelot, and ignore these PDFs
- raise NoTextInDocumentError()
-
- # Tables can span pages, camelot assumes they're different tables, so we
- # need to join them back together
- table_list = []
- for table in tables:
- table_list.append(table)
- table_list.sort(key=lambda t: (t.page, t.order))
-
- if not table_list:
- return None
-
- table_data = table_list.pop(0).df
-
- for table in table_list:
- # It's possible to have the "situation of poll" document on the SOPN
- # Ignore any table that contains "polling station" (SOPNs tables don't)
- table = table.df
- first_row = table.iloc[0].to_string()
-
- if "polling station" in clean_text(first_row):
- break
- # Append the continuation table to the first one in the document.
- # ignore_index is needed so the e.g table 2 row 1 doesn't replace
- # table 1 row 1
- table_data = pd.concat([table_data, table], ignore_index=True)
-
- if not table_data.empty:
- parsed, _ = CamelotParsedSOPN.objects.update_or_create(
- sopn=document,
- defaults={"raw_data": json.dumps(table_data.to_dict())},
- )
- return parsed
- return None
diff --git a/ynr/apps/sopn_parsing/helpers/parse_tables.py b/ynr/apps/sopn_parsing/helpers/parse_tables.py
index 247d0e7e31..e8152b0816 100644
--- a/ynr/apps/sopn_parsing/helpers/parse_tables.py
+++ b/ynr/apps/sopn_parsing/helpers/parse_tables.py
@@ -476,20 +476,12 @@ def parse_dataframe(ballot: Ballot, df: DataFrame):
def parse_raw_data(ballot: Ballot, reparse=False):
"""
- Given a Ballot, go and get the Camelot and the AWS Textract dataframes
+ Given a Ballot, go and get the AWS Textract dataframes
and process them
"""
- camelot_model = getattr(ballot.sopn, "camelotparsedsopn", None)
- camelot_data = {}
textract_model = getattr(ballot.sopn, "awstextractparsedsopn", None)
textract_data = {}
- if (
- camelot_model
- and camelot_model.raw_data_type == "pandas"
- and (reparse or not camelot_model.parsed_data)
- ):
- camelot_data = parse_dataframe(ballot, camelot_model.as_pandas)
if (
textract_model
and textract_model.raw_data
@@ -500,7 +492,7 @@ def parse_raw_data(ballot: Ballot, reparse=False):
textract_model.parse_raw_data()
textract_data = parse_dataframe(ballot, textract_model.as_pandas)
- if camelot_data or textract_data:
+ if textract_data:
# Check there isn't a rawpeople object from another (better) source
rawpeople_qs = RawPeople.objects.filter(ballot=ballot).exclude(
source_type=RawPeople.SOURCE_PARSED_PDF
@@ -510,7 +502,7 @@ def parse_raw_data(ballot: Ballot, reparse=False):
RawPeople.objects.update_or_create(
ballot=ballot,
defaults={
- "data": camelot_data or "",
+ "data": "",
"textract_data": textract_data or "",
"source": "Parsed from {}".format(
ballot.sopn.source_url
@@ -525,17 +517,10 @@ def parse_raw_data(ballot: Ballot, reparse=False):
return
# We've done the parsing, so let's still save the result
storage = DefaultStorage()
- storage.save(
- f"raw_people/camelot_{ballot.ballot_paper_id}.json",
- ContentFile(json.dumps(camelot_data, indent=4).encode("utf8")),
- )
storage.save(
f"raw_people/textract_{ballot.ballot_paper_id}.json",
ContentFile(json.dumps(textract_data, indent=4).encode("utf8")),
)
- if camelot_model:
- ballot.sopn.camelotparsedsopn.status = "parsed"
- ballot.sopn.camelotparsedsopn.save()
if textract_model:
ballot.sopn.awstextractparsedsopn.status = "parsed"
ballot.sopn.awstextractparsedsopn.save()
diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_extract_tables.py b/ynr/apps/sopn_parsing/management/commands/sopn_parsing_extract_tables.py
deleted file mode 100644
index 3a4e091290..0000000000
--- a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_extract_tables.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from sopn_parsing.helpers.command_helpers import BaseSOPNParsingCommand
-from sopn_parsing.helpers.extract_tables import extract_ballot_table
-from sopn_parsing.helpers.text_helpers import NoTextInDocumentError
-
-
-class Command(BaseSOPNParsingCommand):
- help = """
- Parse tables out of PDFs in to CamelotParsedSOPN models for later parsing.
- """
-
- def handle(self, *args, **options):
- qs = self.get_queryset(options)
- filter_kwargs = {}
- if not options["ballot"] and not options["testing"]:
- if not options["reparse"]:
- filter_kwargs["sopn__camelotparsedsopn"] = None
-
- qs = qs.filter(**filter_kwargs)
- for ballot in qs:
- try:
- extract_ballot_table(ballot)
- except NoTextInDocumentError:
- self.stdout.write(
- f"{ballot} raised a NoTextInDocumentError trying to extract tables"
- )
- except ValueError:
- self.stdout.write(
- f"{ballot} raised a ValueError trying extract tables"
- )
diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_parse_tables.py b/ynr/apps/sopn_parsing/management/commands/sopn_parsing_parse_tables.py
deleted file mode 100644
index 26448b697f..0000000000
--- a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_parse_tables.py
+++ /dev/null
@@ -1,67 +0,0 @@
-from bulk_adding.models import RawPeople
-from sopn_parsing.helpers.command_helpers import BaseSOPNParsingCommand
-from sopn_parsing.helpers.parse_tables import parse_raw_data_for_ballot
-
-
-class Command(BaseSOPNParsingCommand):
- help = """
- Convert the raw extracted tables on the CamelotParsedSOPN model to a parsed
- RawPeople model, and set the status as parsed.
-
- """
-
- def build_filter_kwargs(self, options):
- """
- Build kwargs used to filter the BallotQuerySet that is parsed
- - Always skip any ballots where we do not have a CamelotParsedSOPN to try to
- extract candidates from
- - When test flag is used, dont make any changes
- - When parsing a single ballot, dont make any changes
- - When reparsing, only use ballots where we have previously created a
- RawPeople object from a CamelotParsedSOPN
- - Otherwise filter by unparsed CamelotParsedSOPN objects
- """
- # Always skip any ballots where we do not have a CamelotParsedSOPN to try to
- # extract candidates from
- filter_kwargs = {}
- if options.get("testing"):
- return filter_kwargs
-
- if options.get("ballot"):
- return filter_kwargs
-
- if options.get("reparse"):
- filter_kwargs[
- "rawpeople__source_type"
- ] = RawPeople.SOURCE_PARSED_PDF
- return filter_kwargs
-
- return filter_kwargs
-
- def handle(self, *args, **options):
- # filters that we never change with args. These two would raise
- # ValueErrors in the parse_raw_data_for_ballot function
- base_qs = self.get_queryset(options)
- filter_kwargs = self.build_filter_kwargs(options)
-
- qs = base_qs.filter(**filter_kwargs)
- qs = qs.filter(
- candidates_locked=False, # Never parse a locked ballot
- suggestedpostlock=None, # Never parse a ballot with lock suggestions
- )
-
- if not qs.exists():
- msg = ["No ballots to parse found."]
-
- if options.get("ballot"):
- msg.append(
- "This ballot might be locked or have lock suggestions"
- )
-
- self.stderr.write("\n".join(msg))
-
- for ballot in qs:
- try:
- parse_raw_data_for_ballot(ballot, options["reparse"])
- except ValueError as e:
- print(str(e))
diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_process_unparsed.py b/ynr/apps/sopn_parsing/management/commands/sopn_parsing_process_unparsed.py
index 41db0e1f57..7b38b54b6a 100644
--- a/ynr/apps/sopn_parsing/management/commands/sopn_parsing_process_unparsed.py
+++ b/ynr/apps/sopn_parsing/management/commands/sopn_parsing_process_unparsed.py
@@ -1,4 +1,3 @@
-from django.conf import settings
from django.core.management.base import BaseCommand
from sopn_parsing.helpers.parse_tables import parse_raw_data_for_ballot
from sopn_parsing.helpers.textract_helpers import (
@@ -8,7 +7,6 @@
from sopn_parsing.models import (
AWSTextractParsedSOPN,
AWSTextractParsedSOPNStatus,
- CamelotParsedSOPN,
)
@@ -21,22 +19,16 @@ class Command(BaseCommand):
This script picks up where `parse` left off. It manages two cases:
- # Camelot
-
- We expect to have made a `CamelotParsedSOPN` with `raw_data` populated. This will only have
- happened if the file is a PDF readable by Camelot.
-
- We need to parse the `raw_data` into `parsed_data` and then make a `RawData` object for bulk adding.
-
# AWS Textract
- We should have made a `AWSTextractParsedSOPN` with `job_id` populated. Textract is async,
- so the initial `parse` just submits the data to AWS and gets a job_id.
+ We should have made a `AWSTextractParsedSOPN` with `job_id` populated.
+ Textract is async, so the initial `parse` just submits the data to AWS and
+ gets a job_id.
We need to check if the job ID has finished and pull in the data to `raw_data`.
- We're then in the same state as the Camelot method above, we need to parse the `raw_data` into
- `parsed_data` and makr a `RawData` object for bulk adding.
+ We need to parse the `raw_data` into `parsed_data` and makr a `RawData`
+ object for bulk adding.
"""
def handle(self, *args, **options):
@@ -45,15 +37,6 @@ def handle(self, *args, **options):
"sopn__ballot__candidates_locked": False,
}
- if getattr(settings, "CAMELOT_ENABLED", False):
- # Camelot first
- qs = (
- CamelotParsedSOPN.objects.filter(parsed_data=None)
- .exclude(raw_data="")
- .filter(**current_ballot_kwargs)
- )
- self.parse_tables_for_qs(qs)
-
# Textract
qs = AWSTextractParsedSOPN.objects.exclude(
status__in=[
diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_clear_existing_objects.py b/ynr/apps/sopn_parsing/management/commands/sopn_tooling_clear_existing_objects.py
deleted file mode 100644
index cb68ffdf02..0000000000
--- a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_clear_existing_objects.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from bulk_adding.models import RawPeople
-from django.conf import settings
-from django.core.management.base import BaseCommand
-from official_documents.models import OfficialDocument
-
-
-class Command(BaseCommand):
- """
- Used to quickly delete existing objects used when testing SOPN
- parsing so that you can start fresh for example, when you want
- to start testing a new set of SOPNs.
- """
-
- def print_deleted(self, deleted_dict):
- for object, count in deleted_dict.items():
- self.stdout.write(f"Deleted {count} {object}")
-
- def handle(self, *args, **options):
- if settings.SETTINGS_MODULE != "ynr.settings.sopn_testing":
- raise ValueError(
- "You are trying to run this command outside of SOPN testing environment"
- )
-
- deleted_dict = {}
- deleted_dict.update(OfficialDocument.objects.all().delete()[1])
- deleted_dict.update(RawPeople.objects.all().delete()[1])
- self.print_deleted(deleted_dict)
diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_compare_raw_people.py b/ynr/apps/sopn_parsing/management/commands/sopn_tooling_compare_raw_people.py
deleted file mode 100644
index dbe5eb913a..0000000000
--- a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_compare_raw_people.py
+++ /dev/null
@@ -1,237 +0,0 @@
-import json
-import os
-from collections import Counter
-
-from bulk_adding.models import RawPeople
-from candidates.models import Ballot
-from django.core.management import call_command
-from official_documents.models import OfficialDocument
-from popolo.models import Membership
-from sopn_parsing.helpers.command_helpers import BaseSOPNParsingCommand
-from sopn_parsing.models import CamelotParsedSOPN
-
-
-class Command(BaseSOPNParsingCommand):
- CORRECT_EXACTLY = "correct_exactly"
- NUM_CORRECT_MISSING_PARTIES = "num_correct_some_parties_missing"
- NUM_INCORRECT = "num_incorrect"
- ZERO_CANDIDATES = "zero_candidates"
-
- def add_arguments(self, parser):
- super().add_arguments(parser)
- parser.add_argument("--loud", action="store_true", default=False)
-
- def handle(self, *args, **options):
- """
- - Check we have a baseline file to compare with
- - Prepare some OfficialDocuments
- - Re-parse the documents
- - Loop through the created RawPeople objects, comparing to our baseline
- to make sure that we are parsing at least as many people as before
- - If no asserts failed, use the data to write a new baseline file
- """
-
- self.loud = options.pop("loud")
-
- self.candidates_results = {
- "correct_exactly": [],
- "num_correct_some_parties_missing": [],
- "num_incorrect": [],
- "zero_candidates": [],
- }
-
- raw_people_file = "ynr/apps/sopn_parsing/tests/data/sopn_baseline.json"
- if not os.path.isfile(raw_people_file):
- call_command("sopn_tooling_write_baseline")
- self.stdout.write("Baseline file didn't exist so one was created")
-
- options.update({"testing": True})
-
- OfficialDocument.objects.update(relevant_pages="")
- call_command("sopn_parsing_extract_page_numbers", *args, **options)
- CamelotParsedSOPN.objects.all().delete()
- call_command("sopn_parsing_extract_tables", *args, **options)
- RawPeople.objects.all().delete()
- call_command("sopn_parsing_parse_tables", *args, **options)
-
- with open(raw_people_file) as file:
- old_raw_people = json.loads(file.read())
-
- self.new_raw_people = {}
- for ballot in Ballot.objects.exclude(officialdocument__isnull=True):
- ballot_data = old_raw_people.get(ballot.ballot_paper_id, {})
-
- self.compare_relevant_pages(ballot=ballot, ballot_data=ballot_data)
-
- self.compare_raw_people(ballot=ballot, ballot_data=ballot_data)
-
- # display some overall totals
- self.stdout.write(
- "Old total 'people' parsed WAS {old}\n"
- "New total 'people' parsed IS {new}".format(
- old=self.count_people_parsed(old_raw_people),
- new=self.count_people_parsed(self.new_raw_people),
- )
- )
-
- old_raw_people_obj_count = len(
- {k: v for k, v in old_raw_people.items() if v["raw_people"]}
- )
- new_raw_people_obj_count = RawPeople.objects.count()
- style = self.style.SUCCESS
- if new_raw_people_obj_count < old_raw_people_obj_count:
- style = self.style.ERROR
- self.stdout.write(
- style(
- f"Old RawPeople count: {old_raw_people_obj_count}\n"
- f"New total RawPeople count: {new_raw_people_obj_count}"
- )
- )
-
- for result, ballots in self.candidates_results.items():
- total = len(ballots)
- self.stdout.write(f"{total} ballots parsed {result}")
- # Write a new baseline
- call_command("sopn_tooling_write_baseline")
-
- def compare_relevant_pages(self, ballot, ballot_data):
- old_relevant_pages = ballot_data.get("relevant_pages", "")
- new_relevant_pages = ballot.sopn.relevant_pages
-
- if old_relevant_pages != new_relevant_pages:
- self.stdout.write(
- self.style.WARNING(
- f"RELEVANT PAGES CHANGED FROM {old_relevant_pages} to {new_relevant_pages} for {ballot.ballot_paper_id}"
- )
- )
-
- def compare_raw_people(self, ballot, ballot_data):
- try:
- raw_people = ballot.rawpeople.data
- except RawPeople.DoesNotExist:
- raw_people = []
-
- old_raw_people_for_ballot = ballot_data.get("raw_people", [])
- old_count = len(old_raw_people_for_ballot)
- new_count = len(raw_people)
- if new_count < old_count:
- self.stderr.write(
- f"Uh oh, parsed people for {ballot.ballot_paper_id} decreased from {old_count} to {new_count}. Stopping."
- )
-
- if new_count > old_count:
- self.stdout.write(
- f"{ballot.ballot_paper_id} increased from {old_count} to {new_count} parsed people.\n"
- f"Check the SOPN at https://candidates.democracyclub.org.uk{ballot.get_sopn_url()}."
- )
- for person in raw_people:
- if person not in old_raw_people_for_ballot:
- self.stdout.write(self.style.SUCCESS(person))
-
- # when people parsed have changed e.g. different name/different party print it for further checking
- changed_people = [
- person
- for person in old_raw_people_for_ballot
- if person not in raw_people
- ]
- if changed_people:
- self.stdout.write(
- self.style.WARNING(
- f"Parsed data changed for {ballot.ballot_paper_id}\n"
- f"New raw people data:\n"
- f"{raw_people}\n"
- "Missing people:"
- )
- )
- for person in changed_people:
- self.stderr.write(str(person))
-
- self.new_raw_people[ballot.ballot_paper_id] = {"raw_people": raw_people}
-
- self.parties_correct(ballot, raw_people)
-
- def count_people_parsed(self, raw_people_data):
- """
- Returns the total number of "people" that were parsed.
- NB that just because something was parsed, it doesnt mean that it was
- accurately parsed. Therefore this total is best used to look for large
- changes that should then be checked in detail.
- """
- return sum(
- [len(data["raw_people"]) for data in raw_people_data.values()]
- )
-
- def parties_correct(self, ballot, raw_people_for_ballot):
- candidates = Membership.objects.filter(ballot=ballot)
- if not candidates:
- self.stdout.write(
- self.style.WARNING(
- f"We dont have candidates for {ballot.ballot_paper_id}. Try updating with the live site first?"
- )
- )
-
- if not raw_people_for_ballot:
- self.candidates_results[self.ZERO_CANDIDATES].append(
- ballot.ballot_paper_id
- )
- return None
-
- num_candidates_correct = candidates.count() == len(
- raw_people_for_ballot
- )
-
- if self.loud:
- if num_candidates_correct:
- self.stdout.write(
- self.style.SUCCESS(
- f"Correct number of people parsed as expected for {ballot.ballot_paper_id}"
- )
- )
- else:
- self.stdout.write(
- self.style.ERROR(
- f"Incorrect number of people parsed for {ballot.ballot_paper_id}"
- )
- )
-
- parsed = sorted(
- [person["party_id"] for person in raw_people_for_ballot]
- )
- expected = list(
- candidates.values_list("party__ec_id", flat=True).order_by(
- "party__ec_id"
- )
- )
-
- if parsed == expected:
- return self.candidates_results[self.CORRECT_EXACTLY].append(
- ballot.ballot_paper_id
- )
-
- # count number of each missing party ID as there could be more than one
- # missing candidate for a party e.g. 1 missing Green, 2 missing independents
- parsed = Counter(parsed)
- expected = Counter(expected)
- missing = expected - parsed
- if missing:
- total = sum(missing.values())
- self.stderr.write(
- f"{total} MISSING parties for {ballot.ballot_paper_id} (party_id:num_missing)\n{missing}"
- )
- else:
- # sometimes we incorrectly parse extra people - often independents
- # due to an empty row
- extras = parsed - expected
- total = sum(extras.values())
- self.stderr.write(
- f"{total} EXTRA parties for {ballot.ballot_paper_id}\n{extras}"
- )
-
- if num_candidates_correct:
- return self.candidates_results[
- self.NUM_CORRECT_MISSING_PARTIES
- ].append(ballot.ballot_paper_id)
-
- return self.candidates_results[self.NUM_INCORRECT].append(
- ballot.ballot_paper_id
- )
diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_create_official_documents.py b/ynr/apps/sopn_parsing/management/commands/sopn_tooling_create_official_documents.py
deleted file mode 100644
index e7c3f3e1b2..0000000000
--- a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_create_official_documents.py
+++ /dev/null
@@ -1,124 +0,0 @@
-import requests
-from candidates.models import Ballot
-from django.conf import settings
-from django.core.files.base import ContentFile
-from django.core.management.base import BaseCommand
-from elections.models import Election
-from official_documents.models import OfficialDocument
-
-
-class Command(BaseCommand):
- """This command uses the ballots endpoint to loop over each
- ballot and store each sopn pdf (uploaded_file) locally"""
-
- def add_arguments(self, parser):
- parser.add_argument(
- "--date",
- "-d",
- action="store",
- help="Election date in ISO format, defaults to 2021-05-06",
- default="2021-05-06",
- type=str,
- )
- parser.add_argument(
- "--site_url",
- "-u",
- action="store",
- help="URL of site to download from",
- default="https://candidates.democracyclub.org.uk/",
- type=str,
- )
- parser.add_argument(
- "--election-count",
- "-c",
- action="store",
- help="URL of site to download from",
- default=50,
- type=int,
- )
- parser.add_argument(
- "--election-slugs", "-s", action="store", required=False
- )
-
- def handle(self, *args, **options):
- site_url = options.get("site_url")
- election_date = options.get("date")
- election_count = options.get("election_count")
-
- if options["election_slugs"]:
- election_slugs = options["election_slugs"].split(",")
- else:
- election_slugs = Election.objects.filter(
- election_date=election_date
- ).values_list("slug", flat=True)[:election_count]
-
- for slug in election_slugs:
- url = f"{site_url}api/next/ballots/?has_sopn=1&page_size=200&election_id={slug}&auth_token={settings.YNR_API_KEY}"
- self.create_official_documents(url=url)
-
- def create_official_documents(self, url):
- data = requests.get(url=url).json()
- try:
- next_page = data["next"]
- except KeyError:
- next_page = None
- if "results" in data:
- for ballot_data in data["results"]:
- ballot = Ballot.objects.get(
- ballot_paper_id=ballot_data["ballot_paper_id"]
- )
- sopn_data = ballot_data["sopn"]
-
- # if we already have the SOPN no need to recreate
- if ballot.officialdocument_set.filter(
- source_url=sopn_data["source_url"]
- ).exists():
- self.stdout.write(
- f"SOPN already exists for {ballot.ballot_paper_id}"
- )
- continue
-
- # check if we already have an OfficialDocument with this source
- # downloaded
- official_document = OfficialDocument.objects.filter(
- source_url=sopn_data["source_url"]
- ).first()
- if official_document:
- # if so we dont need to redownload the file, we can create a new
- # object for this ballot with the same file
- self.stdout.write(
- f"Found SOPN for source {sopn_data['source_url']}"
- )
- OfficialDocument.objects.create(
- ballot=ballot,
- source_url=sopn_data["source_url"],
- uploaded_file=official_document.uploaded_file,
- document_type=OfficialDocument.NOMINATION_PAPER,
- )
- continue
-
- # otherwise we dont have this file stored already, so download it as
- # part of creating the OfficialDocument
- self.stdout.write(
- f"Downloading SOPN from {sopn_data['uploaded_file']}"
- )
- file_response = requests.get(sopn_data["uploaded_file"])
- file_object = ContentFile(content=file_response.content)
- official_document = OfficialDocument(
- ballot=ballot,
- source_url=sopn_data["source_url"],
- document_type=OfficialDocument.NOMINATION_PAPER,
- )
- file_extension = sopn_data["uploaded_file"].split(".")[-1]
- filename = f"{ballot.ballot_paper_id}.{file_extension}"
- official_document.uploaded_file.save(
- name=filename, content=file_object
- )
- else:
- self.stdout.write("No results found")
-
- # this should only be the case where the election object has > 200
- # ballots e.g. parliamentary elections
- if next_page:
- return self.create_official_documents(url=next_page)
- return None
diff --git a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_write_baseline.py b/ynr/apps/sopn_parsing/management/commands/sopn_tooling_write_baseline.py
deleted file mode 100644
index 07ae9309cd..0000000000
--- a/ynr/apps/sopn_parsing/management/commands/sopn_tooling_write_baseline.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import json
-import os
-
-from bulk_adding.models import RawPeople
-from candidates.models import Ballot
-from django.core.management.base import BaseCommand
-from django.db.models import Q
-
-
-class Command(BaseCommand):
- """
- Creates a JSON file to represent ballots that have an Officialdocument.
- Only include ballots where:
- - The source of the RawPeople is from parsing a PDF
- - No RawPeople were created from the OfficialDocument. This is so that we
- will know if we make make improvements that mean more RawPeople are parsed
- from an OfficialDocument
- """
-
- def add_arguments(self, parser):
- parser.add_argument(
- "--data",
- action="store",
- help="Dictionary of raw people to write as a baseline",
- )
-
- def handle(self, *args, **options):
- json_data = options["data"] or {}
-
- if not json_data:
- qs = Ballot.objects.exclude(officialdocument__isnull=True).filter(
- Q(rawpeople__source_type=RawPeople.SOURCE_PARSED_PDF)
- | Q(rawpeople__isnull=True)
- )
- for ballot in qs:
- raw_people = getattr(ballot, "rawpeople", [])
- try:
- raw_people = ballot.rawpeople.data
- except RawPeople.DoesNotExist:
- raw_people = []
-
- json_data[ballot.ballot_paper_id] = {
- "raw_people": raw_people,
- "relevant_pages": ballot.sopn.relevant_pages,
- }
-
- file_path = os.path.join(
- os.getcwd(), "ynr/apps/sopn_parsing/tests/data/sopn_baseline.json"
- )
- with open(file_path, "w") as f:
- f.write(json.dumps(json_data))
diff --git a/ynr/apps/sopn_parsing/tests/__init__.py b/ynr/apps/sopn_parsing/tests/__init__.py
index 0c1d689775..934f393dcf 100644
--- a/ynr/apps/sopn_parsing/tests/__init__.py
+++ b/ynr/apps/sopn_parsing/tests/__init__.py
@@ -5,12 +5,3 @@ def should_skip_pdf_tests():
return False
except ImportError:
return True
-
-
-def should_skip_conversion_tests():
- try:
- import pypandoc # noqa
-
- return False
- except ImportError:
- return True
diff --git a/ynr/apps/sopn_parsing/tests/test_extract_tables.py b/ynr/apps/sopn_parsing/tests/test_extract_tables.py
index 21a03dfb63..bda0d99fd3 100644
--- a/ynr/apps/sopn_parsing/tests/test_extract_tables.py
+++ b/ynr/apps/sopn_parsing/tests/test_extract_tables.py
@@ -1,15 +1,10 @@
from os.path import abspath, dirname, join
-from unittest import skipIf
from candidates.tests.helpers import TmpMediaRootMixin
from candidates.tests.uk_examples import UK2015ExamplesMixin
from django.core.files.uploadedfile import SimpleUploadedFile
-from django.core.management import call_command
from django.test import TestCase
from official_documents.models import BallotSOPN
-from sopn_parsing.helpers.extract_tables import extract_ballot_table
-from sopn_parsing.models import CamelotParsedSOPN
-from sopn_parsing.tests import should_skip_pdf_tests
class TestSOPNHelpers(TmpMediaRootMixin, UK2015ExamplesMixin, TestCase):
@@ -27,152 +22,3 @@ def setUp(self):
uploaded_file=SimpleUploadedFile("sopn.pdf", sopn_file),
source_url="example.com",
)
-
- @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed")
- def test_extract_tables(self):
- extract_ballot_table(self.dulwich_post_ballot)
- self.assertEqual(
- CamelotParsedSOPN.objects.get().as_pandas.to_dict(),
- {
- "0": {
- "0": "Name of \nCandidate",
- "1": "ALAGARATNAM \nRathy",
- "2": "BARBER \nJames",
- "3": "HAYES \nHelen Elizabeth",
- "4": "KANUMANSA \nAmadu",
- "5": "KOTECHA \nResham",
- "6": "LAMBERT \nRobin Andrew \nDavid",
- "7": "NALLY \nSteve",
- "8": "NIX \nRashid",
- },
- "1": {
- "0": "Home \nAddress",
- "1": "(address in the \nMitcham and Morden \nConstituency)",
- "2": "33 Champion Hill, \nLondon, SE5 8BS",
- "3": "11 Woodsyre, \nSydenham Hill, \nLondon, SE26 6SS",
- "4": "11 Coleridge House, \nBrowning Street, \nLondon, SE17 1DG",
- "5": "(address in the \nRuislip, Northwood \nand Pinner \nConstituency)",
- "6": "(address in the \nDuwlich and West \nNorwood \nConstituency)",
- "7": "(address in the \nVauxhall \nConstituency)",
- "8": "66 Guinness Court, \nLondon, SW3 2PQ",
- },
- "2": {
- "0": "Description \n(if any)",
- "1": "UK Independence \nParty (UKIP)",
- "2": "Liberal Democrat",
- "3": "Labour Party",
- "4": "All People`s Party",
- "5": "The Conservative \nParty Candidate",
- "6": "Independent",
- "7": "Trade Unionist \nand Socialist \nCoalition",
- "8": "The Green Party",
- },
- "3": {
- "0": "Name of Assentors \nProposer(+), Seconder(++)",
- "1": "Coleman Alice M + \n"
- "Potter Keith S ++ \n"
- "Potter Stephanie \n"
- "Smith Bryan L \n"
- "Anderson Beth \n"
- "Lumba Avita \n"
- "Andersen Robert \n"
- "Patel Sajal \n"
- "Stanbury Linda \n"
- "Stanbury James",
- "2": "Fitchett Keith + \n"
- "Price Jonathan ++ \n"
- "Gardner Brigid \n"
- "Waddington Simon \n"
- "Morland Laura \n"
- "Lester Rachel \n"
- "Pidgeon Caroline \n"
- "Hare David \n"
- "Hanton Alastair \n"
- "Haylett Alexander",
- "3": "Samuel Gaynelle + \n"
- "Whaley Stephen P ++ \n"
- "Brazell Shadi M \n"
- "De Souza Johnny \n"
- "Alcock Heather \n"
- "Natzler Robert S \n"
- "Pearce Michelle E \n"
- "Pickering Robert \n"
- "Richardson Katherine G \n"
- "Pickard Jane",
- "4": "King James + \n"
- "King Rosemary ++ \n"
- "King David \n"
- "Davies Yadalieu \n"
- "Sesay Mary \n"
- "Rahman Layla K \n"
- "Rahman Syed A \n"
- "Ahmed Jalaluddin \n"
- "Rahman Tajwar S \n"
- "Rahman Taamid S",
- "5": "Davis James G + \n"
- "Bradbury David S ++ \n"
- "Badman Susan E \n"
- "Hill-Archer Roderick C \n"
- "Langley Anne C \n"
- "Mitchell Andrew M \n"
- "Virgo Marjorie J \n"
- "Virgo Philip A \n"
- "Chathli Lindsay \n"
- "Broomhead Robert A",
- "6": "Smith Caitlin + \n"
- "Parks Jesse ++ \n"
- "Connage Kyesha \n"
- "Hendry Perihan \n"
- "Mounty E J \n"
- "Sharif B \n"
- "Scott Wellesley \n"
- "Harriott S A \n"
- "Harriott Clive \n"
- "Ojumu Ibi",
- "7": "Tullis Andrew C + \n"
- "Mason Joshua H ++ \n"
- "Parkinson Francine M \n"
- "Gait Elizabeth \n"
- "Doolan Samantha \n"
- "Ubiaro Elizabeth \n"
- "Garner Stuart \n"
- "Akinjogbin Dolapo \n"
- "Walker Donna \n"
- "Lang Geoffrey P",
- "8": "Atwell E G + \n"
- "Rose Lloyd ++ \n"
- "O`Shea C \n"
- "Gomes Jacqueline \n"
- "Wood Thomas \n"
- "Rosenfeld David \n"
- "Conroy Martin \n"
- "Skiadopoulou I \n"
- "Rosenfeld Lawrence \n"
- "Rosenfeld Emily",
- },
- "4": {
- "0": "Reason why \nno longer \nnominated*",
- "1": "",
- "2": "",
- "3": "",
- "4": "",
- "5": "",
- "6": "",
- "7": "",
- "8": "",
- },
- },
- )
-
- @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed")
- def test_extract_command_current(self):
- self.assertEqual(CamelotParsedSOPN.objects.count(), 0)
- call_command("sopn_parsing_extract_tables", current=True)
- self.assertEqual(CamelotParsedSOPN.objects.count(), 1)
-
- def test_extract_command_current_no_current_elections(self):
- self.election.current = False
- self.election.save()
- self.assertEqual(CamelotParsedSOPN.objects.count(), 0)
- call_command("sopn_parsing_extract_tables", current=True)
- self.assertEqual(CamelotParsedSOPN.objects.count(), 0)
diff --git a/ynr/apps/sopn_parsing/tests/test_parse_tables.py b/ynr/apps/sopn_parsing/tests/test_parse_tables.py
deleted file mode 100644
index 922c487dd4..0000000000
--- a/ynr/apps/sopn_parsing/tests/test_parse_tables.py
+++ /dev/null
@@ -1,529 +0,0 @@
-import json
-from pathlib import Path
-from unittest import skipIf
-from unittest.mock import patch
-
-from bulk_adding.models import RawPeople
-from candidates.tests.uk_examples import UK2015ExamplesMixin
-from django.core.management import call_command
-from django.db import connection
-from django.test import TestCase
-from official_documents.models import BallotSOPN
-from pandas import Index, Series
-from parties.models import Party, PartyDescription
-from parties.tests.factories import PartyFactory
-from parties.tests.fixtures import DefaultPartyFixtures
-from sopn_parsing.helpers import parse_tables
-from sopn_parsing.models import CamelotParsedSOPN
-from sopn_parsing.tests import should_skip_pdf_tests
-from sopn_parsing.tests.data.welsh_sopn_data import welsh_sopn_data
-
-from ynr.apps.sopn_parsing.management.commands.sopn_parsing_parse_tables import (
- Command as ParseTablesCommand,
-)
-
-
-class TestSOPNHelpers(DefaultPartyFixtures, UK2015ExamplesMixin, TestCase):
- def setUp(self):
- PartyFactory(ec_id="PP85", name="UK Independence Party (UKIP)")
- with connection.cursor() as cursor:
- cursor.execute("CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;")
-
- @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed")
- def test_basic_parsing(self):
- self.assertFalse(RawPeople.objects.exists())
- doc = BallotSOPN.objects.create(
- ballot=self.dulwich_post_ballot,
- source_url="example.com",
- )
- dataframe = json.dumps(
- {
- "0": {
- "0": "Name of \nCandidate",
- "1": "BRADBURY \nAndrew John",
- "2": "COLLINS \nDave",
- "3": "HARVEY \nPeter John",
- "4": "JENNER \nMelanie",
- },
- "1": {
- "0": "Home Address",
- "1": "10 Fowey Close, \nShoreham by Sea, \nWest Sussex, \nBN43 5HE",
- "2": "51 Old Fort Road, \nShoreham by Sea, \nBN43 5RL",
- "3": "76 Harbour Way, \nShoreham by Sea, \nSussex, \nBN43 5HH",
- "4": "9 Flag Square, \nShoreham by Sea, \nWest Sussex, \nBN43 5RZ",
- },
- "2": {
- "0": "Description (if \nany)",
- "1": "Green Party",
- "2": "Independent",
- "3": "UK Independence \nParty (UKIP)",
- "4": "Labour Party",
- },
- "3": {
- "0": "Name of \nProposer",
- "1": "Tiffin Susan J",
- "2": "Loader Jocelyn C",
- "3": "Hearne James H",
- "4": "O`Connor Lavinia",
- },
- "4": {
- "0": "Reason \nwhy no \nlonger \nnominated\n*",
- "1": "",
- "2": "",
- "3": "",
- "4": "",
- },
- }
- )
- CamelotParsedSOPN.objects.create(
- sopn=doc, raw_data=dataframe, status="unparsed"
- )
- call_command("sopn_parsing_parse_tables")
- self.assertEqual(RawPeople.objects.count(), 1)
- raw_people = RawPeople.objects.get()
- self.assertEqual(
- raw_people.data,
- [
- {"name": "Andrew John Bradbury", "party_id": "PP63"},
- {"name": "Dave Collins", "party_id": "ynmp-party:2"},
- {"name": "Peter John Harvey", "party_id": "PP85"},
- {"name": "Melanie Jenner", "party_id": "PP53"},
- ],
- )
-
- @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed")
- def test_welsh_run_sopn(self):
- """
- Test that if the ballot is welsh run and previous party affiliations
- are included they are parsed
- """
- self.assertFalse(RawPeople.objects.exists())
- doc = BallotSOPN.objects.create(
- ballot=self.senedd_ballot,
- source_url="example.com",
- )
-
- plaid_cymru, _ = Party.objects.update_or_create(
- ec_id="PP77",
- legacy_slug="party:77",
- defaults={
- "name": "Plaid Cymru - The Party of Wales",
- "date_registered": "1999-01-14",
- },
- )
-
- dataframe = json.dumps(welsh_sopn_data)
- CamelotParsedSOPN.objects.create(
- sopn=doc, raw_data=dataframe, status="unparsed"
- )
- call_command("sopn_parsing_parse_tables")
- self.assertEqual(RawPeople.objects.count(), 1)
- raw_people = RawPeople.objects.get()
- self.assertEqual(
- raw_people.data,
- [
- {
- "name": "John Smith",
- "party_id": self.conservative_party.ec_id,
- "previous_party_affiliations": [self.ld_party.ec_id],
- },
- {
- "name": "Joe Bloggs",
- "party_id": self.labour_party.ec_id,
- "previous_party_affiliations": ["ynmp-party:2"],
- },
- {"name": "Jon Doe", "party_id": self.ld_party.ec_id},
- {
- "name": "Jane Brown",
- "party_id": "ynmp-party:2",
- "previous_party_affiliations": [plaid_cymru.ec_id],
- },
- {
- "name": "Judy Johnson",
- "party_id": plaid_cymru.ec_id,
- "previous_party_affiliations": [self.labour_party.ec_id],
- },
- {"name": "Julie Williams", "party_id": "ynmp-party:2"},
- ],
- )
-
- @skipIf(should_skip_pdf_tests(), "Required PDF libs not installed")
- def test_match_complex_descriptions(self):
- self.assertFalse(RawPeople.objects.exists())
- doc = BallotSOPN.objects.create(
- ballot=self.senedd_ballot,
- source_url="example.com",
- )
-
- plaid_cymru, _ = Party.objects.update_or_create(
- ec_id="PP77",
- legacy_slug="party:77",
- defaults={
- "name": "Plaid Cymru - The Party of Wales",
- "date_registered": "1999-01-14",
- },
- )
-
- dickens_heath, _ = Party.objects.update_or_create(
- ec_id="PP1",
- legacy_slug="PP!",
- defaults={
- "name": "Independent Dickens Heath Residents Action Group",
- "date_registered": "1999-01-14",
- },
- )
- PartyDescription.objects.create(
- party=dickens_heath,
- description="Independent Dickens Heath Residents Action Group",
- )
- lib_dem, _ = Party.objects.update_or_create(
- ec_id="PP100",
- legacy_slug="PP100",
- defaults={
- "name": "Liberal Democrats",
- "date_registered": "1999-01-14",
- },
- register="GB",
- )
-
- PartyDescription.objects.create(
- party=lib_dem,
- description="Liberal Democrat Focus Team | Tîm Ffocws y Democratiaid Rhyddfrydol",
- )
-
- data_path = (
- Path(__file__).parent / "data/edge_case_description_data.json"
- )
- with data_path.open() as f:
- CamelotParsedSOPN.objects.create(
- sopn=doc, raw_data=f.read(), status="unparsed"
- )
- call_command("sopn_parsing_parse_tables")
- self.assertEqual(RawPeople.objects.count(), 1)
- raw_people = RawPeople.objects.get()
- self.assertEqual(
- sorted(raw_people.data, key=lambda x: x["name"]),
- sorted(
- [
- {
- "name": "John Smith",
- "party_id": self.conservative_party.ec_id,
- },
- {
- "name": "Joe Bloggs",
- "party_id": self.labour_party.ec_id,
- },
- {
- "name": "Jon Doe",
- "party_id": self.ld_party.ec_id,
- },
- {
- "name": "Jane Brown",
- "party_id": "ynmp-party:2",
- },
- {
- "name": "Judy Johnson",
- "party_id": plaid_cymru.ec_id,
- },
- {"name": "Julie Williams", "party_id": "ynmp-party:2"},
- ],
- key=lambda x: x["name"],
- ),
- )
-
-
-class TestParseTablesUnitTests(UK2015ExamplesMixin, TestCase):
- def get_two_name_field_cases(self):
- # this could be updated with more combinations as we come across them
- return [
- {
- "name_fields": ["candidate surname", "candidate forename"],
- "row": {
- "candidate surname": "BAGSHAW",
- "candidate forename": "Elaine Sheila",
- "home address": "1 Foo Street \n London \nE14 6FW",
- "description": "London Liberal \nDemocrats",
- "reason why no longer nominated": "",
- },
- "ordered_name_fields": [
- "candidate forename",
- "candidate surname",
- ],
- "expected_name": "Elaine Sheila Bagshaw",
- },
- {
- "name_fields": ["surname", "other names"],
- "row": {
- "surname": "BAGSHAW",
- "other names": "Elaine Sheila",
- "home address": "1 Foo Street \nLondon \nE14 6FW",
- "description": "London Liberal \nDemocrats",
- "reason why no longer nominated": "",
- },
- "ordered_name_fields": ["other names", "surname"],
- "expected_name": "Elaine Sheila Bagshaw",
- },
- {
- "name_fields": ["last name", "other names"],
- "row": {
- "last name": "BAGSHAW",
- "other names": "Elaine Sheila",
- "home address": "1 Foo Street \nLondon \nE14 6FW",
- "description": "London Liberal \nDemocrats",
- "reason why no longer nominated": "",
- },
- "ordered_name_fields": ["other names", "last name"],
- "expected_name": "Elaine Sheila Bagshaw",
- },
- {
- "name_fields": ["candidate forename", "candidate surname"],
- "row": {
- "candidate forename": "Elaine Sheila",
- "candidate surname": "BAGSHAW",
- "home address": "1 Foo Street \n London \nE14 6FW",
- "description": "London Liberal \nDemocrats",
- "reason why no longer nominated": "",
- },
- "ordered_name_fields": [
- "candidate forename",
- "candidate surname",
- ],
- "expected_name": "Elaine Sheila Bagshaw",
- },
- ]
-
- def get_single_name_field_cases(self):
- return [
- {
- "name_fields": ["name of candidate"],
- "row": {
- "name of candidate": "BAGSHAW Elaine Sheila",
- "home address": "1 Foo Street \n London \nE14 6FW",
- "description": "London Liberal \nDemocrats",
- "reason why no longer nominated": "",
- },
- },
- {
- "name_fields": ["names of candidate"],
- "row": {
- "names of candidate": "BAGSHAW Elaine Sheila",
- "home address": "1 Foo Street \nLondon \nE14 6FW",
- "description": "London Liberal \nDemocrats",
- "reason why no longer nominated": "",
- },
- },
- {
- "name_fields": ["candidate name"],
- "row": {
- "candidate name": "BAGSHAW Elaine Sheila",
- "home address": "1 Foo Street \nLondon \nE14 6FW",
- "description": "London Liberal \nDemocrats",
- "reason why no longer nominated": "",
- },
- },
- {
- "name_fields": ["surname"],
- "row": {
- "surname": "BAGSHAW Elaine Sheila",
- "home address": "1 Foo Street \nLondon \nE14 6FW",
- "description": "London Liberal \nDemocrats",
- "reason why no longer nominated": "",
- },
- },
- {
- "name_fields": ["candidates surname"],
- "row": {
- "candidates surname": "BAGSHAW Elaine Sheila",
- "home address": "1 Foo Street \nLondon \nE14 6FW",
- "description": "London Liberal \nDemocrats",
- "reason why no longer nominated": "",
- },
- },
- {
- "name_fields": ["other name"],
- "row": {
- "other name": "BAGSHAW Elaine Sheila",
- "home address": "1 Foo Street \nLondon \nE14 6FW",
- "description": "London Liberal \nDemocrats",
- "reason why no longer nominated": "",
- },
- },
- ]
-
- def test_get_name_single_field(self):
- for case in self.get_single_name_field_cases():
- row = Series(case["row"])
- name_fields = case["name_fields"]
- with self.subTest(name_fields=name_fields):
- assert len(case["name_fields"]) == 1
- name = parse_tables.get_name(row=row, name_fields=name_fields)
- assert name == "Elaine Sheila Bagshaw"
-
- def test_get_name_two_fields(self):
- for case in self.get_two_name_field_cases():
- row = Series(case["row"])
- name_fields = case["name_fields"]
- with self.subTest(name_fields=name_fields):
- assert len(case["name_fields"]) == 2
- name = parse_tables.get_name(row=row, name_fields=name_fields)
- assert name == case["expected_name"]
-
- def test_get_name_fields_single(self):
- for case in self.get_single_name_field_cases():
- row = Index(case["row"])
- with self.subTest(row=row):
- name_fields = parse_tables.get_name_fields(row=row)
- assert len(name_fields) == 1
- assert name_fields == case["name_fields"]
-
- def test_get_name_fields_two(self):
- for case in self.get_two_name_field_cases():
- row = Index(case["row"])
- with self.subTest(row=row):
- name_fields = parse_tables.get_name_fields(row=row)
- assert len(name_fields) == 2
- assert name_fields == case["name_fields"]
-
- def test_get_name_fields_raises_error(self):
- row = Index({"foo": "Bar"})
- with self.assertRaises(ValueError):
- parse_tables.get_name_fields(row=row)
-
- def test_order_name_fields(self):
- for case in self.get_two_name_field_cases():
- name_fields = case["name_fields"]
- with self.subTest(name_fields=name_fields):
- result = parse_tables.order_name_fields(name_fields)
- assert result == case["ordered_name_fields"]
-
- def test_clean_name_replaces_backticks(self):
- name = parse_tables.clean_name("D`SOUZA")
- assert "`" not in name
- assert "'" in name
-
- def test_clean_name_replaces_newlines(self):
- name = parse_tables.clean_name(
- "A Very Long Name That Splits \nOver Lines"
- )
- assert "\n" not in name
-
- def test_clean_name_capitalized_last_and_titalized(self):
- name = parse_tables.clean_name("SMITH John")
- assert name == "John Smith"
-
- def test_clean_last_names(self):
- name = parse_tables.clean_last_names(["MACDONALD", "John"])
- assert name == "MacDonald"
-
- def test_clean_name_two_word_surnames(self):
- names = [
- ("EDE COOPER \nPalmer", "Palmer Ede Cooper"),
- ("VAN DULKEN \nRichard Michael", "Richard Michael Van Dulken"),
- ("ARMSTRONG LILLEY \nLynne", "Lynne Armstrong Lilley"),
- (
- " D`SOUZA Aaron Anthony Jose \nHasan",
- "Aaron Anthony Jose Hasan D'Souza",
- ),
- ("Michael James Collins", "Michael James Collins"),
- (" Michael James Collins ", "Michael James Collins"),
- ("DAVE Nitesh Pravin", "Nitesh Pravin Dave"),
- ("DAVE\nNitesh Pravin", "Nitesh Pravin Dave"),
- ("COOKE Anne-Marie", "Anne-Marie Cooke"),
- ("COOKE\nAnne-Marie", "Anne-Marie Cooke"),
- ("BROOKES-\nDUNCAN\nKaty", "Katy Brookes-Duncan"),
- ("HOUNSOME\nJohn", "John Hounsome"),
- ("O`CONNELL \nStephen John", "Stephen John O'Connell"),
- ("O`NEAL \nCarol Joy", "Carol Joy O'Neal"),
- ("O`REILLY \nTracey Linda \nDiane", "Tracey Linda Diane O'Reilly"),
- ("LIAM THOMAS O'ROURKE", "Liam Thomas O'Rourke"),
- ("O'CALLAGHAN \nClaire Louise", "Claire Louise O'Callaghan"),
- ]
- for name in names:
- with self.subTest(name=names[0]):
- assert parse_tables.clean_name(name[0]) == name[1]
-
- def test_clean_description_removes_newlines(self):
- cleaned_description = parse_tables.clean_description(
- "A Long Description That Splits \nOver \\nLines"
- )
- assert "\n" not in cleaned_description
- assert "\\n" not in cleaned_description
-
- def test_clean_description_replaces_backticks(self):
- cleaned_description = parse_tables.clean_description(
- "All People`s Party"
- )
- assert "`" not in cleaned_description
- assert "'" in cleaned_description
- assert cleaned_description == "All People's Party"
-
- def test_guess_previous_party_affiliations_field(self):
- sopn = CamelotParsedSOPN(raw_data=json.dumps(welsh_sopn_data))
- data = sopn.as_pandas
- data.columns = data.iloc[0]
-
- cases = [
- (self.dulwich_post_ballot, None),
- (self.senedd_ballot, "statement of party membership"),
- ]
- for case in cases:
- with self.subTest(msg=case[0]):
- sopn.sopn = BallotSOPN(ballot=case[0])
- result = parse_tables.guess_previous_party_affiliations_field(
- data=data, sopn=sopn
- )
- assert result == case[1]
-
- def test_add_previous_party_affiliations(self):
- cases = [
- {"party_str": "", "party": None, "expected": {}},
- {"party_str": "Unknown Party", "party": None, "expected": {}},
- {
- "party_str": "Labour Party",
- "party": self.labour_party,
- "expected": {
- "previous_party_affiliations": [self.labour_party.ec_id]
- },
- },
- ]
- for case in cases:
- with self.subTest(msg=case["party_str"]), patch.object(
- parse_tables, "get_party", return_value=case["party"]
- ):
- raw_data = {}
- sopn = CamelotParsedSOPN()
- result = parse_tables.add_previous_party_affiliations(
- party_str=case["party_str"],
- raw_data=raw_data,
- sopn=sopn,
- )
- assert result == case["expected"]
-
-
-class TestParseTablesFilterKwargs(TestCase):
- def setUp(self):
- self.command = ParseTablesCommand()
- self.default_filter_kwargs = {}
-
- def test_when_testing(self):
- options = {"testing": True}
- result = self.command.build_filter_kwargs(options)
- self.assertEqual(result, self.default_filter_kwargs)
-
- def test_when_using_ballot(self):
- options = {"ballot": "local.foo.bar.2021-05-06"}
- result = self.command.build_filter_kwargs(options)
- self.assertEqual(result, self.default_filter_kwargs)
-
- def test_when_using_reparse(self):
- options = {"reparse": True}
- result = self.command.build_filter_kwargs(options)
- expected = self.default_filter_kwargs.copy()
- expected["rawpeople__source_type"] = RawPeople.SOURCE_PARSED_PDF
- self.assertEqual(result, expected)
-
- def test_when_no_options(self):
- options = {}
- result = self.command.build_filter_kwargs(options)
- expected = self.default_filter_kwargs.copy()
- self.assertEqual(result, expected)
diff --git a/ynr/apps/sopn_parsing/tests/test_pdf_conversion.py b/ynr/apps/sopn_parsing/tests/test_pdf_conversion.py
index dceec34d7e..25a6afee1d 100644
--- a/ynr/apps/sopn_parsing/tests/test_pdf_conversion.py
+++ b/ynr/apps/sopn_parsing/tests/test_pdf_conversion.py
@@ -1,5 +1,3 @@
-from unittest import skipIf
-
from candidates.tests.uk_examples import UK2015ExamplesMixin
from django.core.files.uploadedfile import SimpleUploadedFile
from django.test import TestCase
@@ -13,12 +11,8 @@
PandocConversionError,
convert_docx_to_pdf,
)
-from sopn_parsing.tests import should_skip_conversion_tests
-@skipIf(
- should_skip_conversion_tests(), "Required conversion libs not installed"
-)
class TestSOPNHelpers(UK2015ExamplesMixin, TestCase):
example_docx_filename = EXAMPLE_DOCX_FILENAME
example_html_filename = EXAMPLE_HTML_FILENAME