Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ test-results
node_modules/
.vscode/
/test-env
/ynr/apps/sopn_parsing/tests/data/sopn_baseline.json
/ynr/apps/sopn_parsing/tests/data/sopn_baseline_copy.json
# PyCharm
.idea/

Expand Down
45 changes: 0 additions & 45 deletions Makefile

This file was deleted.

5 changes: 1 addition & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,10 @@ dependencies = [
"markdown-it-py==4.0.0",
"nameparser==1.1.2",
"ndg-httpsclient==0.5.1",

"openai==1.30.3",
# for compatibility with openai==1.30.3
# TODO: review/remove when we upgrade openai package
"httpx==0.27.0",

"Pillow==10.3.0",
"psycopg==3.1.12",
"python-dateutil==2.8.2",
Expand All @@ -58,14 +56,13 @@ dependencies = [
"whitenoise==6.5.0",
"sorl-thumbnail-serializer-field",
"slacker2",

# SOPN parsing
"pdfminer.six==20201018",
"camelot-py[cv]==0.8.2",
"pypandoc_binary==1.14",
"PyPDF2==2.12.1",
"amazon-textract-response-parser==1.0.3",
"amazon-textract-helper==0.0.35",
"pandas>=3.0.0",
]

[dependency-groups]
Expand Down
131 changes: 30 additions & 101 deletions uv.lock

Large diffs are not rendered by default.

76 changes: 0 additions & 76 deletions ynr/apps/bulk_adding/tests/test_bulk_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -752,79 +752,3 @@ def test_bulk_add_person_removes_spaces_from_name(self):
self.assertContains(resp, "Review candidates")
resp = form.submit()
self.assertContains(resp, "Bart Simpson")

def test_fall_back_to_camelot_if_no_textract(self):
data = {"name": "Bart", "party_id": "PP52"}

raw_people = RawPeople.objects.create(
ballot=self.dulwich_post_ballot,
data=[data],
source_type=RawPeople.SOURCE_PARSED_PDF,
)

self.assertEqual(
raw_people.as_form_kwargs(),
{
"initial": [
{
"name": "Bart",
"party": ["PP52", "PP52"],
"previous_party_affiliations": [],
"source": "",
}
]
},
)
raw_people.delete()

textract_data = {"name": "Lisa", "party_id": "PP53"}
raw_people = RawPeople.objects.create(
ballot=self.dulwich_post_ballot,
data=[data],
textract_data=[textract_data],
source_type=RawPeople.SOURCE_PARSED_PDF,
)

self.assertEqual(
raw_people.as_form_kwargs(),
{
"initial": [
{
"name": "Lisa",
"party": ["PP53", "PP53"],
"previous_party_affiliations": [],
"source": "",
}
]
},
)

def test_can_change_parser_in_frontend(self):
"""
Check that a query param can change the parser we use
"""
BallotSOPN.objects.create(
source_url="http://example.com",
ballot=self.dulwich_post_ballot,
uploaded_file="sopn.pdf",
)
RawPeople.objects.create(
ballot=self.dulwich_post_ballot,
data=[{"name": "Bart", "party_id": "PP52"}],
textract_data=[{"name": "Lisa", "party_id": "PP53"}],
source_type=RawPeople.SOURCE_PARSED_PDF,
)
response = self.app.get(
"/bulk_adding/sopn/parl.65808.2015-05-07/", user=self.user
)
form = response.forms["bulk_add_form"]
# This should be the Textract data
self.assertEqual(form.fields["form-0-name"][0].value, "Lisa")

response = self.app.get(
"/bulk_adding/sopn/parl.65808.2015-05-07/?v1_parser=1",
user=self.user,
)
form = response.forms["bulk_add_form"]
# This should be the Textract data
self.assertEqual(form.fields["form-0-name"][0].value, "Bart")
4 changes: 0 additions & 4 deletions ynr/apps/bulk_adding/views/sopns.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,12 +123,8 @@ def get(self, request, *args, **kwargs):
return super().get(request, *args, **kwargs)

def get_active_parser(self) -> Optional[SOPNParsingBackends]:
if self.request.GET.get("v1_parser"):
return SOPNParsingBackends.CAMELOT
if self.ballot.rawpeople.textract_data:
return SOPNParsingBackends.TEXTRACT
if self.ballot.rawpeople.data:
return SOPNParsingBackends.CAMELOT
return None

def get_context_data(self, **kwargs):
Expand Down
16 changes: 0 additions & 16 deletions ynr/apps/elections/templates/elections/includes/_sopn_debug.html
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,12 @@ <h3>Parsing Status</h3>
<ul>
<li>Pages matched: {% if object.sopn.get_pages %}Yes (matched pages: {{ object.sopn.get_pages|join:", " }}
){% else %}No{% endif %}</li>
<li>Camelot tables extracted: {% if object.sopn.camelotparsedsopn %}Yes{% else %}No{% endif %}</li>
<li>Raw Person Data: {% if object.rawpeople %}Yes{% else %}No{% endif %}</li>
<li>AWS Textract Data: {% if textract_parsed.raw_data %}Yes{% else %}No{% endif %}</li>
<li>AWS Textract Parsed? {% if textract_parsed.parsed_data %}Yes{% else %}
No{% endif %}</li>
</ul>

<h3>Camelot raw Data</h3>
{% if object.sopn.camelotparsedsopn.raw_data %}
<pre>{{ object.sopn.camelotparsedsopn.as_pandas.to_dict|pprint }}</pre>
{% else %}
N/A
{% endif %}

<h3>Camelot table Data</h3>
{% if object.sopn.camelotparsedsopn.data_as_html %}
{{ object.sopn.camelotparsedsopn.data_as_html|safe }}
{% else %}
N/A
{% endif %}
<br/>


{% if textract_parsed and textract_parsed.as_textractor_document %}
<h3>AWS extracted table{{ textract_parsed.as_textractor_document.tables|pluralize }}</h3>
Expand Down
8 changes: 0 additions & 8 deletions ynr/apps/official_documents/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from typing import List

from candidates.models import Ballot
from django.conf import settings
from django.core.files.base import ContentFile
from django.core.validators import FileExtensionValidator
from django.db import models
Expand Down Expand Up @@ -258,7 +257,6 @@ def parse(self):

"""

from sopn_parsing.helpers.extract_tables import extract_ballot_table
from sopn_parsing.helpers.textract_helpers import (
NotUsingAWSException,
TextractSOPNHelper,
Expand All @@ -274,12 +272,6 @@ def parse(self):
# There's a cron job that should pick up the result and carry on parsing later.
textract_helper.start_detection()

if getattr(
settings, "CAMELOT_ENABLED", False
) and self.uploaded_file.name.endswith(".pdf"):
# Camelot
extract_ballot_table(self.ballot)


class BallotSOPNHistory(BaseBallotSOPN):
ballot = models.ForeignKey(
Expand Down
38 changes: 0 additions & 38 deletions ynr/apps/official_documents/tests/test_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import textwrap
from os.path import dirname, join, realpath
from pathlib import Path
from unittest import skipIf

from candidates.models import LoggedAction
from candidates.tests.auth import TestUserMixin
Expand All @@ -27,7 +26,6 @@
EXAMPLE_DOCX_FILENAME,
EXAMPLE_HTML_FILENAME,
)
from sopn_parsing.tests import should_skip_conversion_tests
from webtest import Upload

TEST_MEDIA_ROOT = realpath(
Expand Down Expand Up @@ -114,20 +112,8 @@ def test_upload_authorized(self):
with open(self.example_image_filename, "rb") as f:
form["uploaded_file"] = Upload("pilot.jpg", f.read())

# TODO: Add back in
# with patch(
# "official_documents.views.extract_pages_for_ballot"
# ) as extract_pages, patch(
# "official_documents.views.extract_ballot_table"
# ) as extract_tables, patch(
# "official_documents.views.parse_raw_data_for_ballot"
# ) as parse_tables:
response = form.submit()
self.assertEqual(response.status_code, 302)
# TODO: Add back in
# extract_pages.assert_called_once()
# extract_tables.assert_called_once()
# parse_tables.assert_called_once()

ballot_sopns = BallotSOPN.objects.all()
self.assertEqual(ballot_sopns.count(), 1)
Expand Down Expand Up @@ -155,9 +141,6 @@ def test_upload_authorized(self):
)
self.assertInHTML("Update SOPN", response.text)

@skipIf(
should_skip_conversion_tests(), "Required conversion libs not installed"
)
def test_docx_upload_form_validation(self):
self.assertFalse(LoggedAction.objects.exists())
response = self.app.get(
Expand All @@ -181,26 +164,11 @@ def test_docx_upload_form_validation(self):
with open(self.example_docx_filename, "rb") as f:
form["uploaded_file"] = Upload("pilot.docx", f.read())

# TODO: add back in
# with patch(
# "official_documents.views.extract_pages_for_ballot"
# ) as extract_pages, patch(
# "official_documents.views.extract_ballot_table"
# ) as extract_tables, patch(
# "official_documents.views.parse_raw_data_for_ballot"
# ) as parse_tables:
response = form.submit()
self.assertEqual(response.status_code, 302)
# TODO Add back in
# extract_pages.assert_called_once()
# extract_tables.assert_called_once()
# parse_tables.assert_called_once()
self.assertEqual(BallotSOPN.objects.count(), 1)
self.assertEqual(response.location, self.ballot.get_sopn_url())

@skipIf(
should_skip_conversion_tests(), "Required conversion libs not installed"
)
def test_html_upload_form_validation(self):
self.assertFalse(LoggedAction.objects.exists())
response = self.app.get(
Expand Down Expand Up @@ -229,9 +197,6 @@ def test_html_upload_form_validation(self):
response.text,
)

@skipIf(
should_skip_conversion_tests(), "Required conversion libs not installed"
)
def test_jpg_form_validation(self):
self.assertFalse(LoggedAction.objects.exists())
response = self.app.get(
Expand All @@ -256,9 +221,6 @@ def test_jpg_form_validation(self):
self.assertEqual(response.status_code, 302)
self.assertEqual(BallotSOPN.objects.count(), 1)

@skipIf(
should_skip_conversion_tests(), "Required conversion libs not installed"
)
def test_update_existing_sopn(self):
self.assertFalse(LoggedAction.objects.exists())
response = self.app.get(
Expand Down
63 changes: 0 additions & 63 deletions ynr/apps/sopn_parsing/helpers/extract_tables.py

This file was deleted.

Loading