Remove camelot #2624

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

symroe wants to merge 5 commits into master from remove-camelot

.gitignore

-Original file line number
+Diff line change
@@ Expand Up / @@ -24,8 +24,6 @@ test-results @@
     node_modules/
     .vscode/
     /test-env
-    /ynr/apps/sopn_parsing/tests/data/sopn_baseline.json
-    /ynr/apps/sopn_parsing/tests/data/sopn_baseline_copy.json
     # PyCharm
     .idea/
@@ Expand Down @@

Makefile

This file was deleted.

pyproject.toml

-Original file line number
+Diff line change
@@ Expand Up / @@ -41,12 +41,10 @@ dependencies = [ @@
         "markdown-it-py==4.0.0",
         "nameparser==1.1.2",
         "ndg-httpsclient==0.5.1",
         "openai==1.30.3",
         # for compatibility with openai==1.30.3
         # TODO: review/remove when we upgrade openai package
         "httpx==0.27.0",
         "Pillow==10.3.0",
         "psycopg==3.1.12",
         "python-dateutil==2.8.2",
@@ Expand All / @@ -58,14 +56,13 @@ dependencies = [ @@
         "whitenoise==6.5.0",
         "sorl-thumbnail-serializer-field",
         "slacker2",
         # SOPN parsing
         "pdfminer.six==20201018",
-        "camelot-py[cv]==0.8.2",
         "pypandoc_binary==1.14",
         "PyPDF2==2.12.1",
         "amazon-textract-response-parser==1.0.3",
         "amazon-textract-helper==0.0.35",
+        "pandas>=3.0.0",
     ]
     [dependency-groups]
@@ Expand Down @@

uv.lock

Large diffs are not rendered by default.

ynr/apps/bulk_adding/tests/test_bulk_add.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -752,79 +752,3 @@ def test_bulk_add_person_removes_spaces_from_name(self): @@
             self.assertContains(resp, "Review candidates")
             resp = form.submit()
             self.assertContains(resp, "Bart Simpson")
-        def test_fall_back_to_camelot_if_no_textract(self):
-            data = {"name": "Bart", "party_id": "PP52"}
-            raw_people = RawPeople.objects.create(
-                ballot=self.dulwich_post_ballot,
-                data=[data],
-                source_type=RawPeople.SOURCE_PARSED_PDF,
-            )
-            self.assertEqual(
-                raw_people.as_form_kwargs(),
-                {
-                    "initial": [
-                        {
-                            "name": "Bart",
-                            "party": ["PP52", "PP52"],
-                            "previous_party_affiliations": [],
-                            "source": "",
-                        }
-                    ]
-                },
-            )
-            raw_people.delete()
-            textract_data = {"name": "Lisa", "party_id": "PP53"}
-            raw_people = RawPeople.objects.create(
-                ballot=self.dulwich_post_ballot,
-                data=[data],
-                textract_data=[textract_data],
-                source_type=RawPeople.SOURCE_PARSED_PDF,
-            )
-            self.assertEqual(
-                raw_people.as_form_kwargs(),
-                {
-                    "initial": [
-                        {
-                            "name": "Lisa",
-                            "party": ["PP53", "PP53"],
-                            "previous_party_affiliations": [],
-                            "source": "",
-                        }
-                    ]
-                },
-            )
-        def test_can_change_parser_in_frontend(self):
-            """
-            Check that a query param can change the parser we use
-            """
-            BallotSOPN.objects.create(
-                source_url="http://example.com",
-                ballot=self.dulwich_post_ballot,
-                uploaded_file="sopn.pdf",
-            )
-            RawPeople.objects.create(
-                ballot=self.dulwich_post_ballot,
-                data=[{"name": "Bart", "party_id": "PP52"}],
-                textract_data=[{"name": "Lisa", "party_id": "PP53"}],
-                source_type=RawPeople.SOURCE_PARSED_PDF,
-            )
-            response = self.app.get(
-                "/bulk_adding/sopn/parl.65808.2015-05-07/", user=self.user
-            )
-            form = response.forms["bulk_add_form"]
-            # This should be the Textract data
-            self.assertEqual(form.fields["form-0-name"][0].value, "Lisa")
-            response = self.app.get(
-                "/bulk_adding/sopn/parl.65808.2015-05-07/?v1_parser=1",
-                user=self.user,
-            )
-            form = response.forms["bulk_add_form"]
-            # This should be the Textract data
-            self.assertEqual(form.fields["form-0-name"][0].value, "Bart")

ynr/apps/bulk_adding/views/sopns.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -123,12 +123,8 @@ def get(self, request, *args, **kwargs): @@
             return super().get(request, *args, **kwargs)
         def get_active_parser(self) -> Optional[SOPNParsingBackends]:
-            if self.request.GET.get("v1_parser"):
-                return SOPNParsingBackends.CAMELOT
             if self.ballot.rawpeople.textract_data:
                 return SOPNParsingBackends.TEXTRACT
-            if self.ballot.rawpeople.data:
-                return SOPNParsingBackends.CAMELOT
             return None
         def get_context_data(self, **kwargs):
@@ Expand Down @@

ynr/apps/elections/templates/elections/includes/_sopn_debug.html

-Original file line number
+Diff line change
@@ Expand Up / @@ -7,28 +7,12 @@ <h3>Parsing Status</h3> @@
             <ul>
                 <li>Pages matched: {% if object.sopn.get_pages %}Yes (matched pages: {{ object.sopn.get_pages|join:", " }}
                     ){% else %}No{% endif %}</li>
-                <li>Camelot tables extracted: {% if object.sopn.camelotparsedsopn %}Yes{% else %}No{% endif %}</li>
                 <li>Raw Person Data: {% if object.rawpeople %}Yes{% else %}No{% endif %}</li>
                 <li>AWS Textract Data: {% if textract_parsed.raw_data %}Yes{% else %}No{% endif %}</li>
                 <li>AWS Textract Parsed? {% if textract_parsed.parsed_data %}Yes{% else %}
                     No{% endif %}</li>
             </ul>
-            <h3>Camelot raw Data</h3>
-            {% if object.sopn.camelotparsedsopn.raw_data %}
-                <pre>{{ object.sopn.camelotparsedsopn.as_pandas.to_dict|pprint }}</pre>
-            {% else %}
-                N/A
-            {% endif %}
-            <h3>Camelot table Data</h3>
-            {% if object.sopn.camelotparsedsopn.data_as_html %}
-                {{ object.sopn.camelotparsedsopn.data_as_html|safe }}
-            {% else %}
-                N/A
-            {% endif %}
-            <br/>
             {% if textract_parsed and textract_parsed.as_textractor_document %}
                 <h3>AWS extracted table{{ textract_parsed.as_textractor_document.tables|pluralize }}</h3>
@@ Expand Down @@

ynr/apps/official_documents/models.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -4,7 +4,6 @@ @@
     from typing import List
     from candidates.models import Ballot
-    from django.conf import settings
     from django.core.files.base import ContentFile
     from django.core.validators import FileExtensionValidator
     from django.db import models
@@ Expand Down Expand Up / @@ -258,7 +257,6 @@ def parse(self): @@
             """
-            from sopn_parsing.helpers.extract_tables import extract_ballot_table
             from sopn_parsing.helpers.textract_helpers import (
                 NotUsingAWSException,
                 TextractSOPNHelper,
@@ Expand All / @@ -274,12 +272,6 @@ def parse(self): @@
                 # There's a cron job that should pick up the result and carry on parsing later.
                 textract_helper.start_detection()
-            if getattr(
-                settings, "CAMELOT_ENABLED", False
-            ) and self.uploaded_file.name.endswith(".pdf"):
-                # Camelot
-                extract_ballot_table(self.ballot)
     class BallotSOPNHistory(BaseBallotSOPN):
         ballot = models.ForeignKey(
@@ Expand Down @@

ynr/apps/official_documents/tests/test_upload.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -2,7 +2,6 @@ @@
     import textwrap
     from os.path import dirname, join, realpath
     from pathlib import Path
-    from unittest import skipIf
     from candidates.models import LoggedAction
     from candidates.tests.auth import TestUserMixin
@@ Expand All / @@ -27,7 +26,6 @@ @@
         EXAMPLE_DOCX_FILENAME,
         EXAMPLE_HTML_FILENAME,
     )
-    from sopn_parsing.tests import should_skip_conversion_tests
     from webtest import Upload
     TEST_MEDIA_ROOT = realpath(
@@ Expand Down Expand Up / @@ -114,20 +112,8 @@ def test_upload_authorized(self): @@
             with open(self.example_image_filename, "rb") as f:
                 form["uploaded_file"] = Upload("pilot.jpg", f.read())
-            # TODO: Add back in
-            # with patch(
-            #     "official_documents.views.extract_pages_for_ballot"
-            # ) as extract_pages, patch(
-            #     "official_documents.views.extract_ballot_table"
-            # ) as extract_tables, patch(
-            #     "official_documents.views.parse_raw_data_for_ballot"
-            # ) as parse_tables:
             response = form.submit()
             self.assertEqual(response.status_code, 302)
-            # TODO: Add back in
-            # extract_pages.assert_called_once()
-            # extract_tables.assert_called_once()
-            # parse_tables.assert_called_once()
             ballot_sopns = BallotSOPN.objects.all()
             self.assertEqual(ballot_sopns.count(), 1)
@@ Expand Down Expand Up / @@ -155,9 +141,6 @@ def test_upload_authorized(self): @@
             )
             self.assertInHTML("Update SOPN", response.text)
-        @skipIf(
-            should_skip_conversion_tests(), "Required conversion libs not installed"
-        )
         def test_docx_upload_form_validation(self):
             self.assertFalse(LoggedAction.objects.exists())
             response = self.app.get(
@@ Expand All / @@ -181,26 +164,11 @@ def test_docx_upload_form_validation(self): @@
             with open(self.example_docx_filename, "rb") as f:
                 form["uploaded_file"] = Upload("pilot.docx", f.read())
-            # TODO: add back in
-            # with patch(
-            #     "official_documents.views.extract_pages_for_ballot"
-            # ) as extract_pages, patch(
-            #     "official_documents.views.extract_ballot_table"
-            # ) as extract_tables, patch(
-            #     "official_documents.views.parse_raw_data_for_ballot"
-            # ) as parse_tables:
             response = form.submit()
             self.assertEqual(response.status_code, 302)
-            # TODO Add back in
-            # extract_pages.assert_called_once()
-            # extract_tables.assert_called_once()
-            # parse_tables.assert_called_once()
             self.assertEqual(BallotSOPN.objects.count(), 1)
             self.assertEqual(response.location, self.ballot.get_sopn_url())
-        @skipIf(
-            should_skip_conversion_tests(), "Required conversion libs not installed"
-        )
         def test_html_upload_form_validation(self):
             self.assertFalse(LoggedAction.objects.exists())
             response = self.app.get(
@@ Expand Down Expand Up / @@ -229,9 +197,6 @@ def test_html_upload_form_validation(self): @@
                 response.text,
             )
-        @skipIf(
-            should_skip_conversion_tests(), "Required conversion libs not installed"
-        )
         def test_jpg_form_validation(self):
             self.assertFalse(LoggedAction.objects.exists())
             response = self.app.get(
@@ Expand All / @@ -256,9 +221,6 @@ def test_jpg_form_validation(self): @@
             self.assertEqual(response.status_code, 302)
             self.assertEqual(BallotSOPN.objects.count(), 1)
-        @skipIf(
-            should_skip_conversion_tests(), "Required conversion libs not installed"
-        )
         def test_update_existing_sopn(self):
             self.assertFalse(LoggedAction.objects.exists())
             response = self.app.get(
@@ Expand Down @@

ynr/apps/sopn_parsing/helpers/extract_tables.py

This file was deleted.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Remove camelot #2624

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Remove camelot #2624

Are you sure you want to change the base?

Uh oh!

Remove camelot #2624

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!

Uh oh!

Uh oh!