Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion modules/invenio-indexer/invenio_indexer/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from elasticsearch.helpers import bulk, streaming_bulk
from flask import current_app
from invenio_records.api import Record
from invenio_pidstore.errors import PIDDoesNotExistError
from invenio_search import current_search_client
from kombu import Producer as KombuProducer
from kombu.compat import Consumer
Expand Down Expand Up @@ -499,7 +500,10 @@ def _actionsiter(self, message_iterator, with_deleted=False):
message.ack()
except NoResultFound as ne:
message.reject()
current_app.logger.error(f"id:{payload.get('id', 'unknown')}, type:{type(ne).__name__}, message:{str(ne)}\n{traceback.format_exc()}")
current_app.logger.error(f"id:{payload.get('id', 'unknown')}, type:{type(ne).__name__}, message:record does not exists\n{traceback.format_exc()}")
except PIDDoesNotExistError as pe:
message.reject()
current_app.logger.error(f"id:{payload.get('id', 'unknown')}, type:{type(pe).__name__}, message:pid does not exists\n{traceback.format_exc()}")
except SQLAlchemyError as se:
db.session.rollback()
current_app.logger.error(f"id:{payload.get('id', 'unknown')}, type:{type(se).__name__}, message:{str(se)}\n{traceback.format_exc()}")
Expand Down
6 changes: 3 additions & 3 deletions modules/weko-deposit/weko_deposit/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1234,7 +1234,8 @@ def get_content_files(self):
current_app.config["WEKO_DEPOSIT_FILESIZE_LIMIT"]
)
inf = chardet.detect(data)
data = data.decode(inf["encoding"], errors="replace")
encoding = inf.get("encoding") or "utf-8"
data = data.decode(encoding, errors="replace")
else:
file_instance = file.obj.file
file_info = {
Expand All @@ -1245,8 +1246,7 @@ def get_content_files(self):
reading_targets[lst["filename"]] = file_info
attachment["content"] = data
except FileNotFoundError as se:
current_app.logger.error(f"FileNotFoundError: {se}")
current_app.logger.error(f"file.obj: {file.obj}")
current_app.logger.error(f"FileNotFoundError: {se}, {file.obj.key}")

content.update({"attachment": attachment})
contents.append(content)
Expand Down
4 changes: 0 additions & 4 deletions modules/weko-deposit/weko_deposit/receivers.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,8 @@ def append_file_content(sender, json={}, record=None, index=None, **kwargs):
current_app.logger.info('FINISHED reindex record: {0}'.format(
im['control_number']))
except NoResultFound as e:
current_app.logger.error('Indexing error: record does not exists: {0}'.format(
record.id))
raise e
except PIDDoesNotExistError as e:
current_app.logger.error('Indexing error: pid does not exists: {0}'.format(
record.id))
raise e
except Exception as e:
raise e
33 changes: 23 additions & 10 deletions modules/weko-deposit/weko_deposit/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import pypdfium2
import os
import subprocess
import gc

def update_pdf_contents_es(record_ids):
"""register the contents of the record PDF file in elasticsearch
Expand Down Expand Up @@ -59,21 +60,33 @@ def extract_text_from_pdf(filepath, max_size):
reader = pypdfium2.PdfDocument(filepath)
texts = []
total_bytes = 0
for page in reader:
text = page.get_textpage().get_text_range()
encoded = text.encode('utf-8', errors='replace')
if total_bytes + len(encoded) > max_size:
remain = max_size - total_bytes
texts.append(encoded[:remain].decode('utf-8', errors='ignore'))
break
else:
texts.append(text)
total_bytes += len(encoded)
for i in range(len(reader)):
page = reader.get_page(i)
textpage=None
try:
textpage = page.get_textpage()
text = textpage.get_text_range()

encoded = text.encode('utf-8', errors='replace')
if total_bytes + len(encoded) > max_size:
remain = max_size - total_bytes
texts.append(encoded[:remain].decode('utf-8', errors='ignore'))
break
else:
texts.append(text)
total_bytes += len(encoded)
finally:
if textpage is not None:
textpage.close()
page.close()
data = "".join(texts)
data = "".join(data.splitlines())
finally:
if reader is not None:
reader.close()

del reader
gc.collect()

return data

Expand Down
Loading