Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 46 additions & 36 deletions src/ocrd/cli/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
:nested: full
"""
from os import rmdir, unlink
from os.path import dirname, relpath, normpath, exists, join, isabs, isdir
from os.path import abspath, dirname, relpath, normpath, exists, join, isabs, isdir
from pathlib import Path
from json import loads, dumps
import sys
Expand Down Expand Up @@ -47,7 +47,7 @@
def workspace(self):
return Workspace(
self.resolver,
directory=self.directory,
self.directory,
mets_basename=self.mets_basename,
automatic_backup=self.automatic_backup,
mets_server_url=self.mets_server_url,
Expand Down Expand Up @@ -85,7 +85,7 @@
initLogging()
ctx.obj = WorkspaceCtx(
directory,
mets_url=mets,
mets,
mets_basename=mets_basename,
mets_server_url=mets_server_url,
automatic_backup=backup
Expand Down Expand Up @@ -254,22 +254,26 @@
log.debug("Adding '%s'", fname)
local_filename = None
if not (fname.startswith('http://') or fname.startswith('https://')):
if not fname.startswith(ctx.directory):
if not isabs(fname) and exists(join(ctx.directory, fname)):
assert isabs(ctx.directory)
# try to resolve relative to workspace or CWD
if not isabs(fname):
if exists(join(ctx.directory, fname)):
fname = join(ctx.directory, fname)
else:
log.debug("File '%s' is not in workspace, copying", fname)
try:
fname = ctx.resolver.download_to_directory(ctx.directory, fname, subdir=file_grp)
except FileNotFoundError:
if check_file_exists:
log.error("File '%s' does not exist, halt execution!" % fname)
sys.exit(1)
if check_file_exists and not exists(fname):
log.error("File '%s' does not exist, halt execution!" % fname)
sys.exit(1)
fname = abspath(fname)
if fname.startswith(ctx.directory):
if check_file_exists and not exists(fname):
log.error("File '%s' does not exist, halt execution!" % fname)
sys.exit(1)
fname = relpath(fname, ctx.directory)
else:
log.debug("File '%s' is not in workspace, copying", fname)
try:
fname = ctx.resolver.download_to_directory(ctx.directory, fname, subdir=file_grp)
except FileNotFoundError:
if check_file_exists:
log.error("File '%s' does not exist, halt execution!" % fname)
sys.exit(1)
local_filename = fname

if not page_id:
Expand Down Expand Up @@ -299,8 +303,8 @@
@click.option('-g', '--page-id', help="physical page ID of the file", required=False)
@click.option('-i', '--file-id', help="ID of the file. If not provided, derive from fileGrp and filename", required=False)
@click.option('-u', '--url', help="Remote URL of the file", required=False)
@click.option('-l', '--local-filename', help="Local filesystem path in the workspace directory "
"(copied from source file if different)", required=False)
@click.option('-l', '--local-filename', help="Relative filesystem path in the workspace directory "
"(copied from source path if different)", required=False)
@click.option('-G', '--file-grp', help="File group USE of the file", required=True)
@click.option('-n', '--dry-run', help="Don't actually do anything to the METS or filesystem, just preview",
default=False, is_flag=True)
Expand All @@ -312,7 +316,7 @@
@click.option('-s', '--skip', help="Skip files not matching --regex (instead of failing)", default=False, is_flag=True)
@click.argument('file_glob', nargs=-1, required=True)
@pass_workspace
def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_filename, file_grp, dry_run,

Check warning on line 319 in src/ocrd/cli/workspace.py

View workflow job for this annotation

GitHub Actions / build (3.12, ubuntu-22.04)

C901

'workspace_cli_bulk_add' is too complex (29)

Check warning on line 319 in src/ocrd/cli/workspace.py

View workflow job for this annotation

GitHub Actions / build (3.11, ubuntu-22.04)

C901

'workspace_cli_bulk_add' is too complex (29)

Check warning on line 319 in src/ocrd/cli/workspace.py

View workflow job for this annotation

GitHub Actions / build (3.9, ubuntu-22.04)

C901

'workspace_cli_bulk_add' is too complex (29)

Check warning on line 319 in src/ocrd/cli/workspace.py

View workflow job for this annotation

GitHub Actions / build (3.10, macos-latest)

C901

'workspace_cli_bulk_add' is too complex (29)

Check warning on line 319 in src/ocrd/cli/workspace.py

View workflow job for this annotation

GitHub Actions / build (3.10, ubuntu-22.04)

C901

'workspace_cli_bulk_add' is too complex (29)

Check warning on line 319 in src/ocrd/cli/workspace.py

View workflow job for this annotation

GitHub Actions / build (3.9, macos-latest)

C901

'workspace_cli_bulk_add' is too complex (29)
file_glob, src_path_option, ignore, force, skip):
"""
Add files in bulk to an OCR-D workspace.
Expand Down Expand Up @@ -414,38 +418,45 @@
raise ValueError(f"OcrdFile attribute '{param_name}' unset ({file_dict})")
for group_name in group_dict:
file_dict[param_name] = file_dict[param_name].replace('{{ %s }}' % group_name, group_dict[group_name])
if '{{ ' in file_dict[param_name]:
log.warning("Probably failed to match a capture group in %s: '%s'", param_name, file_dict[param_name])

# Where to copy from
if src_path_option:
src_path = src_path_option
for group_name in group_dict:
src_path = src_path.replace('{{ %s }}' % group_name, group_dict[group_name])
srcpath = Path(src_path)
src_path_option = src_path_option.replace('{{ %s }}' % group_name, group_dict[group_name])
if '{{ ' in src_path_option:
log.warning("Probably failed to match a capture group in source path: '%s'", src_path_option)
src_path = Path(src_path_option)
else:
srcpath = file_path
src_path = file_path

# derive --file-id from filename if not --file-id not explicitly set
if not file_id:
id_field = srcpath.stem if file_path != srcpath else file_path.stem
id_field = src_path.stem if file_path != src_path else file_path.stem
file_dict['file_id'] = safe_filename('%s_%s' % (file_dict['file_grp'], id_field))
if not mimetype:
try:
file_dict['mimetype'] = EXT_TO_MIME[srcpath.suffix]
file_dict['mimetype'] = EXT_TO_MIME[src_path.suffix]
except KeyError:
log.error("Cannot guess MIME type from extension '%s' for '%s'. "
"Set --mimetype explicitly" % (srcpath.suffix, srcpath))
"Set --mimetype explicitly" % (src_path.suffix, src_path))

# copy files if src != url
src_path = src_path.resolve().absolute()
if local_filename_is_src:
file_dict['local_filename'] = srcpath
else:
destpath = Path(workspace.directory, file_dict['local_filename'])
if srcpath != destpath and not destpath.exists():
log.info("cp '%s' '%s'", srcpath, destpath)
if not dry_run:
if not destpath.parent.is_dir():
destpath.parent.mkdir()
destpath.write_bytes(srcpath.read_bytes())
if str(src_path).startswith(workspace.directory):
file_dict['local_filename'] = src_path.relative_to(workspace.directory)
else:
file_dict['local_filename'] = src_path.name # will be copied-in below

dest_path = Path(workspace.directory, file_dict['local_filename'])
if src_path != dest_path and not dest_path.exists():
log.info("cp '%s' '%s'", src_path, dest_path)
if not dry_run:
if not dest_path.parent.is_dir():
dest_path.parent.mkdir()
dest_path.write_bytes(src_path.read_bytes())

# Add to workspace (or not)
fileGrp = file_dict.pop('file_grp')
Expand All @@ -465,8 +476,7 @@
@workspace_cli.command('find')
@mets_find_options
@click.option('-k', '--output-field', help="Output field. Repeat for multiple fields, will be joined with tab",
default=['local_filename'],
show_default=True,
default=['local_filename'], show_default=True,
multiple=True,
type=click.Choice([
'url',
Expand Down Expand Up @@ -869,7 +879,7 @@
the same semantics as in ``ocrd workspace find``, see ``ocrd workspace find --help``
for an explanation.
"""
mets_path = Path(mets_path)
mets_path = Path(mets_path).absolute()
if filegrp_mapping:
filegrp_mapping = loads(filegrp_mapping)
assert not ctx.mets_server_url, \
Expand Down
5 changes: 3 additions & 2 deletions src/ocrd/mets_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,8 @@ def __init__(self, workspace, url):
def create_process(mets_server_url: str, ws_dir_path: str, log_file: str) -> int:
sub_process = Popen(
args=["ocrd", "workspace", "-U", f"{mets_server_url}", "-d", f"{ws_dir_path}", "server", "start"],
stdout=open(file=log_file, mode="w"), stderr=open(file=log_file, mode="a"), cwd=ws_dir_path,
stdout=open(file=log_file, mode="w"), stderr=open(file=log_file, mode="a"),
#cwd=ws_dir_path, # rs: if relative, this will cause wrong path resolution in the subprocess
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point! Potentially setting the current working directory to workspace directory path was the only way to make it work without changing/fixing the Workspace class itself or the workspace CLI. However, that had other side effects leading to a different bug.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, could well be. The workspace CLI was obviously not tested enough for the case WS ≠ WD.

In fixing add and bulk-add, I hope I caught that. (But should add tests also.)

What still remains are the long-standing issues with clone (#1149) and find --download (#809).

shell=False, universal_newlines=True, start_new_session=True
)
# Wait for the mets server to start
Expand Down Expand Up @@ -446,7 +447,7 @@ def shutdown(self):
Path(self.url).unlink()

def startup(self):
self.log.info("Configuring the METS Server")
self.log.info("Configuring the METS Server for %s", self.workspace)

workspace = self.workspace

Expand Down
2 changes: 1 addition & 1 deletion src/ocrd_network/processing_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,7 @@ async def validate_and_forward_job_to_worker(self, processor_name: str, data: PY
self.cache_processing_requests.update_request_counter(workspace_key=workspace_key, by_value=0)

# This check is done to return early in case a workspace_id is provided
# but the abs mets path cannot be queried from the DB
# but the mets path cannot be queried from the DB
request_mets_path = await validate_and_return_mets_path(self.log, data)

page_ids = expand_page_ids(data.page_id)
Expand Down
3 changes: 2 additions & 1 deletion tests/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import logging
import io
import collections
from typing import Tuple
from unittest import TestCase as VanillaTestCase, skip, main as unittests_main
import pytest
from ocrd_utils import disableLogging, initLogging, getLogger
Expand Down Expand Up @@ -36,7 +37,7 @@ class CapturingTestCase(TestCase):
def _setup_pytest_capfd(self, capfd):
self.capfd = capfd

def invoke_cli(self, cli, args):
def invoke_cli(self, cli, args) -> Tuple[int, str, str]:
"""
Substitution for click.CliRunner.invooke that works together nicely
with unittests/pytest capturing stdout/stderr.
Expand Down
22 changes: 18 additions & 4 deletions tests/cli/test_workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def test_add_nonexisting_checked(self):
'--mimetype', mimetype,
'does-not-exist.xml'])
self.assertEqual(exit_code, 1)
self.assertIn("File 'does-not-exist.xml' does not exist, halt execution!", err)
self.assertIn("does-not-exist.xml' does not exist, halt execution!", err)

def test_add_519(self):
"""
Expand Down Expand Up @@ -455,6 +455,20 @@ def test_mets_directory_http(self):
with self.assertRaisesRegex(ValueError, r"--mets is an http\(s\) URL but no --directory was given"):
self.invoke_cli(workspace_cli, ['-m', 'https://foo.bar/bla', 'init'])

def test_bulk_add_bad_regex(self):
with pushd_popd(tempdir=True) as wsdir:
self.resolver.workspace_from_nothing(directory=wsdir)
exit_code, _, err = self.invoke_cli(workspace_cli, [
'bulk-add',
'--regex', r'^(not closed',
'-g', 'foo',
'-G', 'foo',
'*'
])
assert int(exit_code) > 0, 'should fail'
assert 'Invalid regex' in err


def test_bulk_add0(self):
NO_FILES=100
with TemporaryDirectory() as srcdir:
Expand Down Expand Up @@ -501,9 +515,9 @@ def test_bulk_add_missing_param(self):
'-m', '{{ mimetype }}',
'-u', "{{ url }}",
'a b c d e f', '1 2 3 4 5 6'])
print('out', out)
print('err', err)
assert 0
# print('out', out)
# print('err', err)
# assert 0

def test_bulk_add_gen_id(self):
with pushd_popd(tempdir=True) as wsdir:
Expand Down
Loading