Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ocrd/ocrd/processor/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from .base import (
Processor,
)
from .helpers import (
run_cli,
run_processor,
generate_processor_help
Expand Down
171 changes: 9 additions & 162 deletions ocrd/ocrd/processor/base.py
Original file line number Diff line number Diff line change
@@ -1,171 +1,18 @@
"""
Processor base class and helper functions
"""

__all__ = ['Processor', 'generate_processor_help', 'run_cli', 'run_processo']

import os
import json
from click import wrap_text
from time import time
import subprocess
from ocrd_utils import getLogger, VERSION as OCRD_VERSION, MIMETYPE_PAGE
from ocrd_validators import ParameterValidator

log = getLogger('ocrd.processor')

def _get_workspace(workspace=None, resolver=None, mets_url=None, working_dir=None):
if workspace is None:
if resolver is None:
raise Exception("Need to pass a resolver to create a workspace")
if mets_url is None:
raise Exception("Need to pass mets_url to create a workspace")
workspace = resolver.workspace_from_url(mets_url, dst_dir=working_dir)
return workspace

def run_processor(
processorClass,
ocrd_tool=None,
mets_url=None,
resolver=None,
workspace=None,
page_id=None,
log_level=None, # TODO actually use this!
input_file_grp=None,
output_file_grp=None,
parameter=None,
parameter_override=None,
working_dir=None,
): # pylint: disable=too-many-locals
"""
Create a workspace for mets_url and run processor through it

Args:
parameter (string): URL to the parameter
"""
workspace = _get_workspace(
workspace,
resolver,
mets_url,
working_dir
)
log.debug("Running processor %s", processorClass)
processor = processorClass(
workspace,
ocrd_tool=ocrd_tool,
page_id=page_id,
input_file_grp=input_file_grp,
output_file_grp=output_file_grp,
parameter=parameter
)
ocrd_tool = processor.ocrd_tool
name = '%s v%s' % (ocrd_tool['executable'], processor.version)
otherrole = ocrd_tool['steps'][0]
logProfile = getLogger('ocrd.process.profile')
log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole)
t0 = time()
processor.process()
t1 = time() - t0
logProfile.info("Executing processor '%s' took %fs [--input-file-grp='%s' --output-file-grp='%s' --parameter='%s']" % (
ocrd_tool['executable'],
t1,
input_file_grp if input_file_grp else '',
output_file_grp if output_file_grp else '',
json.dumps(parameter) if parameter else {}
))
workspace.mets.add_agent(
name=name,
_type='OTHER',
othertype='SOFTWARE',
role='OTHER',
otherrole=otherrole
)
workspace.save_mets()
return processor

def run_cli(
executable,
mets_url=None,
resolver=None,
workspace=None,
page_id=None,
overwrite=None,
log_level=None,
input_file_grp=None,
output_file_grp=None,
parameter=None,
working_dir=None,
):
"""
Create a workspace for mets_url and run MP CLI through it
"""
workspace = _get_workspace(workspace, resolver, mets_url, working_dir)
args = [executable, '--working-dir', workspace.directory]
args += ['--mets', mets_url]
if log_level:
args += ['--log-level', log_level]
if page_id:
args += ['--page-id', page_id]
if input_file_grp:
args += ['--input-file-grp', input_file_grp]
if output_file_grp:
args += ['--output-file-grp', output_file_grp]
if parameter:
args += ['--parameter', parameter]
if overwrite:
args += ['--overwrite']
log.debug("Running subprocess '%s'", ' '.join(args))
return subprocess.call(args)

def generate_processor_help(ocrd_tool):
parameter_help = ''
if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']:
parameter_help = ' NONE\n'
else:
def wrap(s):
return wrap_text(s, initial_indent=' '*3,
subsequent_indent=' '*4,
width=72, preserve_paragraphs=True)
for param_name, param in ocrd_tool['parameters'].items():
parameter_help += wrap('"%s" [%s%s]' % (
param_name,
param['type'],
' - REQUIRED' if 'required' in param and param['required'] else
' - %s' % json.dumps(param['default']) if 'default' in param else ''))
parameter_help += '\n ' + wrap(param['description'])
if 'enum' in param:
parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum']))
parameter_help += "\n"
return '''
Usage: %s [OPTIONS]

%s

Options:
-I, --input-file-grp USE File group(s) used as input
-O, --output-file-grp USE File group(s) used as output
-g, --page-id ID Physical page ID(s) to process
--overwrite Remove existing output pages/images
(with --page-id, remove only those)
-p, --parameter JSON-PATH Parameters, either verbatim JSON string
or JSON file path
-P, --param-override KEY VAL Override a single JSON object key-value pair,
taking precedence over --parameter
-m, --mets URL-PATH URL or file path of METS to process
-w, --working-dir PATH Working directory of local workspace
-l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
Log level
-J, --dump-json Dump tool description as JSON and exit
-h, --help This help message
-V, --version Show version

Parameters:
%s
Default Wiring:
%s -> %s

''' % (
ocrd_tool['executable'],
ocrd_tool['description'],
parameter_help,
ocrd_tool.get('input_file_grp', 'NONE'),
ocrd_tool.get('output_file_grp', 'NONE')
)
# XXX imports must remain for backwards-compatibilty
from .helpers import run_cli, run_processor, generate_processor_help # pylint: disable=unused-import

log = getLogger('ocrd.processor')

class Processor():
"""
Expand Down
178 changes: 178 additions & 0 deletions ocrd/ocrd/processor/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
"""
Helper methods for running and documenting processors
"""
from time import time
import json
import subprocess

from click import wrap_text
from ocrd_utils import getLogger

__all__ = [
'generate_processor_help',
'run_cli',
'run_processor'
]

log = getLogger('ocrd.processor')

def _get_workspace(workspace=None, resolver=None, mets_url=None, working_dir=None):
if workspace is None:
if resolver is None:
raise Exception("Need to pass a resolver to create a workspace")
if mets_url is None:
raise Exception("Need to pass mets_url to create a workspace")
workspace = resolver.workspace_from_url(mets_url, dst_dir=working_dir)
return workspace

def run_processor(
processorClass,
ocrd_tool=None,
mets_url=None,
resolver=None,
workspace=None,
page_id=None,
log_level=None, # TODO actually use this!
input_file_grp=None,
output_file_grp=None,
parameter=None,
parameter_override=None,
working_dir=None,
): # pylint: disable=too-many-locals
"""
Create a workspace for mets_url and run processor through it

Args:
parameter (string): URL to the parameter
"""
workspace = _get_workspace(
workspace,
resolver,
mets_url,
working_dir
)
log.debug("Running processor %s", processorClass)
processor = processorClass(
workspace,
ocrd_tool=ocrd_tool,
page_id=page_id,
input_file_grp=input_file_grp,
output_file_grp=output_file_grp,
parameter=parameter
)
ocrd_tool = processor.ocrd_tool
name = '%s v%s' % (ocrd_tool['executable'], processor.version)
otherrole = ocrd_tool['steps'][0]
logProfile = getLogger('ocrd.process.profile')
log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole)
t0 = time()
processor.process()
t1 = time() - t0
logProfile.info("Executing processor '%s' took %fs [--input-file-grp='%s' --output-file-grp='%s' --parameter='%s']" % (
ocrd_tool['executable'],
t1,
input_file_grp if input_file_grp else '',
output_file_grp if output_file_grp else '',
json.dumps(parameter) if parameter else {}
))
workspace.mets.add_agent(
name=name,
_type='OTHER',
othertype='SOFTWARE',
role='OTHER',
otherrole=otherrole
)
workspace.save_mets()
return processor

def run_cli(
executable,
mets_url=None,
resolver=None,
workspace=None,
page_id=None,
overwrite=None,
log_level=None,
input_file_grp=None,
output_file_grp=None,
parameter=None,
working_dir=None,
):
"""
Create a workspace for mets_url and run MP CLI through it
"""
workspace = _get_workspace(workspace, resolver, mets_url, working_dir)
args = [executable, '--working-dir', workspace.directory]
args += ['--mets', mets_url]
if log_level:
args += ['--log-level', log_level]
if page_id:
args += ['--page-id', page_id]
if input_file_grp:
args += ['--input-file-grp', input_file_grp]
if output_file_grp:
args += ['--output-file-grp', output_file_grp]
if parameter:
args += ['--parameter', parameter]
if overwrite:
args += ['--overwrite']
log.debug("Running subprocess '%s'", ' '.join(args))
return subprocess.call(args)

def generate_processor_help(ocrd_tool):
parameter_help = ''
if 'parameters' not in ocrd_tool or not ocrd_tool['parameters']:
parameter_help = ' NONE\n'
else:
def wrap(s):
return wrap_text(s, initial_indent=' '*3,
subsequent_indent=' '*4,
width=72, preserve_paragraphs=True)
for param_name, param in ocrd_tool['parameters'].items():
parameter_help += wrap('"%s" [%s%s]' % (
param_name,
param['type'],
' - REQUIRED' if 'required' in param and param['required'] else
' - %s' % json.dumps(param['default']) if 'default' in param else ''))
parameter_help += '\n ' + wrap(param['description'])
if 'enum' in param:
parameter_help += '\n ' + wrap('Possible values: %s' % json.dumps(param['enum']))
parameter_help += "\n"
return '''
Usage: %s [OPTIONS]

%s

Options:
-I, --input-file-grp USE File group(s) used as input
-O, --output-file-grp USE File group(s) used as output
-g, --page-id ID Physical page ID(s) to process
--overwrite Remove existing output pages/images
(with --page-id, remove only those)
-p, --parameter JSON-PATH Parameters, either verbatim JSON string
or JSON file path
-P, --param-override KEY VAL Override a single JSON object key-value pair,
taking precedence over --parameter
-m, --mets URL-PATH URL or file path of METS to process
-w, --working-dir PATH Working directory of local workspace
-l, --log-level [OFF|ERROR|WARN|INFO|DEBUG|TRACE]
Log level
-J, --dump-json Dump tool description as JSON and exit
-h, --help This help message
-V, --version Show version

Parameters:
%s
Default Wiring:
%s -> %s

''' % (
ocrd_tool['executable'],
ocrd_tool['description'],
parameter_help,
ocrd_tool.get('input_file_grp', 'NONE'),
ocrd_tool.get('output_file_grp', 'NONE')
)



7 changes: 6 additions & 1 deletion ocrd/ocrd/task_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@
from collections import Counter

from ocrd_utils import getLogger, parse_json_string_or_file, set_json_key_value_overrides
# from collections import Counter
from ocrd.processor.base import run_cli
from ocrd.resolver import Resolver
from ocrd_validators import ParameterValidator, WorkspaceValidator, ValidationReport
from ocrd_validators import ParameterValidator, WorkspaceValidator
from ocrd_models import ValidationReport

class ProcessorTask():

Expand Down Expand Up @@ -85,6 +87,9 @@ def __str__(self):
if self.parameters:
ret += " -p '%s'" % json.dumps(self.parameters)
return ret
from ocrd_validators import WorkspaceValidator
from ocrd_utils import getLogger
from ocrd_models import ValidationReport

def validate_tasks(tasks, workspace, page_id=None, overwrite=False):
report = ValidationReport()
Expand Down
1 change: 1 addition & 0 deletions ocrd_models/ocrd_models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@
from .ocrd_file import OcrdFile
from .ocrd_mets import OcrdMets
from .ocrd_xml_base import OcrdXmlDocument
from .report import ValidationReport
Loading