From d02d78f66ed0af4e3f6de06117da17ab72f20608 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 29 Jan 2019 11:19:54 +0100 Subject: [PATCH 01/17] add 1st draft line GT/training specs --- README.md | 10 +++ gt-profile.json | 1 + gt-profile.yml | 106 +++++++++++++++++++++++ gt-spec.md | 196 +++++++++++++++++++++++++++++++++++++++++++ single-line.json | 1 + single-line.yml | 33 ++++++++ training-schema.json | 1 + training-schema.yml | 44 ++++++++++ 8 files changed, 392 insertions(+) create mode 100644 gt-profile.json create mode 100644 gt-profile.yml create mode 100644 gt-spec.md create mode 100644 single-line.json create mode 100644 single-line.yml create mode 100644 training-schema.json create mode 100644 training-schema.yml diff --git a/README.md b/README.md index bb592e2..816c1b5 100644 --- a/README.md +++ b/README.md @@ -3,3 +3,13 @@ # Specification of the technical architecture, interface definitions and data exchange format(s) See [https://ocr-d.github.io/](https://ocr-d.github.io/). + +## Line Ground Truth + +* [Spec](./gt-spec.md) +* [BagIt profile](./gt-profile.yml) + +## Engine training + +* [Spec](./training-spec.md) +* [JSON schema](./training-schema.yml) diff --git a/gt-profile.json b/gt-profile.json new file mode 100644 index 0000000..d0b7247 --- /dev/null +++ b/gt-profile.json @@ -0,0 +1 @@ +{"BagIt-Profile-Info":{"BagIt-Profile-Identifier":"https://ocr-d.github.io/gt-profile.json","BagIt-Profile-Version":"1.2.0","Source-Organization":"OCR-D","External-Description":"BagIt profile for OCR line Ground Truth","Contact-Name":"Konstantin Baierer","Contact-Email":"konstantin.baierer@sbb.spk-berlin.de","Version":0.1},"Bag-Info":{"Bagging-Date":{"required":false},"Source-Organization":{"required":false},"Gt-Transcription-Extension":{"required":false,"default":".gt.txt"},"Gt-Transcription-Media-Type":{"required":false,"default":"text/plain"},"Gt-Transcription-Directory":{"required":false,"default":"text"},"Gt-Transcription-Normalization":{"required":false,"default":"NFKC","values":["NFD","NFKD","NFC","NFKC"]},"Gt-Color-Image-Extension":{"required":false,"default":".color.png"},"Gt-Color-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff","image/jpeg"]},"Gt-Color-Image-Directory":{"required":false,"default":"img"},"Gt-Grayscale-Image-Extension":{"required":false,"default":".nrm.png"},"Gt-Grayscale-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff","image/jpeg"]},"Gt-Grayscale-Image-Directory":{"required":false,"default":"grayscale"},"Gt-Bitonal-Image-Extension":{"required":false,"default":".bin.png"},"Gt-Bitonal-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff","image/jpeg"]},"Gt-Bitonal-Image-Directory":{"required":false,"default":"bin"},"Gt-Line-Metadata-Extension":{"required":false,"default":".json"},"Gt-Line-Metadata-Media-Type":{"required":false,"default":"application/json","values":["application/json","text/vnd.yaml"]},"Gt-Line-Metadata-Directory":{"required":false,"default":"meta"},"Gt-Directory":{"required":false,"default":"ground-truth"},"Gt-Directory-Structure":{"required":false,"default":"flat","values":["flat","flat-nested","subfolders","subfolders-nested"]}},"Manifests-Required":["sha512"],"Tag-Manifests-Required":[],"Tag-Files-Required":[],"Tag-Files-Allowed":["README.md"],"Allow-Fetch.txt":false,"Serialization":"allowed","Accept-Serialization":"application/zip","Accept-BagIt-Version":["1.0"]} \ No newline at end of file diff --git a/gt-profile.yml b/gt-profile.yml new file mode 100644 index 0000000..11d509d --- /dev/null +++ b/gt-profile.yml @@ -0,0 +1,106 @@ +BagIt-Profile-Info: + BagIt-Profile-Identifier: https://ocr-d.github.io/gt-profile.json + BagIt-Profile-Version: '1.2.0' + Source-Organization: OCR-D + External-Description: BagIt profile for OCR line Ground Truth + Contact-Name: Konstantin Baierer + Contact-Email: konstantin.baierer@sbb.spk-berlin.de + Version: 0.1 +Bag-Info: + Bagging-Date: + required: false + Source-Organization: + required: false + Gt-Transcription-Extension: + required: false + default: '.gt.txt' + Gt-Transcription-Media-Type: + required: false + default: 'text/plain' + Gt-Transcription-Directory: + required: false + default: 'text' + Gt-Transcription-Normalization: + required: false + default: 'NFKC' + values: + - NFD + - NFKD + - NFC + - NFKC + Gt-Color-Image-Extension: + required: false + default: '.color.png' + Gt-Color-Image-Media-Type: + required: false + default: 'image/png' + values: + - 'image/png' + - 'image/tiff' + - 'image/jpeg' + Gt-Color-Image-Directory: + required: false + default: 'img' + Gt-Grayscale-Image-Extension: + required: false + default: '.nrm.png' + Gt-Grayscale-Image-Media-Type: + required: false + default: 'image/png' + values: + - 'image/png' + - 'image/tiff' + - 'image/jpeg' + Gt-Grayscale-Image-Directory: + required: false + default: 'grayscale' + Gt-Bitonal-Image-Extension: + required: false + default: '.bin.png' + Gt-Bitonal-Image-Media-Type: + required: false + default: 'image/png' + values: + - 'image/png' + - 'image/tiff' + - 'image/jpeg' + Gt-Bitonal-Image-Directory: + required: false + default: 'bin' + Gt-Line-Metadata-Extension: + required: false + default: '.json' + Gt-Line-Metadata-Media-Type: + required: false + default: 'application/json' + values: + - 'application/json' + - 'text/vnd.yaml' + Gt-Line-Metadata-Directory: + required: false + default: 'meta' + Gt-Directory: + required: false + default: 'ground-truth' + Gt-Directory-Structure: + required: false + default: 'flat' + values: + # img and transcription in the Gt-Directory + - 'flat' + # img and transcription in the same dir below Gt-Directory + - 'flat-nested' + # img and transcription in subfolders Gt-Bitonal-Image-Directory and Gt-Transcription-Directory of Gt-Directory + - 'subfolders' + # img and transcription in subfolders Gt-Bitonal-Image-Directory and Gt-Transcription-Directory in the same dir below Gt-Directory + - 'subfolders-nested' +Manifests-Required: ['sha512'] +Tag-Manifests-Required: [] +Tag-Files-Required: [] +Tag-Files-Allowed: + - README.md +Allow-Fetch.txt: false +Serialization: allowed +Accept-Serialization: application/zip +Accept-BagIt-Version: + - '1.0' diff --git a/gt-spec.md b/gt-spec.md new file mode 100644 index 0000000..207ba98 --- /dev/null +++ b/gt-spec.md @@ -0,0 +1,196 @@ +# linegt + +> An exchange format for line-based ground truth for OCR + + +* [Rationale](#rationale) +* [BagIt](#bagit) +* [BagIt profile](#bagit-profile) + * [Gt-Transcription-Extension](#gt-transcription-extension) + * [Gt-Transcription-Media-Type](#gt-transcription-media-type) + * [Gt-Transcription-Directory](#gt-transcription-directory) + * [Gt-Transcription-Normalization](#gt-transcription-normalization) + * [Gt-Grayscale-Image-Extension](#gt-grayscale-image-extension) + * [Gt-Grayscale-Image-Media-Type](#gt-grayscale-image-media-type) + * [Gt-Grayscale-Image-Directory](#gt-grayscale-image-directory) + * [Gt-Color-Image-Extension](#gt-color-image-extension) + * [Gt-Color-Image-Media-Type](#gt-color-image-media-type) + * [Gt-Color-Image-Directory](#gt-color-image-directory) + * [Gt-Bitonal-Image-Extension](#gt-bitonal-image-extension) + * [Gt-Bitonal-Image-Media-Type](#gt-bitonal-image-media-type) + * [Gt-Bitonal-Image-Directory](#gt-bitonal-image-directory) + * [Gt-Line-Metadata-Extension](#gt-line-metadata-extension) + * [Gt-Line-Metadata-Media-Type](#gt-line-metadata-media-type) + * [Gt-Line-Metadata-Directory](#gt-line-metadata-directory) + * [Gt-Directory](#gt-directory) + * [Gt-Directory-Structure](#gt-directory-structure) +* [Line metadata](#line-metadata) + + + +## Rationale + +Recent OCR (optical character recognition) engines are not actually +character-based anymore but on neural networks that operate on lines. These +engines can be trained with images of text lines and their transcription +("ground truth"), plus engine-specific configurations. + +This format defines a standardized format to bundle such ground truth, based on +the BagIt conventions. + +## BagIt + +An `linegt` bag must be a valid BagIt bag: + +* Root folder must contain a file `bagit.txt` +* Root folder must contain a file `bag-info.txt` with metadata about the bag +* All payload files must be under a folder `/data` +* Every file in `/data` along with its `` checksum must be listed in a + file `manifest-.txt` + +## BagIt profile + +In addition to the requirements of BagIt, an `ocr_linegt` bag must adhere to +the `ocr_linegt` BagIt profile. + +### Gt-Transcription-Extension + +Extension of the transcription files. Default: `.gt.txt`. + +### Gt-Transcription-Media-Type + +Media type of the transcription files. Default: `text/plain`. + +### Gt-Transcription-Directory + +Name of the subfolder containing transcriptions if [`Gt-Directory-Structure`] is `subfolders` or `subfolders-nested`. Default: `text`. + +### Gt-Transcription-Normalization + +Unicode normalization level. One of `NFC`, `NFKC`, `NFD` or `NFKC`. Default: `NFKC`. + +![Illustration unicode normalization](http://unicode.org/reports/tr15/images/UAX15-NormFig6.jpg) + +### Gt-Grayscale-Image-Extension + +Extension of the grayscale image files. Default: `.png`. + +### Gt-Grayscale-Image-Media-Type + +Media type of the grayscale image files. Default: `image/png`. + +### Gt-Grayscale-Image-Directory + +Name of the subfolder containing grayscale images if [`Gt-Directory-Structure`] is `subfolders` or `subfolders-nested`. Default: `grayscale`. + +### Gt-Color-Image-Extension + +Extension of the color image files. Default: `.png`. + +### Gt-Color-Image-Media-Type + +Media type of the color image files. Default: `image/png`. + +### Gt-Color-Image-Directory + +Name of the subfolder containing color images if [`Gt-Directory-Structure`] is `subfolders` or `subfolders-nested`. Default: `img`. + +### Gt-Bitonal-Image-Extension + +Extension of the bitonal image files. Default: `.png`. + +### Gt-Bitonal-Image-Media-Type + +Media type of the bitonal image files. Default: `image/png`. + +### Gt-Bitonal-Image-Directory + +Name of the subfolder containing bitonal images if [`Gt-Directory-Structure`] is `subfolders` or `subfolders-nested`. Default: `bin`. + +### Gt-Line-Metadata-Extension + +Extension of the [line metadata] files. Default: `.json`. + +### Gt-Line-Metadata-Media-Type + +Media type of the [line metadata] files. Default: `application/json`. + +### Gt-Line-Metadata-Directory + +Name of the subfolder containing [line metadata] if [`Gt-Directory-Structure`] is `subfolders` or `subfolders-nested`. Default: `meta`. + +### Gt-Directory + +Directory below `/data` containing the ground truth. Default: `ground-truth`. + +### Gt-Directory-Structure + +Directory structure. One of + + - `flat`: img and transcription in the [`Gt-Directory`] + - `flat-nested`: img and transcription in the same dir below [`Gt-Directory`] + - `subfolders`: img and transcription in subfolders [`Gt-Bitonal-Image-Directory`] and [`Gt-Transcription-Directory`] of [`Gt-Directory`] + - `subfolders-nested`: img and transcription in subfolders [`Gt-Bitonal-Image-Directory`] and [`Gt-Transcription-Directory`] in the same dir below Gt-Directory + +## Line metadata + +In addition to the bag-wide metadata defined by the [BagIt profile], metadata +can be saved per line to preserve the provenance of every single line. + +Line metadata can be encoded in JSON or YAML (depending on +[`Gt-Line-Metadata-Extension`] and [`Gt-Line-Metadata-Media-Type`]). + +Line metadata mustt adhere to this JSON schema: + + +```yaml +description: Schema for provenance of single lines +type: object +required: + - coords + - imageUrl +properties: + coords: + description: Coordinates as array of x-y-pairs + type: array + items: + type: array + length: 2 + items: + type: number + pageUrl: + description: URL of the page (resp. URL the PAGE-XML file) + type: string + imageUrl: + description: URL of the image (resp. the `pg:imageFilename` in the PAGE-XML file) + type: string + bagUrl: + description: URL of the bag that contains the page + type: string + metsUrl: + description: URL of the METS document that contains the page + type: string + lineId: + description: ID of the line within the PAGE-XML doc + type: string + xpath: + description: XPath to the line if no `fileId` was provided + type: string +``` + + + + +[`Gt-Directory`]: #gt-directory +[`Gt-Bitonal-Image-Directory`]: #gt-bitonal-image-directory +[`Gt-Transcription-Directory`]: #gt-transcription-directory +[`Gt-Directory-Structure`]: #gt-directory-structure +[`Gt-Line-Metadata-Directory`]: #gt-bitonal-image-directory +[`Gt-Line-Metadata-Extension`]: #gt-line-metadata-extension +[`Gt-Line-Metadata-Media-Type`]: #gt-line-metadata-media-type +[BagIt Profile]: #bagit-profile +[line metadata]: #line-metadata diff --git a/single-line.json b/single-line.json new file mode 100644 index 0000000..6183440 --- /dev/null +++ b/single-line.json @@ -0,0 +1 @@ +{"description":"Schema for provenance of single lines","type":"object","required":["coords","imageUrl"],"properties":{"coords":{"description":"Coordinates as array of x-y-pairs","type":"array","items":{"type":"array","length":2,"items":{"type":"number"}}},"pageUrl":{"description":"URL of the page (resp. URL the PAGE-XML file)","type":"string"},"imageUrl":{"description":"URL of the image (resp. the `pg:imageFilename` in the PAGE-XML file)","type":"string"},"bagUrl":{"description":"URL of the bag that contains the page","type":"string"},"metsUrl":{"description":"URL of the METS document that contains the page","type":"string"},"lineId":{"description":"ID of the line within the PAGE-XML doc","type":"string"},"xpath":{"description":"XPath to the line if no `fileId` was provided","type":"string"}}} \ No newline at end of file diff --git a/single-line.yml b/single-line.yml new file mode 100644 index 0000000..930e8ac --- /dev/null +++ b/single-line.yml @@ -0,0 +1,33 @@ +description: Schema for provenance of single lines +type: object +required: + - coords + - imageUrl +properties: + coords: + description: Coordinates as array of x-y-pairs + type: array + items: + type: array + length: 2 + items: + type: number + pageUrl: + description: URL of the page (resp. URL the PAGE-XML file) + type: string + imageUrl: + description: URL of the image (resp. the `pg:imageFilename` in the PAGE-XML file) + type: string + bagUrl: + description: URL of the bag that contains the page + type: string + metsUrl: + description: URL of the METS document that contains the page + type: string + lineId: + description: ID of the line within the PAGE-XML doc + type: string + xpath: + description: XPath to the line if no `fileId` was provided + type: string + diff --git a/training-schema.json b/training-schema.json new file mode 100644 index 0000000..b51c711 --- /dev/null +++ b/training-schema.json @@ -0,0 +1 @@ +{"$id":"https://ocr-d.github.io/schemas/v1/training-schema.json","type":"object","required":["engineName","engineVersion","groundTruthBag","outputModelFormat"],"properties":{"engineName":{"type":"string","enum":["ocropus","kraken","tesseract","calamari"]},"engineVersion":{"type":"string"},"engineArguments":{"description":"Command line arguments passed to the CLI training tool","type":"array","default":[]},"groundTruthBag":{"description":"A bag of line ground truth adhering to https://ocr-d.github.io/gt-profile.json","type":"string"},"groundTruthGlob":{"description":"Wildcard for matching only a subset of the ground truth files. Make sure to exclude extensions and end in '*'.","type":"string","default":"*"},"outputModelFormat":{"description":"The output format of the model. Note that individual engines only support a single one or a subset of formats.","enum":["application/vnd.ocrd.pronn","application/vnd.ocrd.clstm","application/vnd.ocrd.coreml","application/vnd.ocrd.pyrnn","application/vnd.ocrd.tf+zip"]},"evalRatio":{"description":"Ratio of evaluation vs. training data to divide up ground truth","type":"number","default":0.9},"randomSeed":{"description":"Seed for the random number generator shuffling the ground truth before dividing it into evaluation vs. training data","type":"integer","default":0}}} \ No newline at end of file diff --git a/training-schema.yml b/training-schema.yml new file mode 100644 index 0000000..69382bb --- /dev/null +++ b/training-schema.yml @@ -0,0 +1,44 @@ +$id: https://ocr-d.github.io/schemas/v1/training-schema.json +type: object +required: + - engineName + - engineVersion + - groundTruthBag + - outputModelFormat +properties: + engineName: + type: string + enum: + - ocropus + - kraken + - tesseract + - calamari + engineVersion: + type: string + engineArguments: + description: Command line arguments passed to the CLI training tool + type: array + default: [] + groundTruthBag: + description: A bag of line ground truth adhering to https://ocr-d.github.io/gt-profile.json + type: string + groundTruthGlob: + description: Wildcard for matching only a subset of the ground truth files. Make sure to exclude extensions and end in '*'. + type: string + default: '*' + outputModelFormat: + description: The output format of the model. Note that individual engines only support a single one or a subset of formats. + enum: + - application/vnd.ocrd.pronn # kraken < 1.0 + - application/vnd.ocrd.clstm # ocropy-lpred, clstm, kraken<1.0 + - application/vnd.ocrd.coreml # kraken >= 1.0 + - application/vnd.ocrd.pyrnn # ocropy-rpred + - application/vnd.ocrd.tf+zip # calamari, zipped tensorflow data + evalRatio: + description: Ratio of evaluation vs. training data to divide up ground truth + type: number + default: 0.9 + randomSeed: + description: Seed for the random number generator shuffling the ground truth before dividing it into evaluation vs. training data + type: integer + default: 0 From 73dc4c47f4df9cccc11d5b221fe326596a80e701 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 31 Jan 2019 09:16:21 +0100 Subject: [PATCH 02/17] make transcription normalization required but allow 'non-normalized' as value --- gt-profile.json | 2 +- gt-profile.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gt-profile.json b/gt-profile.json index d0b7247..69e2f26 100644 --- a/gt-profile.json +++ b/gt-profile.json @@ -1 +1 @@ -{"BagIt-Profile-Info":{"BagIt-Profile-Identifier":"https://ocr-d.github.io/gt-profile.json","BagIt-Profile-Version":"1.2.0","Source-Organization":"OCR-D","External-Description":"BagIt profile for OCR line Ground Truth","Contact-Name":"Konstantin Baierer","Contact-Email":"konstantin.baierer@sbb.spk-berlin.de","Version":0.1},"Bag-Info":{"Bagging-Date":{"required":false},"Source-Organization":{"required":false},"Gt-Transcription-Extension":{"required":false,"default":".gt.txt"},"Gt-Transcription-Media-Type":{"required":false,"default":"text/plain"},"Gt-Transcription-Directory":{"required":false,"default":"text"},"Gt-Transcription-Normalization":{"required":false,"default":"NFKC","values":["NFD","NFKD","NFC","NFKC"]},"Gt-Color-Image-Extension":{"required":false,"default":".color.png"},"Gt-Color-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff","image/jpeg"]},"Gt-Color-Image-Directory":{"required":false,"default":"img"},"Gt-Grayscale-Image-Extension":{"required":false,"default":".nrm.png"},"Gt-Grayscale-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff","image/jpeg"]},"Gt-Grayscale-Image-Directory":{"required":false,"default":"grayscale"},"Gt-Bitonal-Image-Extension":{"required":false,"default":".bin.png"},"Gt-Bitonal-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff","image/jpeg"]},"Gt-Bitonal-Image-Directory":{"required":false,"default":"bin"},"Gt-Line-Metadata-Extension":{"required":false,"default":".json"},"Gt-Line-Metadata-Media-Type":{"required":false,"default":"application/json","values":["application/json","text/vnd.yaml"]},"Gt-Line-Metadata-Directory":{"required":false,"default":"meta"},"Gt-Directory":{"required":false,"default":"ground-truth"},"Gt-Directory-Structure":{"required":false,"default":"flat","values":["flat","flat-nested","subfolders","subfolders-nested"]}},"Manifests-Required":["sha512"],"Tag-Manifests-Required":[],"Tag-Files-Required":[],"Tag-Files-Allowed":["README.md"],"Allow-Fetch.txt":false,"Serialization":"allowed","Accept-Serialization":"application/zip","Accept-BagIt-Version":["1.0"]} \ No newline at end of file +{"BagIt-Profile-Info":{"BagIt-Profile-Identifier":"https://ocr-d.github.io/gt-profile.json","BagIt-Profile-Version":"1.2.0","Source-Organization":"OCR-D","External-Description":"BagIt profile for OCR line Ground Truth","Contact-Name":"Konstantin Baierer","Contact-Email":"konstantin.baierer@sbb.spk-berlin.de","Version":0.1},"Bag-Info":{"Bagging-Date":{"required":false},"Source-Organization":{"required":false},"Gt-Transcription-Extension":{"required":false,"default":".gt.txt"},"Gt-Transcription-Media-Type":{"required":false,"default":"text/plain"},"Gt-Transcription-Directory":{"required":false,"default":"text"},"Gt-Transcription-Normalization":{"required":true,"values":["NFD","NFKD","NFC","NFKC","non-normalized"]},"Gt-Color-Image-Extension":{"required":false,"default":".color.png"},"Gt-Color-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff","image/jpeg"]},"Gt-Color-Image-Directory":{"required":false,"default":"img"},"Gt-Grayscale-Image-Extension":{"required":false,"default":".nrm.png"},"Gt-Grayscale-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff","image/jpeg"]},"Gt-Grayscale-Image-Directory":{"required":false,"default":"grayscale"},"Gt-Bitonal-Image-Extension":{"required":false,"default":".bin.png"},"Gt-Bitonal-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff","image/jpeg"]},"Gt-Bitonal-Image-Directory":{"required":false,"default":"bin"},"Gt-Line-Metadata-Extension":{"required":false,"default":".json"},"Gt-Line-Metadata-Media-Type":{"required":false,"default":"application/json","values":["application/json","text/vnd.yaml"]},"Gt-Line-Metadata-Directory":{"required":false,"default":"meta"},"Gt-Directory":{"required":false,"default":"ground-truth"},"Gt-Directory-Structure":{"required":false,"default":"flat","values":["flat","flat-nested","subfolders","subfolders-nested"]}},"Manifests-Required":["sha512"],"Tag-Manifests-Required":[],"Tag-Files-Required":[],"Tag-Files-Allowed":["README.md"],"Allow-Fetch.txt":false,"Serialization":"allowed","Accept-Serialization":"application/zip","Accept-BagIt-Version":["1.0"]} \ No newline at end of file diff --git a/gt-profile.yml b/gt-profile.yml index 11d509d..b0e3dc5 100644 --- a/gt-profile.yml +++ b/gt-profile.yml @@ -21,13 +21,13 @@ Bag-Info: required: false default: 'text' Gt-Transcription-Normalization: - required: false - default: 'NFKC' + required: true values: - NFD - NFKD - NFC - NFKC + - non-normalized Gt-Color-Image-Extension: required: false default: '.color.png' From ab7b6e0bbf5ac41873ac2ed20c99da100eec8f3c Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 31 Jan 2019 09:20:05 +0100 Subject: [PATCH 03/17] clarify that all transcriptions must be unicode/utf8 --- gt-spec.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/gt-spec.md b/gt-spec.md index 207ba98..5d13e41 100644 --- a/gt-spec.md +++ b/gt-spec.md @@ -67,7 +67,12 @@ Name of the subfolder containing transcriptions if [`Gt-Directory-Structure`] is ### Gt-Transcription-Normalization -Unicode normalization level. One of `NFC`, `NFKC`, `NFD` or `NFKC`. Default: `NFKC`. +**Required** + +All transcriptions MUST be UTF-8 encoded Unicode. This property defines the +unicode normalization level. + +One of `NFC`, `NFKC`, `NFD` or `NFKC` or `non-normalized`. ![Illustration unicode normalization](http://unicode.org/reports/tr15/images/UAX15-NormFig6.jpg) From 00fa0aa47954aacf004636cc460387204a779158 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 31 Jan 2019 09:20:55 +0100 Subject: [PATCH 04/17] fix image extensions in gt-spec to fit gt-profile --- gt-spec.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gt-spec.md b/gt-spec.md index 5d13e41..1c8e439 100644 --- a/gt-spec.md +++ b/gt-spec.md @@ -78,7 +78,7 @@ One of `NFC`, `NFKC`, `NFD` or `NFKC` or `non-normalized`. ### Gt-Grayscale-Image-Extension -Extension of the grayscale image files. Default: `.png`. +Extension of the grayscale image files. Default: `.nrm.png`. ### Gt-Grayscale-Image-Media-Type @@ -90,7 +90,7 @@ Name of the subfolder containing grayscale images if [`Gt-Directory-Structure`] ### Gt-Color-Image-Extension -Extension of the color image files. Default: `.png`. +Extension of the color image files. Default: `.color.png`. ### Gt-Color-Image-Media-Type @@ -102,7 +102,7 @@ Name of the subfolder containing color images if [`Gt-Directory-Structure`] is ` ### Gt-Bitonal-Image-Extension -Extension of the bitonal image files. Default: `.png`. +Extension of the bitonal image files. Default: `.bin..png`. ### Gt-Bitonal-Image-Media-Type From d0dcab03db10076aac5c94bd81ee1c48a8fab384 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kay-Michael=20W=C3=BCrzner?= Date: Thu, 31 Jan 2019 09:25:55 +0100 Subject: [PATCH 05/17] typos --- gt-spec.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gt-spec.md b/gt-spec.md index 1c8e439..8f2a0be 100644 --- a/gt-spec.md +++ b/gt-spec.md @@ -31,7 +31,7 @@ ## Rationale Recent OCR (optical character recognition) engines are not actually -character-based anymore but on neural networks that operate on lines. These +character-based anymore but use neural networks that operate on lines. These engines can be trained with images of text lines and their transcription ("ground truth"), plus engine-specific configurations. @@ -145,7 +145,7 @@ can be saved per line to preserve the provenance of every single line. Line metadata can be encoded in JSON or YAML (depending on [`Gt-Line-Metadata-Extension`] and [`Gt-Line-Metadata-Media-Type`]). -Line metadata mustt adhere to this JSON schema: +Line metadata MUST adhere to this JSON schema: ```yaml From b91c9a734315b3958616f03aa348a5e38ff2bcff Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 31 Jan 2019 09:28:29 +0100 Subject: [PATCH 06/17] forbid jpeg for bitonal line images --- gt-profile.json | 2 +- gt-profile.yml | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/gt-profile.json b/gt-profile.json index 69e2f26..b19b4b4 100644 --- a/gt-profile.json +++ b/gt-profile.json @@ -1 +1 @@ -{"BagIt-Profile-Info":{"BagIt-Profile-Identifier":"https://ocr-d.github.io/gt-profile.json","BagIt-Profile-Version":"1.2.0","Source-Organization":"OCR-D","External-Description":"BagIt profile for OCR line Ground Truth","Contact-Name":"Konstantin Baierer","Contact-Email":"konstantin.baierer@sbb.spk-berlin.de","Version":0.1},"Bag-Info":{"Bagging-Date":{"required":false},"Source-Organization":{"required":false},"Gt-Transcription-Extension":{"required":false,"default":".gt.txt"},"Gt-Transcription-Media-Type":{"required":false,"default":"text/plain"},"Gt-Transcription-Directory":{"required":false,"default":"text"},"Gt-Transcription-Normalization":{"required":true,"values":["NFD","NFKD","NFC","NFKC","non-normalized"]},"Gt-Color-Image-Extension":{"required":false,"default":".color.png"},"Gt-Color-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff","image/jpeg"]},"Gt-Color-Image-Directory":{"required":false,"default":"img"},"Gt-Grayscale-Image-Extension":{"required":false,"default":".nrm.png"},"Gt-Grayscale-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff","image/jpeg"]},"Gt-Grayscale-Image-Directory":{"required":false,"default":"grayscale"},"Gt-Bitonal-Image-Extension":{"required":false,"default":".bin.png"},"Gt-Bitonal-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff","image/jpeg"]},"Gt-Bitonal-Image-Directory":{"required":false,"default":"bin"},"Gt-Line-Metadata-Extension":{"required":false,"default":".json"},"Gt-Line-Metadata-Media-Type":{"required":false,"default":"application/json","values":["application/json","text/vnd.yaml"]},"Gt-Line-Metadata-Directory":{"required":false,"default":"meta"},"Gt-Directory":{"required":false,"default":"ground-truth"},"Gt-Directory-Structure":{"required":false,"default":"flat","values":["flat","flat-nested","subfolders","subfolders-nested"]}},"Manifests-Required":["sha512"],"Tag-Manifests-Required":[],"Tag-Files-Required":[],"Tag-Files-Allowed":["README.md"],"Allow-Fetch.txt":false,"Serialization":"allowed","Accept-Serialization":"application/zip","Accept-BagIt-Version":["1.0"]} \ No newline at end of file +{"BagIt-Profile-Info":{"BagIt-Profile-Identifier":"https://ocr-d.github.io/gt-profile.json","BagIt-Profile-Version":"1.2.0","Source-Organization":"OCR-D","External-Description":"BagIt profile for OCR line Ground Truth","Contact-Name":"Konstantin Baierer","Contact-Email":"konstantin.baierer@sbb.spk-berlin.de","Version":0.1},"Bag-Info":{"Bagging-Date":{"required":false},"Source-Organization":{"required":false},"Gt-Transcription-Extension":{"required":false,"default":".gt.txt"},"Gt-Transcription-Media-Type":{"required":false,"default":"text/plain"},"Gt-Transcription-Directory":{"required":false,"default":"text"},"Gt-Transcription-Normalization":{"required":true,"values":["NFD","NFKD","NFC","NFKC","non-normalized"]},"Gt-Color-Image-Extension":{"required":false,"default":".color.png"},"Gt-Color-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff","image/jpeg"]},"Gt-Color-Image-Directory":{"required":false,"default":"img"},"Gt-Grayscale-Image-Extension":{"required":false,"default":".nrm.png"},"Gt-Grayscale-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff","image/jpeg"]},"Gt-Grayscale-Image-Directory":{"required":false,"default":"grayscale"},"Gt-Bitonal-Image-Extension":{"required":false,"default":".bin.png"},"Gt-Bitonal-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff"]},"Gt-Bitonal-Image-Directory":{"required":false,"default":"bin"},"Gt-Line-Metadata-Extension":{"required":false,"default":".json"},"Gt-Line-Metadata-Media-Type":{"required":false,"default":"application/json","values":["application/json","text/vnd.yaml"]},"Gt-Line-Metadata-Directory":{"required":false,"default":"meta"},"Gt-Directory":{"required":false,"default":"ground-truth"},"Gt-Directory-Structure":{"required":false,"default":"flat","values":["flat","flat-nested","subfolders","subfolders-nested"]}},"Manifests-Required":["sha512"],"Tag-Manifests-Required":[],"Tag-Files-Required":[],"Tag-Files-Allowed":["README.md"],"Allow-Fetch.txt":false,"Serialization":"allowed","Accept-Serialization":"application/zip","Accept-BagIt-Version":["1.0"]} \ No newline at end of file diff --git a/gt-profile.yml b/gt-profile.yml index b0e3dc5..c843e38 100644 --- a/gt-profile.yml +++ b/gt-profile.yml @@ -63,7 +63,6 @@ Bag-Info: values: - 'image/png' - 'image/tiff' - - 'image/jpeg' Gt-Bitonal-Image-Directory: required: false default: 'bin' From 9300b2504b59031e3bccda8dea3eecda61629379 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 31 Jan 2019 09:34:30 +0100 Subject: [PATCH 07/17] preliminary media type for tesseract >= 4 models --- training-schema.json | 2 +- training-schema.yml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/training-schema.json b/training-schema.json index b51c711..487b6d2 100644 --- a/training-schema.json +++ b/training-schema.json @@ -1 +1 @@ -{"$id":"https://ocr-d.github.io/schemas/v1/training-schema.json","type":"object","required":["engineName","engineVersion","groundTruthBag","outputModelFormat"],"properties":{"engineName":{"type":"string","enum":["ocropus","kraken","tesseract","calamari"]},"engineVersion":{"type":"string"},"engineArguments":{"description":"Command line arguments passed to the CLI training tool","type":"array","default":[]},"groundTruthBag":{"description":"A bag of line ground truth adhering to https://ocr-d.github.io/gt-profile.json","type":"string"},"groundTruthGlob":{"description":"Wildcard for matching only a subset of the ground truth files. Make sure to exclude extensions and end in '*'.","type":"string","default":"*"},"outputModelFormat":{"description":"The output format of the model. Note that individual engines only support a single one or a subset of formats.","enum":["application/vnd.ocrd.pronn","application/vnd.ocrd.clstm","application/vnd.ocrd.coreml","application/vnd.ocrd.pyrnn","application/vnd.ocrd.tf+zip"]},"evalRatio":{"description":"Ratio of evaluation vs. training data to divide up ground truth","type":"number","default":0.9},"randomSeed":{"description":"Seed for the random number generator shuffling the ground truth before dividing it into evaluation vs. training data","type":"integer","default":0}}} \ No newline at end of file +{"$id":"https://ocr-d.github.io/schemas/v1/training-schema.json","type":"object","required":["engineName","engineVersion","groundTruthBag","outputModelFormat"],"properties":{"engineName":{"type":"string","enum":["ocropus","kraken","tesseract","calamari"]},"engineVersion":{"type":"string"},"engineArguments":{"description":"Command line arguments passed to the CLI training tool","type":"array","default":[]},"groundTruthBag":{"description":"A bag of line ground truth adhering to https://ocr-d.github.io/gt-profile.json","type":"string"},"groundTruthGlob":{"description":"Wildcard for matching only a subset of the ground truth files. Make sure to exclude extensions and end in '*'.","type":"string","default":"*"},"outputModelFormat":{"description":"The output format of the model. Note that individual engines only support a single one or a subset of formats.","enum":["application/vnd.ocrd.pronn","application/vnd.ocrd.clstm","application/vnd.ocrd.coreml","application/vnd.ocrd.pyrnn","application/vnd.ocrd.tf+zip","application/vnd.ocrd.tesseract4"]},"evalRatio":{"description":"Ratio of evaluation vs. training data to divide up ground truth","type":"number","default":0.9},"randomSeed":{"description":"Seed for the random number generator shuffling the ground truth before dividing it into evaluation vs. training data","type":"integer","default":0}}} \ No newline at end of file diff --git a/training-schema.yml b/training-schema.yml index 69382bb..ecff0e2 100644 --- a/training-schema.yml +++ b/training-schema.yml @@ -34,6 +34,7 @@ properties: - application/vnd.ocrd.coreml # kraken >= 1.0 - application/vnd.ocrd.pyrnn # ocropy-rpred - application/vnd.ocrd.tf+zip # calamari, zipped tensorflow data + - application/vnd.ocrd.tesseract4 # tesseract >= 4.0.beta1 evalRatio: description: Ratio of evaluation vs. training data to divide up ground truth type: number From 2ae7d5708718fbc4a5abef21872c8528aed20e13 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 31 Jan 2019 10:52:19 +0100 Subject: [PATCH 08/17] single-line metadata: add teiUrl --- gt-spec.md | 3 +++ single-line.json | 2 +- single-line.yml | 3 +++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/gt-spec.md b/gt-spec.md index 8f2a0be..6033551 100644 --- a/gt-spec.md +++ b/gt-spec.md @@ -178,6 +178,9 @@ properties: lineId: description: ID of the line within the PAGE-XML doc type: string + teiUrl: + description: URL of the TEI document that contains the page + type: string xpath: description: XPath to the line if no `fileId` was provided type: string diff --git a/single-line.json b/single-line.json index 6183440..9868536 100644 --- a/single-line.json +++ b/single-line.json @@ -1 +1 @@ -{"description":"Schema for provenance of single lines","type":"object","required":["coords","imageUrl"],"properties":{"coords":{"description":"Coordinates as array of x-y-pairs","type":"array","items":{"type":"array","length":2,"items":{"type":"number"}}},"pageUrl":{"description":"URL of the page (resp. URL the PAGE-XML file)","type":"string"},"imageUrl":{"description":"URL of the image (resp. the `pg:imageFilename` in the PAGE-XML file)","type":"string"},"bagUrl":{"description":"URL of the bag that contains the page","type":"string"},"metsUrl":{"description":"URL of the METS document that contains the page","type":"string"},"lineId":{"description":"ID of the line within the PAGE-XML doc","type":"string"},"xpath":{"description":"XPath to the line if no `fileId` was provided","type":"string"}}} \ No newline at end of file +{"description":"Schema for provenance of single lines","type":"object","required":["coords","imageUrl"],"properties":{"coords":{"description":"Coordinates as array of x-y-pairs","type":"array","items":{"type":"array","length":2,"items":{"type":"number"}}},"pageUrl":{"description":"URL of the page (resp. URL the PAGE-XML file)","type":"string"},"imageUrl":{"description":"URL of the image (resp. the `pg:imageFilename` in the PAGE-XML file)","type":"string"},"bagUrl":{"description":"URL of the bag that contains the page","type":"string"},"metsUrl":{"description":"URL of the METS document that contains the page","type":"string"},"lineId":{"description":"ID of the line within the PAGE-XML doc","type":"string"},"teiUrl":{"description":"URL of the TEI document that contains the page","type":"string"},"xpath":{"description":"XPath to the line if no `fileId` was provided","type":"string"}}} \ No newline at end of file diff --git a/single-line.yml b/single-line.yml index 930e8ac..2339bd0 100644 --- a/single-line.yml +++ b/single-line.yml @@ -27,6 +27,9 @@ properties: lineId: description: ID of the line within the PAGE-XML doc type: string + teiUrl: + description: URL of the TEI document that contains the page + type: string xpath: description: XPath to the line if no `fileId` was provided type: string From b3485923f56d00376e4b5aa070bf2efad4e1490f Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 31 Jan 2019 11:27:24 +0100 Subject: [PATCH 09/17] make coords in single-line schema optional --- gt-spec.md | 1 - single-line.json | 2 +- single-line.yml | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/gt-spec.md b/gt-spec.md index 6033551..cc85730 100644 --- a/gt-spec.md +++ b/gt-spec.md @@ -152,7 +152,6 @@ Line metadata MUST adhere to this JSON schema: description: Schema for provenance of single lines type: object required: - - coords - imageUrl properties: coords: diff --git a/single-line.json b/single-line.json index 9868536..503683a 100644 --- a/single-line.json +++ b/single-line.json @@ -1 +1 @@ -{"description":"Schema for provenance of single lines","type":"object","required":["coords","imageUrl"],"properties":{"coords":{"description":"Coordinates as array of x-y-pairs","type":"array","items":{"type":"array","length":2,"items":{"type":"number"}}},"pageUrl":{"description":"URL of the page (resp. URL the PAGE-XML file)","type":"string"},"imageUrl":{"description":"URL of the image (resp. the `pg:imageFilename` in the PAGE-XML file)","type":"string"},"bagUrl":{"description":"URL of the bag that contains the page","type":"string"},"metsUrl":{"description":"URL of the METS document that contains the page","type":"string"},"lineId":{"description":"ID of the line within the PAGE-XML doc","type":"string"},"teiUrl":{"description":"URL of the TEI document that contains the page","type":"string"},"xpath":{"description":"XPath to the line if no `fileId` was provided","type":"string"}}} \ No newline at end of file +{"description":"Schema for provenance of single lines","type":"object","required":["imageUrl"],"properties":{"coords":{"description":"Coordinates as array of x-y-pairs","type":"array","items":{"type":"array","length":2,"items":{"type":"number"}}},"pageUrl":{"description":"URL of the page (resp. URL the PAGE-XML file)","type":"string"},"imageUrl":{"description":"URL of the image (resp. the `pg:imageFilename` in the PAGE-XML file)","type":"string"},"bagUrl":{"description":"URL of the bag that contains the page","type":"string"},"metsUrl":{"description":"URL of the METS document that contains the page","type":"string"},"lineId":{"description":"ID of the line within the PAGE-XML doc","type":"string"},"teiUrl":{"description":"URL of the TEI document that contains the page","type":"string"},"xpath":{"description":"XPath to the line if no `fileId` was provided","type":"string"}}} \ No newline at end of file diff --git a/single-line.yml b/single-line.yml index 2339bd0..f5a8712 100644 --- a/single-line.yml +++ b/single-line.yml @@ -1,7 +1,6 @@ description: Schema for provenance of single lines type: object required: - - coords - imageUrl properties: coords: From 5b3d6df8a2f1aa92077f625275775f61d9a4a4d8 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 31 Jan 2019 11:55:57 +0100 Subject: [PATCH 10/17] allow build.sh files in bags --- gt-profile.json | 2 +- gt-profile.yml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/gt-profile.json b/gt-profile.json index b19b4b4..4667227 100644 --- a/gt-profile.json +++ b/gt-profile.json @@ -1 +1 @@ -{"BagIt-Profile-Info":{"BagIt-Profile-Identifier":"https://ocr-d.github.io/gt-profile.json","BagIt-Profile-Version":"1.2.0","Source-Organization":"OCR-D","External-Description":"BagIt profile for OCR line Ground Truth","Contact-Name":"Konstantin Baierer","Contact-Email":"konstantin.baierer@sbb.spk-berlin.de","Version":0.1},"Bag-Info":{"Bagging-Date":{"required":false},"Source-Organization":{"required":false},"Gt-Transcription-Extension":{"required":false,"default":".gt.txt"},"Gt-Transcription-Media-Type":{"required":false,"default":"text/plain"},"Gt-Transcription-Directory":{"required":false,"default":"text"},"Gt-Transcription-Normalization":{"required":true,"values":["NFD","NFKD","NFC","NFKC","non-normalized"]},"Gt-Color-Image-Extension":{"required":false,"default":".color.png"},"Gt-Color-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff","image/jpeg"]},"Gt-Color-Image-Directory":{"required":false,"default":"img"},"Gt-Grayscale-Image-Extension":{"required":false,"default":".nrm.png"},"Gt-Grayscale-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff","image/jpeg"]},"Gt-Grayscale-Image-Directory":{"required":false,"default":"grayscale"},"Gt-Bitonal-Image-Extension":{"required":false,"default":".bin.png"},"Gt-Bitonal-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff"]},"Gt-Bitonal-Image-Directory":{"required":false,"default":"bin"},"Gt-Line-Metadata-Extension":{"required":false,"default":".json"},"Gt-Line-Metadata-Media-Type":{"required":false,"default":"application/json","values":["application/json","text/vnd.yaml"]},"Gt-Line-Metadata-Directory":{"required":false,"default":"meta"},"Gt-Directory":{"required":false,"default":"ground-truth"},"Gt-Directory-Structure":{"required":false,"default":"flat","values":["flat","flat-nested","subfolders","subfolders-nested"]}},"Manifests-Required":["sha512"],"Tag-Manifests-Required":[],"Tag-Files-Required":[],"Tag-Files-Allowed":["README.md"],"Allow-Fetch.txt":false,"Serialization":"allowed","Accept-Serialization":"application/zip","Accept-BagIt-Version":["1.0"]} \ No newline at end of file +{"BagIt-Profile-Info":{"BagIt-Profile-Identifier":"https://ocr-d.github.io/gt-profile.json","BagIt-Profile-Version":"1.2.0","Source-Organization":"OCR-D","External-Description":"BagIt profile for OCR line Ground Truth","Contact-Name":"Konstantin Baierer","Contact-Email":"konstantin.baierer@sbb.spk-berlin.de","Version":0.1},"Bag-Info":{"Bagging-Date":{"required":false},"Source-Organization":{"required":false},"Gt-Transcription-Extension":{"required":false,"default":".gt.txt"},"Gt-Transcription-Media-Type":{"required":false,"default":"text/plain"},"Gt-Transcription-Directory":{"required":false,"default":"text"},"Gt-Transcription-Normalization":{"required":true,"values":["NFD","NFKD","NFC","NFKC","non-normalized"]},"Gt-Color-Image-Extension":{"required":false,"default":".color.png"},"Gt-Color-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff","image/jpeg"]},"Gt-Color-Image-Directory":{"required":false,"default":"img"},"Gt-Grayscale-Image-Extension":{"required":false,"default":".nrm.png"},"Gt-Grayscale-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff","image/jpeg"]},"Gt-Grayscale-Image-Directory":{"required":false,"default":"grayscale"},"Gt-Bitonal-Image-Extension":{"required":false,"default":".bin.png"},"Gt-Bitonal-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff"]},"Gt-Bitonal-Image-Directory":{"required":false,"default":"bin"},"Gt-Line-Metadata-Extension":{"required":false,"default":".json"},"Gt-Line-Metadata-Media-Type":{"required":false,"default":"application/json","values":["application/json","text/vnd.yaml"]},"Gt-Line-Metadata-Directory":{"required":false,"default":"meta"},"Gt-Directory":{"required":false,"default":"ground-truth"},"Gt-Directory-Structure":{"required":false,"default":"flat","values":["flat","flat-nested","subfolders","subfolders-nested"]}},"Manifests-Required":["sha512"],"Tag-Manifests-Required":[],"Tag-Files-Required":[],"Tag-Files-Allowed":["README.md","build.sh"],"Allow-Fetch.txt":false,"Serialization":"allowed","Accept-Serialization":"application/zip","Accept-BagIt-Version":["1.0"]} \ No newline at end of file diff --git a/gt-profile.yml b/gt-profile.yml index c843e38..f9830c6 100644 --- a/gt-profile.yml +++ b/gt-profile.yml @@ -98,6 +98,7 @@ Tag-Manifests-Required: [] Tag-Files-Required: [] Tag-Files-Allowed: - README.md + - build.sh Allow-Fetch.txt: false Serialization: allowed Accept-Serialization: application/zip From c8bdc85b85ab6eef25a3bf49cfb20871c7d2792d Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 4 Feb 2019 13:04:52 +0100 Subject: [PATCH 11/17] recognition schema for evaluating training progress --- model-validation-schema.json | 1 + model-validation-schema.yml | 39 ++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 model-validation-schema.json create mode 100644 model-validation-schema.yml diff --git a/model-validation-schema.json b/model-validation-schema.json new file mode 100644 index 0000000..e96dc17 --- /dev/null +++ b/model-validation-schema.json @@ -0,0 +1 @@ +{"$id":"https://ocr-d.github.io/schemas/v1/recognition-schema.json","type":"object","required":["engineName","engineVersion","groundTruthBag","model","outputFormat"],"properties":{"engineName":{"type":"string","enum":["ocropus","kraken","tesseract","calamari"]},"engineVersion":{"type":"string"},"engineArguments":{"description":"Command line arguments passed to the CLI recognition tool","type":"array","default":[]},"groundTruthBag":{"description":"A bag of line ground truth adhering to https://ocr-d.github.io/gt-profile.json","type":"string"},"model":{"description":"URL/path to model to use","type":"string"},"segmentation":{"description":"URL/path to line segmentation JSON"},"outputFormat":{"description":"Output format","type":"string","default":"text/plain","enum":["text/plain","text/vnd.hocr+html","application/vnd.prima.page+xml","application/alto+xml"]}}} \ No newline at end of file diff --git a/model-validation-schema.yml b/model-validation-schema.yml new file mode 100644 index 0000000..ebda840 --- /dev/null +++ b/model-validation-schema.yml @@ -0,0 +1,39 @@ +$id: https://ocr-d.github.io/schemas/v1/recognition-schema.json +type: object +required: + - engineName + - engineVersion + - groundTruthBag + - model + - outputFormat +properties: + engineName: + type: string + enum: + - ocropus + - kraken + - tesseract + - calamari + engineVersion: + type: string + engineArguments: + description: Command line arguments passed to the CLI recognition tool + type: array + default: [] + groundTruthBag: + description: A bag of line ground truth adhering to https://ocr-d.github.io/gt-profile.json + type: string + model: + description: URL/path to model to use + type: string + segmentation: + description: URL/path to line segmentation JSON + outputFormat: + description: Output format + type: string + default: text/plain + enum: + - text/plain + - text/vnd.hocr+html + - application/vnd.prima.page+xml + - application/alto+xml From b706d82bcb39be8c789ef528df5d527886bd166a Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 5 Feb 2019 10:51:47 +0100 Subject: [PATCH 12/17] model-validation-schema: measures --- model-validation-schema.json | 2 +- model-validation-schema.yml | 23 ++++++++++++----------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/model-validation-schema.json b/model-validation-schema.json index e96dc17..36247f1 100644 --- a/model-validation-schema.json +++ b/model-validation-schema.json @@ -1 +1 @@ -{"$id":"https://ocr-d.github.io/schemas/v1/recognition-schema.json","type":"object","required":["engineName","engineVersion","groundTruthBag","model","outputFormat"],"properties":{"engineName":{"type":"string","enum":["ocropus","kraken","tesseract","calamari"]},"engineVersion":{"type":"string"},"engineArguments":{"description":"Command line arguments passed to the CLI recognition tool","type":"array","default":[]},"groundTruthBag":{"description":"A bag of line ground truth adhering to https://ocr-d.github.io/gt-profile.json","type":"string"},"model":{"description":"URL/path to model to use","type":"string"},"segmentation":{"description":"URL/path to line segmentation JSON"},"outputFormat":{"description":"Output format","type":"string","default":"text/plain","enum":["text/plain","text/vnd.hocr+html","application/vnd.prima.page+xml","application/alto+xml"]}}} \ No newline at end of file +{"$id":"https://ocr-d.github.io/schemas/v1/recognition-schema.json","type":"object","required":["engineName","engineVersion","groundTruthBag","model","outputFormat"],"properties":{"engineName":{"type":"string","enum":["ocropus","kraken","tesseract","calamari"]},"engineVersion":{"type":"string"},"engineArguments":{"description":"Command line arguments passed to the CLI recognition tool","type":"array","default":[]},"groundTruthBag":{"description":"A bag of line ground truth adhering to https://ocr-d.github.io/gt-profile.json","type":"string"},"model":{"description":"URL/path to model to use","type":"string"},"measures":{"description":"which evaluation measures to produce","type":"array","items":{"type":"string","enum":["cer-per-line","cer-total","ler","wer-per-line","wer-total","confusion-matrix"]}}}} \ No newline at end of file diff --git a/model-validation-schema.yml b/model-validation-schema.yml index ebda840..94d05d3 100644 --- a/model-validation-schema.yml +++ b/model-validation-schema.yml @@ -26,14 +26,15 @@ properties: model: description: URL/path to model to use type: string - segmentation: - description: URL/path to line segmentation JSON - outputFormat: - description: Output format - type: string - default: text/plain - enum: - - text/plain - - text/vnd.hocr+html - - application/vnd.prima.page+xml - - application/alto+xml + measures: + description: which evaluation measures to produce + type: array + items: + type: string + enum: + - cer-per-line + - cer-total + - ler + - wer-per-line + - wer-total + - confusion-matrix From c9aeb4a9efa99fcc3e84615c6961bb5baa994a22 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 5 Feb 2019 11:50:04 +0100 Subject: [PATCH 13/17] rename model-validation -> model-evaluation --- model-validation-schema.json => model-evaluation-schema.json | 0 model-validation-schema.yml => model-evaluation-schema.yml | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename model-validation-schema.json => model-evaluation-schema.json (100%) rename model-validation-schema.yml => model-evaluation-schema.yml (100%) diff --git a/model-validation-schema.json b/model-evaluation-schema.json similarity index 100% rename from model-validation-schema.json rename to model-evaluation-schema.json diff --git a/model-validation-schema.yml b/model-evaluation-schema.yml similarity index 100% rename from model-validation-schema.yml rename to model-evaluation-schema.yml From c38a7a6801b0a3a4102fbc57d4602bc84309f16f Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 5 Feb 2019 12:45:52 +0100 Subject: [PATCH 14/17] Gt-Prediction-* for linegt profile --- gt-profile.json | 2 +- gt-profile.yml | 9 +++++++++ gt-spec.md | 15 +++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/gt-profile.json b/gt-profile.json index 4667227..004e732 100644 --- a/gt-profile.json +++ b/gt-profile.json @@ -1 +1 @@ -{"BagIt-Profile-Info":{"BagIt-Profile-Identifier":"https://ocr-d.github.io/gt-profile.json","BagIt-Profile-Version":"1.2.0","Source-Organization":"OCR-D","External-Description":"BagIt profile for OCR line Ground Truth","Contact-Name":"Konstantin Baierer","Contact-Email":"konstantin.baierer@sbb.spk-berlin.de","Version":0.1},"Bag-Info":{"Bagging-Date":{"required":false},"Source-Organization":{"required":false},"Gt-Transcription-Extension":{"required":false,"default":".gt.txt"},"Gt-Transcription-Media-Type":{"required":false,"default":"text/plain"},"Gt-Transcription-Directory":{"required":false,"default":"text"},"Gt-Transcription-Normalization":{"required":true,"values":["NFD","NFKD","NFC","NFKC","non-normalized"]},"Gt-Color-Image-Extension":{"required":false,"default":".color.png"},"Gt-Color-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff","image/jpeg"]},"Gt-Color-Image-Directory":{"required":false,"default":"img"},"Gt-Grayscale-Image-Extension":{"required":false,"default":".nrm.png"},"Gt-Grayscale-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff","image/jpeg"]},"Gt-Grayscale-Image-Directory":{"required":false,"default":"grayscale"},"Gt-Bitonal-Image-Extension":{"required":false,"default":".bin.png"},"Gt-Bitonal-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff"]},"Gt-Bitonal-Image-Directory":{"required":false,"default":"bin"},"Gt-Line-Metadata-Extension":{"required":false,"default":".json"},"Gt-Line-Metadata-Media-Type":{"required":false,"default":"application/json","values":["application/json","text/vnd.yaml"]},"Gt-Line-Metadata-Directory":{"required":false,"default":"meta"},"Gt-Directory":{"required":false,"default":"ground-truth"},"Gt-Directory-Structure":{"required":false,"default":"flat","values":["flat","flat-nested","subfolders","subfolders-nested"]}},"Manifests-Required":["sha512"],"Tag-Manifests-Required":[],"Tag-Files-Required":[],"Tag-Files-Allowed":["README.md","build.sh"],"Allow-Fetch.txt":false,"Serialization":"allowed","Accept-Serialization":"application/zip","Accept-BagIt-Version":["1.0"]} \ No newline at end of file +{"BagIt-Profile-Info":{"BagIt-Profile-Identifier":"https://ocr-d.github.io/gt-profile.json","BagIt-Profile-Version":"1.2.0","Source-Organization":"OCR-D","External-Description":"BagIt profile for OCR line Ground Truth","Contact-Name":"Konstantin Baierer","Contact-Email":"konstantin.baierer@sbb.spk-berlin.de","Version":0.1},"Bag-Info":{"Bagging-Date":{"required":false},"Source-Organization":{"required":false},"Gt-Transcription-Extension":{"required":false,"default":".gt.txt"},"Gt-Transcription-Media-Type":{"required":false,"default":"text/plain"},"Gt-Prediction-Directory":{"required":false,"default":"pred"},"Gt-Prediction-Extension":{"required":false,"default":".pred.txt"},"Gt-Prediction-Media-Type":{"required":false,"default":"text/plain"},"Gt-Transcription-Directory":{"required":false,"default":"text"},"Gt-Transcription-Normalization":{"required":true,"values":["NFD","NFKD","NFC","NFKC","non-normalized"]},"Gt-Color-Image-Extension":{"required":false,"default":".color.png"},"Gt-Color-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff","image/jpeg"]},"Gt-Color-Image-Directory":{"required":false,"default":"img"},"Gt-Grayscale-Image-Extension":{"required":false,"default":".nrm.png"},"Gt-Grayscale-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff","image/jpeg"]},"Gt-Grayscale-Image-Directory":{"required":false,"default":"grayscale"},"Gt-Bitonal-Image-Extension":{"required":false,"default":".bin.png"},"Gt-Bitonal-Image-Media-Type":{"required":false,"default":"image/png","values":["image/png","image/tiff"]},"Gt-Bitonal-Image-Directory":{"required":false,"default":"bin"},"Gt-Line-Metadata-Extension":{"required":false,"default":".json"},"Gt-Line-Metadata-Media-Type":{"required":false,"default":"application/json","values":["application/json","text/vnd.yaml"]},"Gt-Line-Metadata-Directory":{"required":false,"default":"meta"},"Gt-Directory":{"required":false,"default":"ground-truth"},"Gt-Directory-Structure":{"required":false,"default":"flat","values":["flat","flat-nested","subfolders","subfolders-nested"]}},"Manifests-Required":["sha512"],"Tag-Manifests-Required":[],"Tag-Files-Required":[],"Tag-Files-Allowed":["README.md","build.sh"],"Allow-Fetch.txt":false,"Serialization":"allowed","Accept-Serialization":"application/zip","Accept-BagIt-Version":["1.0"]} \ No newline at end of file diff --git a/gt-profile.yml b/gt-profile.yml index f9830c6..4457e5a 100644 --- a/gt-profile.yml +++ b/gt-profile.yml @@ -17,6 +17,15 @@ Bag-Info: Gt-Transcription-Media-Type: required: false default: 'text/plain' + Gt-Prediction-Directory: + required: false + default: 'pred' + Gt-Prediction-Extension: + required: false + default: '.pred.txt' + Gt-Prediction-Media-Type: + required: false + default: 'text/plain' Gt-Transcription-Directory: required: false default: 'text' diff --git a/gt-spec.md b/gt-spec.md index cc85730..d718b41 100644 --- a/gt-spec.md +++ b/gt-spec.md @@ -10,6 +10,9 @@ * [Gt-Transcription-Media-Type](#gt-transcription-media-type) * [Gt-Transcription-Directory](#gt-transcription-directory) * [Gt-Transcription-Normalization](#gt-transcription-normalization) + * [Gt-Prediction-Extension](#gt-prediction-extension) + * [Gt-Prediction-Media-Type](#gt-prediction-media-type) + * [Gt-Prediction-Directory](#gt-prediction-directory) * [Gt-Grayscale-Image-Extension](#gt-grayscale-image-extension) * [Gt-Grayscale-Image-Media-Type](#gt-grayscale-image-media-type) * [Gt-Grayscale-Image-Directory](#gt-grayscale-image-directory) @@ -76,6 +79,18 @@ One of `NFC`, `NFKC`, `NFD` or `NFKC` or `non-normalized`. ![Illustration unicode normalization](http://unicode.org/reports/tr15/images/UAX15-NormFig6.jpg) +### Gt-Prediction-Extension + +Extension of the prediction files. Used for evaluation. Default: `.pred.txt`. + +### Gt-Prediction-Media-Type + +Media type of the prediction files. Default: `text/plain`. + +### Gt-Prediction-Directory + +Name of the subfolder containing predictions if [`Gt-Directory-Structure`] is `subfolders` or `subfolders-nested`. Default: `pred`. + ### Gt-Grayscale-Image-Extension Extension of the grayscale image files. Default: `.nrm.png`. From 86761a6c3b3e79cf5e2eaeb80e3fcea61c47ac17 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 5 Feb 2019 13:01:24 +0100 Subject: [PATCH 15/17] differentiate trainerArgs and recognizerArgs --- training-schema.json | 2 +- training-schema.yml | 20 +++++++++++++++++--- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/training-schema.json b/training-schema.json index 487b6d2..0536d25 100644 --- a/training-schema.json +++ b/training-schema.json @@ -1 +1 @@ -{"$id":"https://ocr-d.github.io/schemas/v1/training-schema.json","type":"object","required":["engineName","engineVersion","groundTruthBag","outputModelFormat"],"properties":{"engineName":{"type":"string","enum":["ocropus","kraken","tesseract","calamari"]},"engineVersion":{"type":"string"},"engineArguments":{"description":"Command line arguments passed to the CLI training tool","type":"array","default":[]},"groundTruthBag":{"description":"A bag of line ground truth adhering to https://ocr-d.github.io/gt-profile.json","type":"string"},"groundTruthGlob":{"description":"Wildcard for matching only a subset of the ground truth files. Make sure to exclude extensions and end in '*'.","type":"string","default":"*"},"outputModelFormat":{"description":"The output format of the model. Note that individual engines only support a single one or a subset of formats.","enum":["application/vnd.ocrd.pronn","application/vnd.ocrd.clstm","application/vnd.ocrd.coreml","application/vnd.ocrd.pyrnn","application/vnd.ocrd.tf+zip","application/vnd.ocrd.tesseract4"]},"evalRatio":{"description":"Ratio of evaluation vs. training data to divide up ground truth","type":"number","default":0.9},"randomSeed":{"description":"Seed for the random number generator shuffling the ground truth before dividing it into evaluation vs. training data","type":"integer","default":0}}} \ No newline at end of file +{"$id":"https://ocr-d.github.io/schemas/v1/training-schema.json","type":"object","required":["engineName","engineVersion","groundTruthBag","outputModelFormat"],"properties":{"engineName":{"type":"string","enum":["ocropus","kraken","tesseract","calamari"]},"engineVersion":{"type":"string"},"trainerArgs":{"description":"Command line arguments passed to the CLI training tool","type":"array","default":[]},"recognizerArgs":{"description":"Command line arguments passed to the CLI recognition tool","type":"array","default":[]},"groundTruthBag":{"description":"A bag of line ground truth adhering to https://ocr-d.github.io/gt-profile.json","type":"string"},"groundTruthGlob":{"description":"Wildcard for matching only a subset of the ground truth files for training. Make sure to exclude extensions and end in '*'.","type":"string","default":"*"},"outputModelFormat":{"description":"The output format of the model. Note that individual engines only support a single one or a subset of formats.","enum":["application/vnd.ocrd.pronn","application/vnd.ocrd.clstm","application/vnd.ocrd.coreml","application/vnd.ocrd.pyrnn","application/vnd.ocrd.tf+zip","application/vnd.ocrd.tesseract4"]},"evalRatio":{"description":"Ratio of evaluation vs. training data to divide up ground truth","type":"number","default":0.9},"randomSeed":{"description":"Seed for the random number generator shuffling the ground truth before dividing it into evaluation vs. training data","type":"integer","default":0}}} \ No newline at end of file diff --git a/training-schema.yml b/training-schema.yml index ecff0e2..5ff7b57 100644 --- a/training-schema.yml +++ b/training-schema.yml @@ -15,17 +15,31 @@ properties: - calamari engineVersion: type: string - engineArguments: + trainerArgs: description: Command line arguments passed to the CLI training tool type: array default: [] + recognizerArgs: + description: Command line arguments passed to the CLI recognition tool + type: array + default: [] groundTruthBag: description: A bag of line ground truth adhering to https://ocr-d.github.io/gt-profile.json type: string groundTruthGlob: - description: Wildcard for matching only a subset of the ground truth files. Make sure to exclude extensions and end in '*'. - type: string + description: Wildcard for matching only a subset of the ground truth files for training. Make sure to exclude extensions and end in '*'. + type: 'string' default: '*' + # type: array + # default: ['*'] + # items: + # type: string + # validationGlob: + # description: Wildcard for matching only a subset of the ground truth files for validation. Make sure to exclude extensions and end in '*'. + # type: array + # default: ['*'] + # items: + # type: string outputModelFormat: description: The output format of the model. Note that individual engines only support a single one or a subset of formats. enum: From 6a9e00b7b85cd2d811a0f3607057cc4d1ace790f Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 5 Feb 2019 13:03:52 +0100 Subject: [PATCH 16/17] groundTruthGlob -> trainingGlob, evaluation -> validation, +evaluationGlob --- training-schema.json | 2 +- training-schema.yml | 22 ++++++++++++---------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/training-schema.json b/training-schema.json index 0536d25..a93e497 100644 --- a/training-schema.json +++ b/training-schema.json @@ -1 +1 @@ -{"$id":"https://ocr-d.github.io/schemas/v1/training-schema.json","type":"object","required":["engineName","engineVersion","groundTruthBag","outputModelFormat"],"properties":{"engineName":{"type":"string","enum":["ocropus","kraken","tesseract","calamari"]},"engineVersion":{"type":"string"},"trainerArgs":{"description":"Command line arguments passed to the CLI training tool","type":"array","default":[]},"recognizerArgs":{"description":"Command line arguments passed to the CLI recognition tool","type":"array","default":[]},"groundTruthBag":{"description":"A bag of line ground truth adhering to https://ocr-d.github.io/gt-profile.json","type":"string"},"groundTruthGlob":{"description":"Wildcard for matching only a subset of the ground truth files for training. Make sure to exclude extensions and end in '*'.","type":"string","default":"*"},"outputModelFormat":{"description":"The output format of the model. Note that individual engines only support a single one or a subset of formats.","enum":["application/vnd.ocrd.pronn","application/vnd.ocrd.clstm","application/vnd.ocrd.coreml","application/vnd.ocrd.pyrnn","application/vnd.ocrd.tf+zip","application/vnd.ocrd.tesseract4"]},"evalRatio":{"description":"Ratio of evaluation vs. training data to divide up ground truth","type":"number","default":0.9},"randomSeed":{"description":"Seed for the random number generator shuffling the ground truth before dividing it into evaluation vs. training data","type":"integer","default":0}}} \ No newline at end of file +{"$id":"https://ocr-d.github.io/schemas/v1/training-schema.json","type":"object","required":["engineName","engineVersion","groundTruthBag","outputModelFormat"],"properties":{"engineName":{"type":"string","enum":["ocropus","kraken","tesseract","calamari"]},"engineVersion":{"type":"string"},"trainerArgs":{"description":"Command line arguments passed to the CLI training tool","type":"array","default":[]},"recognizerArgs":{"description":"Command line arguments passed to the CLI recognition tool","type":"array","default":[]},"groundTruthBag":{"description":"A bag of line ground truth adhering to https://ocr-d.github.io/gt-profile.json","type":"string"},"trainingGlob":{"description":"Wildcard for matching only a subset of the ground truth files for training. Make sure to exclude extensions and end in '*'.","type":"string","default":"*"},"evaluationGlob":{"description":"Wildcard for matching only a subset of the ground truth files for evaluationi. Make sure to exclude extensions and end in '*'.","type":"string","default":"*"},"outputModelFormat":{"description":"The output format of the model. Note that individual engines only support a single one or a subset of formats.","enum":["application/vnd.ocrd.pronn","application/vnd.ocrd.clstm","application/vnd.ocrd.coreml","application/vnd.ocrd.pyrnn","application/vnd.ocrd.tf+zip","application/vnd.ocrd.tesseract4"]},"validationRatio":{"description":"Ratio of training vs. validation data to divide up ground truth","type":"number","default":0.9},"randomSeed":{"description":"Seed for the random number generator shuffling the ground truth before dividing it into training vs. validation data","type":"integer","default":0}}} \ No newline at end of file diff --git a/training-schema.yml b/training-schema.yml index 5ff7b57..490e5ca 100644 --- a/training-schema.yml +++ b/training-schema.yml @@ -26,7 +26,7 @@ properties: groundTruthBag: description: A bag of line ground truth adhering to https://ocr-d.github.io/gt-profile.json type: string - groundTruthGlob: + trainingGlob: description: Wildcard for matching only a subset of the ground truth files for training. Make sure to exclude extensions and end in '*'. type: 'string' default: '*' @@ -34,12 +34,14 @@ properties: # default: ['*'] # items: # type: string - # validationGlob: - # description: Wildcard for matching only a subset of the ground truth files for validation. Make sure to exclude extensions and end in '*'. - # type: array - # default: ['*'] - # items: - # type: string + evaluationGlob: + description: Wildcard for matching only a subset of the ground truth files for evaluationi. Make sure to exclude extensions and end in '*'. + type: 'string' + default: '*' + # type: array + # default: ['*'] + # items: + # type: string outputModelFormat: description: The output format of the model. Note that individual engines only support a single one or a subset of formats. enum: @@ -49,11 +51,11 @@ properties: - application/vnd.ocrd.pyrnn # ocropy-rpred - application/vnd.ocrd.tf+zip # calamari, zipped tensorflow data - application/vnd.ocrd.tesseract4 # tesseract >= 4.0.beta1 - evalRatio: - description: Ratio of evaluation vs. training data to divide up ground truth + validationRatio: + description: Ratio of training vs. validation data to divide up ground truth type: number default: 0.9 randomSeed: - description: Seed for the random number generator shuffling the ground truth before dividing it into evaluation vs. training data + description: Seed for the random number generator shuffling the ground truth before dividing it into training vs. validation data type: integer default: 0 From 6827085d051e945062203b82ef921e54025cfbda Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 19 Feb 2019 15:18:19 +0100 Subject: [PATCH 17/17] engineArguments -> recognizerArguments --- model-evaluation-schema.json | 2 +- model-evaluation-schema.yml | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/model-evaluation-schema.json b/model-evaluation-schema.json index 36247f1..c2039c4 100644 --- a/model-evaluation-schema.json +++ b/model-evaluation-schema.json @@ -1 +1 @@ -{"$id":"https://ocr-d.github.io/schemas/v1/recognition-schema.json","type":"object","required":["engineName","engineVersion","groundTruthBag","model","outputFormat"],"properties":{"engineName":{"type":"string","enum":["ocropus","kraken","tesseract","calamari"]},"engineVersion":{"type":"string"},"engineArguments":{"description":"Command line arguments passed to the CLI recognition tool","type":"array","default":[]},"groundTruthBag":{"description":"A bag of line ground truth adhering to https://ocr-d.github.io/gt-profile.json","type":"string"},"model":{"description":"URL/path to model to use","type":"string"},"measures":{"description":"which evaluation measures to produce","type":"array","items":{"type":"string","enum":["cer-per-line","cer-total","ler","wer-per-line","wer-total","confusion-matrix"]}}}} \ No newline at end of file +{"$id":"https://ocr-d.github.io/schemas/v1/model-evaluation-schema.json","type":"object","required":["engineName","engineVersion","groundTruthBag","model"],"properties":{"engineName":{"type":"string","enum":["ocropus","kraken","tesseract","calamari"]},"engineVersion":{"type":"string"},"recognizerArguments":{"description":"Command line arguments passed to the CLI recognition tool","type":"array","default":[]},"groundTruthBag":{"description":"A bag of line ground truth adhering to https://ocr-d.github.io/gt-profile.json","type":"string"},"model":{"description":"URL/path to model to use","type":"string"},"measures":{"description":"which evaluation measures to produce","type":"array","items":{"type":"string","enum":["cer-per-line","cer-total","ler","wer-per-line","wer-total","confusion-matrix"]}}}} \ No newline at end of file diff --git a/model-evaluation-schema.yml b/model-evaluation-schema.yml index 94d05d3..e2a1821 100644 --- a/model-evaluation-schema.yml +++ b/model-evaluation-schema.yml @@ -1,11 +1,10 @@ -$id: https://ocr-d.github.io/schemas/v1/recognition-schema.json +$id: https://ocr-d.github.io/schemas/v1/model-evaluation-schema.json type: object required: - engineName - engineVersion - groundTruthBag - model - - outputFormat properties: engineName: type: string @@ -16,7 +15,7 @@ properties: - calamari engineVersion: type: string - engineArguments: + recognizerArguments: description: Command line arguments passed to the CLI recognition tool type: array default: []