From a43802a6f02765b2a7c2bd82047bd82c5398aae1 Mon Sep 17 00:00:00 2001 From: Otto Brinkhaus Date: Tue, 23 May 2023 16:37:07 +0200 Subject: [PATCH 1/3] fix table parser example --- examples/automatic_parsing_for_tables.ipynb | 92 +++++++++++++++------ 1 file changed, 66 insertions(+), 26 deletions(-) diff --git a/examples/automatic_parsing_for_tables.ipynb b/examples/automatic_parsing_for_tables.ipynb index 582141bc..72ad7525 100644 --- a/examples/automatic_parsing_for_tables.ipynb +++ b/examples/automatic_parsing_for_tables.ipynb @@ -20,22 +20,22 @@ "name": "stdout", "output_type": "stream", "text": [ - "+--------+--------------------------+------------------------------+\n", - "| Data | Row Categories | Column Categories |\n", - "+--------+--------------------------+------------------------------+\n", - "| 1100 | ['Inorganic', 'BiFeO3'] | ['Temperatures', 'Tc/K'] |\n", - "| 643 | ['Inorganic', 'BiFeO3'] | ['Temperatures', 'Tn/K'] |\n", - "| | ['Inorganic', 'BiFeO3'] | ['Magnetic moment', 'B [T]'] |\n", - "| 257 | ['Inorganic', ' LaCrO3'] | ['Temperatures', 'Tc/K'] |\n", - "| 150 | ['Inorganic', ' LaCrO3'] | ['Temperatures', 'Tn/K'] |\n", - "| 0.1 mT | ['Inorganic', ' LaCrO3'] | ['Magnetic moment', 'B [T]'] |\n", - "| | ['Organic', 'LaCrO2'] | ['Temperatures', 'Tc/K'] |\n", - "| 10 | ['Organic', 'LaCrO2'] | ['Temperatures', 'Tn/K'] |\n", - "| 500 | ['Organic', 'LaCrO2'] | ['Magnetic moment', 'B [T]'] |\n", - "| | ['Inorganic', 'Gd'] | ['Temperatures', 'Tc/K'] |\n", - "| 294 | ['Inorganic', 'Gd'] | ['Temperatures', 'Tn/K'] |\n", - "| 659 T | ['Inorganic', 'Gd'] | ['Magnetic moment', 'B [T]'] |\n", - "+--------+--------------------------+------------------------------+\n" + "+---------+--------------------------+------------------------------+\n", + "| Data | Row Categories | Column Categories |\n", + "+---------+--------------------------+------------------------------+\n", + "| 1100 | ['Inorganic', 'BiFeO3'] | ['Temperatures', 'Tc/K'] |\n", + "| 643 | ['Inorganic', 'BiFeO3'] | ['Temperatures', 'Tn/K'] |\n", + "| NoValue | ['Inorganic', 'BiFeO3'] | ['Magnetic moment', 'B [T]'] |\n", + "| 257 | ['Inorganic', ' LaCrO3'] | ['Temperatures', 'Tc/K'] |\n", + "| 150 | ['Inorganic', ' LaCrO3'] | ['Temperatures', 'Tn/K'] |\n", + "| 0.1 mT | ['Inorganic', ' LaCrO3'] | ['Magnetic moment', 'B [T]'] |\n", + "| NoValue | ['Organic', 'LaCrO2'] | ['Temperatures', 'Tc/K'] |\n", + "| 10 | ['Organic', 'LaCrO2'] | ['Temperatures', 'Tn/K'] |\n", + "| 500 | ['Organic', 'LaCrO2'] | ['Magnetic moment', 'B [T]'] |\n", + "| NoValue | ['Inorganic', 'Gd'] | ['Temperatures', 'Tc/K'] |\n", + "| 294 | ['Inorganic', 'Gd'] | ['Temperatures', 'Tn/K'] |\n", + "| 659 T | ['Inorganic', 'Gd'] | ['Magnetic moment', 'B [T]'] |\n", + "+---------+--------------------------+------------------------------+\n" ] }, { @@ -53,7 +53,7 @@ "from chemdataextractor.doc.table import Table\n", "from chemdataextractor.doc import Caption\n", "\n", - "path = \"./example_tables/table_example_tkt_2.csv\"\n", + "path = \"./data/table_example.csv\"\n", "table = Table(caption=Caption(\"\"),table_data=path)\n", "\n", "print(table.tde_table)\n", @@ -104,15 +104,25 @@ "name": "stdout", "output_type": "stream", "text": [ + "Initialising AllenNLP model ✔ \n", + "{'Compound': {'names': ['BiFeO3']}}\n", + "{'Compound': {'names': ['LaCrO3']}}\n", + "{'Compound': {'names': ['LaCrO2']}}\n", + "{'Compound': {'names': ['Gd']}}\n", "{'CurieTemperature': {'raw_value': '1100', 'raw_units': 'K', 'value': [1100.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['BiFeO3']}}}}\n", + "{'CurieTemperature': {'raw_value': '643', 'value': [643.0], 'compound': {'Compound': {'names': ['BiFeO3']}}}}\n", "{'CurieTemperature': {'raw_value': '257', 'raw_units': 'K', 'value': [257.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['LaCrO3']}}}}\n", + "{'CurieTemperature': {'raw_value': '150', 'value': [150.0], 'compound': {'Compound': {'names': ['LaCrO3']}}}}\n", + "{'CurieTemperature': {'raw_value': '0.1', 'value': [0.1], 'compound': {'Compound': {'names': ['LaCrO3']}}}}\n", "{'CurieTemperature': {'raw_value': '10', 'raw_units': 'K', 'value': [10.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['LaCrO2']}}}}\n", - "{'CurieTemperature': {'raw_value': '294', 'raw_units': 'K', 'value': [294.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['Gd']}}}}\n" + "{'CurieTemperature': {'raw_value': '500', 'raw_units': 'K', 'value': [500.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['LaCrO2']}}}}\n", + "{'CurieTemperature': {'raw_value': '294', 'raw_units': 'K', 'value': [294.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['Gd']}}}}\n", + "{'CurieTemperature': {'raw_value': '659', 'raw_units': 'K', 'value': [659.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['Gd']}}}}\n" ] } ], "source": [ - "table.models = [CurieTemperature]\n", + "table = Table(caption=Caption(\"\"),table_data=path, models=[CurieTemperature])\n", "for record in table.records:\n", " print(record.serialize())" ] @@ -137,9 +147,19 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'CurieTemperature': {'raw_value': '1100', 'raw_units': 'K', 'value': [1100.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['BiFeO3']}}, 'label': 'Inorganic'}}\n", - "{'CurieTemperature': {'raw_value': '257', 'raw_units': 'K', 'value': [257.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['LaCrO3']}}, 'label': 'Inorganic'}}\n", - "{'CurieTemperature': {'raw_value': '10', 'raw_units': 'K', 'value': [10.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['LaCrO2']}}}}\n" + "{'Compound': {'names': ['BiFeO3']}}\n", + "{'Compound': {'names': ['LaCrO3']}}\n", + "{'Compound': {'names': ['LaCrO2']}}\n", + "{'Compound': {'names': ['Gd']}}\n", + "{'CurieTemperature': {'raw_value': '1100', 'raw_units': 'K', 'value': [1100.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['BiFeO3']}}}}\n", + "{'CurieTemperature': {'raw_value': '643', 'value': [643.0], 'compound': {'Compound': {'names': ['BiFeO3']}}}}\n", + "{'CurieTemperature': {'raw_value': '257', 'raw_units': 'K', 'value': [257.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['LaCrO3']}}}}\n", + "{'CurieTemperature': {'raw_value': '150', 'value': [150.0], 'compound': {'Compound': {'names': ['LaCrO3']}}}}\n", + "{'CurieTemperature': {'raw_value': '0.1', 'value': [0.1], 'compound': {'Compound': {'names': ['LaCrO3']}}}}\n", + "{'CurieTemperature': {'raw_value': '10', 'raw_units': 'K', 'value': [10.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['LaCrO2']}}}}\n", + "{'CurieTemperature': {'raw_value': '500', 'raw_units': 'K', 'value': [500.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['LaCrO2']}}}}\n", + "{'CurieTemperature': {'raw_value': '294', 'raw_units': 'K', 'value': [294.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['Gd']}}}}\n", + "{'CurieTemperature': {'raw_value': '659', 'raw_units': 'K', 'value': [659.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['Gd']}}}}\n" ] } ], @@ -157,9 +177,29 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'Compound': {'names': ['BiFeO3']}}\n", + "{'Compound': {'names': ['LaCrO3']}}\n", + "{'Compound': {'names': ['LaCrO2']}}\n", + "{'Compound': {'names': ['Gd']}}\n", + "{'CurieTemperature': {'raw_value': '1100', 'raw_units': 'K', 'value': [1100.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['BiFeO3']}}}}\n", + "{'CurieTemperature': {'raw_value': '643', 'value': [643.0], 'compound': {'Compound': {'names': ['BiFeO3']}}}}\n", + "{'CurieTemperature': {'raw_value': '257', 'raw_units': 'K', 'value': [257.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['LaCrO3']}}}}\n", + "{'CurieTemperature': {'raw_value': '150', 'value': [150.0], 'compound': {'Compound': {'names': ['LaCrO3']}}}}\n", + "{'CurieTemperature': {'raw_value': '0.1', 'value': [0.1], 'compound': {'Compound': {'names': ['LaCrO3']}}}}\n", + "{'CurieTemperature': {'raw_value': '10', 'raw_units': 'K', 'value': [10.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['LaCrO2']}}}}\n", + "{'CurieTemperature': {'raw_value': '500', 'raw_units': 'K', 'value': [500.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['LaCrO2']}}}}\n", + "{'CurieTemperature': {'raw_value': '294', 'raw_units': 'K', 'value': [294.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['Gd']}}}}\n", + "{'CurieTemperature': {'raw_value': '659', 'raw_units': 'K', 'value': [659.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['Gd']}}}}\n" + ] + } + ], "source": [ "class CurieTemperature(TemperatureModel):\n", " StringType(parse_expression=I('TC'), required=True, contextual=True, updatable=True)\n", @@ -182,7 +222,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -196,7 +236,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.4" + "version": "3.8.16" } }, "nbformat": 4, From ed3ceffdaf0a297b11fc2517d2342cda4624fb01 Mon Sep 17 00:00:00 2001 From: Otto Brinkhaus Date: Tue, 23 May 2023 16:37:41 +0200 Subject: [PATCH 2/3] add missing import statements in documentation --- docs/introduction/finding_records.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/introduction/finding_records.rst b/docs/introduction/finding_records.rst index 3727593e..db086549 100644 --- a/docs/introduction/finding_records.rst +++ b/docs/introduction/finding_records.rst @@ -8,6 +8,7 @@ and then merges data together from every element in the document to produce a si Consider this simple document as an example:: >>> from chemdataextractor.doc import Document, Heading, Paragraph + >>> from chemdataextractor.model import Compound, MeltingPoint >>> doc = Document( Heading('5,10,15,20-Tetra(4-carboxyphenyl)porphyrin (3).'), Paragraph('m.p. 90°C.'), From e62d53c781f1358955d59b30341990349ff75c34 Mon Sep 17 00:00:00 2001 From: Otto Brinkhaus Date: Tue, 23 May 2023 16:57:34 +0200 Subject: [PATCH 3/3] fix all examples in table parser notebook --- examples/automatic_parsing_for_tables.ipynb | 33 ++++++++------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/examples/automatic_parsing_for_tables.ipynb b/examples/automatic_parsing_for_tables.ipynb index 72ad7525..824cf291 100644 --- a/examples/automatic_parsing_for_tables.ipynb +++ b/examples/automatic_parsing_for_tables.ipynb @@ -140,7 +140,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -151,15 +151,15 @@ "{'Compound': {'names': ['LaCrO3']}}\n", "{'Compound': {'names': ['LaCrO2']}}\n", "{'Compound': {'names': ['Gd']}}\n", - "{'CurieTemperature': {'raw_value': '1100', 'raw_units': 'K', 'value': [1100.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['BiFeO3']}}}}\n", - "{'CurieTemperature': {'raw_value': '643', 'value': [643.0], 'compound': {'Compound': {'names': ['BiFeO3']}}}}\n", - "{'CurieTemperature': {'raw_value': '257', 'raw_units': 'K', 'value': [257.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['LaCrO3']}}}}\n", - "{'CurieTemperature': {'raw_value': '150', 'value': [150.0], 'compound': {'Compound': {'names': ['LaCrO3']}}}}\n", - "{'CurieTemperature': {'raw_value': '0.1', 'value': [0.1], 'compound': {'Compound': {'names': ['LaCrO3']}}}}\n", + "{'CurieTemperature': {'raw_value': '1100', 'raw_units': 'K', 'value': [1100.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['BiFeO3']}}, 'label': 'Inorganic'}}\n", + "{'CurieTemperature': {'raw_value': '643', 'value': [643.0], 'compound': {'Compound': {'names': ['BiFeO3']}}, 'label': 'Inorganic'}}\n", + "{'CurieTemperature': {'raw_value': '257', 'raw_units': 'K', 'value': [257.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['LaCrO3']}}, 'label': 'Inorganic'}}\n", + "{'CurieTemperature': {'raw_value': '150', 'value': [150.0], 'compound': {'Compound': {'names': ['LaCrO3']}}, 'label': 'Inorganic'}}\n", + "{'CurieTemperature': {'raw_value': '0.1', 'value': [0.1], 'compound': {'Compound': {'names': ['LaCrO3']}}, 'label': 'Inorganic'}}\n", "{'CurieTemperature': {'raw_value': '10', 'raw_units': 'K', 'value': [10.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['LaCrO2']}}}}\n", "{'CurieTemperature': {'raw_value': '500', 'raw_units': 'K', 'value': [500.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['LaCrO2']}}}}\n", - "{'CurieTemperature': {'raw_value': '294', 'raw_units': 'K', 'value': [294.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['Gd']}}}}\n", - "{'CurieTemperature': {'raw_value': '659', 'raw_units': 'K', 'value': [659.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['Gd']}}}}\n" + "{'CurieTemperature': {'raw_value': '294', 'raw_units': 'K', 'value': [294.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['Gd']}}, 'label': 'Inorganic'}}\n", + "{'CurieTemperature': {'raw_value': '659', 'raw_units': 'K', 'value': [659.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['Gd']}}, 'label': 'Inorganic'}}\n" ] } ], @@ -170,14 +170,14 @@ " compound = ModelType(Compound, required=True, contextual=True)\n", " label = StringType(parse_expression=I('inorganic'))\n", " \n", - "table.models = [CurieTemperature]\n", + "table = Table(caption=Caption(\"\"),table_data=path, models=[CurieTemperature])\n", "for record in table.records:\n", " print(record.serialize())" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -187,16 +187,7 @@ "{'Compound': {'names': ['BiFeO3']}}\n", "{'Compound': {'names': ['LaCrO3']}}\n", "{'Compound': {'names': ['LaCrO2']}}\n", - "{'Compound': {'names': ['Gd']}}\n", - "{'CurieTemperature': {'raw_value': '1100', 'raw_units': 'K', 'value': [1100.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['BiFeO3']}}}}\n", - "{'CurieTemperature': {'raw_value': '643', 'value': [643.0], 'compound': {'Compound': {'names': ['BiFeO3']}}}}\n", - "{'CurieTemperature': {'raw_value': '257', 'raw_units': 'K', 'value': [257.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['LaCrO3']}}}}\n", - "{'CurieTemperature': {'raw_value': '150', 'value': [150.0], 'compound': {'Compound': {'names': ['LaCrO3']}}}}\n", - "{'CurieTemperature': {'raw_value': '0.1', 'value': [0.1], 'compound': {'Compound': {'names': ['LaCrO3']}}}}\n", - "{'CurieTemperature': {'raw_value': '10', 'raw_units': 'K', 'value': [10.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['LaCrO2']}}}}\n", - "{'CurieTemperature': {'raw_value': '500', 'raw_units': 'K', 'value': [500.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['LaCrO2']}}}}\n", - "{'CurieTemperature': {'raw_value': '294', 'raw_units': 'K', 'value': [294.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['Gd']}}}}\n", - "{'CurieTemperature': {'raw_value': '659', 'raw_units': 'K', 'value': [659.0], 'units': 'Kelvin^(1.0)', 'specifier': 'Tc', 'compound': {'Compound': {'names': ['Gd']}}}}\n" + "{'Compound': {'names': ['Gd']}}\n" ] } ], @@ -207,7 +198,7 @@ " compound = ModelType(Compound, required=True, contextual=True)\n", " label = StringType(parse_expression=I('something else'), required=True)\n", " \n", - "table.models = [CurieTemperature]\n", + "table = Table(caption=Caption(\"\"),table_data=path, models=[CurieTemperature])\n", "for record in table.records:\n", " print(record.serialize())" ]