diff --git a/chemdataextractor/common/REG_EXP.py b/chemdataextractor/common/REG_EXP.py new file mode 100644 index 0000000..8e833b3 --- /dev/null +++ b/chemdataextractor/common/REG_EXP.py @@ -0,0 +1,4 @@ + +# Common regular expressions +# TODO: add br. s inside of the multiplicity +MULTIPLICITY = '^(br\.)?(br.|s|S|d|D|t|T|q|Q|quint|sept|m|M|dd|ddd|dt|td|tt|br|bs|sb|h|ABq|broad|singlet|doublet|triplet|qua(rtet)?|quintet|septet|multiplet|multiple|peaks)$' diff --git a/chemdataextractor/common/__init__.py b/chemdataextractor/common/__init__.py new file mode 100644 index 0000000..982dcd4 --- /dev/null +++ b/chemdataextractor/common/__init__.py @@ -0,0 +1 @@ +from .REG_EXP import * diff --git a/chemdataextractor/doc/document.py b/chemdataextractor/doc/document.py index 3d72760..24511cd 100644 --- a/chemdataextractor/doc/document.py +++ b/chemdataextractor/doc/document.py @@ -185,7 +185,7 @@ def records(self): sent_record = first_sent_records[0] if sent_record.labels or (sent_record.names and len(sent_record.names[0]) > len(el.sentences[0].text) / 2): head_def_record = sent_record - head_def_record_i = i + head_def_record_i = i - 1 # fix error related with cem that contains nmr that sometimes doesn't detect it well for record in el.records: # Keep track of the most recent record with labels @@ -215,10 +215,11 @@ def records(self): continue else: # print(record.serialize()) + # TODO: check the names and labels, not the whole record # We have property values but no names or labels... try merge those from previous if isinstance(el, Paragraph) and (head_def_record or last_product_record or last_id_record or title_record): # head_def_record from heading takes priority if the heading directly precedes the paragraph ( NOPE: or the last_id_record has no name) - if head_def_record_i and head_def_record_i + 1 == i: # or (last_id_record and not last_id_record.names)): + if last_id_record and not last_id_record.names and head_def_record_i is not None and head_def_record_i + 1 == i: # or (last_id_record and not last_id_record.names)): if head_def_record: record.names = head_def_record.names record.labels = head_def_record.labels @@ -272,6 +273,13 @@ def records(self): record.names.append(name) # Merge records with any shared name/label + # TODO: merging labels into a single record because of an 'and' is not a good idea (this must be done in other part of the code) + temp_record = [] + for record in records: + if len(record.labels) <= 1: + temp_record.append(record) + + records.models = temp_record len_l = len(records) i = 0 while i < (len_l - 1): diff --git a/chemdataextractor/doc/text.py b/chemdataextractor/doc/text.py index f99c762..a2c3ff1 100644 --- a/chemdataextractor/doc/text.py +++ b/chemdataextractor/doc/text.py @@ -26,7 +26,9 @@ from ..parse.mp import MpParser from ..parse.tg import TgParser from ..parse.nmr import NmrParser +from ..parse.doi import DoiParser from ..parse.uvvis import UvvisParser +from ..parse.hrms import HRMSParser from ..nlp.lexicon import ChemLexicon from ..nlp.cem import CemTagger, IGNORE_PREFIX, IGNORE_SUFFIX, SPECIALS, SPLITS from ..nlp.abbrev import ChemAbbreviationDetector @@ -266,7 +268,8 @@ def _repr_html_(self): class Paragraph(Text): - parsers = [CompoundParser(), ChemicalLabelParser(), NmrParser(), IrParser(), UvvisParser(), MpParser(), TgParser(), ContextParser()] + parsers = [CompoundParser(), ChemicalLabelParser(), NmrParser(), IrParser(), UvvisParser(), MpParser(), TgParser(), + ContextParser(), DoiParser(), HRMSParser()] def _repr_html_(self): return '
' + self.text + '
' @@ -510,6 +513,7 @@ def records(self): tagged_tokens = [(CONTROL_RE.sub('', token), tag) for token, tag in self.tagged_tokens] for parser in self.parsers: for record in parser.parse(tagged_tokens): + # print(record) p = record.serialize() if not p: # TODO: Potential performance issues? continue diff --git a/chemdataextractor/model.py b/chemdataextractor/model.py index f54666a..faf464a 100644 --- a/chemdataextractor/model.py +++ b/chemdataextractor/model.py @@ -22,12 +22,10 @@ from .utils import python_2_unicode_compatible - log = logging.getLogger(__name__) class BaseType(six.with_metaclass(ABCMeta)): - # This is assigned by ModelMeta to match the attribute on the Model name = None @@ -90,7 +88,6 @@ def process(self, value): class ModelType(BaseType): - def __init__(self, model, **kwargs): self.model_class = model self.model_name = self.model_class.__name__ @@ -102,7 +99,6 @@ def serialize(self, value, primitive=False): class ListType(BaseType): - def __init__(self, field, default=None, **kwargs): super(ListType, self).__init__(**kwargs) self.field = field @@ -376,6 +372,11 @@ class NmrSpectrum(BaseModel): peaks = ListType(ModelType(NmrPeak)) +class HRMS(BaseModel): + """High Resolution Mass Spectrometry""" + chemical_structure = StringType() + + class MeltingPoint(BaseModel): """A melting point measurement.""" value = StringType() @@ -394,6 +395,7 @@ class GlassTransition(BaseModel): concentration = StringType(contextual=True) concentration_units = StringType(contextual=True) + class QuantumYield(BaseModel): """A quantum yield measurement.""" value = StringType() @@ -439,6 +441,8 @@ class Compound(BaseModel): names = ListType(StringType()) labels = ListType(StringType()) roles = ListType(StringType()) + doi = ListType(StringType()) + hrms = ListType(ModelType(HRMS)) nmr_spectra = ListType(ModelType(NmrSpectrum)) ir_spectra = ListType(ModelType(IrSpectrum)) uvvis_spectra = ListType(ModelType(UvvisSpectrum)) @@ -502,8 +506,8 @@ def is_unidentified(self): def is_id_only(self): """Return True if identifier information only.""" for key, value in self.items(): - if key not in {'names', 'labels', 'roles'} and value: + if key not in {'names', 'labels', 'roles', 'doi'} and value: return False - if self.names or self.labels: + if self.names or self.labels or self.doi: return True return False diff --git a/chemdataextractor/nlp/tokenize.py b/chemdataextractor/nlp/tokenize.py index 90f52a0..435221b 100644 --- a/chemdataextractor/nlp/tokenize.py +++ b/chemdataextractor/nlp/tokenize.py @@ -14,12 +14,11 @@ from abc import ABCMeta, abstractmethod import logging import re - import six from ..text import bracket_level, GREEK from ..data import load_model - +from ..common import REG_EXP log = logging.getLogger(__name__) @@ -447,6 +446,8 @@ class ChemWordTokenizer(WordTokenizer): NO_SPLIT_PREFIX_ENDING = re.compile('(^\(.*\)|^[\d,\'"“”„‟‘’‚‛`´′″‴‵‶‷⁗Α-Ωα-ω]+|ano|ato|azo|boc|bromo|cbz|chloro|eno|fluoro|fmoc|ido|ino|io|iodo|mercapto|nitro|ono|oso|oxalo|oxo|oxy|phospho|telluro|tms|yl|ylen|ylene|yliden|ylidene|ylidyn|ylidyne)$', re.U) #: Don't split on hyphen if prefix or suffix match this regular expression NO_SPLIT_CHEM = re.compile('([\-α-ω]|\d+,\d+|\d+[A-Z]|^d\d\d?$|acetic|acetyl|acid|acyl|anol|azo|benz|bromo|carb|cbz|chlor|cyclo|ethan|ethyl|fluoro|fmoc|gluc|hydro|idyl|indol|iene|ione|iodo|mercapto|n,n|nitro|noic|o,o|oxalo|oxo|oxy|oyl|onyl|phen|phth|phospho|pyrid|telluro|tetra|tms|ylen|yli|zole|alpha|beta|gamma|delta|epsilon|theta|kappa|lambda|sigma|omega)', re.U | re.I) + + INSIDE_PEAK = re.compile(REG_EXP.MULTIPLICITY + '|^(M?Hz|\d+\.\d+)$') #: Don't split on hyphen if the prefix is one of these sequences NO_SPLIT_PREFIX = { 'e', 'a', 'u', 'x', 'agro', 'ante', 'anti', 'arch', 'be', 'bi', 'bio', 'co', 'counter', 'cross', 'cyber', @@ -657,6 +658,9 @@ def _subspan(self, s, span, nextspan): # Split around colon unless it looks like we're in a chemical name if not (before and after and after[0].isdigit() and before.rstrip('′\'')[-1:].isdigit() and '-' in after) and not (self.NO_SPLIT_CHEM.search(before) and self.NO_SPLIT_CHEM.search(after)): return self._split_span(span, i, 1) + elif char == ',': + if not (self.NO_SPLIT_CHEM.search(before) and self.NO_SPLIT_CHEM.search(after)) and (self.INSIDE_PEAK.search(before) or self.INSIDE_PEAK.search(after)): + return self._split_span(span, i, 1) elif char in {'x', '+', '−'}: # Split around x, +, − (\u2212 minus) between two numbers or at start followed by numbers if (i == 0 or self._is_number(before)) and self._is_number(after): diff --git a/chemdataextractor/parse/actions.py b/chemdataextractor/parse/actions.py index 1bc09b2..61915ec 100644 --- a/chemdataextractor/parse/actions.py +++ b/chemdataextractor/parse/actions.py @@ -19,7 +19,6 @@ from ..text import HYPHENS - log = logging.getLogger(__name__) @@ -30,7 +29,7 @@ def flatten(tokens, start, result): return result -def join(tokens, start, result): +def join(tokens, start, result, separator=' '): """Join tokens into a single string with spaces between.""" texts = [] if len(result) > 0: @@ -38,7 +37,11 @@ def join(tokens, start, result): for child in e.iter(): if child.text is not None: texts.append(child.text) - return [E(result[0].tag, ' '.join(texts))] + return [E(result[0].tag, separator.join(texts))] + + +def join_comma(tokens, start, result): + return join(tokens, start, result, separator=',') def merge(tokens, start, result): diff --git a/chemdataextractor/parse/cem.py b/chemdataextractor/parse/cem.py index eefaf61..0e3a143 100644 --- a/chemdataextractor/parse/cem.py +++ b/chemdataextractor/parse/cem.py @@ -53,7 +53,7 @@ label_blacklist = R('^(31P|[12]H|[23]D|15N|14C|[4567890]\d+)$') -prefixed_label = R('^(cis|trans)-((d-)?(\d{1,2}[A-Za-z]{0,2}[′″‴‶‷⁗]?)(-d)?|[LS]\d\d?)$') +prefixed_label = R('^(cis|trans|[A-Za-z]{,3})-((d-)?(\d{1,2}[A-Za-z]{0,2}[′″‴‶‷⁗]?)(-d)?|[LS]\d\d?)$') #: Chemical label. Very permissive - must be used in context to avoid false positives. strict_chemical_label = Not(label_blacklist) + (alphanumeric | roman_numeral | letter_number | prefixed_label)('label') @@ -124,7 +124,8 @@ I('acetone-d6') | I('d6-acetone') | I('chloroform-d') | I('d-chloroform') | I('methanol-d4') | I('d4-methanol') | I('pyridine-d5') | I('d5-pyridine') | I('DMSO-d6') | I('d6-DMSO') | I('dimethylsulfoxide-d6') | W('C7D8') | I('d6-dimethylsulfoxide') | W('MeOH-d4') | W('d4-MeOH') | I('DMSO') | I('benzene-d6') | I('d6-benzene') | - I('1,1,2,2-tetrachloroethane-d2') | I('tetrachloroethane-d2') | I('d2-tetrachloroethane') + I('1,1,2,2-tetrachloroethane-d2') | I('tetrachloroethane-d2') | I('d2-tetrachloroethane') | I('MeOD-d4') | + I('d4-MeOD') ) diff --git a/chemdataextractor/parse/doi.py b/chemdataextractor/parse/doi.py new file mode 100644 index 0000000..cbf41dc --- /dev/null +++ b/chemdataextractor/parse/doi.py @@ -0,0 +1,30 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from .base import BaseParser +from .elements import W, R, Optional +from ..model import Compound +from .actions import merge + + +doi = ((R('[Dd][Oo][Ii]') + Optional(W(':'))).hide() + + R('10[.][0-9]{4,}(?:[.][0-9]+)*') + + W('/') + + R('(?:(?!["&\'<>])\S)+')).add_action(merge)('doi') + + +class DoiParser(BaseParser): + """""" + root = doi + + def __init__(self): + pass + + def interpret(self, result, start, end): + c = Compound( + doi=result.xpath('./text()') + ) + + yield c diff --git a/chemdataextractor/parse/elements.py b/chemdataextractor/parse/elements.py index 1761d41..0b7d1e7 100644 --- a/chemdataextractor/parse/elements.py +++ b/chemdataextractor/parse/elements.py @@ -257,7 +257,7 @@ def _parse_tokens(self, tokens, i, actions=True): class Regex(BaseParserElement): """Match token text with regular expression.""" - def __init__(self, pattern, flags=0, group=None): + def __init__(self, pattern, flags=0, group=None, min_size=None, max_size=None): super(Regex, self).__init__() if isinstance(pattern, six.string_types): self.regex = re.compile(pattern, flags) @@ -266,9 +266,16 @@ def __init__(self, pattern, flags=0, group=None): self.regex = pattern self.pattern = pattern.pattern self.group = group + self.min_size = 0 if min_size is None else min_size + self.max_size = float('inf') if max_size is None else min_size def _parse_tokens(self, tokens, i, actions=True): token_text = tokens[i][0] + token_size = len(token_text) + + if not (self.min_size <= token_size < self.max_size): + raise ParseException(tokens, i, 'Expected %s, got %s' % (self.pattern, token_text), self) + result = self.regex.search(token_text) if result: text = tokens[i][0] if self.group is None else result.group(self.group) diff --git a/chemdataextractor/parse/hrms.py b/chemdataextractor/parse/hrms.py new file mode 100644 index 0000000..780c3ba --- /dev/null +++ b/chemdataextractor/parse/hrms.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import re + +from .base import BaseParser +from .elements import OneOrMore, R, Optional, ZeroOrMore, Not +from ..model import Compound, HRMS +from ..utils import first +from .actions import merge + +not_separator = '[^\.;,]$' +separator = '[\.;,]' +chem_sign = '[\+\-‐‑⁃‒–—―−-⁻]' +number = R('^\d+(\.\d+)?$') +chemical_name = R('^(([A-Z][a-z]?\d*|\((?:[^()]*(?:\(.*\))?[^()]*)+\)\d+)+' + chem_sign + '?)$', min_size=5) +# obtained from https://stackoverflow.com/questions/23602175/regex-for-parsing-chemical-formulas +chemical_structure_start = (R('(calcd|calculated)' + separator + '?', flags=re.IGNORECASE) | R('^for' + separator + '?', flags=re.IGNORECASE)) +chemical_structure = (ZeroOrMore(chemical_structure_start + R(not_separator)).hide() + (chemical_name('chemical_structure')) + Optional(R(separator)).hide()) +# compound = (R('^\[') + ZeroOrMore(R('\.+')) + R('\]')).add_action(merge)('compound') + +# theoretical = (Optional(W('calcd') + W('for')).hide() + number('mass') + compound)('theoretical') +# experimental = (Optional(W('found')).hide() + number('mass'))('experimental') +exceptions = ((number | R(chem_sign + '$') | R(u'((^found|^\d+)' + separator + '?)$', flags=re.IGNORECASE)) + Optional(R(separator))).hide() + +hrms = (R('^.*H.*R.*M.*S.*$').hide() + ZeroOrMore(chemical_structure | exceptions | R(not_separator).hide()))('hrms') + + +class HRMSParser(BaseParser): + """""" + root = hrms + + def __init__(self): + pass + + def interpret(self, result, start, end): + h = HRMS( + chemical_structure=first(result.xpath('./chemical_structure/text()')) + ) + c = Compound() + c.hrms.append(h) + + yield c diff --git a/chemdataextractor/parse/nmr.py b/chemdataextractor/parse/nmr.py index d1a093a..f585df6 100644 --- a/chemdataextractor/parse/nmr.py +++ b/chemdataextractor/parse/nmr.py @@ -19,11 +19,12 @@ from ..model import Compound, NmrSpectrum, NmrPeak from ..utils import first -from .actions import join, merge, strip_stop, fix_whitespace +from .actions import join, merge, strip_stop, fix_whitespace, join_comma from .base import BaseParser from .common import cc, equals from .cem import chemical_name, nmr_solvent from .elements import W, I, T, R, Optional, ZeroOrMore, SkipTo, OneOrMore, Not, Group +from ..common import REG_EXP log = logging.getLogger(__name__) @@ -80,11 +81,14 @@ def strip_delta(tokens, start, result): shift_error = (Optional(R('^[\-–−‒]$')) + R('^δ?[\+\-–−‒]?\d+(\.+\d+)?,\d+(\.+\d+)?\.?$'))('shift').add_action(merge) shift = (shift_range | shift_value | shift_error).add_action(strip_stop).add_action(strip_delta) -split = R('^(br?)?(s|S|d|D|t|T|q|Q|quint|sept|m|M|dd|ddd|dt|td|tt|br|bs|sb|h|ABq|broad|singlet|doublet|triplet|qua(rtet)?|quintet|septet|multiplet|multiple|peaks)$') +split = R(REG_EXP.MULTIPLICITY) multiplicity = (OneOrMore(split) + Optional(W('of') + split))('multiplicity').add_action(join) -coupling_value = (number + ZeroOrMore(R('^[,;&]$') + number + Not(W('H'))))('value').add_action(join) -coupling = ((R('^\d?J([HCNPFD\d,]*|cis|trans)$') + Optional(R('^[\-–−‒]$') + R('^[HCNPF\d]$')) + Optional('=')).hide() + coupling_value + Optional(W('Hz')('units')) + ZeroOrMore(R('^[,;&]$').hide() + coupling_value + W('Hz')('units')))('coupling') +coupling_separator = '^[,;&]|and$' +coupling_signature = R('^\d?J([HCNPFD\d,]*|cis|trans)$') + Optional(R('^[\-–−‒]$') + R('^[HCNPF\d]$')) + Optional('=') +coupling_value = (number + ZeroOrMore((Optional(W('Hz')) + R(coupling_separator) + Optional(coupling_signature)).hide() + number + Not(W('H'))))('value').add_action(join_comma) +coupling = (coupling_signature.hide() + coupling_value + Optional(W('Hz')('units')) + ZeroOrMore(R( + coupling_separator).hide() + coupling_value + W('Hz')('units')))('coupling') number = (R('^\d+(\.\d+)?[HCNPF]\.?$') | (R('^\d+(\.\d+)?$') + R('^[HCNPF]\.?$')))('number').add_action(merge) @@ -143,4 +147,4 @@ def interpret(self, result, start, end): n.peaks.append(nmr_peak) c.nmr_spectra.append(n) - yield c + yield c \ No newline at end of file diff --git a/tests/test_parse_cem.py b/tests/test_parse_cem.py index 3c268b1..b44f253 100644 --- a/tests/test_parse_cem.py +++ b/tests/test_parse_cem.py @@ -170,6 +170,11 @@ def test_to_yield_phrase(self): ] self.do_parse(s, expected) + def test_label_start(self): + s = '1,3,5-Tricyano-2,4,6-tris(2-dimethylaminovinyl)benzene (Leu-07)' + expected = ['