diff --git a/chemdataextractor/common/REG_EXP.py b/chemdataextractor/common/REG_EXP.py new file mode 100644 index 0000000..8e833b3 --- /dev/null +++ b/chemdataextractor/common/REG_EXP.py @@ -0,0 +1,4 @@ + +# Common regular expressions +# TODO: add br. s inside of the multiplicity +MULTIPLICITY = '^(br\.)?(br.|s|S|d|D|t|T|q|Q|quint|sept|m|M|dd|ddd|dt|td|tt|br|bs|sb|h|ABq|broad|singlet|doublet|triplet|qua(rtet)?|quintet|septet|multiplet|multiple|peaks)$' diff --git a/chemdataextractor/common/__init__.py b/chemdataextractor/common/__init__.py new file mode 100644 index 0000000..982dcd4 --- /dev/null +++ b/chemdataextractor/common/__init__.py @@ -0,0 +1 @@ +from .REG_EXP import * diff --git a/chemdataextractor/doc/document.py b/chemdataextractor/doc/document.py index 3d72760..24511cd 100644 --- a/chemdataextractor/doc/document.py +++ b/chemdataextractor/doc/document.py @@ -185,7 +185,7 @@ def records(self): sent_record = first_sent_records[0] if sent_record.labels or (sent_record.names and len(sent_record.names[0]) > len(el.sentences[0].text) / 2): head_def_record = sent_record - head_def_record_i = i + head_def_record_i = i - 1 # fix error related with cem that contains nmr that sometimes doesn't detect it well for record in el.records: # Keep track of the most recent record with labels @@ -215,10 +215,11 @@ def records(self): continue else: # print(record.serialize()) + # TODO: check the names and labels, not the whole record # We have property values but no names or labels... try merge those from previous if isinstance(el, Paragraph) and (head_def_record or last_product_record or last_id_record or title_record): # head_def_record from heading takes priority if the heading directly precedes the paragraph ( NOPE: or the last_id_record has no name) - if head_def_record_i and head_def_record_i + 1 == i: # or (last_id_record and not last_id_record.names)): + if last_id_record and not last_id_record.names and head_def_record_i is not None and head_def_record_i + 1 == i: # or (last_id_record and not last_id_record.names)): if head_def_record: record.names = head_def_record.names record.labels = head_def_record.labels @@ -272,6 +273,13 @@ def records(self): record.names.append(name) # Merge records with any shared name/label + # TODO: merging labels into a single record because of an 'and' is not a good idea (this must be done in other part of the code) + temp_record = [] + for record in records: + if len(record.labels) <= 1: + temp_record.append(record) + + records.models = temp_record len_l = len(records) i = 0 while i < (len_l - 1): diff --git a/chemdataextractor/doc/text.py b/chemdataextractor/doc/text.py index f99c762..a2c3ff1 100644 --- a/chemdataextractor/doc/text.py +++ b/chemdataextractor/doc/text.py @@ -26,7 +26,9 @@ from ..parse.mp import MpParser from ..parse.tg import TgParser from ..parse.nmr import NmrParser +from ..parse.doi import DoiParser from ..parse.uvvis import UvvisParser +from ..parse.hrms import HRMSParser from ..nlp.lexicon import ChemLexicon from ..nlp.cem import CemTagger, IGNORE_PREFIX, IGNORE_SUFFIX, SPECIALS, SPLITS from ..nlp.abbrev import ChemAbbreviationDetector @@ -266,7 +268,8 @@ def _repr_html_(self): class Paragraph(Text): - parsers = [CompoundParser(), ChemicalLabelParser(), NmrParser(), IrParser(), UvvisParser(), MpParser(), TgParser(), ContextParser()] + parsers = [CompoundParser(), ChemicalLabelParser(), NmrParser(), IrParser(), UvvisParser(), MpParser(), TgParser(), + ContextParser(), DoiParser(), HRMSParser()] def _repr_html_(self): return '

' + self.text + '

' @@ -510,6 +513,7 @@ def records(self): tagged_tokens = [(CONTROL_RE.sub('', token), tag) for token, tag in self.tagged_tokens] for parser in self.parsers: for record in parser.parse(tagged_tokens): + # print(record) p = record.serialize() if not p: # TODO: Potential performance issues? continue diff --git a/chemdataextractor/model.py b/chemdataextractor/model.py index f54666a..faf464a 100644 --- a/chemdataextractor/model.py +++ b/chemdataextractor/model.py @@ -22,12 +22,10 @@ from .utils import python_2_unicode_compatible - log = logging.getLogger(__name__) class BaseType(six.with_metaclass(ABCMeta)): - # This is assigned by ModelMeta to match the attribute on the Model name = None @@ -90,7 +88,6 @@ def process(self, value): class ModelType(BaseType): - def __init__(self, model, **kwargs): self.model_class = model self.model_name = self.model_class.__name__ @@ -102,7 +99,6 @@ def serialize(self, value, primitive=False): class ListType(BaseType): - def __init__(self, field, default=None, **kwargs): super(ListType, self).__init__(**kwargs) self.field = field @@ -376,6 +372,11 @@ class NmrSpectrum(BaseModel): peaks = ListType(ModelType(NmrPeak)) +class HRMS(BaseModel): + """High Resolution Mass Spectrometry""" + chemical_structure = StringType() + + class MeltingPoint(BaseModel): """A melting point measurement.""" value = StringType() @@ -394,6 +395,7 @@ class GlassTransition(BaseModel): concentration = StringType(contextual=True) concentration_units = StringType(contextual=True) + class QuantumYield(BaseModel): """A quantum yield measurement.""" value = StringType() @@ -439,6 +441,8 @@ class Compound(BaseModel): names = ListType(StringType()) labels = ListType(StringType()) roles = ListType(StringType()) + doi = ListType(StringType()) + hrms = ListType(ModelType(HRMS)) nmr_spectra = ListType(ModelType(NmrSpectrum)) ir_spectra = ListType(ModelType(IrSpectrum)) uvvis_spectra = ListType(ModelType(UvvisSpectrum)) @@ -502,8 +506,8 @@ def is_unidentified(self): def is_id_only(self): """Return True if identifier information only.""" for key, value in self.items(): - if key not in {'names', 'labels', 'roles'} and value: + if key not in {'names', 'labels', 'roles', 'doi'} and value: return False - if self.names or self.labels: + if self.names or self.labels or self.doi: return True return False diff --git a/chemdataextractor/nlp/tokenize.py b/chemdataextractor/nlp/tokenize.py index 90f52a0..435221b 100644 --- a/chemdataextractor/nlp/tokenize.py +++ b/chemdataextractor/nlp/tokenize.py @@ -14,12 +14,11 @@ from abc import ABCMeta, abstractmethod import logging import re - import six from ..text import bracket_level, GREEK from ..data import load_model - +from ..common import REG_EXP log = logging.getLogger(__name__) @@ -447,6 +446,8 @@ class ChemWordTokenizer(WordTokenizer): NO_SPLIT_PREFIX_ENDING = re.compile('(^\(.*\)|^[\d,\'"“”„‟‘’‚‛`´′″‴‵‶‷⁗Α-Ωα-ω]+|ano|ato|azo|boc|bromo|cbz|chloro|eno|fluoro|fmoc|ido|ino|io|iodo|mercapto|nitro|ono|oso|oxalo|oxo|oxy|phospho|telluro|tms|yl|ylen|ylene|yliden|ylidene|ylidyn|ylidyne)$', re.U) #: Don't split on hyphen if prefix or suffix match this regular expression NO_SPLIT_CHEM = re.compile('([\-α-ω]|\d+,\d+|\d+[A-Z]|^d\d\d?$|acetic|acetyl|acid|acyl|anol|azo|benz|bromo|carb|cbz|chlor|cyclo|ethan|ethyl|fluoro|fmoc|gluc|hydro|idyl|indol|iene|ione|iodo|mercapto|n,n|nitro|noic|o,o|oxalo|oxo|oxy|oyl|onyl|phen|phth|phospho|pyrid|telluro|tetra|tms|ylen|yli|zole|alpha|beta|gamma|delta|epsilon|theta|kappa|lambda|sigma|omega)', re.U | re.I) + + INSIDE_PEAK = re.compile(REG_EXP.MULTIPLICITY + '|^(M?Hz|\d+\.\d+)$') #: Don't split on hyphen if the prefix is one of these sequences NO_SPLIT_PREFIX = { 'e', 'a', 'u', 'x', 'agro', 'ante', 'anti', 'arch', 'be', 'bi', 'bio', 'co', 'counter', 'cross', 'cyber', @@ -657,6 +658,9 @@ def _subspan(self, s, span, nextspan): # Split around colon unless it looks like we're in a chemical name if not (before and after and after[0].isdigit() and before.rstrip('′\'')[-1:].isdigit() and '-' in after) and not (self.NO_SPLIT_CHEM.search(before) and self.NO_SPLIT_CHEM.search(after)): return self._split_span(span, i, 1) + elif char == ',': + if not (self.NO_SPLIT_CHEM.search(before) and self.NO_SPLIT_CHEM.search(after)) and (self.INSIDE_PEAK.search(before) or self.INSIDE_PEAK.search(after)): + return self._split_span(span, i, 1) elif char in {'x', '+', '−'}: # Split around x, +, − (\u2212 minus) between two numbers or at start followed by numbers if (i == 0 or self._is_number(before)) and self._is_number(after): diff --git a/chemdataextractor/parse/actions.py b/chemdataextractor/parse/actions.py index 1bc09b2..61915ec 100644 --- a/chemdataextractor/parse/actions.py +++ b/chemdataextractor/parse/actions.py @@ -19,7 +19,6 @@ from ..text import HYPHENS - log = logging.getLogger(__name__) @@ -30,7 +29,7 @@ def flatten(tokens, start, result): return result -def join(tokens, start, result): +def join(tokens, start, result, separator=' '): """Join tokens into a single string with spaces between.""" texts = [] if len(result) > 0: @@ -38,7 +37,11 @@ def join(tokens, start, result): for child in e.iter(): if child.text is not None: texts.append(child.text) - return [E(result[0].tag, ' '.join(texts))] + return [E(result[0].tag, separator.join(texts))] + + +def join_comma(tokens, start, result): + return join(tokens, start, result, separator=',') def merge(tokens, start, result): diff --git a/chemdataextractor/parse/cem.py b/chemdataextractor/parse/cem.py index eefaf61..0e3a143 100644 --- a/chemdataextractor/parse/cem.py +++ b/chemdataextractor/parse/cem.py @@ -53,7 +53,7 @@ label_blacklist = R('^(31P|[12]H|[23]D|15N|14C|[4567890]\d+)$') -prefixed_label = R('^(cis|trans)-((d-)?(\d{1,2}[A-Za-z]{0,2}[′″‴‶‷⁗]?)(-d)?|[LS]\d\d?)$') +prefixed_label = R('^(cis|trans|[A-Za-z]{,3})-((d-)?(\d{1,2}[A-Za-z]{0,2}[′″‴‶‷⁗]?)(-d)?|[LS]\d\d?)$') #: Chemical label. Very permissive - must be used in context to avoid false positives. strict_chemical_label = Not(label_blacklist) + (alphanumeric | roman_numeral | letter_number | prefixed_label)('label') @@ -124,7 +124,8 @@ I('acetone-d6') | I('d6-acetone') | I('chloroform-d') | I('d-chloroform') | I('methanol-d4') | I('d4-methanol') | I('pyridine-d5') | I('d5-pyridine') | I('DMSO-d6') | I('d6-DMSO') | I('dimethylsulfoxide-d6') | W('C7D8') | I('d6-dimethylsulfoxide') | W('MeOH-d4') | W('d4-MeOH') | I('DMSO') | I('benzene-d6') | I('d6-benzene') | - I('1,1,2,2-tetrachloroethane-d2') | I('tetrachloroethane-d2') | I('d2-tetrachloroethane') + I('1,1,2,2-tetrachloroethane-d2') | I('tetrachloroethane-d2') | I('d2-tetrachloroethane') | I('MeOD-d4') | + I('d4-MeOD') ) diff --git a/chemdataextractor/parse/doi.py b/chemdataextractor/parse/doi.py new file mode 100644 index 0000000..cbf41dc --- /dev/null +++ b/chemdataextractor/parse/doi.py @@ -0,0 +1,30 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from .base import BaseParser +from .elements import W, R, Optional +from ..model import Compound +from .actions import merge + + +doi = ((R('[Dd][Oo][Ii]') + Optional(W(':'))).hide() + + R('10[.][0-9]{4,}(?:[.][0-9]+)*') + + W('/') + + R('(?:(?!["&\'<>])\S)+')).add_action(merge)('doi') + + +class DoiParser(BaseParser): + """""" + root = doi + + def __init__(self): + pass + + def interpret(self, result, start, end): + c = Compound( + doi=result.xpath('./text()') + ) + + yield c diff --git a/chemdataextractor/parse/elements.py b/chemdataextractor/parse/elements.py index 1761d41..0b7d1e7 100644 --- a/chemdataextractor/parse/elements.py +++ b/chemdataextractor/parse/elements.py @@ -257,7 +257,7 @@ def _parse_tokens(self, tokens, i, actions=True): class Regex(BaseParserElement): """Match token text with regular expression.""" - def __init__(self, pattern, flags=0, group=None): + def __init__(self, pattern, flags=0, group=None, min_size=None, max_size=None): super(Regex, self).__init__() if isinstance(pattern, six.string_types): self.regex = re.compile(pattern, flags) @@ -266,9 +266,16 @@ def __init__(self, pattern, flags=0, group=None): self.regex = pattern self.pattern = pattern.pattern self.group = group + self.min_size = 0 if min_size is None else min_size + self.max_size = float('inf') if max_size is None else min_size def _parse_tokens(self, tokens, i, actions=True): token_text = tokens[i][0] + token_size = len(token_text) + + if not (self.min_size <= token_size < self.max_size): + raise ParseException(tokens, i, 'Expected %s, got %s' % (self.pattern, token_text), self) + result = self.regex.search(token_text) if result: text = tokens[i][0] if self.group is None else result.group(self.group) diff --git a/chemdataextractor/parse/hrms.py b/chemdataextractor/parse/hrms.py new file mode 100644 index 0000000..780c3ba --- /dev/null +++ b/chemdataextractor/parse/hrms.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import re + +from .base import BaseParser +from .elements import OneOrMore, R, Optional, ZeroOrMore, Not +from ..model import Compound, HRMS +from ..utils import first +from .actions import merge + +not_separator = '[^\.;,]$' +separator = '[\.;,]' +chem_sign = '[\+\-‐‑⁃‒–—―−-⁻]' +number = R('^\d+(\.\d+)?$') +chemical_name = R('^(([A-Z][a-z]?\d*|\((?:[^()]*(?:\(.*\))?[^()]*)+\)\d+)+' + chem_sign + '?)$', min_size=5) +# obtained from https://stackoverflow.com/questions/23602175/regex-for-parsing-chemical-formulas +chemical_structure_start = (R('(calcd|calculated)' + separator + '?', flags=re.IGNORECASE) | R('^for' + separator + '?', flags=re.IGNORECASE)) +chemical_structure = (ZeroOrMore(chemical_structure_start + R(not_separator)).hide() + (chemical_name('chemical_structure')) + Optional(R(separator)).hide()) +# compound = (R('^\[') + ZeroOrMore(R('\.+')) + R('\]')).add_action(merge)('compound') + +# theoretical = (Optional(W('calcd') + W('for')).hide() + number('mass') + compound)('theoretical') +# experimental = (Optional(W('found')).hide() + number('mass'))('experimental') +exceptions = ((number | R(chem_sign + '$') | R(u'((^found|^\d+)' + separator + '?)$', flags=re.IGNORECASE)) + Optional(R(separator))).hide() + +hrms = (R('^.*H.*R.*M.*S.*$').hide() + ZeroOrMore(chemical_structure | exceptions | R(not_separator).hide()))('hrms') + + +class HRMSParser(BaseParser): + """""" + root = hrms + + def __init__(self): + pass + + def interpret(self, result, start, end): + h = HRMS( + chemical_structure=first(result.xpath('./chemical_structure/text()')) + ) + c = Compound() + c.hrms.append(h) + + yield c diff --git a/chemdataextractor/parse/nmr.py b/chemdataextractor/parse/nmr.py index d1a093a..f585df6 100644 --- a/chemdataextractor/parse/nmr.py +++ b/chemdataextractor/parse/nmr.py @@ -19,11 +19,12 @@ from ..model import Compound, NmrSpectrum, NmrPeak from ..utils import first -from .actions import join, merge, strip_stop, fix_whitespace +from .actions import join, merge, strip_stop, fix_whitespace, join_comma from .base import BaseParser from .common import cc, equals from .cem import chemical_name, nmr_solvent from .elements import W, I, T, R, Optional, ZeroOrMore, SkipTo, OneOrMore, Not, Group +from ..common import REG_EXP log = logging.getLogger(__name__) @@ -80,11 +81,14 @@ def strip_delta(tokens, start, result): shift_error = (Optional(R('^[\-–−‒]$')) + R('^δ?[\+\-–−‒]?\d+(\.+\d+)?,\d+(\.+\d+)?\.?$'))('shift').add_action(merge) shift = (shift_range | shift_value | shift_error).add_action(strip_stop).add_action(strip_delta) -split = R('^(br?)?(s|S|d|D|t|T|q|Q|quint|sept|m|M|dd|ddd|dt|td|tt|br|bs|sb|h|ABq|broad|singlet|doublet|triplet|qua(rtet)?|quintet|septet|multiplet|multiple|peaks)$') +split = R(REG_EXP.MULTIPLICITY) multiplicity = (OneOrMore(split) + Optional(W('of') + split))('multiplicity').add_action(join) -coupling_value = (number + ZeroOrMore(R('^[,;&]$') + number + Not(W('H'))))('value').add_action(join) -coupling = ((R('^\d?J([HCNPFD\d,]*|cis|trans)$') + Optional(R('^[\-–−‒]$') + R('^[HCNPF\d]$')) + Optional('=')).hide() + coupling_value + Optional(W('Hz')('units')) + ZeroOrMore(R('^[,;&]$').hide() + coupling_value + W('Hz')('units')))('coupling') +coupling_separator = '^[,;&]|and$' +coupling_signature = R('^\d?J([HCNPFD\d,]*|cis|trans)$') + Optional(R('^[\-–−‒]$') + R('^[HCNPF\d]$')) + Optional('=') +coupling_value = (number + ZeroOrMore((Optional(W('Hz')) + R(coupling_separator) + Optional(coupling_signature)).hide() + number + Not(W('H'))))('value').add_action(join_comma) +coupling = (coupling_signature.hide() + coupling_value + Optional(W('Hz')('units')) + ZeroOrMore(R( + coupling_separator).hide() + coupling_value + W('Hz')('units')))('coupling') number = (R('^\d+(\.\d+)?[HCNPF]\.?$') | (R('^\d+(\.\d+)?$') + R('^[HCNPF]\.?$')))('number').add_action(merge) @@ -143,4 +147,4 @@ def interpret(self, result, start, end): n.peaks.append(nmr_peak) c.nmr_spectra.append(n) - yield c + yield c \ No newline at end of file diff --git a/tests/test_parse_cem.py b/tests/test_parse_cem.py index 3c268b1..b44f253 100644 --- a/tests/test_parse_cem.py +++ b/tests/test_parse_cem.py @@ -170,6 +170,11 @@ def test_to_yield_phrase(self): ] self.do_parse(s, expected) + def test_label_start(self): + s = '1,3,5-Tricyano-2,4,6-tris(2-dimethylaminovinyl)benzene (Leu-07)' + expected = ['1,3,5-Tricyano-2,4,6-tris(2-dimethylaminovinyl)benzene'] + self.do_parse(s, expected) + class TestParseCemHeading(unittest.TestCase): @@ -403,6 +408,11 @@ def test_consecutive_headings2(self): Paragraph('The product had a melting point of 70-75° C. and has structural formula VII.') ) results = [r.serialize() for r in d.records] + print(results) + print([ + {'names': [u'5-Bromo-6-pentadecyl-2-hydroxybenzoic acid', u'DBAA'], 'roles': ['product']}, + {'melting_points': [{'units': u'\xb0C.', 'value': u'70-75'}], 'labels': [u'VII'], 'roles': [u'formula']} + ]) self.assertEqual(results, [ {'labels': [u'VII'], 'roles': [u'formula']}, {'melting_points': [{'units': u'\xb0C.', 'value': u'70-75'}], diff --git a/tests/test_parse_doi.py b/tests/test_parse_doi.py new file mode 100644 index 0000000..5faec64 --- /dev/null +++ b/tests/test_parse_doi.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- +""" +test_parse_doi +~~~~~~~~~~~~~~ + +Test DOI parser. + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +import logging +import unittest + +from lxml import etree + +from chemdataextractor.doc.text import Sentence +from chemdataextractor.parse.doi import doi + +logging.basicConfig(level=logging.DEBUG) +log = logging.getLogger(__name__) + + +class TestParseDOI(unittest.TestCase): + maxDiff = None + + def do_parse(self, input, expected): + s = Sentence(input) + log.debug(s) + log.debug(s.tagged_tokens) + result = next(doi.scan(s.tagged_tokens))[0] + log.debug(etree.tostring(result, pretty_print=True, encoding='unicode')) + self.assertEqual(expected, etree.tostring(result, encoding='unicode')) + + def test_doi1(self): + tests = [ + 'DOI:10.1021/jo101758t', + 'doi:10.3390/molecules201219848\n hello world', + 'Molecules 2015, 20(12), 22272-22285; doi:10.3390/molecules201219846' + ] + values = [ + '10.1021/jo101758t', + '10.3390/molecules201219848', + '10.3390/molecules201219846' + ] + for test, expected in zip(tests, values): + self.do_parse(test, expected) diff --git a/tests/test_parse_hrms.py b/tests/test_parse_hrms.py new file mode 100644 index 0000000..7a40a26 --- /dev/null +++ b/tests/test_parse_hrms.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- +""" +test_parse_doi +~~~~~~~~~~~~~~ + +Test DOI parser. + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +import logging +import unittest + +from lxml import etree + +from chemdataextractor.doc.text import Sentence +from chemdataextractor.parse.hrms import hrms + +logging.basicConfig(level=logging.DEBUG) +log = logging.getLogger(__name__) + + +class TestParseHRMS(unittest.TestCase): + maxDiff = None + + def do_parse(self, input, expected): + s = Sentence(input) + log.debug(s) + log.debug(s.tagged_tokens) + result = next(hrms.scan(s.tagged_tokens))[0] + log.debug(etree.tostring(result, pretty_print=True, encoding='unicode')) + self.assertEqual(expected, etree.tostring(result, encoding='unicode')) + + def test_hrms1(self): + s = 'HRMS (ESI) calcd for C34H28N4OP 539.1995 [M + H]+, found 539.1997.' + output = 'C34H28N4OP' + self.do_parse(s, output) + + def test_hrms2(self): + s = 'HRMS: 184.0767 [M + Na]+.' + output = '' + self.do_parse(s, output) + + def test_hrms3(self): + s = 'HRMS-ESI (m/z): calcd. for C42H52NO9 [M + NH4]+ 714.3637, found 714.3633.' + output = 'C42H52NO9' + self.do_parse(s, output) + + def test_hrms4(self): + s = 'MALDI-HRMS (matrix: HCCA) Calculated for C32H48N4O6: [M + H]+ m/z 585.3607, Found 585.3636.' + output = 'C32H48N4O6' + self.do_parse(s, output) + + def test_hrms5(self): + s = 'HRMS (m/z): 827.6005 [M+Na]+ (calcd. for C48H84O9Na: 827.6013). ' + output = 'C48H84O9Na' + self.do_parse(s, output) + + def test_hrms6(self): + s = 'HRMS [M−H]+ m/z calcd. for C24H32N9+ 446.2781, found 446.2775.' + output = 'C24H32N9+' + self.do_parse(s, output) + + def test_hrms7(self): + s = 'DCI-HRMS: m/z 289.0916 [M+H]+; (Calcd for C12H16O8, 288.0845)' + output = 'C12H16O8' + self.do_parse(s, output) + + def test_hrms8(self): + s = 'ES-HRMS: m/z 115.0393 [M−H]−; (Calcd for C5H7O3, 116.0473).' + output = 'C5H7O3' + self.do_parse(s, output) + + def test_hrms9(self): + s = 'HRMS (ESI) calcd for C27H24N4P 435.1733 [M + H]+, found 435.1738.' + output = 'C27H24N4P' + self.do_parse(s, output) + + def test_hrms10(self): + s = 'HRMS (ESI): [M − H]−, found 344.8591. C11H5Br2O3− requires 344.8585.' + output = 'C11H5Br2O3−' + self.do_parse(s, output) + + def test_hrms11(self): + s = 'HRMS (ESI): calcd. for C13H11BrO3Na+ [M + Na]+ 316.9789, found 316.9785.' + output = 'C13H11BrO3Na+' + self.do_parse(s, output) + + def test_hrms12(self): + s = 'HR-ESI-MS [M − H]− m/z: 447.0854, Calcd. for C21H21O9P (M − H) 447.0923.' + output = 'C21H21O9P' + self.do_parse(s, output) diff --git a/tests/test_parse_nmr.py b/tests/test_parse_nmr.py index 83d561d..7d7f8cd 100644 --- a/tests/test_parse_nmr.py +++ b/tests/test_parse_nmr.py @@ -73,7 +73,7 @@ def test_nmr6(self): '4.15 (1H, br d, J = 11.2 Hz, H4′′′), 4.05 (1H, t, J = 11.2 Hz, H3b′′′), 3.88 (1H, J = 14.3, 6.8 Hz, H2), ' \ '3.86 (3H, s, OCH38), 3.69 (3H, s, OCH34′), 3.64 (3H, s, COOCH32), 3.49 (3H, br s, H5′′′ and H6′′′), ' \ '3.43-3.47 (1H, overlapped, H3a′′′), 3.45 (3H, s, OCH32′′′).' - expected = '1HCDCl3 with 0.05 % v / v TMS400MHz7.102Hd8.9HzH2\u2032H6\u20327.03-7.073HmH3\u2032\u2032H4\u2032\u2032H5\u2032\u20326.83-6.852HmH2\u2032\u2032H6\u2032\u20326.662Hd8.9HzH3\u2032H5\u20326.421Hd1.8HzH56.261Hd1.7HzH75.181HsH1\u2032\u2032\u20325.011Hd6.6HzH14.521HsH2\u2032\u2032\u20324.271Hd14.2HzH34.151Hbr d11.2HzH4\u2032\u2032\u20324.051Ht11.2HzH3b\u2032\u2032\u20323.881H14.3 , 6.8HzH23.863HsOCH383.693HsOCH34\u20323.643HsCOOCH323.493Hbr sH5\u2032\u2032\u2032H6\u2032\u2032\u20323.43-3.471HoverlappedH3a\u2032\u2032\u20323.453HsOCH32\u2032\u2032\u2032' + expected = '1HCDCl3 with 0.05 % v / v TMS400MHz7.102Hd8.9HzH2\u2032H6\u20327.03-7.073HmH3\u2032\u2032H4\u2032\u2032H5\u2032\u20326.83-6.852HmH2\u2032\u2032H6\u2032\u20326.662Hd8.9HzH3\u2032H5\u20326.421Hd1.8HzH56.261Hd1.7HzH75.181HsH1\u2032\u2032\u20325.011Hd6.6HzH14.521HsH2\u2032\u2032\u20324.271Hd14.2HzH34.151Hbr d11.2HzH4\u2032\u2032\u20324.051Ht11.2HzH3b\u2032\u2032\u20323.881H14.3,6.8HzH23.863HsOCH383.693HsOCH34\u20323.643HsCOOCH323.493Hbr sH5\u2032\u2032\u2032H6\u2032\u2032\u20323.43-3.471HoverlappedH3a\u2032\u2032\u20323.453HsOCH32\u2032\u2032\u2032' self.do_parse(s, expected) def test_nmr7(self): @@ -91,7 +91,7 @@ def test_nmr8(self): """d of d multiplicity.""" s = '1H NMR (D2O, 300 MHz): δ 1.61 (s, 6H), 1.68 (s, 3H), 1.72 (s, 3H), 2.17-1.99 (m, 8H), ' \ '4.45 (d of d, 2H, JH,H = 6Hz, JP,H = 6Hz), 5.23-5.15 (m, 2H), 5.46 (t, 1H, J = 6Hz).' - expected = '1HD2O300MHz1.61s6H1.68s3H1.72s3H2.17-1.99m8H4.45d of d2H6Hz6Hz5.23-5.15m2H5.46t1H6Hz' + expected = '1HD2O300MHz1.61s6H1.68s3H1.72s3H2.17-1.99m8H4.45d of d2H6,6Hz5.23-5.15m2H5.46t1H6Hz' self.do_parse(s, expected) def test_nmr9(self): @@ -124,6 +124,23 @@ def test_nmr13(self): expected = '1HCDCl30.16-0.51t12H1.75-2.21m8H5.16s2H7.00-8.21' self.do_parse(s, expected) + def test_nmr14(self): + """""" + s = '1H-NMR (400 MHz,DMSO-d6), δ (ppm):10.63 (br. s), 9.73 (s,1H), 4.39 (t, J = 3.6 Hz,4H), 2.36 (t, J = 3.6,2H), 2.15 (dd, J = 1.2 and J = 3.6 Hz, 1H).' + expected = '1H400MHzDMSO-d610.63br. s9.73s1H4.39t3.6Hz4H2.36t3.62H2.15dd1.2,3.6Hz1H' + self.do_parse(s, expected) + + def test_nmr15(self): + """""" + s = '1H-NMR (600 MHz, MeOD-d4) δ: 7.57 (d, J = 9.4 Hz, 1H), 7.54 (d, J = 9.4 Hz, 1H), 7.38 (d, J = 8.2 Hz, 1H), 7.24 (s, 1H), 7.18 (d, J = 1.0 Hz, 1H), 7.13 (d, J = 7.8 Hz, 1H), 7.08 (d, J = 8.2 Hz, 1H), 6.81 (d, J = 8.1 Hz, 1H), 6.68 (d, J = 15.8 Hz, 1H), 6.61 (d, J = 15.8 Hz, 1H), 3.89 (d, J = 7.4 Hz, 6H);' + expected = '1H600MHzMeOD-d47.57d9.4Hz1H7.54d9.4Hz1H7.38d8.2Hz1H7.24s1H7.18d1.0Hz1H7.13d7.8Hz1H7.08d8.2Hz1H6.81d8.1Hz1H6.68d15.8Hz1H6.61d15.8Hz1H3.89d7.4Hz6H' + self.do_parse(s, expected) + + def test_nmr16(self): + """""" + s = '31P-NMR (243 MHz, MeOD-d4) δ:−4.31 (1P)' + expected = '31P243MHzMeOD-d4\u22124.311P' + self.do_parse(s, expected) if __name__ == '__main__': unittest.main()