I am trying to create a custom parser to extract the boiling points from the following texts, so that the text between "boiling point" and "of" is optional.
Paragraph(u'The boiling point limit of 2,4,6-trinitrotoluene is 240 °C') # <- text 1
Paragraph(u'The boiling point of 2,4,6-trinitrotoluene is 240 °C') # <- text 2
But this fails for text2 when there is no text between "boiling point" and "of".
I am not sure whether this is related to the way the code is written.
Full code is given below.
from chemdataextractor.model import BaseModel, StringType, ListType, ModelType
class BoilingPoint(BaseModel):
prefix = StringType()
value = StringType()
units = StringType()
Compound.boiling_points = ListType(ModelType(BoilingPoint))
import re
from chemdataextractor.parse import R, I, W, Optional, merge
prefix = (I('boiling') + I('point') + Optional(R('^\S*$')).hide() + I('of') +\
R(r'\S+') +(I('is')|I('was')).hide() )(u'prefix').add_action(join)
units = (W(u'°') + Optional(R(u'^[CFK]\.?$')))(u'units').add_action(merge)
value = R(u'^\d+(\.\d+)?$')(u'value')
bp = (prefix + value + units)(u'bp')
from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first
class BpParser(BaseParser):
root = bp
def interpret(self, result, start, end):
compound = Compound(
boiling_points=[
BoilingPoint(
prefix=first(result.xpath('./prefix/text()')),
value=first(result.xpath('./value/text()')),
units=first(result.xpath('./units/text()'))
)
]
)
yield compound
Paragraph.parsers = [BpParser()]
d = Document(
Heading(u'Synthesis of (3a)'),
# Paragraph(u'The boiling point limit of 2,4,6-trinitrotoluene is 240 °C') # <- text 1
Paragraph(u'The boiling point of 2,4,6-trinitrotoluene is 240 °C') # <- text 2
)
d.records.serialize()
I am trying to create a custom parser to extract the boiling points from the following texts, so that the text between "boiling point" and "of" is optional.
I try to use the following prefix,
prefix = (I('boiling') + I('point') + Optional(R('^\S*$')).hide() + I('of') +\ R(r'\S+') +(I('is')|I('was')).hide() )(u'prefix').add_action(join)
But this fails for text2 when there is no text between "boiling point" and "of".
I am not sure whether this is related to the way the code is written.
Full code is given below.