CambridgeMolecularEngineering / chemdataextractor2

ChemDataExtractor Version 2.0
Other
121 stars 28 forks source link

Question about create a new unit #59

Closed loilisxka closed 2 months ago

loilisxka commented 3 months ago

I'm trying to add new units of pressure. My code looks like this:

from chemdataextractor.model import QuantityModel, StringType
from chemdataextractor.model.units import Unit
from chemdataextractor.model.units import Dimension
from chemdataextractor.parse import W, I, R, Optional, Any, OneOrMore, Not, ZeroOrMore
from chemdataextractor.parse import merge, join

class Stress(Dimension):
    """
    Dimension subclass for stress.
    """
    pass

class StressModel(QuantityModel):
    """
    Model for stress.
    """
    dimensions = Stress()

class StressUnit(Unit):
    """
    Base class for units with dimensions of stress.
    The standard value for temperature is defined to be a Pa
    """

    def __init__(self, magnitude=0.0, powers=None):
        super(StressUnit, self).__init__(Stress(), magnitude, powers)

class Pa(StressUnit):
    '''
    class of Pa
    '''
    def convert_value_to_standard(self, value):
        return value

    def convert_value_from_standard(self, value):
        return value

    def convert_error_to_standard(self, error):
        return error

    def convert_error_from_standard(self, error):
        return error

class MPa(StressUnit):
    '''
    class of MPa
    '''
    def convert_value_to_standard(self, value):
        return value * 1000000

    def convert_value_from_standard(self, value):
        return value / 1000000.

    def convert_error_to_standard(self, error):
        return error

    def convert_error_from_standard(self, error):
        return error

class kPa(StressUnit):
    '''
    class of KPa
    '''
    def convert_value_to_standard(self, value):
        return value * 1000

    def convert_value_from_standard(self, value):
        return value / 1000.

    def convert_error_to_standard(self, error):
        return error

    def convert_error_from_standard(self, error):
        return error

class hPa(StressUnit):
    '''
    class of hPa
    '''
    def convert_value_to_standard(self, value):
        return value * 100

    def convert_value_from_standard(self, value):
        return value / 100.

    def convert_error_to_standard(self, error):
        return error

    def convert_error_from_standard(self, error):
        return error

class Bar(StressUnit):
    '''
    class of Bar
    '''
    def convert_value_to_standard(self, value):
        return value * 100000

    def convert_value_from_standard(self, value):
        return value / 100000.

    def convert_error_to_standard(self, error):
        return error

    def convert_error_from_standard(self, error):
        return error

class mBar(StressUnit):
    '''
    class of Bar
    '''
    def convert_value_to_standard(self, value):
        return value * 100

    def convert_value_from_standard(self, value):
        return value / 100.

    def convert_error_to_standard(self, error):
        return error

    def convert_error_from_standard(self, error):
        return error

class Atm(StressUnit):
    '''
    class of Bar
    '''
    def convert_value_to_standard(self, value):
        return value * 101325

    def convert_value_from_standard(self, value):
        return value / 101325.

    def convert_error_to_standard(self, error):
        return error

    def convert_error_from_standard(self, error):
        return error

class mmHg(StressUnit):
    '''
    class of mmHg
    '''
    def convert_value_to_standard(self, value):
        return value * 133.322

    def convert_value_from_standard(self, value):
        return value / 133.322

    def convert_error_to_standard(self, error):
        return error

    def convert_error_from_standard(self, error):
        return error

units_dict = {R('(Pa|(P|p)ascal(s)?)\.?', group=0): Pa,
              R('(MPa|(M|m)egapascal(s)?)\.?', group=0): MPa,
              R('(kPa|(K|k)ilopascal(s)?)\.?', group=0): kPa,
              R('(hPa|(H|h)ectopascal(s)?)\.?', group=0): hPa,
              R('((B|b)ar)\.?', group=0): Bar,
              R('(mBar|(M|m)illibar(s)?)\.?', group=0): mBar,
              R('((A|a)tm|(A|a)tmosphere(s)?|atm(ospheric)? pressure(s)?|)\.?', group=0): Atm,
              R('(mmHg)\.?', group=0): mmHg}

Stress.units_dict = units_dict
Stress.standard_units = Pa()

But when I run the following test case, the code does not have any output.

from stress import StressModel
from chemdataextractor import Document
from chemdataextractor.model import BaseModel, Compound, StringType, ModelType

class stress(StressModel):
    specifier = StringType(parse_expression=I('pressure'))
    compound = ModelType(Compound, required=False, contextual=False)

doc = Document('The experiment was conducted at a controlled pressure of 1 atm.')

doc.models = [stress]
print(doc.records.serialize())

What is the question? I look forward to your reply.

Dingyun-Huang commented 2 months ago

Hi there,

I think the reg-ex that you have written are not doing what you expect. You can do basic checks as the following. It is not only picking up 'atm', but also empty strings. So the thing to do first here is to fix the regular expressions in unit_dict.

Python 3.7.16 (default, Jan 17 2023, 16:06:28) [MSC v.1916 64 bit (AMD64)] :: Anaconda, Inc. on win32
Type "help", "copyright", "credits" or "license" for more information.
>>> import re
>>> rule = re.compile('((A|a)tm|(A|a)tmosphere(s)?|atm(ospheric)? pressure(s)?|)\.?')
>>> rule.findall('The experiment was conducted at a controlled pressure of 1 atm.')
[('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('', '', '', '', '', ''), ('atm', 'a', '', '', '', ''), ('', '', '', '', '', '')]
>>> rule.search('The experiment was conducted at a controlled pressure of 1 atm.')
<re.Match object; span=(0, 0), match=''>
loilisxka commented 2 months ago

Thanks a lot, this is useful for me.