frictionlessdata / frictionless-py

Data management framework for Python that provides functionality to describe, extract, validate, and transform tabular data
https://framework.frictionlessdata.io
MIT License
701 stars 147 forks source link

Improved SPSS plugin implementation #410

Closed roll closed 3 years ago

roll commented 6 years ago

Overview

With tableschema-spss we run into a few things that don't allow to use it as a first-class citizien in tableschema/datapackage integrations:

So for now we can't use it in standard integration scenario with tableschema/datapackage like:

from tableschema import Table

table = Table('data.csv', schema='schema.csv')
table.save('data', storage='sql', engine=engine)
table.save('data', storage='bigquery', project=project, dataset=dataset)
table.save('data', storage='pandas')
# Will fail if schema doesn't have spss:format properties
# table.save('data', storage='spss', base_path='dir/path') 

Not sure we have something to do with it now because it works great for other stuff (SPSS-base pilot etc). So just leaving it here for a future.


Our storage plugins for SQL/BigQuery/Pandas are able to pass this testsuite (with some differences in reflected schema - here just an example). It's updated to SPSS storage and use simple cast function to cast resource data:

# Resources

ARTICLES = {
    'schema': {
        'fields': [
            {'name': 'id', 'type': 'integer', 'constraints': {'required': True}},
            {'name': 'parent', 'type': 'integer'},
            {'name': 'name', 'type': 'string'},
            {'name': 'current', 'type': 'boolean'},
            {'name': 'rating', 'type': 'number'},
        ],
        # 'primaryKey': 'id',
        # 'foreignKeys': [
            # {'fields': 'parent', 'reference': {'resource': '', 'fields': 'id'}},
        # ],
    },
    'data': [
        ['1', '', 'Taxes', 'True', '9.5'],
        ['2', '1', '中国人', 'False', '7'],
    ],
}
COMMENTS = {
    'schema': {
        'fields': [
            {'name': 'entry_id', 'type': 'integer', 'constraints': {'required': True}},
            {'name': 'comment', 'type': 'string'},
            {'name': 'note', 'type': 'any'},
        ],
        # 'primaryKey': 'entry_id',
        # 'foreignKeys': [
            # {'fields': 'entry_id', 'reference': {'resource': 'articles', 'fields': 'id'}},
        # ],
    },
    'data': [
        ['1', 'good', 'note1'],
        ['2', 'bad', 'note2'],
    ],
}
TEMPORAL = {
    'schema': {
        'fields': [
            {'name': 'date', 'type': 'date'},
            {'name': 'date_year', 'type': 'date', 'format': '%Y'},
            {'name': 'datetime', 'type': 'datetime'},
            {'name': 'duration', 'type': 'duration'},
            {'name': 'time', 'type': 'time'},
            {'name': 'year', 'type': 'year'},
            {'name': 'yearmonth', 'type': 'yearmonth'},
        ],
    },
    'data': [
        ['2015-01-01', '2015', '2015-01-01T03:00:00Z', 'P1Y1M', '03:00:00', '2015', '2015-01'],
        ['2015-12-31', '2015', '2015-12-31T15:45:33Z', 'P2Y2M', '15:45:33', '2015', '2015-01'],
    ],
}
LOCATION = {
    'schema': {
        'fields': [
            {'name': 'location', 'type': 'geojson'},
            {'name': 'geopoint', 'type': 'geopoint'},
        ],
    },
    'data': [
        ['{"type": "Point","coordinates":[33.33,33.33]}', '30,75'],
        ['{"type": "Point","coordinates":[50.00,50.00]}', '90,45'],
    ],
}
COMPOUND = {
    'schema': {
        'fields': [
            {'name': 'stats', 'type': 'object'},
            {'name': 'persons', 'type': 'array'},
        ],
    },
    'data': [
        ['{"chars":560}', '["Mike", "John"]'],
        ['{"chars":970}', '["Paul", "Alex"]'],
    ],
}

# Tests

def test_storage(tmpdir):

    # Create storage
    storage = Storage(base_path=str(tmpdir))

    # Delete buckets
    storage.delete()

    # Create buckets
    storage.create(['articles', 'comments'], [ARTICLES['schema'], COMMENTS['schema']])
    storage.create('comments', COMMENTS['schema'], force=True)
    storage.create('temporal', TEMPORAL['schema'])
    storage.create('location', LOCATION['schema'])
    storage.create('compound', COMPOUND['schema'])

    # Write data
    storage.write('articles', ARTICLES['data'])
    storage.write('comments', COMMENTS['data'])
    storage.write('temporal', TEMPORAL['data'])
    storage.write('location', LOCATION['data'])
    storage.write('compound', COMPOUND['data'])

    # Create new storage to use reflection only
    storage = Storage(base_path=str(tmpdir))

    # Create existent bucket
    with pytest.raises(tableschema.exceptions.StorageError):
        storage.create('articles.sav', ARTICLES['schema'])

    # Assert buckets
    assert storage.buckets == ['articles', 'compound', 'location', 'temporal', 'comments']

    # Assert schemas
    assert storage.describe('articles') == ARTICLES['schema']
    assert storage.describe('comments') == {
        'fields': [
            {'name': 'entry_id', 'type': 'integer', 'constraints': {'required': True}},
            {'name': 'comment', 'type': 'string'},
            {'name': 'note', 'type': 'string'}, # type downgrade
        ],
    }
    assert storage.describe('temporal') == {
        'fields': [
            {'name': 'date', 'type': 'date'},
            {'name': 'date_year', 'type': 'date'}, # format removal
            {'name': 'datetime', 'type': 'datetime'},
            {'name': 'duration', 'type': 'string'}, # type fallback
            {'name': 'time', 'type': 'time'},
            {'name': 'year', 'type': 'integer'}, # type downgrade
            {'name': 'yearmonth', 'type': 'string'}, # type fallback
        ],
    }
    assert storage.describe('location') == {
        'fields': [
            {'name': 'location', 'type': 'object'}, # type downgrade
            {'name': 'geopoint', 'type': 'string'}, # type fallback
        ],
    }
    assert storage.describe('compound') == {
        'fields': [
            {'name': 'stats', 'type': 'object'},
            {'name': 'persons', 'type': 'string'}, # type fallback
        ],
    }

    # Assert data
    assert storage.read('articles') == cast(ARTICLES)['data']
    assert storage.read('comments') == cast(COMMENTS)['data']
    assert storage.read('temporal') == cast(TEMPORAL, skip=['duration', 'yearmonth'])['data']
    assert storage.read('location') == cast(LOCATION, skip=['geopoint'])['data']
    assert storage.read('compound') == cast(COMPOUND, skip=['array'])['data']

    # Assert data with forced schema
    storage.describe('compound.sav', COMPOUND['schema'])
    assert storage.read('compound.sav') == cast(COMPOUND)['data']

    # Delete non existent bucket
    with pytest.raises(tableschema.exceptions.StorageError):
        storage.delete('non_existent')

    # Delete buckets
    storage.delete()
roll commented 3 years ago

MERGED into https://github.com/frictionlessdata/frictionless-py/issues/419