Overview

With tableschema-spss we run into a few things that don't allow to use it as a first-class citizien in tableschema/datapackage integrations:

more important that we should have special Table Schema descriptor containing spss:format properties. May be it's possible to infer this information somehow? For examples tableschema-pandas only creates an emtpy dataframe on storage.create and then on storage.write does real descriptor mapping using provided data (the same we should do here to get size of data for SPSS formats)
less important that bucket names include .sav extension. I think we still better to hide storage backend details and use abstract bucket names like bucket instead of bucket.sav (adding/removing an extension on the mapping step). But that's not vital.

So for now we can't use it in standard integration scenario with tableschema/datapackage like:

from tableschema import Table

table = Table('data.csv', schema='schema.csv')
table.save('data', storage='sql', engine=engine)
table.save('data', storage='bigquery', project=project, dataset=dataset)
table.save('data', storage='pandas')
# Will fail if schema doesn't have spss:format properties
# table.save('data', storage='spss', base_path='dir/path')

Not sure we have something to do with it now because it works great for other stuff (SPSS-base pilot etc). So just leaving it here for a future.

Our storage plugins for SQL/BigQuery/Pandas are able to pass this testsuite (with some differences in reflected schema - here just an example). It's updated to SPSS storage and use simple cast function to cast resource data:

# Resources

ARTICLES = {
    'schema': {
        'fields': [
            {'name': 'id', 'type': 'integer', 'constraints': {'required': True}},
            {'name': 'parent', 'type': 'integer'},
            {'name': 'name', 'type': 'string'},
            {'name': 'current', 'type': 'boolean'},
            {'name': 'rating', 'type': 'number'},
        ],
        # 'primaryKey': 'id',
        # 'foreignKeys': [
            # {'fields': 'parent', 'reference': {'resource': '', 'fields': 'id'}},
        # ],
    },
    'data': [
        ['1', '', 'Taxes', 'True', '9.5'],
        ['2', '1', '中国人', 'False', '7'],
    ],
}
COMMENTS = {
    'schema': {
        'fields': [
            {'name': 'entry_id', 'type': 'integer', 'constraints': {'required': True}},
            {'name': 'comment', 'type': 'string'},
            {'name': 'note', 'type': 'any'},
        ],
        # 'primaryKey': 'entry_id',
        # 'foreignKeys': [
            # {'fields': 'entry_id', 'reference': {'resource': 'articles', 'fields': 'id'}},
        # ],
    },
    'data': [
        ['1', 'good', 'note1'],
        ['2', 'bad', 'note2'],
    ],
}
TEMPORAL = {
    'schema': {
        'fields': [
            {'name': 'date', 'type': 'date'},
            {'name': 'date_year', 'type': 'date', 'format': '%Y'},
            {'name': 'datetime', 'type': 'datetime'},
            {'name': 'duration', 'type': 'duration'},
            {'name': 'time', 'type': 'time'},
            {'name': 'year', 'type': 'year'},
            {'name': 'yearmonth', 'type': 'yearmonth'},
        ],
    },
    'data': [
        ['2015-01-01', '2015', '2015-01-01T03:00:00Z', 'P1Y1M', '03:00:00', '2015', '2015-01'],
        ['2015-12-31', '2015', '2015-12-31T15:45:33Z', 'P2Y2M', '15:45:33', '2015', '2015-01'],
    ],
}
LOCATION = {
    'schema': {
        'fields': [
            {'name': 'location', 'type': 'geojson'},
            {'name': 'geopoint', 'type': 'geopoint'},
        ],
    },
    'data': [
        ['{"type": "Point","coordinates":[33.33,33.33]}', '30,75'],
        ['{"type": "Point","coordinates":[50.00,50.00]}', '90,45'],
    ],
}
COMPOUND = {
    'schema': {
        'fields': [
            {'name': 'stats', 'type': 'object'},
            {'name': 'persons', 'type': 'array'},
        ],
    },
    'data': [
        ['{"chars":560}', '["Mike", "John"]'],
        ['{"chars":970}', '["Paul", "Alex"]'],
    ],
}

# Tests

def test_storage(tmpdir):

    # Create storage
    storage = Storage(base_path=str(tmpdir))

    # Delete buckets
    storage.delete()

    # Create buckets
    storage.create(['articles', 'comments'], [ARTICLES['schema'], COMMENTS['schema']])
    storage.create('comments', COMMENTS['schema'], force=True)
    storage.create('temporal', TEMPORAL['schema'])
    storage.create('location', LOCATION['schema'])
    storage.create('compound', COMPOUND['schema'])

    # Write data
    storage.write('articles', ARTICLES['data'])
    storage.write('comments', COMMENTS['data'])
    storage.write('temporal', TEMPORAL['data'])
    storage.write('location', LOCATION['data'])
    storage.write('compound', COMPOUND['data'])

    # Create new storage to use reflection only
    storage = Storage(base_path=str(tmpdir))

    # Create existent bucket
    with pytest.raises(tableschema.exceptions.StorageError):
        storage.create('articles.sav', ARTICLES['schema'])

    # Assert buckets
    assert storage.buckets == ['articles', 'compound', 'location', 'temporal', 'comments']

    # Assert schemas
    assert storage.describe('articles') == ARTICLES['schema']
    assert storage.describe('comments') == {
        'fields': [
            {'name': 'entry_id', 'type': 'integer', 'constraints': {'required': True}},
            {'name': 'comment', 'type': 'string'},
            {'name': 'note', 'type': 'string'}, # type downgrade
        ],
    }
    assert storage.describe('temporal') == {
        'fields': [
            {'name': 'date', 'type': 'date'},
            {'name': 'date_year', 'type': 'date'}, # format removal
            {'name': 'datetime', 'type': 'datetime'},
            {'name': 'duration', 'type': 'string'}, # type fallback
            {'name': 'time', 'type': 'time'},
            {'name': 'year', 'type': 'integer'}, # type downgrade
            {'name': 'yearmonth', 'type': 'string'}, # type fallback
        ],
    }
    assert storage.describe('location') == {
        'fields': [
            {'name': 'location', 'type': 'object'}, # type downgrade
            {'name': 'geopoint', 'type': 'string'}, # type fallback
        ],
    }
    assert storage.describe('compound') == {
        'fields': [
            {'name': 'stats', 'type': 'object'},
            {'name': 'persons', 'type': 'string'}, # type fallback
        ],
    }

    # Assert data
    assert storage.read('articles') == cast(ARTICLES)['data']
    assert storage.read('comments') == cast(COMMENTS)['data']
    assert storage.read('temporal') == cast(TEMPORAL, skip=['duration', 'yearmonth'])['data']
    assert storage.read('location') == cast(LOCATION, skip=['geopoint'])['data']
    assert storage.read('compound') == cast(COMPOUND, skip=['array'])['data']

    # Assert data with forced schema
    storage.describe('compound.sav', COMPOUND['schema'])
    assert storage.read('compound.sav') == cast(COMPOUND)['data']

    # Delete non existent bucket
    with pytest.raises(tableschema.exceptions.StorageError):
        storage.delete('non_existent')

    # Delete buckets
    storage.delete()

frictionlessdata / frictionless-py

Improved SPSS plugin implementation #410

Overview