ianepreston / stats_can

Get Statistics Canada data into python (mostly pandas)
GNU General Public License v3.0
62 stars 24 forks source link

calling table_to_df from MS Fabric needing pytables workaround. #579

Open moeeljawadoph opened 7 hours ago

moeeljawadoph commented 7 hours ago

Hello i am trying to call the function table_to_df:

import json
import pandas as pd
import numpy as np
from stats_can import StatsCan
sc = StatsCan()

def get_statcan_dict(url_cube):

    with requests.Session() as c:
        var1 = c.get(url_cube)

    dict1 = json.loads(var1.text)
    return dict1
#
def get_statcan_metadata(js_dict,product_id_path):
    # donloaded metadata into a list
    list1 = []
    for i in js_dict:
        list1.append(pd.DataFrame.from_dict(i, orient='index').T)

    # concat that list and then create the dataframe
    df1 = pd.concat(list1)

    # create the download url
    df1['url'] = df1['productId'].apply(lambda x: f'{product_id_path}/{x}-eng.zip')
    return df1

url_cube = "https://www150.statcan.gc.ca/t1/wds/rest/getAllCubesList"
product_id_path = "https://www150.statcan.gc.ca/n1/tbl/csv"

stcan_dict = get_statcan_dict(url_cube)
df = get_statcan_metadata(stcan_dict,product_id_path)
data=sc.table_to_df('10100001') 

but i am getting the following error: ImportError Traceback (most recent call last) File ~/cluster-env/clonedenv/lib/python3.10/site-packages/pandas/compat/_optional.py:142, in import_optional_dependency(name, extra, errors, min_version) 141 try: --> 142 module = importlib.import_module(name) 143 except ImportError:

File ~/cluster-env/clonedenv/lib/python3.10/importlib/init.py:126, in import_module(name, package) 125 level += 1 --> 126 return _bootstrap._gcd_import(name[level:], package, level)

File :1050, in _gcd_import(name, package, level)

File :1027, in _find_andload(name, import)

File :1006, in _find_and_loadunlocked(name, import)

File :688, in _load_unlocked(spec)

File :883, in exec_module(self, module)

File :241, in _call_with_frames_removed(f, *args, **kwds)

File ~/cluster-env/clonedenv/lib/python3.10/site-packages/tables/init.py:51 50 # Necessary imports to get versions stored on the cython extension ---> 51 from .utilsextension import get_hdf5_version as _get_hdf5_version 53 from ._version import version

File ~/cluster-env/clonedenv/lib/python3.10/site-packages/tables/utilsextension.pyx:25, in init tables.utilsextension()

File ~/cluster-env/clonedenv/lib/python3.10/site-packages/tables/description.py:9 7 import numpy as np ----> 9 from . import atom 10 from .path import check_name_validity

File ~/cluster-env/clonedenv/lib/python3.10/site-packages/tables/atom.py:5 4 import inspect ----> 5 from typing_extensions import Any, Callable, dataclass_transform, Dict, NoReturn, Union 6 import warnings

ImportError: cannot import name 'Callable' from 'typing_extensions' (/home/trusted-service-user/cluster-env/clonedenv/lib/python3.10/site-packages/typing_extensions.py)

During handling of the above exception, another exception occurred:

ImportError Traceback (most recent call last) Cell In[19], line 1 ----> 1 data=sc.table_to_df('10100001')

File ~/cluster-env/clonedenv/lib/python3.10/site-packages/stats_can/api_class.py:66, in StatsCan.table_to_df(self, table) 48 def table_to_df(self, table): 49 """Read a table to a dataframe. 50 51 Parameters (...) 64 call StatsCan.update_tables(), optionally passing just the table number of interest 65 """ ---> 66 return sc.table_to_df(table=table, path=self.data_folder, h5file="stats_can.h5")

File ~/cluster-env/clonedenv/lib/python3.10/site-packages/stats_can/sc.py:613, in table_to_df(table, path, h5file) 608 warn( 609 "This function will be deprecated in the v3 release. Please see the docs for details.", 610 FutureWarning, 611 ) 612 if h5file: --> 613 df = table_from_h5(table=table, h5file=h5file, path=path) 614 else: 615 df = zip_table_to_dataframe(table=table, path=path)

File ~/cluster-env/clonedenv/lib/python3.10/site-packages/stats_can/sc.py:334, in table_from_h5(table, h5file, path) 332 h5 = path / h5file 333 try: --> 334 with pd.HDFStore(h5, "r") as store: 335 df = pd.read_hdf(store, key=table) 336 except (KeyError, OSError):

File ~/cluster-env/clonedenv/lib/python3.10/site-packages/pandas/io/pytables.py:560, in HDFStore.init(self, path, mode, complevel, complib, fletcher32, **kwargs) 557 if "format" in kwargs: 558 raise ValueError("format is not a defined argument for HDFStore") --> 560 tables = import_optional_dependency("tables") 562 if complib is not None and complib not in tables.filters.all_complibs: 563 raise ValueError( 564 f"complib only supports {tables.filters.all_complibs} compression." 565 )

File ~/cluster-env/clonedenv/lib/python3.10/site-packages/pandas/compat/_optional.py:145, in import_optional_dependency(name, extra, errors, min_version) 143 except ImportError: 144 if errors == "raise": --> 145 raise ImportError(msg) 146 return None 148 # Handle submodules: if we have submodule, grab parent module from sys.modules

ImportError: Missing optional dependency 'pytables'. Use pip or conda to install pytables. pytables has some requirements that i am unable to fulfil in MS Fabric. any workaround or solutions.

ianepreston commented 2 hours ago

I'm working on a v3 of this library that removes a lot of those dependencies for exactly this reason. It hasn't been published yet but you could pip install referencing the repo and pointing at the v3 branch. Let me know if that works.