Export Vertipaq analyzer result to Lakehouse table

muhssamy commented 4 months ago

i am trying to export Vertipaq analyzer report to a Lakehouse table using the following command

labs.vertipaq_analyzer(dataset = dataset_name , workspace = Workspace_name , export= 'table')

this code is working and i am able to see the result and export it to a zip folder if needed but when i request the export to be to table it fails with the following error

ValueError: There's no item with the ID '1234' in workspace 'ws'

Note i changed the id and workspace name here in the error

the same happens with bpa if i requested export to lakehouse table

m-kovalsky commented 3 months ago

I see. This is because the lakehouse attached to the workspace is different than the workspace in which the semantic model resides. I'll make a fix for this as the saved delta table always saves to the default lakehouse attached to the notebook.

muhssamy commented 3 months ago

Thanks for your help. Actually also i am facing this with BPA Dose this fix will fix both ?

m-kovalsky commented 3 months ago

I will make the same fix for the run_model_bpa function.

From: muhssamy @.> Sent: Thursday, July 25, 2024 10:59:12 PM To: microsoft/semantic-link-labs @.> Cc: Michael Kovalsky @.>; Comment @.> Subject: Re: [microsoft/semantic-link-labs] Export Vertipaq analyzer result to Lakehouse table (Issue #42)

Thanks for your help. Actually also i am facing this with BPA Dose this fix will fix both ?

— Reply to this email directly, view it on GitHubhttps://github.com/microsoft/semantic-link-labs/issues/42#issuecomment-2251294807, or unsubscribehttps://github.com/notifications/unsubscribe-auth/AHBQBNREZA466ZD5BHXFSBDZOFKJBAVCNFSM6AAAAABLNA6ISCVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDENJRGI4TIOBQG4. You are receiving this because you commented.Message ID: @.***>

m-kovalsky commented 2 months ago

This is resolved in 0.7.0.

muhssamy commented 2 months ago

i am getting this issue today

m-kovalsky commented 2 months ago

Please share the full error and code

From: muhssamy @.> Sent: Wednesday, August 28, 2024 2:48:00 PM To: microsoft/semantic-link-labs @.> Cc: Michael Kovalsky @.>; State change @.> Subject: Re: [microsoft/semantic-link-labs] Export Vertipaq analyzer result to Lakehouse table (Issue #42)

i am getting this issue today image.png (view on web)https://github.com/user-attachments/assets/a7b21b41-9933-4524-b98d-b10fc9d8a5f2

— Reply to this email directly, view it on GitHubhttps://github.com/microsoft/semantic-link-labs/issues/42#issuecomment-2315109610, or unsubscribehttps://github.com/notifications/unsubscribe-auth/AHBQBNTKBNAYY6CGDAHNUR3ZTW2HBAVCNFSM6AAAAABLNA6ISCVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDGMJVGEYDSNRRGA. You are receiving this because you modified the open/close state.Message ID: @.***>

muhssamy commented 2 months ago

import sempy_labs as labs
from sempy_labs import lakehouse as lake
from sempy_labs import directlake
import sempy.fabric as fabric
import pandas as pd
from sempy_labs.tom import connect_semantic_model
Workspace_name = 'ws_retail_sm' #dev
dataset_name = 'Retail Semantic Model' # retail
labs.vertipaq_analyzer(dataset = dataset_name , workspace = Workspace_name , export= 'table')

`--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[25], line 1 ----> 1 labs.vertipaq_analyzer(dataset = dataset_name , workspace = Workspace_name , export= 'table')

File ~/cluster-env/clonedenv/lib/python3.10/site-packages/sempy/_utils/_log.py:273, in mds_log..get_wrapper..log_decorator_wrapper(*args, *kwargs) 270 raise 272 try: --> 273 result = func(args, **kwargs) 275 # The invocation for get_message_dict moves after the function 276 # so it can access the state after the method call 277 message.update(extractor.get_completion_message_dict(result, arg_dict))

File ~/cluster-env/clonedenv/lib/python3.10/site-packages/sempy_labs/_vertipaq.py:452, in vertipaq_analyzer(dataset, workspace, export, read_stats_from_data, **kwargs) 447 lakehouse = resolve_lakehouse_name( 448 lakehouse_id=lakehouse_id, workspace=lake_workspace 449 ) 450 lakeTName = "vertipaq_analyzer_model" --> 452 lakeT = get_lakehouse_tables(lakehouse=lakehouse, workspace=lake_workspace) 453 lakeT_filt = lakeT[lakeT["Table Name"] == lakeTName] 455 query = f"SELECT MAX(RunId) FROM {lakehouse}.{lakeTName}"

File ~/cluster-env/clonedenv/lib/python3.10/site-packages/sempy/_utils/_log.py:273, in mds_log..get_wrapper..log_decorator_wrapper(*args, *kwargs) 270 raise 272 try: --> 273 result = func(args, **kwargs) 275 # The invocation for get_message_dict moves after the function 276 # so it can access the state after the method call 277 message.update(extractor.get_completion_message_dict(result, arg_dict))

File ~/cluster-env/clonedenv/lib/python3.10/site-packages/sempy_labs/lakehouse/_get_lakehouse_tables.py:111, in get_lakehouse_tables(lakehouse, workspace, extended, count_rows, export) 102 new_data = { 103 "Workspace Name": workspace, 104 "Lakehouse Name": lakehouse, (...) 108 "Location": i.get("location"), 109 } 110 dfs.append(pd.DataFrame(new_data, index=[0])) --> 111 df = pd.concat(dfs, ignore_index=True) 113 if extended: 114 sku_value = get_sku_size(workspace)

File ~/cluster-env/clonedenv/lib/python3.10/site-packages/pandas/core/reshape/concat.py:382, in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy) 379 elif copy and using_copy_on_write(): 380 copy = False --> 382 op = _Concatenator( 383 objs, 384 axis=axis, 385 ignore_index=ignore_index, 386 join=join, 387 keys=keys, 388 levels=levels, 389 names=names, 390 verify_integrity=verify_integrity, 391 copy=copy, 392 sort=sort, 393 ) 395 return op.get_result()

File ~/cluster-env/clonedenv/lib/python3.10/site-packages/pandas/core/reshape/concat.py:445, in _Concatenator.init(self, objs, axis, join, keys, levels, names, ignore_index, verify_integrity, copy, sort) 442 self.verify_integrity = verify_integrity 443 self.copy = copy --> 445 objs, keys = self._clean_keys_and_objs(objs, keys) 447 # figure out what our result ndim is going to be 448 ndims = self._get_ndims(objs)

File ~/cluster-env/clonedenv/lib/python3.10/site-packages/pandas/core/reshape/concat.py:507, in _Concatenator._clean_keys_and_objs(self, objs, keys) 504 objs_list = list(objs) 506 if len(objs_list) == 0: --> 507 raise ValueError("No objects to concatenate") 509 if keys is None: 510 objs_list = list(com.not_none(*objs_list))

ValueError: No objects to concatenate`

muhssamy commented 2 months ago

the dataset is in a workspace called ws_retail_sm and my notebook is in a different workspace and attached

muhssamy commented 2 months ago

@m-kovalsky i can say that this error was due to this was a new Lakehouse not containing any tables. when i created some tables in this Lakehouse and re-run the code the table has been created. i got a new error


⌛ Saving Vertipaq Analyzer to delta tables in the lakehouse...

🟢 The dataframe has been saved as the 'vertipaqanalyzer_columns' table in the 'lkh_sm_tracking' lakehouse within the 'NahdiTest_WS' workspace.
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[13], line 1
----> 1 labs.vertipaq_analyzer(dataset = dataset_name , workspace = Workspace_name , export= 'table')

File ~/cluster-env/clonedenv/lib/python3.10/site-packages/sempy/_utils/_log.py:273, in mds_log.<locals>.get_wrapper.<locals>.log_decorator_wrapper(*args, **kwargs)
    270     raise
    272 try:
--> 273     result = func(*args, **kwargs)
    275     # The invocation for get_message_dict moves after the function
    276     # so it can access the state after the method call
    277     message.update(extractor.get_completion_message_dict(result, arg_dict))

File ~/cluster-env/clonedenv/lib/python3.10/site-packages/sempy_labs/_vertipaq.py:511, in vertipaq_analyzer(dataset, workspace, export, read_stats_from_data, **kwargs)
    508         df.columns = df.columns.str.replace(" ", "_")
    510         delta_table_name = f"VertipaqAnalyzer_{obj}".lower()
--> 511         save_as_delta_table(
    512             dataframe=df,
    513             delta_table_name=delta_table_name,
    514             write_mode="append",
    515             merge_schema=True,
    516         )
    518 # Export vertipaq to zip file within the lakehouse
    519 if export == "zip":

File ~/cluster-env/clonedenv/lib/python3.10/site-packages/sempy_labs/_helper_functions.py:453, in save_as_delta_table(dataframe, delta_table_name, write_mode, merge_schema, lakehouse, workspace)
    450 dataframe.columns = dataframe.columns.str.replace(" ", "_")
    452 spark = SparkSession.builder.getOrCreate()
--> 453 spark_df = spark.createDataFrame(dataframe)
    455 filePath = create_abfss_path(
    456     lakehouse_id=lakehouse_id,
    457     lakehouse_workspace_id=workspace_id,
    458     delta_table_name=delta_table_name,
    459 )
    461 if merge_schema:

File /opt/spark/python/lib/pyspark.zip/pyspark/sql/session.py:1273, in SparkSession.createDataFrame(self, data, schema, samplingRatio, verifySchema)
   1269     data = pd.DataFrame(data, columns=column_names)
   1271 if has_pandas and isinstance(data, pd.DataFrame):
   1272     # Create a DataFrame from pandas DataFrame.
-> 1273     return super(SparkSession, self).createDataFrame(  # type: ignore[call-overload]
   1274         data, schema, samplingRatio, verifySchema
   1275     )
   1276 return self._create_dataframe(
   1277     data, schema, samplingRatio, verifySchema  # type: ignore[arg-type]
   1278 )

File /opt/spark/python/lib/pyspark.zip/pyspark/sql/pandas/conversion.py:440, in SparkConversionMixin.createDataFrame(self, data, schema, samplingRatio, verifySchema)
    438             raise
    439 converted_data = self._convert_from_pandas(data, schema, timezone)
--> 440 return self._create_dataframe(converted_data, schema, samplingRatio, verifySchema)

File /opt/spark/python/lib/pyspark.zip/pyspark/sql/session.py:1318, in SparkSession._create_dataframe(self, data, schema, samplingRatio, verifySchema)
   1316     rdd, struct = self._createFromRDD(data.map(prepare), schema, samplingRatio)
   1317 else:
-> 1318     rdd, struct = self._createFromLocal(map(prepare, data), schema)
   1319 assert self._jvm is not None
   1320 jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())

File /opt/spark/python/lib/pyspark.zip/pyspark/sql/session.py:962, in SparkSession._createFromLocal(self, data, schema)
    959     data = list(data)
    961 if schema is None or isinstance(schema, (list, tuple)):
--> 962     struct = self._inferSchemaFromList(data, names=schema)
    963     converter = _create_converter(struct)
    964     tupled_data: Iterable[Tuple] = map(converter, data)

File /opt/spark/python/lib/pyspark.zip/pyspark/sql/session.py:850, in SparkSession._inferSchemaFromList(self, data, names)
    836 schema = reduce(
    837     _merge_type,
    838     (
   (...)
    847     ),
    848 )
    849 if _has_nulltype(schema):
--> 850     raise ValueError("Some of types cannot be determined after inferring")
    851 return schema

ValueError: Some of types cannot be determined after inferring

microsoft / semantic-link-labs

Export Vertipaq analyzer result to Lakehouse table #42