m-kovalsky / fabric_cat_tools

Supercharge your Microsoft Fabric development with the fabric_cat_tools library
MIT License
100 stars 14 forks source link

fct.get_lakehouse_tables - HTTPError: HTTP Error 404: Not Found #4

Closed GilbertQue closed 2 months ago

GilbertQue commented 2 months ago

HI there

I am using version 0.3.0 and when I try and run the following notebook function below

` import fabric_cat_tools as fct

df_tableSizes = fct.get_lakehouse_tables( extended = True)`

I am then getting the error below, this was working on Thursday.

`HTTPError Traceback (most recent call last) Cell In[10], line 10 4 import fabric_cat_tools as fct 6 # Get Table Details 7 # Reference: https://github.com/m-kovalsky/fabric_cat_tools?tab=readme-ov-file#get_lakehouse_tables 8 9 # Get the Lakehouse Table sizes into Pandas Dataframe ---> 10 df_tableSizes = fct.get_lakehouse_tables( 11 extended = True) 13 # ## Remove Invalid Characters from Column Names 14 df_tableSizes.columns = df_tableSizes.columns.str.replace(r'(.[)|(].)', '', regex=True)

File /nfs4/pyenv-e2048f8d-14e9-4483-9558-0f3a2c89bfda/lib/python3.10/site-packages/fabric_cat_tools/GetLakehouseTables.py:92, in get_lakehouse_tables(lakehouse, workspace, extended, count_rows) 90 else: 91 sku_value = get_sku_size(workspace) ---> 92 guardrail = get_directlake_guardrails_for_sku(sku_value) 94 spark = SparkSession.builder.getOrCreate() 96 intColumns = ['Files', 'Row Groups', 'Table Size']

File /nfs4/pyenv-e2048f8d-14e9-4483-9558-0f3a2c89bfda/lib/python3.10/site-packages/fabric_cat_tools/Guardrails.py:71, in get_directlake_guardrails_for_sku(sku_size) 56 def get_directlake_guardrails_for_sku(sku_size): 58 """ 59
60 This function obtains guardrails for a given SKU size. (...) 68 This function returns a pandas dataframe showing the guardrails for the SKU size. 69 """ ---> 71 df = get_direct_lake_guardrails() 72 filtered_df = df[df['Fabric/Power BI SKUs'] == sku_size] 74 return filtered_df

File /nfs4/pyenv-e2048f8d-14e9-4483-9558-0f3a2c89bfda/lib/python3.10/site-packages/fabric_cat_tools/Guardrails.py:22, in get_direct_lake_guardrails() 7 """ 8 9 This function shows the Direct Lake guardrails based on Microsoft documentation. (...) 17 This function returns a pandas dataframe showing the guardrails by SKU. 18 """ 20 url = 'https://learn.microsoft.com/power-bi/enterprise/directlake-overview' ---> 22 tables = pd.read_html(url) 23 df = tables[0] 24 df['Fabric/Power BI SKUs'] = df['Fabric/Power BI SKUs'].str.split('/')

File ~/cluster-env/trident_env/lib/python3.10/site-packages/pandas/util/_decorators.py:331, in deprecate_nonkeyword_arguments..decorate..wrapper(*args, *kwargs) 325 if len(args) > num_allow_args: 326 warnings.warn( 327 msg.format(arguments=_format_argument_list(allow_args)), 328 FutureWarning, 329 stacklevel=find_stack_level(), 330 ) --> 331 return func(args, **kwargs)

File ~/cluster-env/trident_env/lib/python3.10/site-packages/pandas/io/html.py:1205, in read_html(io, match, flavor, header, index_col, skiprows, attrs, parse_dates, thousands, encoding, decimal, converters, na_values, keep_default_na, displayed_only, extract_links) 1201 validate_header_arg(header) 1203 io = stringify_path(io) -> 1205 return _parse( 1206 flavor=flavor, 1207 io=io, 1208 match=match, 1209 header=header, 1210 index_col=index_col, 1211 skiprows=skiprows, 1212 parse_dates=parse_dates, 1213 thousands=thousands, 1214 attrs=attrs, 1215 encoding=encoding, 1216 decimal=decimal, 1217 converters=converters, 1218 na_values=na_values, 1219 keep_default_na=keep_default_na, 1220 displayed_only=displayed_only, 1221 extract_links=extract_links, 1222 )

File ~/cluster-env/trident_env/lib/python3.10/site-packages/pandas/io/html.py:986, in _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs) 983 p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_links) 985 try: --> 986 tables = p.parse_tables() 987 except ValueError as caught: 988 # if io is an io-like object, check if it's seekable 989 # and try to rewind it before trying the next parser 990 if hasattr(io, "seekable") and io.seekable():

File ~/cluster-env/trident_env/lib/python3.10/site-packages/pandas/io/html.py:262, in _HtmlFrameParser.parse_tables(self) 254 def parse_tables(self): 255 """ 256 Parse and return all tables from the DOM. 257 (...) 260 list of parsed (header, body, footer) tuples from tables. 261 """ --> 262 tables = self._parse_tables(self._build_doc(), self.match, self.attrs) 263 return (self._parse_thead_tbody_tfoot(table) for table in tables)

File ~/cluster-env/trident_env/lib/python3.10/site-packages/pandas/io/html.py:821, in _LxmlFrameParser._build_doc(self) 819 pass 820 else: --> 821 raise e 822 else: 823 if not hasattr(r, "text_content"):

File ~/cluster-env/trident_env/lib/python3.10/site-packages/pandas/io/html.py:802, in _LxmlFrameParser._build_doc(self) 800 try: 801 if is_url(self.io): --> 802 with urlopen(self.io) as f: 803 r = parse(f, parser=parser) 804 else: 805 # try to parse the input in the simplest way

File ~/cluster-env/trident_env/lib/python3.10/site-packages/pandas/io/common.py:265, in urlopen(*args, *kwargs) 259 """ 260 Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of 261 the stdlib. 262 """ 263 import urllib.request --> 265 return urllib.request.urlopen(args, **kwargs)

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:216, in urlopen(url, data, timeout, cafile, capath, cadefault, context) 214 else: 215 opener = _opener --> 216 return opener.open(url, data, timeout)

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:525, in OpenerDirector.open(self, fullurl, data, timeout) 523 for processor in self.process_response.get(protocol, []): 524 meth = getattr(processor, meth_name) --> 525 response = meth(req, response) 527 return response

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:634, in HTTPErrorProcessor.http_response(self, request, response) 631 # According to RFC 2616, "2xx" code indicates that the client's 632 # request was successfully received, understood, and accepted. 633 if not (200 <= code < 300): --> 634 response = self.parent.error( 635 'http', request, response, code, msg, hdrs) 637 return response

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:557, in OpenerDirector.error(self, proto, args) 555 http_err = 0 556 args = (dict, proto, meth_name) + args --> 557 result = self._call_chain(args) 558 if result: 559 return result

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:496, in OpenerDirector._call_chain(self, chain, kind, meth_name, args) 494 for handler in handlers: 495 func = getattr(handler, meth_name) --> 496 result = func(args) 497 if result is not None: 498 return result

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:749, in HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers) 746 fp.read() 747 fp.close() --> 749 return self.parent.open(new, timeout=req.timeout)

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:525, in OpenerDirector.open(self, fullurl, data, timeout) 523 for processor in self.process_response.get(protocol, []): 524 meth = getattr(processor, meth_name) --> 525 response = meth(req, response) 527 return response

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:634, in HTTPErrorProcessor.http_response(self, request, response) 631 # According to RFC 2616, "2xx" code indicates that the client's 632 # request was successfully received, understood, and accepted. 633 if not (200 <= code < 300): --> 634 response = self.parent.error( 635 'http', request, response, code, msg, hdrs) 637 return response

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:557, in OpenerDirector.error(self, proto, args) 555 http_err = 0 556 args = (dict, proto, meth_name) + args --> 557 result = self._call_chain(args) 558 if result: 559 return result

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:496, in OpenerDirector._call_chain(self, chain, kind, meth_name, args) 494 for handler in handlers: 495 func = getattr(handler, meth_name) --> 496 result = func(args) 497 if result is not None: 498 return result

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:749, in HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers) 746 fp.read() 747 fp.close() --> 749 return self.parent.open(new, timeout=req.timeout)

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:525, in OpenerDirector.open(self, fullurl, data, timeout) 523 for processor in self.process_response.get(protocol, []): 524 meth = getattr(processor, meth_name) --> 525 response = meth(req, response) 527 return response

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:634, in HTTPErrorProcessor.http_response(self, request, response) 631 # According to RFC 2616, "2xx" code indicates that the client's 632 # request was successfully received, understood, and accepted. 633 if not (200 <= code < 300): --> 634 response = self.parent.error( 635 'http', request, response, code, msg, hdrs) 637 return response

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:563, in OpenerDirector.error(self, proto, args) 561 if http_err: 562 args = (dict, 'default', 'http_error_default') + orig_args --> 563 return self._call_chain(args)

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:496, in OpenerDirector._call_chain(self, chain, kind, meth_name, args) 494 for handler in handlers: 495 func = getattr(handler, meth_name) --> 496 result = func(args) 497 if result is not None: 498 return result

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:643, in HTTPDefaultErrorHandler.http_error_default(self, req, fp, code, msg, hdrs) 642 def http_error_default(self, req, fp, code, msg, hdrs): --> 643 raise HTTPError(req.full_url, code, msg, hdrs, fp)

HTTPError: HTTP Error 404: Not Found`

m-kovalsky commented 2 months ago

Specifying ‘extended=True’ calls a function which gets the guardrails from Microsoft’s online documentation. The site (see below) is down at the moment which is causing this 404 error. It’s a known issue and should be resolved shortly by Microsoft.

https://learn.microsoft.com/en-us/fabric/get-started/directlake-overview


From: GilbertQue @.> Sent: Sunday, April 21, 2024 1:51:30 AM To: m-kovalsky/fabric_cat_tools @.> Cc: Subscribed @.***> Subject: [m-kovalsky/fabric_cat_tools] fct.get_lakehouse_tables - HTTPError: HTTP Error 404: Not Found (Issue #4)

HI there

I am using version 0.3.0 and when I try and run the following notebook function below

`##########################################

Get Table Data

########################################## import fabric_cat_tools as fct

Get Table Details Reference: https://github.com/m-kovalsky/fabric_cat_tools?tab=readme-ov-file#get_lakehouse_tables Get the Lakehouse Table sizes into Pandas Dataframe

df_tableSizes = fct.get_lakehouse_tables( extended = True)`

I am then getting the error below, this was working on Thursday.

`HTTPError Traceback (most recent call last) Cell In[10], line 10 4 import fabric_cat_tools as fct 6 # Get Table Details 7 # Reference: https://github.com/m-kovalsky/fabric_cat_tools?tab=readme-ov-file#get_lakehouse_tables 8 9 # Get the Lakehouse Table sizes into Pandas Dataframe ---> 10 df_tableSizes = fct.get_lakehouse_tables( 11 extended = True) 13 # ## Remove Invalid Characters from Column Names 14 df_tableSizes.columns = df_tableSizes.columns.str.replace(r'(.[)|(].)', '', regex=True)

File /nfs4/pyenv-e2048f8d-14e9-4483-9558-0f3a2c89bfda/lib/python3.10/site-packages/fabric_cat_tools/GetLakehouseTables.py:92, in get_lakehouse_tables(lakehouse, workspace, extended, count_rows) 90 else: 91 sku_value = get_sku_size(workspace) ---> 92 guardrail = get_directlake_guardrails_for_sku(sku_value) 94 spark = SparkSession.builder.getOrCreate() 96 intColumns = ['Files', 'Row Groups', 'Table Size']

File /nfs4/pyenv-e2048f8d-14e9-4483-9558-0f3a2c89bfda/lib/python3.10/site-packages/fabric_cat_tools/Guardrails.py:71, in get_directlake_guardrails_for_sku(sku_size) 56 def get_directlake_guardrails_for_sku(sku_size): 58 """ 59 60 This function obtains guardrails for a given SKU size. (...) 68 This function returns a pandas dataframe showing the guardrails for the SKU size. 69 """ ---> 71 df = get_direct_lake_guardrails() 72 filtered_df = df[df['Fabric/Power BI SKUs'] == sku_size] 74 return filtered_df

File /nfs4/pyenv-e2048f8d-14e9-4483-9558-0f3a2c89bfda/lib/python3.10/site-packages/fabric_cat_tools/Guardrails.py:22, in get_direct_lake_guardrails() 7 """ 8 9 This function shows the Direct Lake guardrails based on Microsoft documentation. (...) 17 This function returns a pandas dataframe showing the guardrails by SKU. 18 """ 20 url = 'https://learn.microsoft.com/power-bi/enterprise/directlake-overview' ---> 22 tables = pd.read_html(url) 23 df = tables[0] 24 df['Fabric/Power BI SKUs'] = df['Fabric/Power BI SKUs'].str.split('/')

File ~/cluster-env/trident_env/lib/python3.10/site-packages/pandas/util/_decorators.py:331, in deprecate_nonkeyword_arguments..decorate..wrapper(*args, *kwargs) 325 if len(args) > num_allow_args: 326 warnings.warn( 327 msg.format(arguments=_format_argument_list(allow_args)), 328 FutureWarning, 329 stacklevel=find_stack_level(), 330 ) --> 331 return func(args, **kwargs)

File ~/cluster-env/trident_env/lib/python3.10/site-packages/pandas/io/html.py:1205, in read_html(io, match, flavor, header, index_col, skiprows, attrs, parse_dates, thousands, encoding, decimal, converters, na_values, keep_default_na, displayed_only, extract_links) 1201 validate_header_arg(header) 1203 io = stringify_path(io) -> 1205 return _parse( 1206 flavor=flavor, 1207 io=io, 1208 match=match, 1209 header=header, 1210 index_col=index_col, 1211 skiprows=skiprows, 1212 parse_dates=parse_dates, 1213 thousands=thousands, 1214 attrs=attrs, 1215 encoding=encoding, 1216 decimal=decimal, 1217 converters=converters, 1218 na_values=na_values, 1219 keep_default_na=keep_default_na, 1220 displayed_only=displayed_only, 1221 extract_links=extract_links, 1222 )

File ~/cluster-env/trident_env/lib/python3.10/site-packages/pandas/io/html.py:986, in _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs) 983 p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_links) 985 try: --> 986 tables = p.parse_tables() 987 except ValueError as caught: 988 # if io is an io-like object, check if it's seekable 989 # and try to rewind it before trying the next parser 990 if hasattr(io, "seekable") and io.seekable():

File ~/cluster-env/trident_env/lib/python3.10/site-packages/pandas/io/html.py:262, in _HtmlFrameParser.parse_tables(self) 254 def parse_tables(self): 255 """ 256 Parse and return all tables from the DOM. 257 (...) 260 list of parsed (header, body, footer) tuples from tables. 261 """ --> 262 tables = self._parse_tables(self._build_doc(), self.match, self.attrs) 263 return (self._parse_thead_tbody_tfoot(table) for table in tables)

File ~/cluster-env/trident_env/lib/python3.10/site-packages/pandas/io/html.py:821, in _LxmlFrameParser._build_doc(self) 819 pass 820 else: --> 821 raise e 822 else: 823 if not hasattr(r, "text_content"):

File ~/cluster-env/trident_env/lib/python3.10/site-packages/pandas/io/html.py:802, in _LxmlFrameParser._build_doc(self) 800 try: 801 if is_url(self.io): --> 802 with urlopen(self.io) as f: 803 r = parse(f, parser=parser) 804 else: 805 # try to parse the input in the simplest way

File ~/cluster-env/trident_env/lib/python3.10/site-packages/pandas/io/common.py:265, in urlopen(*args, *kwargs) 259 """ 260 Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of 261 the stdlib. 262 """ 263 import urllib.request --> 265 return urllib.request.urlopen(args, **kwargs)

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:216, in urlopen(url, data, timeout, cafile, capath, cadefault, context) 214 else: 215 opener = _opener --> 216 return opener.open(url, data, timeout)

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:525, in OpenerDirector.open(self, fullurl, data, timeout) 523 for processor in self.process_response.get(protocol, []): 524 meth = getattr(processor, meth_name) --> 525 response = meth(req, response) 527 return response

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:634, in HTTPErrorProcessor.http_response(self, request, response) 631 # According to RFC 2616, "2xx" code indicates that the client's 632 # request was successfully received, understood, and accepted. 633 if not (200 <= code < 300): --> 634 response = self.parent.error( 635 'http', request, response, code, msg, hdrs) 637 return response

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:557, in OpenerDirector.error(self, proto, args) 555 http_err = 0 556 args = (dict, proto, meth_name) + args --> 557 result = self._call_chain(args) 558 if result: 559 return result

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:496, in OpenerDirector._call_chain(self, chain, kind, meth_name, args) 494 for handler in handlers: 495 func = getattr(handler, meth_name) --> 496 result = func(args) 497 if result is not None: 498 return result

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:749, in HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers) 746 fp.read() 747 fp.close() --> 749 return self.parent.open(new, timeout=req.timeout)

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:525, in OpenerDirector.open(self, fullurl, data, timeout) 523 for processor in self.process_response.get(protocol, []): 524 meth = getattr(processor, meth_name) --> 525 response = meth(req, response) 527 return response

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:634, in HTTPErrorProcessor.http_response(self, request, response) 631 # According to RFC 2616, "2xx" code indicates that the client's 632 # request was successfully received, understood, and accepted. 633 if not (200 <= code < 300): --> 634 response = self.parent.error( 635 'http', request, response, code, msg, hdrs) 637 return response

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:557, in OpenerDirector.error(self, proto, args) 555 http_err = 0 556 args = (dict, proto, meth_name) + args --> 557 result = self._call_chain(args) 558 if result: 559 return result

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:496, in OpenerDirector._call_chain(self, chain, kind, meth_name, args) 494 for handler in handlers: 495 func = getattr(handler, meth_name) --> 496 result = func(args) 497 if result is not None: 498 return result

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:749, in HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers) 746 fp.read() 747 fp.close() --> 749 return self.parent.open(new, timeout=req.timeout)

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:525, in OpenerDirector.open(self, fullurl, data, timeout) 523 for processor in self.process_response.get(protocol, []): 524 meth = getattr(processor, meth_name) --> 525 response = meth(req, response) 527 return response

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:634, in HTTPErrorProcessor.http_response(self, request, response) 631 # According to RFC 2616, "2xx" code indicates that the client's 632 # request was successfully received, understood, and accepted. 633 if not (200 <= code < 300): --> 634 response = self.parent.error( 635 'http', request, response, code, msg, hdrs) 637 return response

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:563, in OpenerDirector.error(self, proto, args) 561 if http_err: 562 args = (dict, 'default', 'http_error_default') + orig_args --> 563 return self._call_chain(args)

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:496, in OpenerDirector._call_chain(self, chain, kind, meth_name, args) 494 for handler in handlers: 495 func = getattr(handler, meth_name) --> 496 result = func(args) 497 if result is not None: 498 return result

File ~/cluster-env/trident_env/lib/python3.10/urllib/request.py:643, in HTTPDefaultErrorHandler.http_error_default(self, req, fp, code, msg, hdrs) 642 def http_error_default(self, req, fp, code, msg, hdrs): --> 643 raise HTTPError(req.full_url, code, msg, hdrs, fp)

HTTPError: HTTP Error 404: Not Found`

— Reply to this email directly, view it on GitHubhttps://github.com/m-kovalsky/fabric_cat_tools/issues/4, or unsubscribehttps://github.com/notifications/unsubscribe-auth/AHBQBNRBK6XHETDOSJR3XZTY6L5QFAVCNFSM6AAAAABGQ3W6AKVHI2DSMVQWIX3LMV43ASLTON2WKOZSGI2TINZWG4ZDAOI. You are receiving this because you are subscribed to this thread.Message ID: @.***>

m-kovalsky commented 2 months ago

The msft website is working again but they changed the name of one of the columns so I published a new release 0.3.1 which fixes that issue.

GilbertQue commented 2 months ago

awesome thanks for getting this fixed!