Closed AmericanY closed 3 years ago
import trio import httpx from bs4 import BeautifulSoup import pandas as pd from functools import partial async def main(url): async with httpx.AsyncClient(timeout=None) as client: r = await client.get(url) soup = BeautifulSoup(r.text, 'lxml') tfile = soup.select_one('.file-link:-soup-contains(Table)').a['href'] async with client.stream('GET', tfile) as r: fname = r.headers.get('content-disposition').split('=')[-1] async with await trio.open_file(fname, 'wb') as f: async for chunk in r.aiter_bytes(): await f.write(chunk) df = await trio.to_thread.run_sync(partial(pd.read_excel, fname, sheet_name='Master Data', engine="pyxlsb")) print(df) if __name__ == "__main__": trio.run(main, 'https://rigcount.bakerhughes.com/na-rig-count')
Reading the sheet using sheet_name='Master Data' works fine but sheet_name = 1 is just reading like 1.5k of rows!
sheet_name='Master Data'
sheet_name = 1
https://stackoverflow.com/a/68418399/7658985
the problem is that the excel file has 2 hidden sheets, and the 2nd sheets really has 1457 rows, the Master Data is actually the 4th sheet, so sheet_name=3 will work
Reading the sheet using
sheet_name='Master Data'
works fine butsheet_name = 1
is just reading like 1.5k of rows!