AdamSulek / timelogs

0 stars 1 forks source link

[story] create prefect flow to load gsheet into db #68

Open vperluzzo opened 3 years ago

vperluzzo commented 3 years ago
vperluzzo commented 3 years ago

image

acivitillo commented 3 years ago

@jkrobicki and @AdamSulek any updates?

jkrobicki commented 3 years ago

Created table in postgres: image And trying to put excel into this db:

jkrobicki commented 3 years ago

Trying on this code to work:

import pandas as pd from sqlalchemy import create_engine import psycopg2

engine=create_engine("postresql+psycopg2://sqlpad:sqlpad/localhost:3001/Postgres demo")

def sheets(data, file): if(data=="Sheet1"): df=pd.read_excel(file, "Sheet1") df.to_sql(name="timelogs_excel", con=engine,if_exists="append", index=False)

with pd.ExcelFile("extract_timelogs_may_july.xlsx") as xls: for sheet_name in xls.sheet_names: sheets(sheet_name,xls)

jkrobicki commented 3 years ago

Changed code to: import pandas as pd from sqlalchemy import create_engine

engine = create_engine("postgresql://sqlpad:sqlpad@localhost:3001/Postgres_sqlpad", pool_pre_ping=True)

def sheets(file): df=pd.read_excel(file) print(df) df.to_sql(name="timelogs_excel", con=engine, if_exists="append", index=False)

jkrobicki commented 3 years ago

Working on a problem with connection to database: image

jkrobicki commented 3 years ago

Changing database to SQLite and trying to put two column table first

jkrobicki commented 3 years ago

Managed to insert specific columns into SQLite database image

jkrobicki commented 3 years ago

Working code for importing "extract_timelogs_may_july.xlsx" into SQLite database via csv: import sqlite3 import pandas as pd from sqlalchemy import create_engine from sqlalchemy import Table, Column, Integer, String, MetaData

meta = MetaData() file = "extract_timelogs_may_july.xlsx" output = "output.xlsx"

df = pd.read_excel(file) dfc = df.to_csv("dfc.csv") readcsv = pd.read_csv("dfc.csv")

dfr = readcsv.drop( labels=None, axis=0, index=None, columns="Unnamed: 0", level=None, inplace=False,errors='raise')

engine = create_engine('sqlite:///timelogs.db', echo = True) meta = MetaData()

timelogs = Table('timelogs', meta, Column('contractor', String(60)), Column('date', String(60)), Column('start', String(60)), Column('end', String(60)),
Column('work_element', String(60)), Column('calendar_month', String(60)),
Column('work_area', String(60)),
Column('client', String(60)),
Column('count_hours', String(60)),
Column('count_days', String(60)), Column('client_rate', String(60)),
Column('client_charges', String(60)), Column('invoice_month', String(60))
) meta.create_all(engine)

x = dfr.to_sql(name="timelogs", con=engine, if_exists="append", index=False)

results = engine.execute("select * from timelogs")

final = pd.DataFrame(data=results, columns=dfr.columns) final

jkrobicki commented 3 years ago

Final code for Prefect Flow:

import sqlite3 import pandas as pd from sqlalchemy import create_engine from sqlalchemy import Table, Column, Integer, String, MetaData import prefect from prefect import task, Flow engine = create_engine('sqlite:///timelogs.db', echo = True)

@task def read_excel(file): df = pd.read_excel(file) return df @task def to_csv(df): dfc = df.to_csv("dfc.csv") return dfc @task def read_csv(): readcsv = pd.read_csv("dfc.csv") dfr = readcsv.drop( labels=None, axis=0, index=None, columns="Unnamed: 0", level=None, inplace=False,errors='raise') return dfr

@task def create_table(): meta = MetaData()

timelogs = Table('timelogs', meta,
    Column('contractor', String(60)),
    Column('date', String(60)),
    Column('start', String(60)),
    Column('end', String(60)),             
    Column('work_element', String(60)),
    Column('calendar_month', String(60)),                
    Column('work_area', String(60)),            
    Column('client', String(60)),            
    Column('count_hours', String(60)),            
    Column('count_days', String(60)),
    Column('client_rate', String(60)),        
    Column('client_charges', String(60)),
    Column('invoice_month', String(60))             
            )
table = meta.create_all(engine)

@task def to_sql(dfr): tosql = dfr.to_sql(name="timelogs", con=engine, if_exists="append", index=False) return tosql

@task def read_table(): table = pd.read_sql_table("timelogs", con=engine) return table

with Flow("excel_to_database") as flow: read_excel = read_excel("extract_timelogs_may_july.xlsx") to_csv = to_csv(read_excel) read_csv = read_csv() create_table() to_sql = to_sql(read_csv) read_table() flow.visualize() flow.run()

print(tabulate(df, headers = 'keys', tablefmt = 'psql'))