Open Maksim-Sheyngalts opened 2 years ago
Introduction
In [ ]:
import pandas as pd import numpy as np
In [ ]:
In [ ]:
Exploring Data
In [ ]:
df.head() # first five rows
In [ ]:
df.tail() # last five rows
In [ ]:
df.sample(5) # random sample of rows
In [ ]:
df.shape # number of rows/columns
In [ ]:
df.describe() # calculates measures of central tendency
In [ ]:
df.info() # memory footprint and datatypes
Statistics
In [ ]:
df.describe() # Summary statistics for numerical columns
In [ ]:
df.mean() # Returns the mean of all columns
In [ ]:
df.corr() # Returns the correlation between columns in a DataFrame
In [ ]:
df.count() # Returns the number of non-null values in each DataFrame column
In [ ]:
df.max() # Returns the highest value in each column
In [ ]:
df.min() # Returns the lowest value in each column
In [ ]:
df.median() # Returns the median of each column
In [ ]:
df.std() # Returns the standard deviation of each column
Import Data from Files
Import csv
In [ ]:
df = pd.read_csv('Data/my-data.csv' ,sep=",") df
Import xls
In [ ]:
df = pd.read_excel('Data/my-data.xlsx')
In [ ]:
df = pd.read_excel('Data/my-data.xlsx', sheetname='sheet1', skiprows=[1] # header data )
Export Data to files
Export xls
In [ ]:
df.to_excel('Data/my-data.xlsx')
Export csv
In [ ]:
df.to_csv ('Data/my-data.csv' , index = False, header=True)
Column manipulation
Column Filter
In [ ]:
df[['Title','Rating']]
In [ ]:
df.filter(['Title','Rating'])
Column Rename
In [ ]:
df.rename(columns={'Title': 'a', 'Rating': 'c'},inplace=True) df
In [ ]:
Column Resorter/Reorder
In [ ]:
In [ ]:
Constant Value Column
In [ ]:
df['new_column'] = 23 df.head()
Math Formula
In [ ]:
df['Rating_Votes'] = df.Rating + df.Votes df[['Rating_Votes','Rating','Votes']].head()
Number to String
In [ ]:
df['Year_str'] =df['Year'].astype(str) df.info()
String to Number
In [ ]:
df['Year_int'] =df['Year_str'].astype(int) df.info()
Double to Int
In [ ]:
df['Rating_int'] = df['Rating'].round(0).astype(int) df[['Rating_int','Rating']].head()
String Replacer
In [ ]:
df['Title'].replace('Prometheus', 'Alien') df[df.Title == 'Prometheus']
String Manipulation
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
Date manipulation
In [ ]:
pd.to_datetime('2010/11/12')
Sort
In [ ]:
df.sort_values(by='Title', ascending=True)
In [ ]:
df.sort_values(by=['Director','Year'], ascending=True)
Row manipulation
Row Filter
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
Table Manipulation
Group By
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
Pivot / Unpivot
In [ ]:
In [ ]:
In [ ]:
In [ ]:
df4.melt(id_vars=['index'],var_name='Year',value_name='Title')
Join
In [ ]:
In [ ]:
In [ ]:
Concatenate
In [ ]:
df2 = df df.append(df2) # Append df2 to df (The columns must be the same in both dataframes)
In [ ]:
pd.concat([df, df2],axis=0) # concatenate two dataframes
Import Data from Databases
Import from mysql
In [ ]:
import pymysql conn = pymysql.connect(host='localhost',port=3306, db='database',user='root',password='pw')
df = pd.read_sql_query( "SELECT * FROM table LIMIT 3;", conn) df.tail(100)
Import Teradata
In [ ]:
import teradata #Make a connection session = udaExec.connect(method="odbc", USEREGIONALSETTINGS="N", system= "dwh", username = "root", password = "pw"); query = "SELECT * FROM DATABASEX.TABLENAMEX" #Reading query to df df = pd.read_sql(query,session) # do something with df,e.g. print(df.head()) #to see the first 5 rows
Import SAP-Hana
In [ ]:
import pyhdb connection = pyhdb.connect( host="localhost", port=30015, user="root, password="pw" ) print(connection.isconnected()) query = "SELECT * FROM HDB_REPORT.\"Table\"" df = pd.read_sql(query,connection) # do something with df,e.g. print(df.head()) #to see
https://github.com/deganza/jupyter_pandas_cheat_sheet/blob/main/Jupyter_Pandas_Cheat_Sheet.ipynb