pandas入门 - Githubissues

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('max_columns', 50)
%matplotlib inline

pandas是python的一个用于数据分析的函数库，通过数据结构作为入门的出发点，我们来看一下如何使用这个函数库。

pandas只有2种数据结构，series 和 dataframe

#series 是一个一维的对象，类似数组
# create a Series with an arbitrary list
s = pd.Series([7, 'Heisenberg', 3.14, -1789710578, 'Happy Eating!'])
s

0                7
1       Heisenberg
2             3.14
3      -1789710578
4    Happy Eating!
dtype: object

#默认的索引是0-N,可以通过index自定义索引
s = pd.Series([7, 'Heisenberg', 3.14, -1789710578, 'Happy Eating!'],
              index=['A', 'Z', 'C', 'Y', 'E'])
s

A                7
Z       Heisenberg
C             3.14
Y      -1789710578
E    Happy Eating!
dtype: object

#通过dict创建series
d = {'Chicago': 1000, 'New York': 1300, 'Portland': 900, 'San Francisco': 1100,
     'Austin': 450, 'Boston': None}
cities = pd.Series(d)
cities

Austin            450.0
Boston              NaN
Chicago          1000.0
New York         1300.0
Portland          900.0
San Francisco    1100.0
dtype: float64

cities['Chicago']

1000.0

cities[['Chicago', 'Portland', 'San Francisco']]

Chicago          1000.0
Portland          900.0
San Francisco    1100.0
dtype: float64

cities[cities < 1000]

Austin      450.0
Portland    900.0
dtype: float64

# 改变series的数值
print('Old value:', cities['Chicago'])
cities['Chicago'] = 1400
print('New value:', cities['Chicago'])

('Old value:', 1000.0)
('New value:', 1400.0)

print('Seattle' in cities)
print('San Francisco' in cities)

False
True

# divide city values by 3
cities / 3

Austin           150.000000
Boston                  NaN
Chicago          466.666667
New York         433.333333
Portland         300.000000
San Francisco    366.666667
dtype: float64

#dataframe是一组series，他们共用同一个index

#读取数据
#自定义数据
data = {'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],
        'team': ['Bears', 'Bears', 'Bears', 'Packers', 'Packers', 'Lions', 'Lions', 'Lions'],
        'wins': [11, 8, 10, 15, 11, 6, 10, 4],
        'losses': [5, 8, 6, 1, 5, 10, 6, 12]}
football = pd.DataFrame(data, columns=['year', 'team', 'wins', 'losses'])
football

	year	team	wins	losses
0	2010	Bears	11	5
1	2011	Bears	8	8
2	2012	Bears	10	6
3	2011	Packers	15	1
4	2012	Packers	11	5
5	2010	Lions	6	10
6	2011	Lions	10	6
7	2012	Lions	4	12

#读取csv数据
from_csv = pd.read_csv('test.csv')
from_csv.head()

	Year	Age	Tm	Lg	W	L	W-L%	ERA	G	GS	GF	SV	IP	H	R	ER	HR	BB	IBB	SO	HBP	BK	WP	BF	ERA+	WHIP	H/9	HR/9	BB/9	SO/9	SO/BB	Awards
0	1995	25	NYY	AL	5	3	0.625	5.51	19	10	2	0	67.0	71	43	41	11	30	0	51	2	1	0	301	84	1.507	9.5	1.5	4.0	6.9	1.70	NaN
1	1996	26	NYY	AL	8	3	0.727	2.09	61	0	14	5	107.2	73	25	25	1	34	3	130	2	0	1	425	240	0.994	6.1	0.1	2.8	10.9	3.82	CYA-3MVP-12
2	1997	27	NYY	AL	6	4	0.600	1.88	66	0	56	43	71.2	65	17	15	5	20	6	68	0	0	2	301	239	1.186	8.2	0.6	2.5	8.5	3.40	ASMVP-25
3	1998	28	NYY	AL	3	0	1.000	1.91	54	0	49	36	61.1	48	13	13	3	17	1	36	1	0	0	246	233	1.060	7.0	0.4	2.5	5.3	2.12	NaN

#没有表头的csv数据
cols = ['num', 'game', 'date', 'team', 'home_away', 'opponent',
        'result', 'quarter', 'distance', 'receiver', 'score_before',
        'score_after']
no_headers = pd.read_csv('peyton-passing-TDs-2012.csv', sep=',', header=None,
                         names=cols)

#读excel
#需要安装xlrd 库
data = {'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],
        'team': ['Bears', 'Bears', 'Bears', 'Packers', 'Packers', 'Lions', 'Lions', 'Lions'],
        'wins': [11, 8, 10, 15, 11, 6, 10, 4],
        'losses': [5, 8, 6, 1, 5, 10, 6, 12]}
football = pd.DataFrame(data, columns=['year', 'team', 'wins', 'losses'])
football

	year	team	wins	losses
0	2010	Bears	11	5
1	2011	Bears	8	8
2	2012	Bears	10	6
3	2011	Packers	15	1
4	2012	Packers	11	5
5	2010	Lions	6	10
6	2011	Lions	10	6
7	2012	Lions	4	12

football.to_excel('football.xlsx', index=False)

/Library/Python/2.7/site-packages/pandas/io/excel.py:784: DeprecationWarning: Call to deprecated function remove_sheet (Use wb.remove(worksheet) or del wb[sheetname]).
  self.book.remove_sheet(self.book.worksheets[0])
/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/encoder.py:207: DeprecationWarning: Interpreting naive datetime as local 2018-08-16 15:55:04.426237. Please add timezone info to timestamps.
  chunks = self.iterencode(o, _one_shot=True)

# delete the DataFrame
del football

/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/encoder.py:207: DeprecationWarning: Interpreting naive datetime as local 2018-08-16 15:56:02.944848. Please add timezone info to timestamps.
  chunks = self.iterencode(o, _one_shot=True)

# read from Excel
football = pd.read_excel('football.xlsx', 'Sheet1')
football

	year	team	wins	losses
0	2010	Bears	11	5
1	2011	Bears	8	8
2	2012	Bears	10	6
3	2011	Packers	15	1
4	2012	Packers	11	5
5	2010	Lions	6	10
6	2011	Lions	10	6
7	2012	Lions	4	12

/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/encoder.py:207: DeprecationWarning: Interpreting naive datetime as local 2018-08-16 15:56:16.107466. Please add timezone info to timestamps.
  chunks = self.iterencode(o, _one_shot=True)

football

	losses	team	wins	year
0	5	bears	11	2010
1	8	packer	3	2011
2	6	bears	3	2012

dataframe应用

数据读入


# pass in column names for each CSV
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols,
                    encoding='latin-1')

users.head()

	user_id	age	sex	occupation	zip_code
0	1	24	M	technician	85711
1	2	53	F	other	94043
2	3	23	M	writer	32067
3	4	24	M	technician	43537
4	5	33	F	other	15213

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols,
                      encoding='latin-1')

# the movies file contains columns indicating the movie's genres
# let's only load the first five columns of the file with usecols
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=m_cols, usecols=range(5),
                     encoding='latin-1')

movies.head()

	movie_id	title	release_date	video_release_date	imdb_url
0	1	Toy Story (1995)	01-Jan-1995	NaN	http://us.imdb.com/M/title-exact?Toy%20Story%2...
1	2	GoldenEye (1995)	01-Jan-1995	NaN	http://us.imdb.com/M/title-exact?GoldenEye%20(...
2	3	Four Rooms (1995)	01-Jan-1995	NaN	http://us.imdb.com/M/title-exact?Four%20Rooms%...
3	4	Get Shorty (1995)	01-Jan-1995	NaN	http://us.imdb.com/M/title-exact?Get%20Shorty%...
4	5	Copycat (1995)	01-Jan-1995	NaN	http://us.imdb.com/M/title-exact?Copycat%20(1995)

数据概览

movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 5 columns):
movie_id              1682 non-null int64
title                 1682 non-null object
release_date          1681 non-null object
video_release_date    0 non-null float64
imdb_url              1679 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 65.8+ KB

users.describe()

	user_id	age
count	943.000000	943.000000
mean	472.000000	34.051962
std	272.364951	12.192740
min	1.000000	7.000000
25%	236.500000	25.000000
50%	472.000000	31.000000
75%	707.500000	43.000000
max	943.000000	73.000000

movies.tail(3)

	movie_id	title	release_date	video_release_date	imdb_url
1679	1680	Sliding Doors (1998)	01-Jan-1998	NaN	http://us.imdb.com/Title?Sliding+Doors+(1998)
1680	1681	You So Crazy (1994)	01-Jan-1994	NaN	http://us.imdb.com/M/title-exact?You%20So%20Cr...
1681	1682	Scream of Stone (Schrei aus Stein) (1991)	08-Mar-1996	NaN	http://us.imdb.com/M/title-exact?Schrei%20aus%...

#查看部分数据
print(users[['age', 'zip_code']].head())
print('\n')

# can also store in a variable to use later
columns_you_want = ['occupation', 'sex'] 
print(users[columns_you_want].head())

   age zip_code
0   24    85711
1   53    94043
2   23    32067
3   24    43537
4   33    15213

   occupation sex
0  technician   M
1       other   F
2      writer   M
3  technician   M
4       other   F

# users older than 25
print(users[users.age > 25].head(3))
print('\n')

# users aged 40 AND male
print(users[(users.age == 40) & (users.sex == 'M')].head(3))
print('\n')

# users younger than 30 OR female
print(users[(users.sex == 'F') | (users.age < 30)].head(3))

   user_id  age sex occupation zip_code
1        2   53   F      other    94043
4        5   33   F      other    15213
5        6   42   M  executive    98101

     user_id  age sex  occupation zip_code
18        19   40   M   librarian    02138
82        83   40   M       other    44133
115      116   40   M  healthcare    97232

   user_id  age sex  occupation zip_code
0        1   24   M  technician    85711
1        2   53   F       other    94043
2        3   23   M      writer    32067

print(users.set_index('user_id').head())
print('\n')

print(users.head())
print("\n^^^ I didn't actually change the DataFrame. ^^^\n")

with_new_index = users.set_index('user_id')
print(with_new_index.head())
print("\n^^^ set_index actually returns a new DataFrame. ^^^\n")

         age sex  occupation zip_code
user_id                              
1         24   M  technician    85711
2         53   F       other    94043
3         23   M      writer    32067
4         24   M  technician    43537
5         33   F       other    15213

   user_id  age sex  occupation zip_code
0        1   24   M  technician    85711
1        2   53   F       other    94043
2        3   23   M      writer    32067
3        4   24   M  technician    43537
4        5   33   F       other    15213

^^^ I didn't actually change the DataFrame. ^^^

         age sex  occupation zip_code
user_id                              
1         24   M  technician    85711
2         53   F       other    94043
3         23   M      writer    32067
4         24   M  technician    43537
5         33   F       other    15213

^^^ set_index actually returns a new DataFrame. ^^^

with_new_index.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 943 entries, 1 to 943
Data columns (total 4 columns):
age           943 non-null int64
sex           943 non-null object
occupation    943 non-null object
zip_code      943 non-null object
dtypes: int64(1), object(3)
memory usage: 36.8+ KB

users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 5 columns):
user_id       943 non-null int64
age           943 non-null int64
sex           943 non-null object
occupation    943 non-null object
zip_code      943 non-null object
dtypes: int64(2), object(3)
memory usage: 36.9+ KB

2个表之间的操作

JOIN

left_frame = pd.DataFrame({'key': range(5), 
                           'left_value': ['a', 'b', 'c', 'd', 'e']})
right_frame = pd.DataFrame({'key': range(2, 7), 
                           'right_value': ['f', 'g', 'h', 'i', 'j']})
print(left_frame)
print('\n')
print(right_frame)

   key left_value
0    0          a
1    1          b
2    2          c
3    3          d
4    4          e

   key right_value
0    2           f
1    3           g
2    4           h
3    5           i
4    6           j

pd.merge(left_frame, right_frame, on='key', how='inner')

	key	left_value	right_value
0	2	c	f
1	3	d	g
2	4	e	h

pd.merge(left_frame, right_frame, left_on='key', right_index=True)

	key	key_x	left_value	key_y	right_value
0	0	0	a	2	f
1	1	1	b	3	g
2	2	2	c	4	h
3	3	3	d	5	i
4	4	4	e	6	j

#left outer join

pd.merge(left_frame, right_frame, on='key', how='left')

	key	left_value	right_value
0	0	a	NaN
1	1	b	NaN
2	2	c	f
3	3	d	g
4	4	e	h

#right outer join

pd.merge(left_frame, right_frame, on='key', how='right')

	key	left_value	right_value
0	2	c	f
1	3	d	g
2	4	e	h
3	5	NaN	i
4	6	NaN	j

# full outer join

pd.merge(left_frame, right_frame, on='key', how='outer')

	key	left_value	right_value
0	0	a	NaN
1	1	b	NaN
2	2	c	f
3	3	d	g
4	4	e	h
5	5	NaN	i
6	6	NaN	j

Combining 合并，直接将2个表相加，通过参数控制水平方向相加还是数值方向

pd.concat([left_frame, right_frame])

	key	left_value	right_value
0	0	a	NaN
1	1	b	NaN
2	2	c	NaN
3	3	d	NaN
4	4	e	NaN
0	2	NaN	f
1	3	NaN	g
2	4	NaN	h
3	5	NaN	i
4	6	NaN	j

pd.concat([left_frame, right_frame], axis=1)

	key	left_value	key	right_value
0	0	a	2	f
1	1	b	3	g
2	2	c	4	h
3	3	d	5	i
4	4	e	6	j

split-apply-combine 的策略

headers = ['name', 'title', 'department', 'salary']
chicago = pd.read_csv('city-of-chicago-salaries.csv', 
                      header=0,
                      names=headers,
                      converters={'salary': lambda x: float(x.replace('$', ''))})
chicago.head()

	name	title	department	salary
0	AARON, ELVIA J	WATER RATE TAKER	WATER MGMNT	85512.0
1	AARON, JEFFERY M	POLICE OFFICER	POLICE	75372.0
2	AARON, KIMBERLEI R	CHIEF CONTRACT EXPEDITER	GENERAL SERVICES	80916.0
3	ABAD JR, VICENTE M	CIVIL ENGINEER IV	WATER MGMNT	99648.0
4	ABBATACOLA, ROBERT J	ELECTRICAL MECHANIC	AVIATION	89440.0

#groupby 返回的是一个dataframe的对象，含有多种可使用的方法
by_dept = chicago.groupby('department')
by_dept

<pandas.core.groupby.DataFrameGroupBy object at 0x1100f82d0>

print(by_dept.count().head()) # NOT NULL records within each column
print('\n')
print(by_dept.size().tail()) # total records for each department

                   name  title  salary
department                            
ADMIN HEARNG         42     42      42
ANIMAL CONTRL        61     61      61
AVIATION           1218   1218    1218
BOARD OF ELECTION   110    110     110
BOARD OF ETHICS       9      9       9

department
ADMIN HEARNG           42
ANIMAL CONTRL          61
AVIATION             1218
BOARD OF ELECTION     110
BOARD OF ETHICS         9
dtype: int64

print(by_dept.sum()[20:25]) # total salaries of each department
print('\n')
print(by_dept.mean()[20:25]) # average salary of each department
print('\n')
print(by_dept.median()[20:25]) # take that, RDBMS!

                       salary
department                   
HUMAN RESOURCES     4850928.0
INSPECTOR GEN       4035150.0
IPRA                7006128.0
LAW                31883920.2
LICENSE APPL COMM     65436.0

                         salary
department                     
HUMAN RESOURCES    71337.176471
INSPECTOR GEN      80703.000000
IPRA               82425.035294
LAW                70853.156000
LICENSE APPL COMM  65436.000000

                    salary
department                
HUMAN RESOURCES    68496.0
INSPECTOR GEN      76116.0
IPRA               82524.0
LAW                66492.0
LICENSE APPL COMM  65436.0

#我们想看title最多的5个部门
#SELECT department, COUNT(DISTINCT title)
#FROM chicago
#GROUP BY department
#ORDER BY 2 DESC
#LIMIT 5;
by_dept.title.nunique().sort_values(ascending=False)[:5]

department
WATER MGMNT    153
TRANSPORTN     150
POLICE         130
AVIATION       125
HEALTH         118
Name: title, dtype: int64

def ranker(df):
    """Assigns a rank to each employee based on salary, with 1 being the highest paid.
    Assumes the data is DESC sorted."""
    df['dept_rank'] = np.arange(len(df)) + 1
    return df

chicago.sort_values('salary', ascending=False, inplace=True)
chicago = chicago.groupby('department').apply(ranker)
print(chicago[chicago.dept_rank == 1].head(7))

                         name                     title      department  \
18039     MC CARTHY,  GARRY F  SUPERINTENDENT OF POLICE          POLICE   
8004           EMANUEL,  RAHM                     MAYOR  MAYOR'S OFFICE   
25588       SANTIAGO,  JOSE A         FIRE COMMISSIONER            FIRE   
763    ANDOLINO,  ROSEMARIE S  COMMISSIONER OF AVIATION        AVIATION   
4697     CHOUCAIR,  BECHARA N    COMMISSIONER OF HEALTH          HEALTH   
21971      PATTON,  STEPHEN R       CORPORATION COUNSEL             LAW   
12635      HOLT,  ALEXANDRA D                BUDGET DIR   BUDGET & MGMT   

         salary  dept_rank  
18039  260004.0          1  
8004   216210.0          1  
25588  202728.0          1  
763    186576.0          1  
4697   177156.0          1  
21971  173664.0          1  
12635  169992.0          1

chicago[chicago.department == "LAW"][:5]

	name	title	department	salary	dept_rank
21971	PATTON, STEPHEN R	CORPORATION COUNSEL	LAW	173664.0	1
6311	DARLING, LESLIE M	FIRST ASST CORPORATION COUNSEL	LAW	149160.0	2
17680	MARTINICO, JOSEPH P	CHIEF LABOR NEGOTIATOR	LAW	144036.0	3
22357	PETERS, LYNDA A	CITY PROSECUTOR	LAW	139932.0	4
31383	WONG JR, EDWARD J	DEPUTY CORPORATION COUNSEL	LAW	137076.0	5

# pass in column names for each CSV
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols,
                    encoding='latin-1')

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols,
                      encoding='latin-1')

# the movies file contains columns indicating the movie's genres
# let's only load the first five columns of the file with usecols
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=m_cols, usecols=range(5),
                     encoding='latin-1')

# create one merged DataFrame
movie_ratings = pd.merge(movies, ratings)
lens = pd.merge(movie_ratings, users)

movies[].head()

	movie_id	title	release_date	video_release_date	imdb_url
0	1	Toy Story (1995)	01-Jan-1995	NaN	http://us.imdb.com/M/title-exact?Toy%20Story%2...
1	2	GoldenEye (1995)	01-Jan-1995	NaN	http://us.imdb.com/M/title-exact?GoldenEye%20(...
2	3	Four Rooms (1995)	01-Jan-1995	NaN	http://us.imdb.com/M/title-exact?Four%20Rooms%...
3	4	Get Shorty (1995)	01-Jan-1995	NaN	http://us.imdb.com/M/title-exact?Get%20Shorty%...
4	5	Copycat (1995)	01-Jan-1995	NaN	http://us.imdb.com/M/title-exact?Copycat%20(1995)

ratings[ratings.movie_id == 1].head()

	user_id	movie_id	rating	unix_timestamp
24	308	1	4	887736532
454	287	1	5	875334088
957	148	1	4	877019411
971	280	1	4	891700426
1324	66	1	3	883601324

movie_ratings.head()

	movie_id	title	release_date	video_release_date	imdb_url	user_id	rating	unix_timestamp
0	1	Toy Story (1995)	01-Jan-1995	NaN	http://us.imdb.com/M/title-exact?Toy%20Story%2...	308	4	887736532
1	1	Toy Story (1995)	01-Jan-1995	NaN	http://us.imdb.com/M/title-exact?Toy%20Story%2...	287	5	875334088
2	1	Toy Story (1995)	01-Jan-1995	NaN	http://us.imdb.com/M/title-exact?Toy%20Story%2...	148	4	877019411
3	1	Toy Story (1995)	01-Jan-1995	NaN	http://us.imdb.com/M/title-exact?Toy%20Story%2...	280	4	891700426
4	1	Toy Story (1995)	01-Jan-1995	NaN	http://us.imdb.com/M/title-exact?Toy%20Story%2...	66	3	883601324

What are the 25 most rated movies?

most_rated = lens.groupby('title').size().sort_values(ascending=False)[:25]
most_rated

title
Star Wars (1977)                             583
Contact (1997)                               509
Fargo (1996)                                 508
Return of the Jedi (1983)                    507
Liar Liar (1997)                             485
English Patient, The (1996)                  481
Scream (1996)                                478
Toy Story (1995)                             452
Air Force One (1997)                         431
Independence Day (ID4) (1996)                429
Raiders of the Lost Ark (1981)               420
Godfather, The (1972)                        413
Pulp Fiction (1994)                          394
Twelve Monkeys (1995)                        392
Silence of the Lambs, The (1991)             390
Jerry Maguire (1996)                         384
Chasing Amy (1997)                           379
Rock, The (1996)                             378
Empire Strikes Back, The (1980)              367
Star Trek: First Contact (1996)              365
Back to the Future (1985)                    350
Titanic (1997)                               350
Mission: Impossible (1996)                   344
Fugitive, The (1993)                         336
Indiana Jones and the Last Crusade (1989)    331
dtype: int64

# 或者
lens.title.value_counts()[:25]

Star Wars (1977)                             583
Contact (1997)                               509
Fargo (1996)                                 508
Return of the Jedi (1983)                    507
Liar Liar (1997)                             485
English Patient, The (1996)                  481
Scream (1996)                                478
Toy Story (1995)                             452
Air Force One (1997)                         431
Independence Day (ID4) (1996)                429
Raiders of the Lost Ark (1981)               420
Godfather, The (1972)                        413
Pulp Fiction (1994)                          394
Twelve Monkeys (1995)                        392
Silence of the Lambs, The (1991)             390
Jerry Maguire (1996)                         384
Chasing Amy (1997)                           379
Rock, The (1996)                             378
Empire Strikes Back, The (1980)              367
Star Trek: First Contact (1996)              365
Titanic (1997)                               350
Back to the Future (1985)                    350
Mission: Impossible (1996)                   344
Fugitive, The (1993)                         336
Indiana Jones and the Last Crusade (1989)    331
Name: title, dtype: int64

Which movies are most highly rated?

movie_stats = lens.groupby('title').agg({'rating': [np.size, np.mean]})
movie_stats.head()

	rating
	size	mean
title
'Til There Was You (1997)	9	2.333333
1-900 (1994)	5	2.600000
101 Dalmatians (1996)	109	2.908257
12 Angry Men (1957)	125	4.344000
187 (1997)	41	3.024390

# sort by rating average
movie_stats.sort_values([('rating', 'mean')], ascending=False).head()

	rating
	size	mean
title
They Made Me a Criminal (1939)	1	5.0
Marlene Dietrich: Shadow and Light (1996)	1	5.0
Saint of Fort Washington, The (1993)	2	5.0
Someone Else's America (1995)	1	5.0
Star Kid (1997)	3	5.0

atleast_100 = movie_stats['rating']['size'] >= 100
movie_stats[atleast_100].sort_values([('rating', 'mean')], ascending=False)[:15]

	rating
	size	mean
title
Close Shave, A (1995)	112	4.491071
Schindler's List (1993)	298	4.466443
Wrong Trousers, The (1993)	118	4.466102
Casablanca (1942)	243	4.456790
Shawshank Redemption, The (1994)	283	4.445230
Rear Window (1954)	209	4.387560
Usual Suspects, The (1995)	267	4.385768
Star Wars (1977)	583	4.358491
12 Angry Men (1957)	125	4.344000
Citizen Kane (1941)	198	4.292929
To Kill a Mockingbird (1962)	219	4.292237
One Flew Over the Cuckoo's Nest (1975)	264	4.291667
Silence of the Lambs, The (1991)	390	4.289744
North by Northwest (1959)	179	4.284916
Godfather, The (1972)	413	4.283293

#这是错误的方法
movie_stats.sort_values([('rating', 'mean'),('rating', 'size')], ascending=False).head()

	rating
	size	mean
title
Prefontaine (1997)	3	5.0
Star Kid (1997)	3	5.0
Saint of Fort Washington, The (1993)	2	5.0
Santa with Muscles (1996)	2	5.0
Aiqing wansui (1994)	1	5.0

most_50 = lens.groupby('movie_id').size().sort_values(ascending=False)[:50]

users.age.plot.hist(bins=30)
plt.title("Distribution of users' ages")
plt.ylabel('count of users')
plt.xlabel('age');

png

pandas 绑定了 matplotlib

绑定用户

labels = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79']
lens['age_group'] = pd.cut(lens.age, range(0, 81, 10), right=False, labels=labels)
lens[['age', 'age_group']].drop_duplicates()[:10]

	age	age_group
0	60	60-69
397	21	20-29
459	33	30-39
524	30	30-39
782	23	20-29
995	29	20-29
1229	26	20-29
1664	31	30-39
1942	24	20-29
2270	32	30-39

lens.groupby('age_group').agg({'rating': [np.size, np.mean]})

	rating
	size	mean
age_group
0-9	43	3.767442
10-19	8181	3.486126
20-29	39535	3.467333
30-39	25696	3.554444
40-49	15021	3.591772
50-59	8704	3.635800
60-69	2623	3.648875
70-79	197	3.649746

lens.set_index('movie_id', inplace=True)

by_age = lens.loc[most_50.index].groupby(['title', 'age_group'])
by_age.rating.mean().head(15)

title                 age_group
Air Force One (1997)  10-19        3.647059
                      20-29        3.666667
                      30-39        3.570000
                      40-49        3.555556
                      50-59        3.750000
                      60-69        3.666667
                      70-79        3.666667
Alien (1979)          10-19        4.111111
                      20-29        4.026087
                      30-39        4.103448
                      40-49        3.833333
                      50-59        4.272727
                      60-69        3.500000
                      70-79        4.000000
Aliens (1986)         10-19        4.050000
Name: rating, dtype: float64

by_age.rating.mean().unstack(1).fillna(0)[10:20]

age_group	0-9	10-19	20-29	30-39	40-49	50-59	60-69	70-79
title
E.T. the Extra-Terrestrial (1982)	0.0	3.680000	3.609091	3.806818	4.160000	4.368421	4.375000	0.000000
Empire Strikes Back, The (1980)	4.0	4.642857	4.311688	4.052083	4.100000	3.909091	4.250000	5.000000
English Patient, The (1996)	5.0	3.739130	3.571429	3.621849	3.634615	3.774648	3.904762	4.500000
Fargo (1996)	0.0	3.937500	4.010471	4.230769	4.294118	4.442308	4.000000	4.333333
Forrest Gump (1994)	5.0	4.047619	3.785714	3.861702	3.847826	4.000000	3.800000	0.000000
Fugitive, The (1993)	0.0	4.320000	3.969925	3.981481	4.190476	4.240000	3.666667	0.000000
Full Monty, The (1997)	0.0	3.421053	4.056818	3.933333	3.714286	4.146341	4.166667	3.500000
Godfather, The (1972)	0.0	4.400000	4.345070	4.412844	3.929412	4.463415	4.125000	0.000000
Groundhog Day (1993)	0.0	3.476190	3.798246	3.786667	3.851064	3.571429	3.571429	4.000000
Independence Day (ID4) (1996)	0.0	3.595238	3.291429	3.389381	3.718750	3.888889	2.750000	0.000000

那部电影男、女的评价差异最大

lens.reset_index('movie_id', inplace=True)
pivoted = lens.pivot_table(index=['movie_id', 'title'],
                           columns=['sex'],
                           values='rating',
                           fill_value=0)
pivoted.head()

	sex	F	M
movie_id	title
1	Toy Story (1995)	3.789916	3.909910
2	GoldenEye (1995)	3.368421	3.178571
3	Four Rooms (1995)	2.687500	3.108108
4	Get Shorty (1995)	3.400000	3.591463
5	Copycat (1995)	3.772727	3.140625

pivoted['diff'] = pivoted.M - pivoted.F
pivoted.head()

	sex	F	M	diff
movie_id	title
1	Toy Story (1995)	3.789916	3.909910	0.119994
2	GoldenEye (1995)	3.368421	3.178571	-0.189850
3	Four Rooms (1995)	2.687500	3.108108	0.420608
4	Get Shorty (1995)	3.400000	3.591463	0.191463
5	Copycat (1995)	3.772727	3.140625	-0.632102

pivoted.reset_index('movie_id', inplace=True)

disagreements = pivoted[pivoted.movie_id.isin(most_50.index)]['diff']
disagreements.head()

title
Toy Story (1995)             0.119994
Twelve Monkeys (1995)        0.300315
Dead Man Walking (1995)     -0.043452
Mr. Holland's Opus (1995)   -0.244160
Braveheart (1995)            0.031136
Name: diff, dtype: float64

disagreements = pivoted[pivoted.movie_id.isin(most_50.index)]['diff']
disagreements.sort_values().plot(kind='barh', figsize=[9, 15])
plt.title('Male vs. Female Avg. Ratings\n(Difference > 0 = Favored by Men)')
plt.ylabel('Title')
plt.xlabel('Average Rating Difference');

png

AprilJoy / blog

pandas入门 #5

dataframe应用

数据读入

数据概览

2个表之间的操作

JOIN

split-apply-combine 的策略

What are the 25 most rated movies?

Which movies are most highly rated?

绑定用户

那部电影男、女的评价差异最大

	key	left_value	right_value
0	0	a	NaN
1	1	b	NaN
2	2	c	NaN
3	3	d	NaN
4	4	e	NaN
0	2	NaN	f
1	3	NaN	g
2	4	NaN	h
3	5	NaN	i
4	6	NaN	j

	key	left_value	right_value
0	0	a	NaN
1	1	b	NaN
2	2	c	NaN
3	3	d	NaN
4	4	e	NaN
0	2	NaN	f
1	3	NaN	g
2	4	NaN	h
3	5	NaN	i
4	6	NaN	j

	key	left_value	right_value
0	0	a	NaN
1	1	b	NaN
2	2	c	NaN
3	3	d	NaN
4	4	e	NaN
0	2	NaN	f
1	3	NaN	g
2	4	NaN	h
3	5	NaN	i
4	6	NaN	j