AprilJoy / blog

想订阅的点Watch,欢迎 Star,您的鼓励是我写作的动力!
https://apriljoy.github.io/Demo_deploy
0 stars 0 forks source link

pandas入门 #5

Open AprilJoy opened 6 years ago

AprilJoy commented 6 years ago
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('max_columns', 50)
%matplotlib inline

pandas是python的一个用于数据分析的函数库,通过数据结构作为入门的出发点,我们来看一下如何使用这个函数库。

pandas只有2种数据结构,series 和 dataframe

#series 是一个一维的对象,类似数组
# create a Series with an arbitrary list
s = pd.Series([7, 'Heisenberg', 3.14, -1789710578, 'Happy Eating!'])
s
0                7
1       Heisenberg
2             3.14
3      -1789710578
4    Happy Eating!
dtype: object
#默认的索引是0-N,可以通过index自定义索引
s = pd.Series([7, 'Heisenberg', 3.14, -1789710578, 'Happy Eating!'],
              index=['A', 'Z', 'C', 'Y', 'E'])
s
A                7
Z       Heisenberg
C             3.14
Y      -1789710578
E    Happy Eating!
dtype: object
#通过dict创建series
d = {'Chicago': 1000, 'New York': 1300, 'Portland': 900, 'San Francisco': 1100,
     'Austin': 450, 'Boston': None}
cities = pd.Series(d)
cities
Austin            450.0
Boston              NaN
Chicago          1000.0
New York         1300.0
Portland          900.0
San Francisco    1100.0
dtype: float64
cities['Chicago']
1000.0
cities[['Chicago', 'Portland', 'San Francisco']]
Chicago          1000.0
Portland          900.0
San Francisco    1100.0
dtype: float64
cities[cities < 1000]
Austin      450.0
Portland    900.0
dtype: float64
# 改变series的数值
print('Old value:', cities['Chicago'])
cities['Chicago'] = 1400
print('New value:', cities['Chicago'])
('Old value:', 1000.0)
('New value:', 1400.0)
print('Seattle' in cities)
print('San Francisco' in cities)
False
True
# divide city values by 3
cities / 3
Austin           150.000000
Boston                  NaN
Chicago          466.666667
New York         433.333333
Portland         300.000000
San Francisco    366.666667
dtype: float64
#dataframe是一组series,他们共用同一个index
#读取数据
#自定义数据
data = {'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],
        'team': ['Bears', 'Bears', 'Bears', 'Packers', 'Packers', 'Lions', 'Lions', 'Lions'],
        'wins': [11, 8, 10, 15, 11, 6, 10, 4],
        'losses': [5, 8, 6, 1, 5, 10, 6, 12]}
football = pd.DataFrame(data, columns=['year', 'team', 'wins', 'losses'])
football
year team wins losses
0 2010 Bears 11 5
1 2011 Bears 8 8
2 2012 Bears 10 6
3 2011 Packers 15 1
4 2012 Packers 11 5
5 2010 Lions 6 10
6 2011 Lions 10 6
7 2012 Lions 4 12
#读取csv数据
from_csv = pd.read_csv('test.csv')
from_csv.head()
Year Age Tm Lg W L W-L% ERA G GS GF CG SHO SV IP H R ER HR BB IBB SO HBP BK WP BF ERA+ WHIP H/9 HR/9 BB/9 SO/9 SO/BB Awards
0 1995 25 NYY AL 5 3 0.625 5.51 19 10 2 0 0 0 67.0 71 43 41 11 30 0 51 2 1 0 301 84 1.507 9.5 1.5 4.0 6.9 1.70 NaN
1 1996 26 NYY AL 8 3 0.727 2.09 61 0 14 0 0 5 107.2 73 25 25 1 34 3 130 2 0 1 425 240 0.994 6.1 0.1 2.8 10.9 3.82 CYA-3MVP-12
2 1997 27 NYY AL 6 4 0.600 1.88 66 0 56 0 0 43 71.2 65 17 15 5 20 6 68 0 0 2 301 239 1.186 8.2 0.6 2.5 8.5 3.40 ASMVP-25
3 1998 28 NYY AL 3 0 1.000 1.91 54 0 49 0 0 36 61.1 48 13 13 3 17 1 36 1 0 0 246 233 1.060 7.0 0.4 2.5 5.3 2.12 NaN
#没有表头的csv数据
cols = ['num', 'game', 'date', 'team', 'home_away', 'opponent',
        'result', 'quarter', 'distance', 'receiver', 'score_before',
        'score_after']
no_headers = pd.read_csv('peyton-passing-TDs-2012.csv', sep=',', header=None,
                         names=cols)
#读excel
#需要安装xlrd 库
data = {'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],
        'team': ['Bears', 'Bears', 'Bears', 'Packers', 'Packers', 'Lions', 'Lions', 'Lions'],
        'wins': [11, 8, 10, 15, 11, 6, 10, 4],
        'losses': [5, 8, 6, 1, 5, 10, 6, 12]}
football = pd.DataFrame(data, columns=['year', 'team', 'wins', 'losses'])
football
year team wins losses
0 2010 Bears 11 5
1 2011 Bears 8 8
2 2012 Bears 10 6
3 2011 Packers 15 1
4 2012 Packers 11 5
5 2010 Lions 6 10
6 2011 Lions 10 6
7 2012 Lions 4 12
football.to_excel('football.xlsx', index=False)
/Library/Python/2.7/site-packages/pandas/io/excel.py:784: DeprecationWarning: Call to deprecated function remove_sheet (Use wb.remove(worksheet) or del wb[sheetname]).
  self.book.remove_sheet(self.book.worksheets[0])
/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/encoder.py:207: DeprecationWarning: Interpreting naive datetime as local 2018-08-16 15:55:04.426237. Please add timezone info to timestamps.
  chunks = self.iterencode(o, _one_shot=True)
# delete the DataFrame
del football
/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/encoder.py:207: DeprecationWarning: Interpreting naive datetime as local 2018-08-16 15:56:02.944848. Please add timezone info to timestamps.
  chunks = self.iterencode(o, _one_shot=True)
# read from Excel
football = pd.read_excel('football.xlsx', 'Sheet1')
football
year team wins losses
0 2010 Bears 11 5
1 2011 Bears 8 8
2 2012 Bears 10 6
3 2011 Packers 15 1
4 2012 Packers 11 5
5 2010 Lions 6 10
6 2011 Lions 10 6
7 2012 Lions 4 12
/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/encoder.py:207: DeprecationWarning: Interpreting naive datetime as local 2018-08-16 15:56:16.107466. Please add timezone info to timestamps.
  chunks = self.iterencode(o, _one_shot=True)
football
losses team wins year
0 5 bears 11 2010
1 8 packer 3 2011
2 6 bears 3 2012

dataframe应用

数据读入


# pass in column names for each CSV
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols,
                    encoding='latin-1')
users.head()
user_id age sex occupation zip_code
0 1 24 M technician 85711
1 2 53 F other 94043
2 3 23 M writer 32067
3 4 24 M technician 43537
4 5 33 F other 15213
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols,
                      encoding='latin-1')
# the movies file contains columns indicating the movie's genres
# let's only load the first five columns of the file with usecols
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=m_cols, usecols=range(5),
                     encoding='latin-1')
movies.head()
movie_id title release_date video_release_date imdb_url
0 1 Toy Story (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Toy%20Story%2...
1 2 GoldenEye (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?GoldenEye%20(...
2 3 Four Rooms (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Four%20Rooms%...
3 4 Get Shorty (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Get%20Shorty%...
4 5 Copycat (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Copycat%20(1995)

数据概览

movies.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 5 columns):
movie_id              1682 non-null int64
title                 1682 non-null object
release_date          1681 non-null object
video_release_date    0 non-null float64
imdb_url              1679 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 65.8+ KB
users.describe()
user_id age
count 943.000000 943.000000
mean 472.000000 34.051962
std 272.364951 12.192740
min 1.000000 7.000000
25% 236.500000 25.000000
50% 472.000000 31.000000
75% 707.500000 43.000000
max 943.000000 73.000000
movies.tail(3)
movie_id title release_date video_release_date imdb_url
1679 1680 Sliding Doors (1998) 01-Jan-1998 NaN http://us.imdb.com/Title?Sliding+Doors+(1998)
1680 1681 You So Crazy (1994) 01-Jan-1994 NaN http://us.imdb.com/M/title-exact?You%20So%20Cr...
1681 1682 Scream of Stone (Schrei aus Stein) (1991) 08-Mar-1996 NaN http://us.imdb.com/M/title-exact?Schrei%20aus%...
#查看部分数据
print(users[['age', 'zip_code']].head())
print('\n')

# can also store in a variable to use later
columns_you_want = ['occupation', 'sex'] 
print(users[columns_you_want].head())
   age zip_code
0   24    85711
1   53    94043
2   23    32067
3   24    43537
4   33    15213

   occupation sex
0  technician   M
1       other   F
2      writer   M
3  technician   M
4       other   F
# users older than 25
print(users[users.age > 25].head(3))
print('\n')

# users aged 40 AND male
print(users[(users.age == 40) & (users.sex == 'M')].head(3))
print('\n')

# users younger than 30 OR female
print(users[(users.sex == 'F') | (users.age < 30)].head(3))
   user_id  age sex occupation zip_code
1        2   53   F      other    94043
4        5   33   F      other    15213
5        6   42   M  executive    98101

     user_id  age sex  occupation zip_code
18        19   40   M   librarian    02138
82        83   40   M       other    44133
115      116   40   M  healthcare    97232

   user_id  age sex  occupation zip_code
0        1   24   M  technician    85711
1        2   53   F       other    94043
2        3   23   M      writer    32067
print(users.set_index('user_id').head())
print('\n')

print(users.head())
print("\n^^^ I didn't actually change the DataFrame. ^^^\n")

with_new_index = users.set_index('user_id')
print(with_new_index.head())
print("\n^^^ set_index actually returns a new DataFrame. ^^^\n")
         age sex  occupation zip_code
user_id                              
1         24   M  technician    85711
2         53   F       other    94043
3         23   M      writer    32067
4         24   M  technician    43537
5         33   F       other    15213

   user_id  age sex  occupation zip_code
0        1   24   M  technician    85711
1        2   53   F       other    94043
2        3   23   M      writer    32067
3        4   24   M  technician    43537
4        5   33   F       other    15213

^^^ I didn't actually change the DataFrame. ^^^

         age sex  occupation zip_code
user_id                              
1         24   M  technician    85711
2         53   F       other    94043
3         23   M      writer    32067
4         24   M  technician    43537
5         33   F       other    15213

^^^ set_index actually returns a new DataFrame. ^^^
with_new_index.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 943 entries, 1 to 943
Data columns (total 4 columns):
age           943 non-null int64
sex           943 non-null object
occupation    943 non-null object
zip_code      943 non-null object
dtypes: int64(1), object(3)
memory usage: 36.8+ KB
users.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 5 columns):
user_id       943 non-null int64
age           943 non-null int64
sex           943 non-null object
occupation    943 non-null object
zip_code      943 non-null object
dtypes: int64(2), object(3)
memory usage: 36.9+ KB

2个表之间的操作

JOIN

left_frame = pd.DataFrame({'key': range(5), 
                           'left_value': ['a', 'b', 'c', 'd', 'e']})
right_frame = pd.DataFrame({'key': range(2, 7), 
                           'right_value': ['f', 'g', 'h', 'i', 'j']})
print(left_frame)
print('\n')
print(right_frame)
   key left_value
0    0          a
1    1          b
2    2          c
3    3          d
4    4          e

   key right_value
0    2           f
1    3           g
2    4           h
3    5           i
4    6           j
pd.merge(left_frame, right_frame, on='key', how='inner')
key left_value right_value
0 2 c f
1 3 d g
2 4 e h
pd.merge(left_frame, right_frame, left_on='key', right_index=True)
key key_x left_value key_y right_value
0 0 0 a 2 f
1 1 1 b 3 g
2 2 2 c 4 h
3 3 3 d 5 i
4 4 4 e 6 j
#left outer join

pd.merge(left_frame, right_frame, on='key', how='left')
key left_value right_value
0 0 a NaN
1 1 b NaN
2 2 c f
3 3 d g
4 4 e h
#right outer join

pd.merge(left_frame, right_frame, on='key', how='right')
key left_value right_value
0 2 c f
1 3 d g
2 4 e h
3 5 NaN i
4 6 NaN j
# full outer join

pd.merge(left_frame, right_frame, on='key', how='outer')
key left_value right_value
0 0 a NaN
1 1 b NaN
2 2 c f
3 3 d g
4 4 e h
5 5 NaN i
6 6 NaN j

Combining 合并,直接将2个表相加,通过参数控制水平方向相加还是数值方向

pd.concat([left_frame, right_frame])
key left_value right_value
0 0 a NaN
1 1 b NaN
2 2 c NaN
3 3 d NaN
4 4 e NaN
0 2 NaN f
1 3 NaN g
2 4 NaN h
3 5 NaN i
4 6 NaN j
pd.concat([left_frame, right_frame], axis=1)
key left_value key right_value
0 0 a 2 f
1 1 b 3 g
2 2 c 4 h
3 3 d 5 i
4 4 e 6 j

split-apply-combine 的策略

headers = ['name', 'title', 'department', 'salary']
chicago = pd.read_csv('city-of-chicago-salaries.csv', 
                      header=0,
                      names=headers,
                      converters={'salary': lambda x: float(x.replace('$', ''))})
chicago.head()
name title department salary
0 AARON, ELVIA J WATER RATE TAKER WATER MGMNT 85512.0
1 AARON, JEFFERY M POLICE OFFICER POLICE 75372.0
2 AARON, KIMBERLEI R CHIEF CONTRACT EXPEDITER GENERAL SERVICES 80916.0
3 ABAD JR, VICENTE M CIVIL ENGINEER IV WATER MGMNT 99648.0
4 ABBATACOLA, ROBERT J ELECTRICAL MECHANIC AVIATION 89440.0
#groupby 返回的是一个dataframe的对象,含有多种可使用的方法
by_dept = chicago.groupby('department')
by_dept
<pandas.core.groupby.DataFrameGroupBy object at 0x1100f82d0>
print(by_dept.count().head()) # NOT NULL records within each column
print('\n')
print(by_dept.size().tail()) # total records for each department
                   name  title  salary
department                            
ADMIN HEARNG         42     42      42
ANIMAL CONTRL        61     61      61
AVIATION           1218   1218    1218
BOARD OF ELECTION   110    110     110
BOARD OF ETHICS       9      9       9

department
ADMIN HEARNG           42
ANIMAL CONTRL          61
AVIATION             1218
BOARD OF ELECTION     110
BOARD OF ETHICS         9
dtype: int64
print(by_dept.sum()[20:25]) # total salaries of each department
print('\n')
print(by_dept.mean()[20:25]) # average salary of each department
print('\n')
print(by_dept.median()[20:25]) # take that, RDBMS!
                       salary
department                   
HUMAN RESOURCES     4850928.0
INSPECTOR GEN       4035150.0
IPRA                7006128.0
LAW                31883920.2
LICENSE APPL COMM     65436.0

                         salary
department                     
HUMAN RESOURCES    71337.176471
INSPECTOR GEN      80703.000000
IPRA               82425.035294
LAW                70853.156000
LICENSE APPL COMM  65436.000000

                    salary
department                
HUMAN RESOURCES    68496.0
INSPECTOR GEN      76116.0
IPRA               82524.0
LAW                66492.0
LICENSE APPL COMM  65436.0
#我们想看title最多的5个部门
#SELECT department, COUNT(DISTINCT title)
#FROM chicago
#GROUP BY department
#ORDER BY 2 DESC
#LIMIT 5;
by_dept.title.nunique().sort_values(ascending=False)[:5]
department
WATER MGMNT    153
TRANSPORTN     150
POLICE         130
AVIATION       125
HEALTH         118
Name: title, dtype: int64
def ranker(df):
    """Assigns a rank to each employee based on salary, with 1 being the highest paid.
    Assumes the data is DESC sorted."""
    df['dept_rank'] = np.arange(len(df)) + 1
    return df

chicago.sort_values('salary', ascending=False, inplace=True)
chicago = chicago.groupby('department').apply(ranker)
print(chicago[chicago.dept_rank == 1].head(7))
                         name                     title      department  \
18039     MC CARTHY,  GARRY F  SUPERINTENDENT OF POLICE          POLICE   
8004           EMANUEL,  RAHM                     MAYOR  MAYOR'S OFFICE   
25588       SANTIAGO,  JOSE A         FIRE COMMISSIONER            FIRE   
763    ANDOLINO,  ROSEMARIE S  COMMISSIONER OF AVIATION        AVIATION   
4697     CHOUCAIR,  BECHARA N    COMMISSIONER OF HEALTH          HEALTH   
21971      PATTON,  STEPHEN R       CORPORATION COUNSEL             LAW   
12635      HOLT,  ALEXANDRA D                BUDGET DIR   BUDGET & MGMT   

         salary  dept_rank  
18039  260004.0          1  
8004   216210.0          1  
25588  202728.0          1  
763    186576.0          1  
4697   177156.0          1  
21971  173664.0          1  
12635  169992.0          1  
chicago[chicago.department == "LAW"][:5]
name title department salary dept_rank
21971 PATTON, STEPHEN R CORPORATION COUNSEL LAW 173664.0 1
6311 DARLING, LESLIE M FIRST ASST CORPORATION COUNSEL LAW 149160.0 2
17680 MARTINICO, JOSEPH P CHIEF LABOR NEGOTIATOR LAW 144036.0 3
22357 PETERS, LYNDA A CITY PROSECUTOR LAW 139932.0 4
31383 WONG JR, EDWARD J DEPUTY CORPORATION COUNSEL LAW 137076.0 5
# pass in column names for each CSV
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols,
                    encoding='latin-1')

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols,
                      encoding='latin-1')

# the movies file contains columns indicating the movie's genres
# let's only load the first five columns of the file with usecols
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=m_cols, usecols=range(5),
                     encoding='latin-1')

# create one merged DataFrame
movie_ratings = pd.merge(movies, ratings)
lens = pd.merge(movie_ratings, users)
movies[].head()
movie_id title release_date video_release_date imdb_url
0 1 Toy Story (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Toy%20Story%2...
1 2 GoldenEye (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?GoldenEye%20(...
2 3 Four Rooms (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Four%20Rooms%...
3 4 Get Shorty (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Get%20Shorty%...
4 5 Copycat (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Copycat%20(1995)
ratings[ratings.movie_id == 1].head()
user_id movie_id rating unix_timestamp
24 308 1 4 887736532
454 287 1 5 875334088
957 148 1 4 877019411
971 280 1 4 891700426
1324 66 1 3 883601324
movie_ratings.head()
movie_id title release_date video_release_date imdb_url user_id rating unix_timestamp
0 1 Toy Story (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Toy%20Story%2... 308 4 887736532
1 1 Toy Story (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Toy%20Story%2... 287 5 875334088
2 1 Toy Story (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Toy%20Story%2... 148 4 877019411
3 1 Toy Story (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Toy%20Story%2... 280 4 891700426
4 1 Toy Story (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Toy%20Story%2... 66 3 883601324

What are the 25 most rated movies?

most_rated = lens.groupby('title').size().sort_values(ascending=False)[:25]
most_rated
title
Star Wars (1977)                             583
Contact (1997)                               509
Fargo (1996)                                 508
Return of the Jedi (1983)                    507
Liar Liar (1997)                             485
English Patient, The (1996)                  481
Scream (1996)                                478
Toy Story (1995)                             452
Air Force One (1997)                         431
Independence Day (ID4) (1996)                429
Raiders of the Lost Ark (1981)               420
Godfather, The (1972)                        413
Pulp Fiction (1994)                          394
Twelve Monkeys (1995)                        392
Silence of the Lambs, The (1991)             390
Jerry Maguire (1996)                         384
Chasing Amy (1997)                           379
Rock, The (1996)                             378
Empire Strikes Back, The (1980)              367
Star Trek: First Contact (1996)              365
Back to the Future (1985)                    350
Titanic (1997)                               350
Mission: Impossible (1996)                   344
Fugitive, The (1993)                         336
Indiana Jones and the Last Crusade (1989)    331
dtype: int64
# 或者
lens.title.value_counts()[:25]
Star Wars (1977)                             583
Contact (1997)                               509
Fargo (1996)                                 508
Return of the Jedi (1983)                    507
Liar Liar (1997)                             485
English Patient, The (1996)                  481
Scream (1996)                                478
Toy Story (1995)                             452
Air Force One (1997)                         431
Independence Day (ID4) (1996)                429
Raiders of the Lost Ark (1981)               420
Godfather, The (1972)                        413
Pulp Fiction (1994)                          394
Twelve Monkeys (1995)                        392
Silence of the Lambs, The (1991)             390
Jerry Maguire (1996)                         384
Chasing Amy (1997)                           379
Rock, The (1996)                             378
Empire Strikes Back, The (1980)              367
Star Trek: First Contact (1996)              365
Titanic (1997)                               350
Back to the Future (1985)                    350
Mission: Impossible (1996)                   344
Fugitive, The (1993)                         336
Indiana Jones and the Last Crusade (1989)    331
Name: title, dtype: int64

Which movies are most highly rated?

movie_stats = lens.groupby('title').agg({'rating': [np.size, np.mean]})
movie_stats.head()
rating
size mean
title
'Til There Was You (1997) 9 2.333333
1-900 (1994) 5 2.600000
101 Dalmatians (1996) 109 2.908257
12 Angry Men (1957) 125 4.344000
187 (1997) 41 3.024390
# sort by rating average
movie_stats.sort_values([('rating', 'mean')], ascending=False).head()
rating
size mean
title
They Made Me a Criminal (1939) 1 5.0
Marlene Dietrich: Shadow and Light (1996) 1 5.0
Saint of Fort Washington, The (1993) 2 5.0
Someone Else's America (1995) 1 5.0
Star Kid (1997) 3 5.0
atleast_100 = movie_stats['rating']['size'] >= 100
movie_stats[atleast_100].sort_values([('rating', 'mean')], ascending=False)[:15]
rating
size mean
title
Close Shave, A (1995) 112 4.491071
Schindler's List (1993) 298 4.466443
Wrong Trousers, The (1993) 118 4.466102
Casablanca (1942) 243 4.456790
Shawshank Redemption, The (1994) 283 4.445230
Rear Window (1954) 209 4.387560
Usual Suspects, The (1995) 267 4.385768
Star Wars (1977) 583 4.358491
12 Angry Men (1957) 125 4.344000
Citizen Kane (1941) 198 4.292929
To Kill a Mockingbird (1962) 219 4.292237
One Flew Over the Cuckoo's Nest (1975) 264 4.291667
Silence of the Lambs, The (1991) 390 4.289744
North by Northwest (1959) 179 4.284916
Godfather, The (1972) 413 4.283293
#这是错误的方法
movie_stats.sort_values([('rating', 'mean'),('rating', 'size')], ascending=False).head()
rating
size mean
title
Prefontaine (1997) 3 5.0
Star Kid (1997) 3 5.0
Saint of Fort Washington, The (1993) 2 5.0
Santa with Muscles (1996) 2 5.0
Aiqing wansui (1994) 1 5.0
most_50 = lens.groupby('movie_id').size().sort_values(ascending=False)[:50]
users.age.plot.hist(bins=30)
plt.title("Distribution of users' ages")
plt.ylabel('count of users')
plt.xlabel('age');

png

pandas 绑定了 matplotlib

绑定用户

labels = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79']
lens['age_group'] = pd.cut(lens.age, range(0, 81, 10), right=False, labels=labels)
lens[['age', 'age_group']].drop_duplicates()[:10]
age age_group
0 60 60-69
397 21 20-29
459 33 30-39
524 30 30-39
782 23 20-29
995 29 20-29
1229 26 20-29
1664 31 30-39
1942 24 20-29
2270 32 30-39
lens.groupby('age_group').agg({'rating': [np.size, np.mean]})
rating
size mean
age_group
0-9 43 3.767442
10-19 8181 3.486126
20-29 39535 3.467333
30-39 25696 3.554444
40-49 15021 3.591772
50-59 8704 3.635800
60-69 2623 3.648875
70-79 197 3.649746
lens.set_index('movie_id', inplace=True)
by_age = lens.loc[most_50.index].groupby(['title', 'age_group'])
by_age.rating.mean().head(15)
title                 age_group
Air Force One (1997)  10-19        3.647059
                      20-29        3.666667
                      30-39        3.570000
                      40-49        3.555556
                      50-59        3.750000
                      60-69        3.666667
                      70-79        3.666667
Alien (1979)          10-19        4.111111
                      20-29        4.026087
                      30-39        4.103448
                      40-49        3.833333
                      50-59        4.272727
                      60-69        3.500000
                      70-79        4.000000
Aliens (1986)         10-19        4.050000
Name: rating, dtype: float64
by_age.rating.mean().unstack(1).fillna(0)[10:20]
age_group 0-9 10-19 20-29 30-39 40-49 50-59 60-69 70-79
title
E.T. the Extra-Terrestrial (1982) 0.0 3.680000 3.609091 3.806818 4.160000 4.368421 4.375000 0.000000
Empire Strikes Back, The (1980) 4.0 4.642857 4.311688 4.052083 4.100000 3.909091 4.250000 5.000000
English Patient, The (1996) 5.0 3.739130 3.571429 3.621849 3.634615 3.774648 3.904762 4.500000
Fargo (1996) 0.0 3.937500 4.010471 4.230769 4.294118 4.442308 4.000000 4.333333
Forrest Gump (1994) 5.0 4.047619 3.785714 3.861702 3.847826 4.000000 3.800000 0.000000
Fugitive, The (1993) 0.0 4.320000 3.969925 3.981481 4.190476 4.240000 3.666667 0.000000
Full Monty, The (1997) 0.0 3.421053 4.056818 3.933333 3.714286 4.146341 4.166667 3.500000
Godfather, The (1972) 0.0 4.400000 4.345070 4.412844 3.929412 4.463415 4.125000 0.000000
Groundhog Day (1993) 0.0 3.476190 3.798246 3.786667 3.851064 3.571429 3.571429 4.000000
Independence Day (ID4) (1996) 0.0 3.595238 3.291429 3.389381 3.718750 3.888889 2.750000 0.000000

那部电影男、女的评价差异最大

lens.reset_index('movie_id', inplace=True)
pivoted = lens.pivot_table(index=['movie_id', 'title'],
                           columns=['sex'],
                           values='rating',
                           fill_value=0)
pivoted.head()
sex F M
movie_id title
1 Toy Story (1995) 3.789916 3.909910
2 GoldenEye (1995) 3.368421 3.178571
3 Four Rooms (1995) 2.687500 3.108108
4 Get Shorty (1995) 3.400000 3.591463
5 Copycat (1995) 3.772727 3.140625
pivoted['diff'] = pivoted.M - pivoted.F
pivoted.head()
sex F M diff
movie_id title
1 Toy Story (1995) 3.789916 3.909910 0.119994
2 GoldenEye (1995) 3.368421 3.178571 -0.189850
3 Four Rooms (1995) 2.687500 3.108108 0.420608
4 Get Shorty (1995) 3.400000 3.591463 0.191463
5 Copycat (1995) 3.772727 3.140625 -0.632102
pivoted.reset_index('movie_id', inplace=True)
disagreements = pivoted[pivoted.movie_id.isin(most_50.index)]['diff']
disagreements.head()
title
Toy Story (1995)             0.119994
Twelve Monkeys (1995)        0.300315
Dead Man Walking (1995)     -0.043452
Mr. Holland's Opus (1995)   -0.244160
Braveheart (1995)            0.031136
Name: diff, dtype: float64
disagreements = pivoted[pivoted.movie_id.isin(most_50.index)]['diff']
disagreements.sort_values().plot(kind='barh', figsize=[9, 15])
plt.title('Male vs. Female Avg. Ratings\n(Difference > 0 = Favored by Men)')
plt.ylabel('Title')
plt.xlabel('Average Rating Difference');

png