magigo / data_science_tool_book_code

9 stars 5 forks source link

第八章代码运行不了 #1

Open guanxin522 opened 7 years ago

guanxin522 commented 7 years ago

def read_excel(): """读取人口普查分民族/年龄/性别统计 """ excel_content = pd.read_excel("A0201.xls", skiprows=2) race_list = excel_content.irow(0)[1:][::3].tolist()

去掉字符中间的空格

age_list = map(lambda x: str(x).replace(" ", ""),
               excel_content.icol(0)[2:].tolist())
excel_content = pd.read_excel("A0201.xls",
                              skiprows=4)

def get_num(lines):
    ret_dict = OrderedDict()
    for k, v in lines.to_dict().items():
        new_v_dict = OrderedDict()
        for vk, vv in v.items():
            new_v_dict[age_list[int(vk)]] = vv
        ret_dict[k.split(".", 1)[0]] = new_v_dict  # 将每一列表头中"."号后面的字符去掉
    return ret_dict

result_dict = OrderedDict()
for i, x in enumerate(range(1, 178, 3)):
    ids = [x, x + 1, x + 2]
    race_list[i] = race_list[i].replace(" ", "")
    result_dict[race_list[i]] = get_num(excel_content.icol(ids))

return result_dict

这是作者给出8.1.1节的代码,运行不了报了AttributeError: 'DataFrame' object has no attribute 'irow' ,查了原来是irow和icol属性被弃用了。看了官方的文档,是用iloc代替,文档全英文的我好难看懂啊,换了iloc之后运行又出错'DataFrame' object has no attribute 'tolist' ,还是运行不了。还有注释太少了,这本书属于入门书,注释少看不懂啊,excel_content.irow(0)[1:][::3].tolist(),这里也看不懂,书前面没有讲lambda,代码写了却没有对它加个注释,看起来很困难。这段代码运行不起来,第8章后面也运行不起来,希望作者及时更正下。

guanxin522 commented 7 years ago

根据课本使用pip install pandas 安装pandas应该是下了最新版,库里面有些东西就被弃用了

Tacode commented 7 years ago

根据作者的思路重新写的(主函数路口那注意一下,上传文本出了问题)

--coding:utf-8--

from future import print_function import pandas as pd from collections import OrderedDict from pandas import Series,DataFrame import numpy as np import sys reload(sys) sys.setdefaultencoding("utf-8")

def read_data(): excel_content=pd.read_excel('A0201.xls',skiprows=2)

race_list=excel_content.iloc[0,:][1:][::3].tolist()
# print(excel_content.iloc[0,:][1:][::3])
# print(excel_content.head())

# race_list=(excel_content.ix[1:,::3]).values.tolist()
# print((excel_content.ix[1:,::3]).values.tolist())
# print((excel_content.ix[0:1,::3]))

# for m in race_list:
#     print (m.decode('utf-8'))
# print(race_list)
# print(excel_content.iloc[:,0])
age_list=map(lambda x:str(x).replace(' ',''),excel_content.iloc[:,0][2:].tolist())
# for m in age_list:
#     print(m.decode('utf-8')) 
# print(age_list)
excel_content=pd.read_excel('A0201.xls',skiprows=4)
# print(excel_content)

# r_dict=OrderedDict()
# for i,j in excel_content.iloc[:,[4,5,6]].to_dict().items():
#     new_v_dict=OrderedDict()
#     for vi,vj in j.items():
#         # print(vi,vj)
#         new_v_dict[age_list[int(vi)]]=vj
#     r_dict[i.split('.',1)[0]]=new_v_dict
# # print(new_v_dict)
# print(json.dumps(r_dict,ensure_ascii=False,indent=4))

def get_number(lines):
    ret_dict=OrderedDict()
    # print(lines.to_dict())
    for k,v in lines.to_dict('dict').items():

        new_v_dict=OrderedDict()
        for vk,vv in v.items():
            new_v_dict[age_list[int(vk)]]=vv
        ret_dict[k.split('.',1)[0]]=new_v_dict
    return ret_dict

result_dict=OrderedDict()
for i,x in enumerate(range(1,178,3)):
    ids=[x,x+1,x+2]
    # print(ids)
    race_list[i]=race_list[i].replace(' ','')
    # print(race_list[i])
    result_dict[str(race_list[i])]=get_number(excel_content.iloc[:,ids])
# print (race_list[1])
# for k,v in result_dict.items():
#     print(k,v)

return result_dict

def calc_mean(d): total=0 total_age=0 for age,count in d.items(): if age.isdigit(): total+=count #统计所累加的人口数 total_age+=int(age)*count return total_age/float(total)

if name =='main':
import json

print(json.dumps(read_data(),ensure_ascii=False,indent=4))

#将json数据写入到.json文件中
with open("record.json","w") as f:
    f.write(json.dumps(read_data(),ensure_ascii=False,indent=4))