本工程爬取三个媒体网站的元信息与音频信息,并将爬取到的信息存储到mongo中,三个网站包括喜马拉雅fm,蜻蜓fm,考拉fm(点播内容)
使用
项目使用scrapy 框架,使用Centos,需要有mongo 环境,爬虫配置在./m_spider/settings.py,项目在python2.7 的虚拟环境下运行,相关依赖在 ./requirements.txt。
环境准备完毕使用 scrapy list 查看所有爬虫,scrapy crawl 爬虫名爬取数据
{
"_id" : ObjectId("571a1447e138237497458fd8"),
"sourceUrl" : "http://www.kaolafm.com/category/1165",
"categoryName" : "军事",
"pageSize" : 24,
"totalCounts" : 69,
"subCategorys" : [
{
"linkType" : 1,
"categoryName" : "军情速递",
"hasSub" : 0,
"imageAoyo" : "",
"categoryId" : 1314,
"imageAoyoEffect" : "",
"logo" : "http://image.kaolafm.net/mz/images/201603/3fff5ffd-d6a4-437c-90c3-44b49494c59e/default.jpg",
"id" : -1
},
{
"linkType" : 1,
"categoryName" : "军史档案",
"hasSub" : 0,
"imageAoyo" : "",
"categoryId" : 1315,
"imageAoyoEffect" : "",
"logo" : "http://image.kaolafm.net/mz/images/201603/7166226b-bddf-4fb1-a417-168db80bacbe/default.jpg",
"id" : -1
},
{
"linkType" : 1,
"categoryName" : "武器装备",
"hasSub" : 0,
"imageAoyo" : "",
"categoryId" : 1313,
"imageAoyoEffect" : "",
"logo" : "http://image.kaolafm.net/mz/images/201603/deb0658e-4407-4614-b1bf-a82df010f2d5/default.jpg",
"id" : -1
}
],
"totalPages" : 3,
"categoryId" : 1165
}
{
"_id" : ObjectId("571a148ae138237497458fef"),
"pageSize" : 20,
"audios" : [
{
"audioId" : NumberLong("1000002233810"),
"playUrl" : "http://image.kaolafm.net/mz/outopus_16/201602/28010074-86d8-4e66-932f-737a3ec63154.opus"
},
{
"audioId" : NumberLong("1000002233813"),
"playUrl" : "http://image.kaolafm.net/mz/outopus_16/201602/a308d101-8dbd-45a1-baf3-719cd7f2850a.opus"
},
{
"audioId" : NumberLong("1000002233844"),
"playUrl" : "http://image.kaolafm.net/mz/outopus_16/201602/9bd44ce2-9ff4-41e2-b446-9c15793e56c7.opus"
},
{
"audioId" : NumberLong("1000002233847"),
"playUrl" : "http://image.kaolafm.net/mz/outopus_16/201602/0c42b3d3-421e-4c26-a9ed-5c9c304219ae.opus"
},
{
"audioId" : NumberLong("1000002233901"),
"playUrl" : "http://image.kaolafm.net/mz/outopus_16/201602/b8dba548-deb5-488a-bbcd-57465746240f.opus"
},
{
"audioId" : NumberLong("1000002233919"),
"playUrl" : "http://image.kaolafm.net/mz/outopus_16/201602/1116158e-4fe7-48ca-b5f8-14e95912de00.opus"
},
{
"audioId" : NumberLong("1000002256643"),
"playUrl" : "http://image.kaolafm.net/mz/outopus_16/201602/bdda8b4e-622c-4a87-8a15-932b85397a0c.opus"
},
{
"audioId" : NumberLong("1000002280876"),
"playUrl" : "http://image.kaolafm.net/mz/outopus_16/201603/62d6d392-2a91-4f0d-b09d-3e06c6e2a967.opus"
},
{
"audioId" : NumberLong("1000002296060"),
"playUrl" : "http://image.kaolafm.net/mz/outopus_16/201603/73f0d359-40e8-4b3d-ba0a-0ebc5ef640a9.opus"
},
{
"audioId" : NumberLong("1000002330497"),
"playUrl" : "http://image.kaolafm.net/mz/outopus_32/201603/0169223d-7b6c-4b07-a5c1-ab656e073aa7.opus"
}
],
"commentCount" : 0,
"shortDesc" : "岁月留声,光阴永存!",
"albumPicUrl" : "http://image.kaolafm.net/mz/images/201602/1d5ab80e-b2ce-4101-a8be-67d74efd9782/default.png",
"categoryName" : "公益",
"index" : 22,
"fullDescs" : "岁月留声,光阴永存!",
"followedNum" : 6,
"categoryPage" : 6,
"lastModifyTime" : ISODate("2016-04-22T20:09:44.368Z"),
"type" : 0,
"categoryId" : 994,
"status" : "更新中",
"anchors" : [
{
"des" : "",
"name" : "月上岚",
"img" : ""
}
],
"tags" : [
"光音工坊",
"月上岚",
"CV",
"广播剧",
"配音"
],
"audioCounts" : 10,
"comeFromId" : null,
"comeFrom" : "光音工坊",
"albumId" : NumberLong("1100000117692"),
"produce" : "用户分享 ",
"uploaderId" : 2490178,
"updateDay" : "不定期更新",
"uploadUserName" : "月上岚",
"listenNum" : 14006,
"shareUrl" : "http://m.kaolafm.com/share/zj.html?albumId=1100000117692",
"albumName" : "光音工坊",
"sumPage" : 1
}
{
"_id" : ObjectId("571a1488e138237497458fe5"),
"updateTime" : "2016-02-05",
"uploaderName" : "月上岚",
"duration" : 237976,
"uuid" : "19ae59de088311e699c0782bcb3b9846",
"crawledCount" : 1,
"crawledTime" : ISODate("2016-04-22T20:09:44.597Z"),
"createTime" : NumberLong("1454638740000"),
"orderNum" : 1,
"audioPicUrl" : null,
"sendToCNRTime" : null,
"audioId" : NumberLong("1000002233810"),
"likedNum" : 0,
"commentNum" : 0,
"m3u8PlayUrl" : "http://image.kaolafm.net/mz/aac_64/201602/28010074-86d8-4e66-932f-737a3ec63154/playlist.m3u8",
"fileSize" : 1863117,
"uploaderId" : 2490178,
"mp3PlayUrl" : "http://image.kaolafm.net/mz/audios/201602/28010074-86d8-4e66-932f-737a3ec63154.mp3",
"playUrl" : "http://image.kaolafm.net/mz/outopus_16/201602/28010074-86d8-4e66-932f-737a3ec63154.opus",
"audioDesc" : null,
"audioName" : "校园灵异广播剧——《轮回》(片花)",
"listenNum" : 866,
"audioDownloadDir" : "/var/crawler/kl/audios/full/90f4be0f22dd148698d09d5fad1556ff2bda6a07.m3u8",
"shareUrl" : "http://m.kaolafm.com/share/jm.html?audioId=1000002233810",
"accPlayUrl" : null,
"album_title" : "光音工坊",
"checksum" : "d68260612135cc53cb15d498fd338ec9"
}
{
"_id" : ObjectId("571a2dfce1382377590c23ce"),
"cid" : "3",
"href" : "http://www.ximalaya.com/dq/book/",
"nameText" : "有声书",
"subCategorys" : [
"言情",
"悬疑",
"幻想",
"历史",
"都市",
"文学",
"社科",
"武侠",
"读客图书",
"QQ阅读",
"果麦文化",
"中信出版",
"博集天卷",
"速播专区",
"推理世界",
"正能量有声书"
],
"cname" : "book"
}
{
"_id" : ObjectId("571a2e0fe1382377590c23fa"),
"categoryName" : "【商业财经】",
"imgSrc" : "http://fdfs.xmcdn.com/group8/M01/74/FD/wKgDYFX-z1XAbxRIAAEx5osVmk0793_web_large.jpg",
"tags" : [
"财经资讯",
"财经评论",
"创业密码",
"商业聚焦",
"股指期货"
],
"crawledCount" : 1,
"album_id" : "2983857",
"audios" : [
{
"id" : "8850477",
"album_id" : 2983857
}
],
"uploadUserName" : "\r\n 管理之声\r\n ",
"crawledTime" : ISODate("2016-04-22T21:58:39.106Z"),
"href" : "http://www.ximalaya.com/29947683/album/2983857",
"albumDesc" : "企业家自己的故事。",
"uploadUserUrl" : "http://www.ximalaya.com/zhubo/29947683/",
"playTime" : "177",
"album_name" : "企业家"
}
{
"_id" : ObjectId("571a2e0ce1382377590c23e6"),
"uid" : 4253278,
"album_id" : 389451,
"audioChecksum" : null,
"intro" : "",
"play_path_32" : "http://audio.xmcdn.com/group11/M00/37/78/wKgDa1WWnp7CUK4SAEMm6D8va3k417.m4a",
"duration" : 1452,
"id" : "6447415",
"uuid" : "4f2a32e0089211e69dce782bcb3b9846",
"title" : "张少佐评书黄杨传001",
"crawledCount" : 1,
"sendToCNRTime" : null,
"upload_id" : "u_5463419",
"cover_url" : "http://fdfs.xmcdn.com/group6/M07/4D/47/wKgDg1U0543TKCnMAAEAFfU4hWc143.jpg",
"play_count" : 3706,
"uploadUserName" : "\r\n 惜林槐香\r\n ",
"cover_url_142" : "http://fdfs.xmcdn.com/group6/M07/4D/47/wKgDg1U0543TKCnMAAEAFfU4hWc143_web_large.jpg",
"play_path_64" : "http://audio.xmcdn.com/group11/M00/37/80/wKgDbVWWnmDgH23vAFlLABTtPbA323.m4a",
"nickname" : "惜林槐香",
"category_name" : "comic",
"shares_count" : 0,
"favorites_count" : 12,
"play_path" : "http://audio.xmcdn.com/group11/M00/37/80/wKgDbVWWnmDgH23vAFlLABTtPbA323.m4a",
"created_at" : ISODate("2015-04-20T00:00:00Z"),
"audioDownloadDir" : "/var/crawler/xmly/audios/full/412b8fd00600331c1d6492624cde21076618b3b4.m4a",
"crawledTime" : ISODate("2016-04-22T21:58:36.779Z"),
"comments_count" : 5,
"album_title" : "张少佐《黄杨传》",
"category_title" : "相声评书",
"checksum" : "146b12a9635b0c55ccadcccec0a5ce2b"
}
{
"_id" : ObjectId("571a12d4e138237180c34984"),
"category" : " 音乐 ",
"subcategory" : "精选·专题",
"contentSource" : "www.qingting.fm",
"crawledCount" : 1,
"fullDescs" : "None",
"albumName" : "年代FM",
"audios" : [
{
"album_title" : "年代FM",
"audioName" : "70后怀旧金曲",
"sub_category_title" : "精选·专题",
"category_title" : " 音乐 ",
"playUrl" : "http://od.qingting.fm/live/3911402.m3u8"
},
{
"album_title" : "年代FM",
"audioName" : "80后爱听的歌",
"sub_category_title" : "精选·专题",
"category_title" : " 音乐 ",
"playUrl" : "http://od.qingting.fm/live/3999089.m3u8"
},
{
"album_title" : "年代FM",
"audioName" : "90后爱听的歌",
"sub_category_title" : "精选·专题",
"category_title" : " 音乐 ",
"playUrl" : "http://od.qingting.fm/live/3999976.m3u8"
}
],
"crawledTime" : ISODate("2016-04-22T20:02:28.808Z"),
"albumPicPath" : "",
"crawlType" : "qt_album",
"albumPicUrl" : "http://pic.qingting.fm/2015/0805/20150805111256939.jpg!400",
"crawlTime" : "2016-04-22 20:02:28"
}
{
"_id" : ObjectId("5719f56ee1382364a6740310"),
"audioName" : "大明演义 第1回",
"uuid" : "900655fe087011e68619782bcb3b9846",
"crawledCount" : 2,
"audioDownloadDir" : "/var/crawler/qt/audios/full/426e12396136ca0dd1ed227ba9d3bf6918028b8d.m4a",
"sub_category_title" : "评书演义",
"crawledTime" : ISODate("2016-04-22T17:57:02.707Z"),
"sendToCNRTime" : null,
"album_title" : "单田芳评书:大明演义",
"category_title" : " 历史 ",
"playUrl" : "http://od.qingting.fm/vod/00/00/0000000000000000000026028559_64.m4a",
"checksum" : "75b8326d258b1834c68f704d1b9ee16c"
}