wxyBUPT / sxs_spider

基于scrapy的音频网站爬取
12 stars 5 forks source link

本工程爬取三个媒体网站的元信息与音频信息,并将爬取到的信息存储到mongo中,三个网站包括喜马拉雅fm,蜻蜓fm,考拉fm(点播内容)

使用
项目使用scrapy 框架,使用Centos,需要有mongo 环境,爬虫配置在./m_spider/settings.py,项目在python2.7 的虚拟环境下运行,相关依赖在 ./requirements.txt。

环境准备完毕使用 scrapy list 查看所有爬虫,scrapy crawl 爬虫名爬取数据

数据库格式(原始数据部分)

考拉fm

1、考拉类别信息

{
    "_id" : ObjectId("571a1447e138237497458fd8"),
    "sourceUrl" : "http://www.kaolafm.com/category/1165",
    "categoryName" : "军事",
    "pageSize" : 24,
    "totalCounts" : 69,
    "subCategorys" : [
        {
            "linkType" : 1,
            "categoryName" : "军情速递",
            "hasSub" : 0,
            "imageAoyo" : "",
            "categoryId" : 1314,
            "imageAoyoEffect" : "",
            "logo" : "http://image.kaolafm.net/mz/images/201603/3fff5ffd-d6a4-437c-90c3-44b49494c59e/default.jpg",
            "id" : -1
        },
        {
            "linkType" : 1,
            "categoryName" : "军史档案",
            "hasSub" : 0,
            "imageAoyo" : "",
            "categoryId" : 1315,
            "imageAoyoEffect" : "",
            "logo" : "http://image.kaolafm.net/mz/images/201603/7166226b-bddf-4fb1-a417-168db80bacbe/default.jpg",
            "id" : -1
        },
        {
            "linkType" : 1,
            "categoryName" : "武器装备",
            "hasSub" : 0,
            "imageAoyo" : "",
            "categoryId" : 1313,
            "imageAoyoEffect" : "",
            "logo" : "http://image.kaolafm.net/mz/images/201603/deb0658e-4407-4614-b1bf-a82df010f2d5/default.jpg",
            "id" : -1
        }
    ],
    "totalPages" : 3,
    "categoryId" : 1165
}

2、考拉网站专辑

{
    "_id" : ObjectId("571a148ae138237497458fef"),
    "pageSize" : 20,
    "audios" : [
        {
            "audioId" : NumberLong("1000002233810"),
            "playUrl" : "http://image.kaolafm.net/mz/outopus_16/201602/28010074-86d8-4e66-932f-737a3ec63154.opus"
        },
        {
            "audioId" : NumberLong("1000002233813"),
            "playUrl" : "http://image.kaolafm.net/mz/outopus_16/201602/a308d101-8dbd-45a1-baf3-719cd7f2850a.opus"
        },
        {
            "audioId" : NumberLong("1000002233844"),
            "playUrl" : "http://image.kaolafm.net/mz/outopus_16/201602/9bd44ce2-9ff4-41e2-b446-9c15793e56c7.opus"
        },
        {
            "audioId" : NumberLong("1000002233847"),
            "playUrl" : "http://image.kaolafm.net/mz/outopus_16/201602/0c42b3d3-421e-4c26-a9ed-5c9c304219ae.opus"
        },
        {
            "audioId" : NumberLong("1000002233901"),
            "playUrl" : "http://image.kaolafm.net/mz/outopus_16/201602/b8dba548-deb5-488a-bbcd-57465746240f.opus"
        },
        {
            "audioId" : NumberLong("1000002233919"),
            "playUrl" : "http://image.kaolafm.net/mz/outopus_16/201602/1116158e-4fe7-48ca-b5f8-14e95912de00.opus"
        },
        {
            "audioId" : NumberLong("1000002256643"),
            "playUrl" : "http://image.kaolafm.net/mz/outopus_16/201602/bdda8b4e-622c-4a87-8a15-932b85397a0c.opus"
        },
        {
            "audioId" : NumberLong("1000002280876"),
            "playUrl" : "http://image.kaolafm.net/mz/outopus_16/201603/62d6d392-2a91-4f0d-b09d-3e06c6e2a967.opus"
        },
        {
            "audioId" : NumberLong("1000002296060"),
            "playUrl" : "http://image.kaolafm.net/mz/outopus_16/201603/73f0d359-40e8-4b3d-ba0a-0ebc5ef640a9.opus"
        },
        {
            "audioId" : NumberLong("1000002330497"),
            "playUrl" : "http://image.kaolafm.net/mz/outopus_32/201603/0169223d-7b6c-4b07-a5c1-ab656e073aa7.opus"
        }
    ],
    "commentCount" : 0,
    "shortDesc" : "岁月留声,光阴永存!",
    "albumPicUrl" : "http://image.kaolafm.net/mz/images/201602/1d5ab80e-b2ce-4101-a8be-67d74efd9782/default.png",
    "categoryName" : "公益",
    "index" : 22,
    "fullDescs" : "岁月留声,光阴永存!",
    "followedNum" : 6,
    "categoryPage" : 6,
    "lastModifyTime" : ISODate("2016-04-22T20:09:44.368Z"),
    "type" : 0,
    "categoryId" : 994,
    "status" : "更新中",
    "anchors" : [
        {
            "des" : "",
            "name" : "月上岚",
            "img" : ""
        }
    ],
    "tags" : [
        "光音工坊",
        "月上岚",
        "CV",
        "广播剧",
        "配音"
    ],
    "audioCounts" : 10,
    "comeFromId" : null,
    "comeFrom" : "光音工坊",
    "albumId" : NumberLong("1100000117692"),
    "produce" : "用户分享 ",
    "uploaderId" : 2490178,
    "updateDay" : "不定期更新",
    "uploadUserName" : "月上岚",
    "listenNum" : 14006,
    "shareUrl" : "http://m.kaolafm.com/share/zj.html?albumId=1100000117692",
    "albumName" : "光音工坊",
    "sumPage" : 1
}

3、考拉网站的音频

{
    "_id" : ObjectId("571a1488e138237497458fe5"),
    "updateTime" : "2016-02-05",
    "uploaderName" : "月上岚",
    "duration" : 237976,
    "uuid" : "19ae59de088311e699c0782bcb3b9846",
    "crawledCount" : 1,
    "crawledTime" : ISODate("2016-04-22T20:09:44.597Z"),
    "createTime" : NumberLong("1454638740000"),
    "orderNum" : 1,
    "audioPicUrl" : null,
    "sendToCNRTime" : null,
    "audioId" : NumberLong("1000002233810"),
    "likedNum" : 0,
    "commentNum" : 0,
    "m3u8PlayUrl" : "http://image.kaolafm.net/mz/aac_64/201602/28010074-86d8-4e66-932f-737a3ec63154/playlist.m3u8",
    "fileSize" : 1863117,
    "uploaderId" : 2490178,
    "mp3PlayUrl" : "http://image.kaolafm.net/mz/audios/201602/28010074-86d8-4e66-932f-737a3ec63154.mp3",
    "playUrl" : "http://image.kaolafm.net/mz/outopus_16/201602/28010074-86d8-4e66-932f-737a3ec63154.opus",
    "audioDesc" : null,
    "audioName" : "校园灵异广播剧——《轮回》(片花)",
    "listenNum" : 866,
    "audioDownloadDir" : "/var/crawler/kl/audios/full/90f4be0f22dd148698d09d5fad1556ff2bda6a07.m3u8",
    "shareUrl" : "http://m.kaolafm.com/share/jm.html?audioId=1000002233810",
    "accPlayUrl" : null,
    "album_title" : "光音工坊",
    "checksum" : "d68260612135cc53cb15d498fd338ec9"
}

喜马拉雅fm

1、喜马拉雅类别

{
    "_id" : ObjectId("571a2dfce1382377590c23ce"),
    "cid" : "3",
    "href" : "http://www.ximalaya.com/dq/book/",
    "nameText" : "有声书",
    "subCategorys" : [
        "言情",
        "悬疑",
        "幻想",
        "历史",
        "都市",
        "文学",
        "社科",
        "武侠",
        "读客图书",
        "QQ阅读",
        "果麦文化",
        "中信出版",
        "博集天卷",
        "速播专区",
        "推理世界",
        "正能量有声书"
    ],
    "cname" : "book"
}

2、喜马拉雅网站的专辑信息

{
    "_id" : ObjectId("571a2e0fe1382377590c23fa"),
    "categoryName" : "【商业财经】",
    "imgSrc" : "http://fdfs.xmcdn.com/group8/M01/74/FD/wKgDYFX-z1XAbxRIAAEx5osVmk0793_web_large.jpg",
    "tags" : [
        "财经资讯",
        "财经评论",
        "创业密码",
        "商业聚焦",
        "股指期货"
    ],
    "crawledCount" : 1,
    "album_id" : "2983857",
    "audios" : [
        {
            "id" : "8850477",
            "album_id" : 2983857
        }
    ],
    "uploadUserName" : "\r\n            管理之声\r\n            ",
    "crawledTime" : ISODate("2016-04-22T21:58:39.106Z"),
    "href" : "http://www.ximalaya.com/29947683/album/2983857",
    "albumDesc" : "企业家自己的故事。",
    "uploadUserUrl" : "http://www.ximalaya.com/zhubo/29947683/",
    "playTime" : "177",
    "album_name" : "企业家"
}

3、音频信息

{
    "_id" : ObjectId("571a2e0ce1382377590c23e6"),
    "uid" : 4253278,
    "album_id" : 389451,
    "audioChecksum" : null,
    "intro" : "",
    "play_path_32" : "http://audio.xmcdn.com/group11/M00/37/78/wKgDa1WWnp7CUK4SAEMm6D8va3k417.m4a",
    "duration" : 1452,
    "id" : "6447415",
    "uuid" : "4f2a32e0089211e69dce782bcb3b9846",
    "title" : "张少佐评书黄杨传001",
    "crawledCount" : 1,
    "sendToCNRTime" : null,
    "upload_id" : "u_5463419",
    "cover_url" : "http://fdfs.xmcdn.com/group6/M07/4D/47/wKgDg1U0543TKCnMAAEAFfU4hWc143.jpg",
    "play_count" : 3706,
    "uploadUserName" : "\r\n            惜林槐香\r\n            ",
    "cover_url_142" : "http://fdfs.xmcdn.com/group6/M07/4D/47/wKgDg1U0543TKCnMAAEAFfU4hWc143_web_large.jpg",
    "play_path_64" : "http://audio.xmcdn.com/group11/M00/37/80/wKgDbVWWnmDgH23vAFlLABTtPbA323.m4a",
    "nickname" : "惜林槐香",
    "category_name" : "comic",
    "shares_count" : 0,
    "favorites_count" : 12,
    "play_path" : "http://audio.xmcdn.com/group11/M00/37/80/wKgDbVWWnmDgH23vAFlLABTtPbA323.m4a",
    "created_at" : ISODate("2015-04-20T00:00:00Z"),
    "audioDownloadDir" : "/var/crawler/xmly/audios/full/412b8fd00600331c1d6492624cde21076618b3b4.m4a",
    "crawledTime" : ISODate("2016-04-22T21:58:36.779Z"),
    "comments_count" : 5,
    "album_title" : "张少佐《黄杨传》",
    "category_title" : "相声评书",
    "checksum" : "146b12a9635b0c55ccadcccec0a5ce2b"
}

蜻蜓fm

1、蜻蜓fm 的专辑信息

{
    "_id" : ObjectId("571a12d4e138237180c34984"),
    "category" : " 音乐 ",
    "subcategory" : "精选·专题",
    "contentSource" : "www.qingting.fm",
    "crawledCount" : 1,
    "fullDescs" : "None",
    "albumName" : "年代FM",
    "audios" : [
        {
            "album_title" : "年代FM",
            "audioName" : "70后怀旧金曲",
            "sub_category_title" : "精选·专题",
            "category_title" : " 音乐 ",
            "playUrl" : "http://od.qingting.fm/live/3911402.m3u8"
        },
        {
            "album_title" : "年代FM",
            "audioName" : "80后爱听的歌",
            "sub_category_title" : "精选·专题",
            "category_title" : " 音乐 ",
            "playUrl" : "http://od.qingting.fm/live/3999089.m3u8"
        },
        {
            "album_title" : "年代FM",
            "audioName" : "90后爱听的歌",
            "sub_category_title" : "精选·专题",
            "category_title" : " 音乐 ",
            "playUrl" : "http://od.qingting.fm/live/3999976.m3u8"
        }
    ],
    "crawledTime" : ISODate("2016-04-22T20:02:28.808Z"),
    "albumPicPath" : "",
    "crawlType" : "qt_album",
    "albumPicUrl" : "http://pic.qingting.fm/2015/0805/20150805111256939.jpg!400",
    "crawlTime" : "2016-04-22 20:02:28"
}

2、qt 音频

{
    "_id" : ObjectId("5719f56ee1382364a6740310"),
    "audioName" : "大明演义 第1回",
    "uuid" : "900655fe087011e68619782bcb3b9846",
    "crawledCount" : 2,
    "audioDownloadDir" : "/var/crawler/qt/audios/full/426e12396136ca0dd1ed227ba9d3bf6918028b8d.m4a",
    "sub_category_title" : "评书演义",
    "crawledTime" : ISODate("2016-04-22T17:57:02.707Z"),
    "sendToCNRTime" : null,
    "album_title" : "单田芳评书:大明演义",
    "category_title" : " 历史 ",
    "playUrl" : "http://od.qingting.fm/vod/00/00/0000000000000000000026028559_64.m4a",
    "checksum" : "75b8326d258b1834c68f704d1b9ee16c"
}