owner888 / phpspider

《我用爬虫一天时间“偷了”知乎一百万用户,只为证明PHP是世界上最好的语言 》所使用的程序
3.49k stars 1.17k forks source link

无法获取到列表页面的内容 #124

Open HelplessMan opened 6 years ago

HelplessMan commented 6 years ago

用phpspider 写了一个爬小说的的,但是我想在列表也获取内容的时候。遇到了问题。总是拿不到标题。而且方法也是按照官网文档的方式获取的。麻烦那位大佬帮忙看一下哈,或者说我这种方式写错了么。。 请大佬们指点一二。。。

报错提示:

2018-11-12 14:26:37 [debug] Find list page: https://www.kanshushenzhan.com/8311/
2018-11-12 14:26:37 [debug] Find list page: https://www.kanshushenzhan.com/8309/
2018-11-12 14:26:37 [debug] Find list page: https://www.kanshushenzhan.com/4359/
2018-11-12 14:26:37 [debug] Find list page: https://www.kanshushenzhan.com/4365/
2018-11-12 14:26:37 [warn] Selector article_title[//div[contains(@class,'article_title')]] not found, It's a must

详细代码

<?php
/**
 * Created by PhpStorm.
 * User: Administrator
 * Date: 2018/11/5
 * Time: 19:33
 */
require './vendor/autoload.php';
use phpspider\core\phpspider;
use phpspider\core\selector;

/* Do NOT delete this comment */
/* 不要删除这段注释 */

$configs = array(
    'name' => '看书神站',
    'log_show' => true,
    'domains' => array(
        'kanshushenzhan.com',
        'www.kanshushenzhan.com'
    ),
    'scan_urls' => array(
        "https://www.kanshushenzhan.com/all/"
    ),
    'content_url_regexes' => array(
        "https://www.kanshushenzhan.com/\d+/\d+.html"
    ),
    'list_url_regexes' => array(
        "https://www.kanshushenzhan.com/\d+/"
    ),
    'fields' => array(
        array(
            // 抽取列表页的文章标题
            'name' => "article_title",
            'selector' => "//div[contains(@class,'article_title')]",
            'required' => true
        ),
        array(
            // 抽取内容页的文章内容
            'name' => "article_content",
            'selector' => "//div[contains(@class,'content')]",
            'required' => false
        ),
        array(
            // 抽取内容页的文章作者
            'name' => "article_author",
            'selector' => "//div[contains(@class,'author')]",
            'required' => false
        ),
        array(
            // 抽取内容页的文章作者
            'name' => "article_category",
            'selector' => "//div[contains(@class,'category')]",
            'required' => false
        ),
        array(
            // 抽取内容页的文章作者
            'name' => "article_lasttime",
            'selector' => "//div[contains(@class,'renew')]",
            'required' => false
        ),
        array(
            // 抽取内容页的文章最新章节
            'name' => "article_novel",
            'selector' => "//div[contains(@class,'renew_date')]",
            'required' => false
        ),
        array(
            // 抽取内容页的文章作者
            'name' => "article_status",
            'selector' => "//a[contains(@class,'author')]",
            'required' => false
        ),
        array(
            // 抽取内容页的文章描述
            'name' => "article_desc",
            'selector' => "//div[contains(@class,'desc')]",
            'required' => false
        ),
        array(
            // 抽取内容页的文章图片
            'name' => "article_images",
            'selector' => "//div[contains(@class,'author')]",
            'required' => false
        ),
        array(
            // 抽取内容页的文章章节名称
            'name' => "article_novel_title",
            'selector' => "//div[contains(@class,'author')]",
            'required' => false
        ),
    ),
    'export' => array(
        'type' => 'csv',
        'file' => './kanshushenzhan.csv', // data目录下
    )
);
$spider = new phpspider($configs);
$spider->on_list_page = function ($page, $content, $phpspider) {
    //*[@id="yuedu"]/div[2]/ul/li[1]/a
    $url_title = selector::select($content, "//div[contains(@class,'chapterCon')]//a");
    $urls = selector::select($content, "//div[contains(@class,'chapterCon')]//@href");
    $article_title = selector::select($content, "//div[contains(@class,'bookPhr')]//h2");
    $article_author = selector::select($content, "//div[contains(@class,'bookPhr')]/dl[1]/dd[1]");
    $article_category = selector::select($content, "//div[contains(@class,'bookPhr')]/dl[1]/dd[2]");
    $article_renew = selector::select($content, "//div[contains(@class,'renew')]//a");
    $article_renew_date = selector::select($content, "//div[contains(@class,'renew')]//span");
    $article_desc = selector::select($content, "//div[contains(@class,'introCon')]//p");
    $article_count_click = selector::select($content, "//div[contains(@class,'bookPhr')]/dl[1]/dd[4]");
    $article_month_click = selector::select($content, "//div[contains(@class,'bookPhr')]/dl[1]/dd[5]");
    $article_week_click = selector::select($content, "//div[contains(@class,'bookPhr')]/dl[1]/dd[6]");
    $article_zishu = selector::select($content, "//div[contains(@class,'bookPhr')]/dl[1]/dd[7]");
    $article_shoucang = selector::select($content, "//div[contains(@class,'bookPhr')]/dl[1]/dd[8]");
    $page_views = '<div class="article_title">' . $article_title  .'</div><div class="author">' . $article_author . '</div><div class="category">' . $article_category . '</div>' .
                    '<div class="renew">' . $article_renew . '</div><div class="renew_date">' . $article_renew_date . '</div><div class="desc">' . $article_desc . '</div>'.
                    '<div class="count_click">' . $article_count_click . '</div><div class="month_click">' . $article_month_click . '</div><div class="week_click">' . $article_week_click . '</div>' .
                    '<div class="zishu">' . $article_zishu . '</div><div class="shoucang">' . $article_shoucang . '</div>';
    //var_dump($page_views);exit;
    $options = array(
        'method' => 'get',
        'context_data' => $page_views,
    );
    foreach ($urls as $url) {
        $phpspider->add_url($url, $options);
    }
    //print_r($page_views);exit;
    // 返回true继续提取其他列表页URL
    return true;

};
$spider->on_content_page = function ($page, $content, $phpspider) {
    //var_dump($content);exit;
    return true;
};
$spider->start();
lirko commented 6 years ago
'content_url_regexes' => array(
    "https://www.kanshushenzhan.com/\d+/\d+.html"
),
'list_url_regexes' => array(
    "https://www.kanshushenzhan.com/\d+/"
),

还没看出问题了么? Q643796448