Open HelplessMan opened 6 years ago
用phpspider 写了一个爬小说的的,但是我想在列表也获取内容的时候。遇到了问题。总是拿不到标题。而且方法也是按照官网文档的方式获取的。麻烦那位大佬帮忙看一下哈,或者说我这种方式写错了么。。 请大佬们指点一二。。。
报错提示: 2018-11-12 14:26:37 [debug] Find list page: https://www.kanshushenzhan.com/8311/ 2018-11-12 14:26:37 [debug] Find list page: https://www.kanshushenzhan.com/8309/ 2018-11-12 14:26:37 [debug] Find list page: https://www.kanshushenzhan.com/4359/ 2018-11-12 14:26:37 [debug] Find list page: https://www.kanshushenzhan.com/4365/ 2018-11-12 14:26:37 [warn] Selector article_title[//div[contains(@class,'article_title')]] not found, It's a must 详细代码
报错提示:
2018-11-12 14:26:37 [debug] Find list page: https://www.kanshushenzhan.com/8311/ 2018-11-12 14:26:37 [debug] Find list page: https://www.kanshushenzhan.com/8309/ 2018-11-12 14:26:37 [debug] Find list page: https://www.kanshushenzhan.com/4359/ 2018-11-12 14:26:37 [debug] Find list page: https://www.kanshushenzhan.com/4365/ 2018-11-12 14:26:37 [warn] Selector article_title[//div[contains(@class,'article_title')]] not found, It's a must
详细代码
<?php /** * Created by PhpStorm. * User: Administrator * Date: 2018/11/5 * Time: 19:33 */ require './vendor/autoload.php'; use phpspider\core\phpspider; use phpspider\core\selector; /* Do NOT delete this comment */ /* 不要删除这段注释 */ $configs = array( 'name' => '看书神站', 'log_show' => true, 'domains' => array( 'kanshushenzhan.com', 'www.kanshushenzhan.com' ), 'scan_urls' => array( "https://www.kanshushenzhan.com/all/" ), 'content_url_regexes' => array( "https://www.kanshushenzhan.com/\d+/\d+.html" ), 'list_url_regexes' => array( "https://www.kanshushenzhan.com/\d+/" ), 'fields' => array( array( // 抽取列表页的文章标题 'name' => "article_title", 'selector' => "//div[contains(@class,'article_title')]", 'required' => true ), array( // 抽取内容页的文章内容 'name' => "article_content", 'selector' => "//div[contains(@class,'content')]", 'required' => false ), array( // 抽取内容页的文章作者 'name' => "article_author", 'selector' => "//div[contains(@class,'author')]", 'required' => false ), array( // 抽取内容页的文章作者 'name' => "article_category", 'selector' => "//div[contains(@class,'category')]", 'required' => false ), array( // 抽取内容页的文章作者 'name' => "article_lasttime", 'selector' => "//div[contains(@class,'renew')]", 'required' => false ), array( // 抽取内容页的文章最新章节 'name' => "article_novel", 'selector' => "//div[contains(@class,'renew_date')]", 'required' => false ), array( // 抽取内容页的文章作者 'name' => "article_status", 'selector' => "//a[contains(@class,'author')]", 'required' => false ), array( // 抽取内容页的文章描述 'name' => "article_desc", 'selector' => "//div[contains(@class,'desc')]", 'required' => false ), array( // 抽取内容页的文章图片 'name' => "article_images", 'selector' => "//div[contains(@class,'author')]", 'required' => false ), array( // 抽取内容页的文章章节名称 'name' => "article_novel_title", 'selector' => "//div[contains(@class,'author')]", 'required' => false ), ), 'export' => array( 'type' => 'csv', 'file' => './kanshushenzhan.csv', // data目录下 ) ); $spider = new phpspider($configs); $spider->on_list_page = function ($page, $content, $phpspider) { //*[@id="yuedu"]/div[2]/ul/li[1]/a $url_title = selector::select($content, "//div[contains(@class,'chapterCon')]//a"); $urls = selector::select($content, "//div[contains(@class,'chapterCon')]//@href"); $article_title = selector::select($content, "//div[contains(@class,'bookPhr')]//h2"); $article_author = selector::select($content, "//div[contains(@class,'bookPhr')]/dl[1]/dd[1]"); $article_category = selector::select($content, "//div[contains(@class,'bookPhr')]/dl[1]/dd[2]"); $article_renew = selector::select($content, "//div[contains(@class,'renew')]//a"); $article_renew_date = selector::select($content, "//div[contains(@class,'renew')]//span"); $article_desc = selector::select($content, "//div[contains(@class,'introCon')]//p"); $article_count_click = selector::select($content, "//div[contains(@class,'bookPhr')]/dl[1]/dd[4]"); $article_month_click = selector::select($content, "//div[contains(@class,'bookPhr')]/dl[1]/dd[5]"); $article_week_click = selector::select($content, "//div[contains(@class,'bookPhr')]/dl[1]/dd[6]"); $article_zishu = selector::select($content, "//div[contains(@class,'bookPhr')]/dl[1]/dd[7]"); $article_shoucang = selector::select($content, "//div[contains(@class,'bookPhr')]/dl[1]/dd[8]"); $page_views = '<div class="article_title">' . $article_title .'</div><div class="author">' . $article_author . '</div><div class="category">' . $article_category . '</div>' . '<div class="renew">' . $article_renew . '</div><div class="renew_date">' . $article_renew_date . '</div><div class="desc">' . $article_desc . '</div>'. '<div class="count_click">' . $article_count_click . '</div><div class="month_click">' . $article_month_click . '</div><div class="week_click">' . $article_week_click . '</div>' . '<div class="zishu">' . $article_zishu . '</div><div class="shoucang">' . $article_shoucang . '</div>'; //var_dump($page_views);exit; $options = array( 'method' => 'get', 'context_data' => $page_views, ); foreach ($urls as $url) { $phpspider->add_url($url, $options); } //print_r($page_views);exit; // 返回true继续提取其他列表页URL return true; }; $spider->on_content_page = function ($page, $content, $phpspider) { //var_dump($content);exit; return true; }; $spider->start();
'content_url_regexes' => array( "https://www.kanshushenzhan.com/\d+/\d+.html" ), 'list_url_regexes' => array( "https://www.kanshushenzhan.com/\d+/" ),
还没看出问题了么? Q643796448
用phpspider 写了一个爬小说的的,但是我想在列表也获取内容的时候。遇到了问题。总是拿不到标题。而且方法也是按照官网文档的方式获取的。麻烦那位大佬帮忙看一下哈,或者说我这种方式写错了么。。 请大佬们指点一二。。。