code4craft / webmagic

A scalable web crawler framework for Java.
http://webmagic.io/
Apache License 2.0
11.43k stars 4.18k forks source link

pipline自定义存入mysql报错空指针 #842

Closed kaori-seasons closed 4 years ago

kaori-seasons commented 6 years ago

您好是这样的,打扰您工作十分抱歉。 我在processor中定制了一个抽取逻辑如下, 将爬取的数据存到了userDetailInfo对象,但不知道为什么报空指针异常。明明控制台打印的确实有值 代码清单:

UserBaseInfoProcessor.java

package com.complone.zhihumagic.processor;

import com.complone.zhihumagic.model.UserDetailInfo;
import com.complone.zhihumagic.pipeline.UserBaseInfoPipeline;
import org.apache.commons.lang3.StringUtils;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.RedisScheduler;

import java.util.Date;
import java.util.List;

public class UserBaseInfoProcessor implements PageProcessor {
    private static final String START_URL  = "https://www.zhihu.com/people/excited-vczh/activities";

    private static final String TARGET_USER_BASE_INFO = "https://www\\.zhihu\\.com/people/[\\w-]+";

    private Site site = Site.me().setCycleRetryTimes(5).setRetryTimes(5).setSleepTime(300).setTimeOut(3 * 60 * 1000)
            .setUserAgent(
                    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36")
            .setCharset("UTF-8")
            .addCookie("_xsrf", "f3jahdGruGNjLacAKaxX3SAjXJPtXWmn")
            .addCookie("_zap", "a1139d27-1bd3-4631-ab8d-8c939a7ef471")
            .addCookie("z_c0", "2|1:0|10:1540302879|4:z_c0|92:Mi4xM2ZhQkFnQUFBQUFBd0dBdzVQQ01EU1lBQUFCZ0FsVk5IM2E4WEFCMDNqbXhqaVJYSW9aaFpYd2g5eHo3WC12aXBR|09b5c10ce3f5cae4e4ab532937e4efc31d9773025214e84c29775d4930a13a2d")
            .addCookie("__utmc", "155987696.1522404687.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)") ;

    public void process(Page page) {
        //进入详细页进行进行抓取
        List<String> urls = page.getHtml().links().regex(TARGET_USER_BASE_INFO).all();
        UserDetailInfo userDetailInfo = new UserDetailInfo();
        for(String s:urls){
            if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) {
                continue;
            }
            //page.addTargetRequest(s+"/about");
//            System.err.println(s);
        }

//        System.out.println(page.getHtml());
        page.addTargetRequests(page.getHtml().links().regex(TARGET_USER_BASE_INFO).all());
        userDetailInfo.setPageurl(page.getHtml().xpath("//div[@itemprop='people']/meta[@itemprop='url']/@content").toString());
        userDetailInfo.setNickname(page.getHtml().xpath("//div[@class='ProfileHeader-contentHead']/h1/span[1]/text()").toString());
        userDetailInfo.setBusiness(page.getHtml().xpath("//div[@class='ProfileHeader-contentBody']/div/div/div[2]/text()").toString());
        userDetailInfo.setEmployment(page.getHtml().xpath("//div[@class='ProfileHeader-contentBody']/div/div/div[1]/text()").toString());
        userDetailInfo.setPosition(page.getHtml().xpath("//div[@class='ProfileHeader-contentBody']/div/div[@class='ProfileHeader-detail']").toString());
        userDetailInfo.setCollecters(1);
        //Integer.valueOf(page.getHtml().xpath("//div[@class='zm-profile-module-desc']/span[4]/strong/text()").toString())
        //System.err.println(page.getHtml().xpath("//div[@class='zm-profile-module-desc']/span[4]/strong/text()").toString());
        //System.err.println(page.getHtml().xpath("//div[@class='zm-profile-module-desc']/span[5]/strong/text()").toString());

//        userDetailInfo.setShares(Integer.valueOf(page.getHtml().xpath("//div[@class='ProfileHeader-contentHead']/h1/a/span[2]/text()").toString()));
//        userDetailInfo.setEducation(page.getHtml().xpath("//div[@class='ProfileHeader-detail']/div[4]/div/div/text()").toString());
//        userDetailInfo.setEducationextra(page.getHtml().xpath("//div[@class='ProfileHeader-detail']/div[4]/div/div/text()").toString());
        userDetailInfo.setAddtime(new Date());

        //状态
        String status = page.getHtml().xpath("//div[@class='Profile-sideColumn']/div[@class='Card']/div[2]/div[2]/div[2]/text()").toString();
        if(StringUtils.isEmpty(status))
            userDetailInfo.setStatus("active");
        else
            userDetailInfo.setStatus("noActive");

        //性别
        String gender = page.getHtml().xpath("//div[@class='Card FollowshipCard']/div/a[2]/div/strong/text()").toString();
        if(StringUtils.isEmpty(gender))
            userDetailInfo.setGender("unknow");
        else
            userDetailInfo.setGender(gender);

        page.putField("userDetailInfo",userDetailInfo);

    }

    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {

        Spider.create(new UserBaseInfoProcessor()).addUrl(START_URL)
                .scheduler(new QueueScheduler())
                .addPipeline(new UserBaseInfoPipeline())
                .thread(1).run();
    }

}

UserBaseInfoPipeline.java

package com.complone.zhihumagic.pipeline;

import com.complone.zhihumagic.mapper.UserDetailInfoMapper;
import com.complone.zhihumagic.model.UserBaseInfo;
import com.complone.zhihumagic.model.UserDetailInfo;
import org.springframework.beans.factory.annotation.Autowired;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;

import us.codecraft.webmagic.pipeline.PageModelPipeline;
import us.codecraft.webmagic.pipeline.Pipeline;

import java.util.Map;

public class UserBaseInfoPipeline implements Pipeline {

    @Autowired
    private UserDetailInfoMapper userDetailInfoMapper;

//    @Override
////    public void process(ResultItems resultItems, Task task) {
////        UserDetailInfo record = new UserDetailInfo();
////        for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
////            System.out.println(entry.getKey() + ":\t" + entry.getValue());
////            if (null != entry.getValue()) {
////                switch (entry.getKey()) {
////                    case "pageUrl":
////                        record.setPageurl(entry.getValue().toString());
////                        break;
////                    case "nickname":
////                        record.setNickname(entry.getValue().toString());
////                        break;
////                    case "business":
////                        record.setBusiness(entry.getValue().toString());
////                        break;
////                    case "employment":
////                        record.setEmployment(entry.getValue().toString());
////                        break;
////                    case "position":
////                        record.setPosition(entry.getValue().toString());
////                        break;
////                    case "gender":
////                        record.setGender(entry.getValue().toString());
////                        break;
////                    case "shares":
////                        record.setShares(Integer.valueOf((String) entry.getValue()));
////                        break;
////                    case "collecters":
////                        record.setCollecters(Integer.valueOf((String) entry.getValue()));
////                        break;
////                    case "education":
////                        record.setEducation(entry.getValue().toString());
////                        break;
////                    case "educationExtra":
////                        record.setEducationextra(entry.getValue().toString());
////                        break;
////                    case "status":
////                        record.setStatus(entry.getValue().toString());
////                        break;
////                    default:
////                        break;
////                }
////            }
////        }

    @Override
    public void process(ResultItems resultItems, Task task) { 
        UserDetailInfo userDetailInfo = resultItems.get("userDetailInfo");
        System.out.println(userDetailInfo.getPageurl()+" ------------------ "+userDetailInfo.getNickname());
        int row =  userDetailInfoMapper.insertOne(userDetailInfo);
        System.out.println(row);
    }

}

ConsoleLog

https://www.zhihu.com/people/yang-gui-fu-52 ------------------ 杨贵福
10:43:11.299 [pool-1-thread-1] ERROR us.codecraft.webmagic.Spider - process request Request{url='https://www.zhihu.com/people/yang-gui-fu-52', method='null', extras={statusCode=200}, priority=0} error
java.lang.NullPointerException: null
    at com.complone.zhihumagic.pipeline.UserBaseInfoPipeline.process(UserBaseInfoPipeline.java:73)
    at us.codecraft.webmagic.Spider.processRequest(Spider.java:424)
    at us.codecraft.webmagic.Spider$1.run(Spider.java:322)
    at us.codecraft.webmagic.selector.thread.CountableThreadPool$1.run(CountableThreadPool.java:74)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)
i-CNNN commented 5 years ago

断点调试一下这个变量: userDetailInfoMapper 你的UserBaseInfoPipeline类没有注入啊? userDetailInfoMapper 这个怎么会注入呢?

kaori-seasons commented 5 years ago

断点调试一下这个变量: userDetailInfoMapper 你的UserBaseInfoPipeline类没有注入啊? userDetailInfoMapper 这个怎么会注入呢?

已经解决了,需要用spring bean的生命周期管理 总结如下:https://www.jianshu.com/p/7c476c6c0b68