一、综述

1. 所用技术

# 详看pom
- SpringBoot 2.4.2
- Webmagic 0.7.4
- quartz 2.3.2
- Redis 

2. 架构

定时任务不断爬取微博榜单放入Redis,接口调取Redis数据

3.数据结构

WeiboSpider.jpg

4. 目录结构

project:
│  Dockerfile
│  pom.xml
│  README.md
│  
└─src
    ├─main
    │  ├─java
    │  │  └─com
    │  │      └─pal
    │  │          │  WeiboSpiderApplication.java
    │  │          │  
    │  │          ├─controller
    │  │          │      WeiboSpiderController.java
    │  │          │      
    │  │          ├─process
    │  │          │      CommonWebSite.java
    │  │          │      DiscussProcess.java
    │  │          │      HotListProcess.java
    │  │          │      
    │  │          ├─quartz
    │  │          │      QuartzManager.java
    │  │          │      SpiderJob.java
    │  │          │      
    │  │          ├─service
    │  │          │  │  HotInfoService.java
    │  │          │  │  
    │  │          │  └─impl
    │  │          │          HotInfoServiceImpl.java
    │  │          │          
    │  │          └─untils
    │  │                  JedisUtils.java
    │  │                  SingleJedisUtils.java
    │  │                  
    │  └─resources
    │          application-dev.properties
    │          application-prod.properties
    │          application.properties
    │          log4j2.xml
    │          redis.properties

二、爬虫与定时任务

1. Webmagic

一定要用 0.7.4 版本, 0.7.3 有bug

1.1 公用Set

public class CommonWebSite {

    private static String cookie;

    @Value("${spider.cookie}")
    public static void setCookie(String cookies) {
        cookie = cookies;
    }

    public static Site getCommonWebSite()
    {
        return Site.me()
                //设置编码
                .setCharset("utf8")
                //设置超时时间
                .setTimeOut(10 * 1000)
                //设置重试的间隔时间
                .setRetrySleepTime(3000)
                //设置重试的次数
                .setRetryTimes(3)
                //添加抓包获取的cookie信息
                .addCookie("s.weibo.com", cookie)
                //添加请求头,伪装浏览器请求
                .addHeader("User-Agent",
                        "ozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80" +
                                " Safari/537.36 Core/1.47.516.400 QQBrowser/9.4.8188.400")
                .addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
                .addHeader("Accept-Encoding", "gzip, deflate, sdch")
                .addHeader("Accept-Language", "zh-CN,zh;q=0.8")
                .addHeader("Connection", "keep-alive")
                .addHeader("Referer", "https://s.weibo.com");
    }
}

1.2 热榜爬取

@Slf4j
public class HotListProcess implements PageProcessor {

    private static final Jedis jedis = SingleJedisUtils.getJedis();
    private Site site = CommonWebSite.getCommonWebSite();

    @Override
    public void process(Page page)
    {
        //当前获取了多少数据
        int count = 0;
        List<Selectable> list = page.getHtml().css("#pl_top_realtimehot td.td-02").nodes();

        //获取当前页面所有帖子
        for (Selectable crad : list)
        {
            Document doc = Jsoup.parse(crad.toString());
            //获取发帖人微博
            String contentUrl = crad.links().get();
            //获取帖子内容
            String text = doc.text();
            String content = text.equals("") ? "" : text;

            //写入redis
            jedis.rpush("HotList",content);
            jedis.rpush("HotListUrl",contentUrl);

            log.info(count + "  " + content + "  ");
            count++;
        }
    }

    @Override
    public Site getSite()
    {
        return site;
    }

}

1.3 调用爬取热榜

@Slf4j
public class SpiderJob implements Job {
    public static String HotListUrl = "https://s.weibo.com/top/summary?cate=realtimehot";
    public static final Jedis jedis = SingleJedisUtils.getJedis();

    @Override
    public void execute(JobExecutionContext jobExecutionContext) throws JobExecutionException
    {
        //热榜爬取任务
        // 先把先前爬取的list备份
        jedis.del("HotListBak");
        jedis.del("HotListUrlBak");

        if (jedis.llen("HotList")>0){
            jedis.rename("HotList","HotListBak");
            jedis.rename("HotListUrl","HotListUrlBak");
        }

        Spider.create(new HotListProcess())
                .addUrl(HotListUrl)
                // .addPipeline(new ExcelPipeline())
                //.thread(5)  //表示开启5个线程来完成任务
                //设置布隆过滤器,最多对100w数据进行去重
                .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(1000*1000)))
                .run();

        List<String> hotListUrl = jedis.lrange("HotListUrl", 1, 50);

        log.info("开始进行详细内容爬取!");
        jedis.del("HotDetailBak");
        if (jedis.llen("HotDetail")>0 ){
            jedis.rename("HotDetail","HotDetailBak");
        }
        long startTime = System.currentTimeMillis();
        AtomicInteger count = new AtomicInteger(0);

        int maxTry = 3;
        for (int i = 0; i<hotListUrl.size();i++){
            Long hotDetailLen = jedis.llen("HotDetail");
            // 因为有爬取失败的情况,所以当redis中详情的list的长度小于计数长度COUNT时候,说明上一条失败了
            if (hotDetailLen < count.get())
            {
                if (maxTry > 0)
                {
                    log.info("第 " + count.get() +"失败,开始重新爬取该条!" + hotListUrl.get(count.get()));
                    i--;
                    count.getAndDecrement();
                    maxTry --;

                }
                else
                {
                    log.info("多次尝试失败,填充错误提示数据!");
                    jedis.rpush("HotDetail","[{'id': '99','content':'Error!'}]");
                }
            }
            count.getAndIncrement();
            Spider.create(new DiscussProcess())
                    .addUrl(hotListUrl.get(i))
                    .thread(5)
                    .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(1000*1000)))
                    .run();
            log.info("第 "+count+" 条完成!");
        }

        log.info("详细内容爬取完成,耗时:" + (System.currentTimeMillis() - startTime) + " ms");
    }
}

1.4 详细内容爬取

@Slf4j
public class DiscussProcess implements PageProcessor {

    private static final Jedis jedis = SingleJedisUtils.getJedis();
    private final Site site = CommonWebSite.getCommonWebSite();

    @Override
    public void process(Page page)
    {
        //css定位微博内容
        List<Selectable> list = page.getHtml().css("#pl_feedlist_index div.card-wrap").nodes();
        List<Map<String, String>> detailList = new ArrayList<>();

        //获取当前页面所有帖子
        for (Selectable crad : list)
        {
            int count = 1;
            Document doc = Jsoup.parse(crad.toString());
            Map<String, String> detailsMap = new HashMap<>(7);

            // 作者
            if (doc.select("a.name").first() != null)
            {
                detailsMap.put("author", doc.select("a.name").first().attr("nick-name"));

            }
            else
            {
                continue;
            }
            //获取时间
            if (doc.select("p[class=from]").first() != null)
            {
                detailsMap.put("time", doc.select("p[class=from]").first().text());
            }
            else
            {
                detailsMap.put("time", "");
            }

            //获取帖子图片
            if (doc.select("div[node-type=feed_list_media_prev]") == null || doc.select("div[node-type=feed_list_media_prev]").select("img[src~=(?i)\\.(png|jpe?g|gif)]") == null)
            {
                detailsMap.put("img", "0");
            }
            else
            {
                Elements imageUrlElements = doc.select("div[node-type=feed_list_media_prev]").select("img[src~=(?i)\\.(png|jpe?g|gif)]");
                List<String> imgUrls = new ArrayList<>();
                imageUrlElements.forEach( element -> {
                    imgUrls.add(element.attr("src"));
                });
                detailsMap.put("img", JSON.toJSONString(imgUrls));
            }
            String text = doc.select("p[node-type=feed_list_content]").text();
            String textFull = doc.select("p[node-type=feed_list_content_full]").text();
            String content = textFull.equals("") ? text : textFull;

            detailsMap.put("id", String.valueOf(count));
            detailsMap.put("contentUrl", crad.css("div[class=avator]").links().get());
            detailsMap.put("content", content);
            detailsMap.put("discussNum", doc.select("div[class=card-act]").text());
            detailList.add(detailsMap);
            count++;
        }
        jedis.rpush("HotDetail", JSON.toJSONString(detailList));

    }

    @Override
    public Site getSite()
    {
        return site;
    }
}

1.5 调用爬取详情

@Slf4j
public class SpiderJob implements Job {
    public static String HotListUrl = "https://s.weibo.com/top/summary?cate=realtimehot";
    public static final Jedis jedis = SingleJedisUtils.getJedis();

    @Override
    public void execute(JobExecutionContext jobExecutionContext) throws JobExecutionException
    {
        List<String> hotListUrl = jedis.lrange("HotListUrl", 1, 50);
        log.info("开始进行详细内容爬取!");
        
        jedis.del("HotDetailBak");
        if (jedis.llen("HotDetail")>0 ){
            jedis.rename("HotDetail","HotDetailBak");
        }
        long startTime = System.currentTimeMillis();
        AtomicInteger count = new AtomicInteger(0);

        int maxTry = 3;
        for (int i = 0; i<hotListUrl.size();i++){
            Long hotDetailLen = jedis.llen("HotDetail");
            // 因为有爬取失败的情况,所以当redis中详情的list的长度小于计数长度COUNT时候,说明上一条失败了
            if (hotDetailLen < count.get())
            {
                if (maxTry > 0)
                {
                    log.info("第 " + count.get() +"失败,开始重新爬取该条!" + hotListUrl.get(count.get()));
                    i--;
                    count.getAndDecrement();
                    maxTry --;

                }
                else
                {
                    log.info("多次尝试失败,填充错误提示数据!");
                    jedis.rpush("HotDetail","[{'id': '99','content':'Error!'}]");
                }
            }
            count.getAndIncrement();
            Spider.create(new DiscussProcess())
                    .addUrl(hotListUrl.get(i))
                    .thread(5)
                    .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(1000*1000)))
                    .run();
            log.info("第 "+count+" 条完成!");
        }

        log.info("详细内容爬取完成,耗时:" + (System.currentTimeMillis() - startTime) + " ms");
    }
}

2. 定时任务

@Service
public class QuartzManager {
    public void runJob()
    {
        JobDetail jobDetail = JobBuilder.newJob(SpiderJob.class).withIdentity("WeiboSpider", "Spider").build();

        //10分钟执行一次 一直执行
        Trigger trigger = TriggerBuilder.newTrigger().withIdentity("trigger1", "group").startNow()
                .withSchedule(SimpleScheduleBuilder.simpleSchedule().withIntervalInSeconds(600).repeatForever()).build();

        // 定义一个 Schedule,用于绑定任务和触发器
        SchedulerFactory sf = new StdSchedulerFactory();
        Scheduler scheduler = null;
        try
        {
            scheduler = sf.getScheduler();
            scheduler.scheduleJob(jobDetail, trigger);
            scheduler.start();
        }
        catch (SchedulerException e)
        {
            e.printStackTrace();
        }

    }
}

三、存储

1. Redis设置

# redis.properties
redis.host = 127.0.0.1
redis.port = 6379
redis.maxIdle = 10
redis.maxTotal = 30

2. 连接池

public class JedisUtils {
    private static JedisPool jp = null;
    private static String host = null;
    private static int port = 0;
    /**
     * 活动链接数
     */
    private static int maxIdle = 0;

    /**
     * 最大连接数
     */
    private static int maxTotal = 0;

    static {
        ResourceBundle resourceBundle = ResourceBundle.getBundle("redis");
        host = resourceBundle.getString("redis.host");
        port = Integer.parseInt(resourceBundle.getString("redis.port"));
        maxIdle = Integer.parseInt(resourceBundle.getString("redis.maxIdle"));
        maxTotal = Integer.parseInt(resourceBundle.getString("redis.maxTotal"));

        JedisPoolConfig jedisPoolConfig = new JedisPoolConfig();
        jedisPoolConfig.setMaxIdle(maxIdle);
        jedisPoolConfig.setMaxTotal(maxTotal);
        jp = new JedisPool(host, port);
    }

    public static Jedis getJedis(){
        return jp.getResource();
    }
}

3. Jedis单例

Initialization On Demand Holder

public class SingleJedisUtils {

    //private static Jedis jedis = JedisUtils.getJedis();

    public SingleJedisUtils() {
    }

    private static class SingletonHolder{
        public final static Jedis instance = JedisUtils.getJedis();
    }

    public static Jedis getJedis(){
        return SingletonHolder.instance;
    }
}

四、调用

1. Service

// 获取热榜
@Override
public String getHotListJson() {
    Jedis jedis = JedisUtils.getJedis();
    List<String> hotList = jedis.lrange("HotList", 1, jedis.llen("HotList"));
    jedis.close();
    return JSON.toJSONString(hotList);
}

// 获取详情
@Override
public String getHotDetailJson(int index) {
    Jedis jedis = JedisUtils.getJedis();
    
    List<String> hotDetail = jedis.lrange("HotDetail", 0, jedis.llen("HotList"));
    List<String> hotList = jedis.lrange("HotList", 0, jedis.llen("HotList"));
    List<Map<String, String>> list = (List) JSON.parseObject(hotDetail.get(index - 1), List.class);
    
    jedis.close();
    return JSON.toJSONString(list);
}

五、其他

SpingBoot 2.4 需要增加如下配置才能使用之前版本的配置

# application.properties
spring.config.use-legacy-processing = true

深蓝
1 声望0 粉丝