一、综述
1. 所用技术
# 详看pom
- SpringBoot 2.4.2
- Webmagic 0.7.4
- quartz 2.3.2
- Redis
2. 架构
定时任务不断爬取微博榜单放入Redis,接口调取Redis数据
3.数据结构
4. 目录结构
project:
│ Dockerfile
│ pom.xml
│ README.md
│
└─src
├─main
│ ├─java
│ │ └─com
│ │ └─pal
│ │ │ WeiboSpiderApplication.java
│ │ │
│ │ ├─controller
│ │ │ WeiboSpiderController.java
│ │ │
│ │ ├─process
│ │ │ CommonWebSite.java
│ │ │ DiscussProcess.java
│ │ │ HotListProcess.java
│ │ │
│ │ ├─quartz
│ │ │ QuartzManager.java
│ │ │ SpiderJob.java
│ │ │
│ │ ├─service
│ │ │ │ HotInfoService.java
│ │ │ │
│ │ │ └─impl
│ │ │ HotInfoServiceImpl.java
│ │ │
│ │ └─untils
│ │ JedisUtils.java
│ │ SingleJedisUtils.java
│ │
│ └─resources
│ application-dev.properties
│ application-prod.properties
│ application.properties
│ log4j2.xml
│ redis.properties
二、爬虫与定时任务
1. Webmagic
一定要用 0.7.4
版本, 0.7.3
有bug
1.1 公用Set
public class CommonWebSite {
private static String cookie;
@Value("${spider.cookie}")
public static void setCookie(String cookies) {
cookie = cookies;
}
public static Site getCommonWebSite()
{
return Site.me()
//设置编码
.setCharset("utf8")
//设置超时时间
.setTimeOut(10 * 1000)
//设置重试的间隔时间
.setRetrySleepTime(3000)
//设置重试的次数
.setRetryTimes(3)
//添加抓包获取的cookie信息
.addCookie("s.weibo.com", cookie)
//添加请求头,伪装浏览器请求
.addHeader("User-Agent",
"ozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80" +
" Safari/537.36 Core/1.47.516.400 QQBrowser/9.4.8188.400")
.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
.addHeader("Accept-Encoding", "gzip, deflate, sdch")
.addHeader("Accept-Language", "zh-CN,zh;q=0.8")
.addHeader("Connection", "keep-alive")
.addHeader("Referer", "https://s.weibo.com");
}
}
1.2 热榜爬取
@Slf4j
public class HotListProcess implements PageProcessor {
private static final Jedis jedis = SingleJedisUtils.getJedis();
private Site site = CommonWebSite.getCommonWebSite();
@Override
public void process(Page page)
{
//当前获取了多少数据
int count = 0;
List<Selectable> list = page.getHtml().css("#pl_top_realtimehot td.td-02").nodes();
//获取当前页面所有帖子
for (Selectable crad : list)
{
Document doc = Jsoup.parse(crad.toString());
//获取发帖人微博
String contentUrl = crad.links().get();
//获取帖子内容
String text = doc.text();
String content = text.equals("") ? "" : text;
//写入redis
jedis.rpush("HotList",content);
jedis.rpush("HotListUrl",contentUrl);
log.info(count + " " + content + " ");
count++;
}
}
@Override
public Site getSite()
{
return site;
}
}
1.3 调用爬取热榜
@Slf4j
public class SpiderJob implements Job {
public static String HotListUrl = "https://s.weibo.com/top/summary?cate=realtimehot";
public static final Jedis jedis = SingleJedisUtils.getJedis();
@Override
public void execute(JobExecutionContext jobExecutionContext) throws JobExecutionException
{
//热榜爬取任务
// 先把先前爬取的list备份
jedis.del("HotListBak");
jedis.del("HotListUrlBak");
if (jedis.llen("HotList")>0){
jedis.rename("HotList","HotListBak");
jedis.rename("HotListUrl","HotListUrlBak");
}
Spider.create(new HotListProcess())
.addUrl(HotListUrl)
// .addPipeline(new ExcelPipeline())
//.thread(5) //表示开启5个线程来完成任务
//设置布隆过滤器,最多对100w数据进行去重
.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(1000*1000)))
.run();
List<String> hotListUrl = jedis.lrange("HotListUrl", 1, 50);
log.info("开始进行详细内容爬取!");
jedis.del("HotDetailBak");
if (jedis.llen("HotDetail")>0 ){
jedis.rename("HotDetail","HotDetailBak");
}
long startTime = System.currentTimeMillis();
AtomicInteger count = new AtomicInteger(0);
int maxTry = 3;
for (int i = 0; i<hotListUrl.size();i++){
Long hotDetailLen = jedis.llen("HotDetail");
// 因为有爬取失败的情况,所以当redis中详情的list的长度小于计数长度COUNT时候,说明上一条失败了
if (hotDetailLen < count.get())
{
if (maxTry > 0)
{
log.info("第 " + count.get() +"失败,开始重新爬取该条!" + hotListUrl.get(count.get()));
i--;
count.getAndDecrement();
maxTry --;
}
else
{
log.info("多次尝试失败,填充错误提示数据!");
jedis.rpush("HotDetail","[{'id': '99','content':'Error!'}]");
}
}
count.getAndIncrement();
Spider.create(new DiscussProcess())
.addUrl(hotListUrl.get(i))
.thread(5)
.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(1000*1000)))
.run();
log.info("第 "+count+" 条完成!");
}
log.info("详细内容爬取完成,耗时:" + (System.currentTimeMillis() - startTime) + " ms");
}
}
1.4 详细内容爬取
@Slf4j
public class DiscussProcess implements PageProcessor {
private static final Jedis jedis = SingleJedisUtils.getJedis();
private final Site site = CommonWebSite.getCommonWebSite();
@Override
public void process(Page page)
{
//css定位微博内容
List<Selectable> list = page.getHtml().css("#pl_feedlist_index div.card-wrap").nodes();
List<Map<String, String>> detailList = new ArrayList<>();
//获取当前页面所有帖子
for (Selectable crad : list)
{
int count = 1;
Document doc = Jsoup.parse(crad.toString());
Map<String, String> detailsMap = new HashMap<>(7);
// 作者
if (doc.select("a.name").first() != null)
{
detailsMap.put("author", doc.select("a.name").first().attr("nick-name"));
}
else
{
continue;
}
//获取时间
if (doc.select("p[class=from]").first() != null)
{
detailsMap.put("time", doc.select("p[class=from]").first().text());
}
else
{
detailsMap.put("time", "");
}
//获取帖子图片
if (doc.select("div[node-type=feed_list_media_prev]") == null || doc.select("div[node-type=feed_list_media_prev]").select("img[src~=(?i)\\.(png|jpe?g|gif)]") == null)
{
detailsMap.put("img", "0");
}
else
{
Elements imageUrlElements = doc.select("div[node-type=feed_list_media_prev]").select("img[src~=(?i)\\.(png|jpe?g|gif)]");
List<String> imgUrls = new ArrayList<>();
imageUrlElements.forEach( element -> {
imgUrls.add(element.attr("src"));
});
detailsMap.put("img", JSON.toJSONString(imgUrls));
}
String text = doc.select("p[node-type=feed_list_content]").text();
String textFull = doc.select("p[node-type=feed_list_content_full]").text();
String content = textFull.equals("") ? text : textFull;
detailsMap.put("id", String.valueOf(count));
detailsMap.put("contentUrl", crad.css("div[class=avator]").links().get());
detailsMap.put("content", content);
detailsMap.put("discussNum", doc.select("div[class=card-act]").text());
detailList.add(detailsMap);
count++;
}
jedis.rpush("HotDetail", JSON.toJSONString(detailList));
}
@Override
public Site getSite()
{
return site;
}
}
1.5 调用爬取详情
@Slf4j
public class SpiderJob implements Job {
public static String HotListUrl = "https://s.weibo.com/top/summary?cate=realtimehot";
public static final Jedis jedis = SingleJedisUtils.getJedis();
@Override
public void execute(JobExecutionContext jobExecutionContext) throws JobExecutionException
{
List<String> hotListUrl = jedis.lrange("HotListUrl", 1, 50);
log.info("开始进行详细内容爬取!");
jedis.del("HotDetailBak");
if (jedis.llen("HotDetail")>0 ){
jedis.rename("HotDetail","HotDetailBak");
}
long startTime = System.currentTimeMillis();
AtomicInteger count = new AtomicInteger(0);
int maxTry = 3;
for (int i = 0; i<hotListUrl.size();i++){
Long hotDetailLen = jedis.llen("HotDetail");
// 因为有爬取失败的情况,所以当redis中详情的list的长度小于计数长度COUNT时候,说明上一条失败了
if (hotDetailLen < count.get())
{
if (maxTry > 0)
{
log.info("第 " + count.get() +"失败,开始重新爬取该条!" + hotListUrl.get(count.get()));
i--;
count.getAndDecrement();
maxTry --;
}
else
{
log.info("多次尝试失败,填充错误提示数据!");
jedis.rpush("HotDetail","[{'id': '99','content':'Error!'}]");
}
}
count.getAndIncrement();
Spider.create(new DiscussProcess())
.addUrl(hotListUrl.get(i))
.thread(5)
.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(1000*1000)))
.run();
log.info("第 "+count+" 条完成!");
}
log.info("详细内容爬取完成,耗时:" + (System.currentTimeMillis() - startTime) + " ms");
}
}
2. 定时任务
@Service
public class QuartzManager {
public void runJob()
{
JobDetail jobDetail = JobBuilder.newJob(SpiderJob.class).withIdentity("WeiboSpider", "Spider").build();
//10分钟执行一次 一直执行
Trigger trigger = TriggerBuilder.newTrigger().withIdentity("trigger1", "group").startNow()
.withSchedule(SimpleScheduleBuilder.simpleSchedule().withIntervalInSeconds(600).repeatForever()).build();
// 定义一个 Schedule,用于绑定任务和触发器
SchedulerFactory sf = new StdSchedulerFactory();
Scheduler scheduler = null;
try
{
scheduler = sf.getScheduler();
scheduler.scheduleJob(jobDetail, trigger);
scheduler.start();
}
catch (SchedulerException e)
{
e.printStackTrace();
}
}
}
三、存储
1. Redis设置
# redis.properties
redis.host = 127.0.0.1
redis.port = 6379
redis.maxIdle = 10
redis.maxTotal = 30
2. 连接池
public class JedisUtils {
private static JedisPool jp = null;
private static String host = null;
private static int port = 0;
/**
* 活动链接数
*/
private static int maxIdle = 0;
/**
* 最大连接数
*/
private static int maxTotal = 0;
static {
ResourceBundle resourceBundle = ResourceBundle.getBundle("redis");
host = resourceBundle.getString("redis.host");
port = Integer.parseInt(resourceBundle.getString("redis.port"));
maxIdle = Integer.parseInt(resourceBundle.getString("redis.maxIdle"));
maxTotal = Integer.parseInt(resourceBundle.getString("redis.maxTotal"));
JedisPoolConfig jedisPoolConfig = new JedisPoolConfig();
jedisPoolConfig.setMaxIdle(maxIdle);
jedisPoolConfig.setMaxTotal(maxTotal);
jp = new JedisPool(host, port);
}
public static Jedis getJedis(){
return jp.getResource();
}
}
3. Jedis单例
Initialization On Demand Holder
public class SingleJedisUtils {
//private static Jedis jedis = JedisUtils.getJedis();
public SingleJedisUtils() {
}
private static class SingletonHolder{
public final static Jedis instance = JedisUtils.getJedis();
}
public static Jedis getJedis(){
return SingletonHolder.instance;
}
}
四、调用
1. Service
// 获取热榜
@Override
public String getHotListJson() {
Jedis jedis = JedisUtils.getJedis();
List<String> hotList = jedis.lrange("HotList", 1, jedis.llen("HotList"));
jedis.close();
return JSON.toJSONString(hotList);
}
// 获取详情
@Override
public String getHotDetailJson(int index) {
Jedis jedis = JedisUtils.getJedis();
List<String> hotDetail = jedis.lrange("HotDetail", 0, jedis.llen("HotList"));
List<String> hotList = jedis.lrange("HotList", 0, jedis.llen("HotList"));
List<Map<String, String>> list = (List) JSON.parseObject(hotDetail.get(index - 1), List.class);
jedis.close();
return JSON.toJSONString(list);
}
五、其他
SpingBoot 2.4 需要增加如下配置才能使用之前版本的配置
# application.properties
spring.config.use-legacy-processing = true
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。