本程序将展示使用Jsoup爬取网站信息的示例,只是用于对Jsoup的学习,不会做其他使用
1. 新建一个springboot项目
添加Jsoup的依赖,以及mysql和mybatis的依赖,其中后面的依赖用于将爬取到的数据存入中mysql数据库中
<dependency>
2 <groupId>org.springframework.boot</groupId>
3 <artifactId>spring-boot-starter-web</artifactId>
4 </dependency>
5
6
7 <dependency>
8 <groupId>org.jsoup</groupId>
9 <artifactId>jsoup</artifactId>
10 <version>1.10.2</version>
11 </dependency>
12
13 <dependency>
14 <groupId>tk.mybatis</groupId>
15 <artifactId>mapper-spring-boot-starter</artifactId>
16 <version>2.0.4</version>
17 </dependency>
18
19 <dependency>
20 <groupId>org.springframework.boot</groupId>
21 <artifactId>spring-boot-starter-jdbc</artifactId>
22 <version>2.2.1.RELEASE</version>
23 </dependency>
24
25 <dependency>
26 <groupId>org.mybatis.spring.boot</groupId>
27 <artifactId>mybatis-spring-boot-starter</artifactId>
28 <version>2.0.1</version>
29 </dependency>
30
31 <dependency>
32 <groupId>mysql</groupId>
33 <artifactId>mysql-connector-java</artifactId>
34 <version>5.1.48</version>
35 </dependency>
36
37 <dependency>
38 <groupId>com.alibaba</groupId>
39 <artifactId>druid</artifactId>
40 <version>1.1.1</version>
41 </dependency>
2. 配置文件application.yml
主要配置数据库的链接字符串信息
1 server:
2 port: 7999
3 spring:
4 servlet:
5 multipart:
6 max-request-size: 100MB #最大请求文件的大小
7 max-file-size: 20MB #设置单个文件最大长度
8 http:
9 encoding:
10 charset: utf-8
11 force: true
12 enabled: true
13 datasource:
14 platform: mysql
15 type: com.alibaba.druid.pool.DruidDataSource
16 initialSize: 5
17 minIdle: 3
18 maxActive: 500
19 maxWait: 60000
20 timeBetweenEvictionRunsMillis: 60000
21 minEvictableIdleTimeMillis: 30000
22 validationQuery: select 1
23 testOnBorrow: true
24 poolPreparedStatements: true
25 maxPoolPreparedStatementPerConnectionSize: 20
26 driverClassName: com.mysql.jdbc.Driver
27 url: jdbc:mysql://localhost:3306/job?serverTimezone=UTC&useSSL=false&useUnicode=true&characterEncoding=utf-8&useAffectedRows=true&rewriteBatchedStatements=true
28 username: root
29 password: root
3. springboot启动类
额外添加了对mybatis mapper的扫描
1 package com.devin.jobsearch;
2
3 import org.springframework.boot.SpringApplication;
4 import org.springframework.boot.autoconfigure.SpringBootApplication;
5 import tk.mybatis.spring.annotation.MapperScan;
6
7
8 @MapperScan("com.devin.jobsearch.mapper")
9 @SpringBootApplication
10 public class JobSearchApplication {
11
12 public static void main(String[] args) {
13 SpringApplication.run(JobSearchApplication.class, args);
14 }
15
16 }
4. 数据库表和对应的model
CREATE TABLE `job` (
`job_id` varchar(128) NOT NULL,
`job_name` varchar(512) DEFAULT NULL,
`job_detail` text,
`job_company_name` varchar(512) DEFAULT NULL,
`job_company_image` varchar(512) DEFAULT NULL,
`job_company_desc` text,
`job_company_url` varchar(512) DEFAULT NULL,
`job_url` varchar(512) DEFAULT NULL,
`job_location` varchar(128) DEFAULT NULL,
`job_location_detail` varchar(4000) DEFAULT NULL,
`job_salary` varchar(512) DEFAULT NULL,
`job_date` varchar(128) DEFAULT NULL,
`job_restrict_str` varchar(512) DEFAULT NULL,
PRIMARY KEY (`job_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
1 package com.devin.jobsearch.model;
2
3 import javax.persistence.Table;
4
5 /**
6 * @author Devin Zhang
7 * @className JobModel
8 * @description TODO
9 * @date 2020/4/22 9:42
10 */
11 @Table(name = "job")
12 public class JobModel {
13 private String jobId;
14 private String jobName;
15 private String jobDetail;
16 private String jobCompanyName;
17 private String jobCompanyImage;
18 private String jobCompanyDesc;
19 private String jobCompanyUrl;
20 private String jobUrl;
21 private String jobLocation;
22 private String jobLocationDetail;
23 private String jobSalary;
24 private String jobDate;
25 private String jobRestrictStr;
26
27 public String getJobId() {
28 return jobId;
29 }
30
31 public void setJobId(String jobId) {
32 this.jobId = jobId;
33 }
34
35 public String getJobName() {
36 return jobName;
37 }
38
39 public void setJobName(String jobName) {
40 this.jobName = jobName;
41 }
42
43 public String getJobDetail() {
44 return jobDetail;
45 }
46
47 public void setJobDetail(String jobDetail) {
48 this.jobDetail = jobDetail;
49 }
50
51 public String getJobCompanyName() {
52 return jobCompanyName;
53 }
54
55 public void setJobCompanyName(String jobCompanyName) {
56 this.jobCompanyName = jobCompanyName;
57 }
58
59 public String getJobCompanyDesc() {
60 return jobCompanyDesc;
61 }
62
63 public void setJobCompanyDesc(String jobCompanyDesc) {
64 this.jobCompanyDesc = jobCompanyDesc;
65 }
66
67 public String getJobCompanyUrl() {
68 return jobCompanyUrl;
69 }
70
71 public void setJobCompanyUrl(String jobCompanyUrl) {
72 this.jobCompanyUrl = jobCompanyUrl;
73 }
74
75 public String getJobUrl() {
76 return jobUrl;
77 }
78
79 public void setJobUrl(String jobUrl) {
80 this.jobUrl = jobUrl;
81 }
82
83 public String getJobLocation() {
84 return jobLocation;
85 }
86
87 public void setJobLocation(String jobLocation) {
88 this.jobLocation = jobLocation;
89 }
90
91 public String getJobLocationDetail() {
92 return jobLocationDetail;
93 }
94
95 public void setJobLocationDetail(String jobLocationDetail) {
96 this.jobLocationDetail = jobLocationDetail;
97 }
98
99 public String getJobSalary() {
100 return jobSalary;
101 }
102
103 public void setJobSalary(String jobSalary) {
104 this.jobSalary = jobSalary;
105 }
106
107 public String getJobDate() {
108 return jobDate;
109 }
110
111 public void setJobDate(String jobDate) {
112 this.jobDate = jobDate;
113 }
114
115 public String getJobCompanyImage() {
116 return jobCompanyImage;
117 }
118
119 public void setJobCompanyImage(String jobCompanyImage) {
120 this.jobCompanyImage = jobCompanyImage;
121 }
122
123 public String getJobRestrictStr() {
124 return jobRestrictStr;
125 }
126
127 public void setJobRestrictStr(String jobRestrictStr) {
128 this.jobRestrictStr = jobRestrictStr;
129 }
130
131 @Override
132 public String toString() {
133 return "JobModel{" +
134 "jobId='" + jobId + '\'' +
135 ", jobName='" + jobName + '\'' +
136 ", jobDetail='" + jobDetail + '\'' +
137 ", jobCompanyName='" + jobCompanyName + '\'' +
138 ", jobCompanyImage='" + jobCompanyImage + '\'' +
139 ", jobCompanyDesc='" + jobCompanyDesc + '\'' +
140 ", jobCompanyUrl='" + jobCompanyUrl + '\'' +
141 ", jobUrl='" + jobUrl + '\'' +
142 ", jobLocation='" + jobLocation + '\'' +
143 ", jobLocatonDetail='" + jobLocationDetail + '\'' +
144 ", jobSalary='" + jobSalary + '\'' +
145 ", jobDate='" + jobDate + '\'' +
146 ", jobRestrictStr='" + jobRestrictStr + '\'' +
147 '}';
148 }
149 }
5. 数据库操作mapper类 和 mapper配置文件
因为使用了tkmybatis,所以mapper类 和 mapper配置文件中补需要额外添加任何代码和配置
JobMapper.java
1 package com.devin.jobsearch.mapper;
2
3 import com.devin.jobsearch.model.JobModel;
4 import tk.mybatis.mapper.common.Mapper;
5 import tk.mybatis.mapper.common.MySqlMapper;
6
7 /**
8 * @author Devin Zhang
9 * @className JobMapper
10 * @description TODO
11 * @date 2020/4/22 16:24
12 */
13
14 public interface JobMapper extends Mapper<JobModel>, MySqlMapper<JobModel> {
15 }
JobMapper.xml
1 <?xml version="1.0" encoding="UTF-8" ?>
2 <!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd" >
3 <mapper namespace="com.devin.jobsearch.mapper.JobMapper" >
4
5 </mapper>
6. 工具类
SearchUtil.java 主要用于Jsoup加载url返回一个Document对象
1 package com.devin.jobsearch.util;
2
3
4 import org.jsoup.Jsoup;
5 import org.jsoup.nodes.Document;
6 import org.springframework.stereotype.Component;
7
8 /**
9 * @author Devin Zhang
10 * @className SearchUtil
11 * @description TODO
12 * @date 2020/4/22 10:07
13 */
14
15 @Component
16 public class SearchUtil {
17
18 public Document getDocument(String url) throws Exception {
19 Document document = Jsoup
20 .connect(url)
21 .header("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36")
22 .get();
23 return document;
24 }
25 }
FileUtil.java 主要用于记录错误信息
1 package com.devin.jobsearch.util;
2
3 import org.springframework.stereotype.Component;
4
5 import java.io.BufferedWriter;
6 import java.io.File;
7 import java.io.FileOutputStream;
8 import java.io.OutputStreamWriter;
9
10 /**
11 * @author Devin Zhang
12 * @className FileUtil
13 * @description TODO
14 * @date 2020/4/22 17:56
15 */
16 @Component
17 public class FileUtil {
18
19 private String filePath = "D:\\data\\job\\fail.log";
20
21 public void writeLog(String log) {
22 BufferedWriter bw = null;
23 try {
24 bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(filePath), true)));
25 bw.write(log);
26 bw.flush();
27 } catch (Exception e) {
28 e.printStackTrace();
29 } finally {
30 try {
31 bw.close();
32 } catch (Exception e) {
33 e.printStackTrace();
34 }
35 }
36 }
37 }
7. 爬虫的逻辑处理
我们打开51job的搜索页,分别查看首页,第2页,第3页,可以看到变化只是在访问页码的参数上有变化,所以我们可以循环去爬取整个的职位信息
首页
https://search.51job.com/list...1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
第二页
https://search.51job.com/list...2.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
第三页
https://search.51job.com/list...3.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
代码实现:
首先创建一个接口,接口中定了两个方法,一个用于获取总的页数,一个用于循环去爬取数据
爬虫的方法实现了该接口,后续如果我们要爬取其他网站,只需要实现该接口,编写逻辑即可
IJobHandle.java
1 package com.devin.jobsearch.Service;
2
3 /**
4 * @author Devin Zhang
5 * @className IJobHandle
6 * @description TODO
7 * @date 2020/4/22 16:39
8 */
9
10 public interface IJobHandle {
11
12 int getJobPage() throws Exception;
13
14 void handle() throws Exception;
15 }
Job51SearchHandle.java
1 package com.devin.jobsearch.Service;
2
3 import com.devin.jobsearch.mapper.JobMapper;
4 import com.devin.jobsearch.model.JobModel;
5 import com.devin.jobsearch.util.FileUtil;
6 import com.devin.jobsearch.util.SearchUtil;
7 import org.jsoup.nodes.Document;
8 import org.jsoup.nodes.Element;
9 import org.jsoup.select.Elements;
10 import org.springframework.stereotype.Service;
11 import org.springframework.util.CollectionUtils;
12 import org.springframework.util.StringUtils;
13
14 import javax.annotation.Resource;
15 import java.util.ArrayList;
16 import java.util.List;
17 import java.util.regex.Matcher;
18 import java.util.regex.Pattern;
19
20 /**
21 * @author Devin Zhang
22 * @className Job51SearchHandle
23 * @description TODO
24 * @date 2020/4/21 16:41
25 */
26
27 @Service
28 public class Job51SearchHandle implements IJobHandle {
29
30 @Resource
31 private SearchUtil searchUtil;
32 @Resource
33 private JobMapper jobMapper;
34 @Resource
35 private FileUtil fileUtil;
36
37 private static final String PAGEPATTERN = "pagePattern";
38 private static final String JOB51URL = "https://search.51job.com/list/000000,000000,0000,00,9,99,%2B,2,pagePattern.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=";
39
40
41 /**
42 * 获取51job总共有多少页
43 *
44 * @return
45 * @throws Exception
46 */
47 @Override
48 public int getJobPage() throws Exception {
49 String url = JOB51URL;
50 url = url.replace(PAGEPATTERN, "1");
51 Document document = searchUtil.getDocument(url);
52 return Integer.parseInt(document.getElementById("hidTotalPage").val());
53 }
54
55
56 /**
57 * 分页爬取51job
58 */
59 @Override
60 public void handle() throws Exception {
61 //目标地址
62 int pageTotal = this.getJobPage();
63 List<JobModel> jobModelList = null;
64 for (int page = 1; page <= pageTotal; page++) {
65 try {
66 jobModelList = new ArrayList();
67 System.out.println("开始爬取第:" + page + "页的数据");
68 String url = JOB51URL;
69 url = url.replace(PAGEPATTERN, page + "");
70 Document document = searchUtil.getDocument(url);
71 // 右侧导航栏
72 Elements nav_com = document.getElementsByClass("el");
73 for (Element element : nav_com) {
74 if (element.children().first().tagName("p").hasClass("t1") &&
75 element.children().first().tagName("p").children().hasClass("check")) {
76
77 String jobName = element.children().first().tagName("p").children().tagName("span").text();
78 String jobUrl = element.children().first().tagName("p").child(2).child(0).attr("href");
79 String companyName = element.child(1).text();
80 String companyUrl = element.child(1).child(0).attr("href");
81 String jobLocation = element.child(2).text();
82 String jobSalary = element.child(3).text();
83 String jobDate = element.child(4).text();
84
85 JobModel jobModel = new JobModel();
86 jobModel.setJobName(jobName);
87 jobModel.setJobUrl(jobUrl);
88 jobModel.setJobCompanyName(companyName);
89 jobModel.setJobCompanyUrl(companyUrl);
90 jobModel.setJobLocation(jobLocation);
91 jobModel.setJobSalary(jobSalary);
92 jobModel.setJobDate(jobDate);
93
94 //爬取明细
95 Document detailDocument = searchUtil.getDocument(jobUrl);
96
97 String jobRestrict = detailDocument.getElementsByClass("msg ltype").text();
98 String jobDesc = detailDocument.getElementsByClass("bmsg job_msg inbox").text();
99 String jobLocationDetail = "";
100 if (detailDocument.getElementsByClass("bmsg inbox").size() > 0) {
101 jobLocationDetail = detailDocument.getElementsByClass("bmsg inbox").first().child(0).text();
102 }
103 String companyDesc = detailDocument.getElementsByClass("tmsg inbox").text();
104 String companyImage = "";
105 if (detailDocument.getElementsByClass("com_name himg").size() > 0) {
106 companyImage = detailDocument.getElementsByClass("com_name himg").first().child(0).attr("src");
107 }
108
109
110 jobModel.setJobRestrictStr(jobRestrict);
111 jobModel.setJobDetail(jobDesc);
112 jobModel.setJobLocationDetail(jobLocationDetail);
113 jobModel.setJobCompanyDesc(companyDesc);
114 jobModel.setJobCompanyImage(companyImage);
115
116 String jobId = "";
117 String patternStr = "/[0-9]*.html";
118 Pattern pattern = Pattern.compile(patternStr);
119 Matcher matcher = pattern.matcher(jobUrl);
120 if (matcher.find()) {
121 jobId = matcher.group();
122 jobId = jobId.replaceAll(".html", "").replaceAll("/","");
123 }
124 if (StringUtils.isEmpty(jobId)) {
125 patternStr = "jobid=[0-9]*";
126 pattern = Pattern.compile(patternStr);
127 matcher = pattern.matcher(jobUrl);
128 if (matcher.find()) {
129 jobId = matcher.group();
130 jobId = jobId.replaceAll("jobid=", "");
131 }
132 }
133 if (StringUtils.isEmpty(jobId)) {
134 patternStr = "#[0-9]*";
135 pattern = Pattern.compile(patternStr);
136 matcher = pattern.matcher(jobUrl);
137 if (matcher.find()) {
138 jobId = matcher.group();
139 jobId = jobId.replaceAll("#", "");
140 }
141 }
142 jobModel.setJobId(jobId);
143 System.out.println(jobModel);
144 jobModelList.add(jobModel);
145 }
146 }
147 if (!CollectionUtils.isEmpty(jobModelList)) {
148 System.out.println("第" + page + "页数据,开始插入数据");
149 jobMapper.insertList(jobModelList);
150 }
151 Thread.sleep(3000); //sleep 3 秒,防止访问太频繁,被禁掉
152 } catch (Exception e) {
153 e.printStackTrace();
154 if (null != jobModelList) {
155 fileUtil.writeLog(jobModelList.toString());
156 }
157 }
158 }
159 }
160
161 }
8. 调用
我们新建一个controller,访问直接调用弄
1 package com.devin.jobsearch.controller;
2
3 import com.devin.jobsearch.Service.Job51SearchHandle;
4 import org.springframework.web.bind.annotation.GetMapping;
5 import org.springframework.web.bind.annotation.RequestMapping;
6 import org.springframework.web.bind.annotation.RestController;
7
8 import javax.annotation.Resource;
9
10 /**
11 * @author Devin Zhang
12 * @className JobController
13 * @description TODO
14 * @date 2020/4/22 16:36
15 */
16 @RestController
17 @RequestMapping("/job")
18 public class JobController {
19
20 @Resource
21 private Job51SearchHandle job51SearchHandle;
22
23 @GetMapping("/51jobHandle")
24 public String handle51JobController() throws Exception {
25 job51SearchHandle.handle();
26 return "success";
27 }
28 }
访问: localhost:7999/job/51jobHandle 即可触发爬取
最后,再次说明,本文章的目的只是为了学习Jsoup,爬取到的数据也不会用作其他使用,毕竟爬虫爬的好,牢饭吃的饱。
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。