webmagic爬取网页数据,【分页爬取内容】见上一篇博文https://segmentfault.com/a/1190000020005655
webmagic的官方文档见: http://webmagic.io/docs/zh/ 可查阅根据不同选择器 获取节点信息等内容
网页内容爬取根据页面生成方式基本上可通过以下方法爬取:
一.静态页面【最常见的】,能通过webmagic的常规方法直接爬取数据
二.一些动态生成网页,需要在爬虫程序里使用浏览器驱动将数据渲染到页面上之后再爬取
三.从js请求中能获取数据的网页,可直接构造http请求获取数据
下文将罗列针对这三种爬取方式的webmagic使用,文章较长,可根据你的需要【爬取方式】取用
一. 静态页面爬取
示例:
import com.boe.mps.jrj.dataas.entity.BigDeposit;
import org.joda.time.DateTime;
import org.springframework.stereotype.Repository;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
import java.util.ArrayList;
import java.util.List;
/**
* 大额存单爬虫
*/
@Repository
public class BigDepositProcessor implements PageProcessor{
private static Site site = Site.me().setRetryTimes(3).setSleepTime(100);
@Override
public Site getSite() {
return site;
}
@Override
public void process(Page page) {
// 数据更新时间
String updateTime = DateTime.now().toString(DateUtils.DATE_FORMAT_LONG_FULL);
// 获取数据节点
List<Selectable> nodes = page.getHtml().$(".ebdp-pc4promote-circularcontainer-wrapper").nodes();
List<BigDeposit> list = new ArrayList<>();
for (int i = 0; i < nodes.size(); i++) {
BigDeposit bigDeposit = new BigDeposit();
// 根据xpth获取table列表中的td节点
List<Selectable> table = page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[2]/table/tbody/tr/td").nodes();
bigDeposit.setItemName(page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[1]/span[1]/span/text()").get());
bigDeposit.setItemRate(page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[2]/table/tbody/tr/td[2]/a/text()").get());
bigDeposit.setUpdateTime(updateTime);
if (table.size()<=7){
bigDeposit.setStartDepositPrice(page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[2]/table/tbody/tr/td[4]/text()").get());
bigDeposit.setGrading(page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[2]/table/tbody/tr/td[6]/text()").get());
}else {
bigDeposit.setStartDepositPrice(page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[2]/table/tbody/tr/td[5]/text()").get());
bigDeposit.setGrading(page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[2]/table/tbody/tr/td[7]/text()").get());
}
list.add(bigDeposit);
}
page.putField("bigDeposit",list);
//打印爬取的内容
list.forEach(e->{
System.out.println(e);
});
}
// 爬虫测试方法
public static void main(String[] args) {
// 爬虫爬取路径
String bigdeposit="https://mybank.icbc.com.cn/servlet/ICBCBaseReqServletNoSession?dse_operationName=per_accountQueryFixedProductsOutOp&cmd=0&NormalOrBooking=0&IN_CURRFLAG=&IN_APPID=02&IN_SAVETYPE=&IN_BIGFLAG=1&JJGFLAG=0&Area_code=1001";
//爬虫构造,将爬取结果打印到控制台上
Spider.create(new BigDepositProcessor()).addPipeline(new ConsolePipeline()).addUrl(bigdeposit).thread(5).run();
}
}
/**
*BigDeposit的实体类
*/
@Data
public class BigDeposit {
private Long id;
/**产品名称*/
private String itemName;
/**产品利率%*/
private String itemRate;
/**起存金额(元)*/
private String startDepositPrice;
/**交易级差*/
private String grading;
/**更新时间*/
private String updateTime;
}
二. 动态页面chromedriver先渲染再爬取
webmagic调用chromedriver驱动,先渲染页面,再爬取数据
示例:
import com.boe.mps.jrj.dataas.entity.Bond;
import org.joda.time.DateTime;
import org.springframework.stereotype.Repository;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
import java.util.ArrayList;
import java.util.List;
/**
*全部债券产品
*/
@Repository
public class BondProcessor implements PageProcessor{
String updateTime = DateTime.now().toString(DateUtils.DATE_FORMAT_LONG_FULL);
long lastTime = DateTime.now().getMillis();
private static Site site = Site.me().setRetryTimes(3).setSleepTime(100);
@Override
public Site getSite() {
return site;
}
@Override
public void process(Page page) {
long execTime = DateTime.now().getMillis();
//时间超过一分钟才重新赋值更新时间,否则不赋值!解决分页数据,多次执行时间不一致的问题
if((execTime - lastTime) > 1000*60){
updateTime = DateTime.now().toString(DateUtils.DATE_FORMAT_LONG_FULL);
lastTime = execTime;
}
// 获取分页数
String s = page.getHtml().xpath("//*[@id=lbInfo]").get();
String sum = s.substring(s.indexOf("总记录数:") + 5, s.indexOf("条"));
System.err.println("ss=="+sum);
int total = Integer.parseInt(sum);
for (int i = 0; i < total; i+=8) {
String nextUrl="https://mybank.icbc.com.cn/icbc/newperbank/nationaldebt/nationaldebt_infoquery_product_nosession.jsp?pageFlag=0&qryBeginPos=1&jSonStrFilter=&isFilterFlag=0&QryTypex=0&matureYear=aaa&term2=aaa&keywords=&remainTerm=aaa&debtType=aaa&couponRate=aaa&currTypeFilter=aaa&pos=0&pos1=0&OrderString=0%7C0%7C0%7C0%7C0%7C0%7C0%7C0%7C&ExtendTableDisplayFlag=2&beginPos="+(i+1);
// 分页爬取,将下一页的url放入爬虫任务列表里
page.addTargetRequest(nextUrl);
}
List<Selectable> nodes1 = page.getHtml().xpath("//*[@id=ebdp-pc4promote-nationaldebtList]/div").nodes();
System.out.println("tiaoshu="+nodes1.size());
List<Bond> list = new ArrayList<>();
for (int i = 1; i <=nodes1.size() ; i++) {//*[@id="ebdp-pc4promote-nationaldebtList"]/div[2]/div[1]/div[1]/div[1]/a
Bond bond = new Bond();
String s0 = page.getHtml().xpath("//*[@id=ebdp-pc4promote-nationaldebtList]/div["+i+"]/div[3]").$(".ebdp-pc4promote-tuijian").get()==null?"":"推荐";
String itemname = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[1]/div[1]/a/text()").get();
String s1 = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[1]/div[1]/span[1]/text()").get();
String s2 = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[1]/div[1]/span[2]/text()").get();
String s3 = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[1]/div[1]/span[3]/text()").get();
String tradingtime = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[1]/div[2]/span[2]/text()").get();
String clientbuyingrate = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[2]/div[1]/ul/li[2]/div[2]/span/text()").get();
String clientbuyingprice=page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[2]/div[1]/ul/li[2]/div[3]/b/text()").get();
String clientsellrate=page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[2]/div[1]/ul/li[3]/div[2]/span/text()").get();
String clientsellprice = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[2]/div[1]/ul/li[3]/div[3]/b/text()").get();
String s4 = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[2]/div[1]/ul/li[2]/div[4]/p/text()").get();
String s5 = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[2]/div[1]/ul/li[2]/div[4]/p/b/text()").get();
String couponrate = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[1]/text()").get();
String accruedInterest = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[2]/text()").get();
String currentPaymentDate = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[3]/text()").get();
String currentInterestIncome = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[4]/text()").get();
String interestFrequency = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[5]/text()").get();
String couponBondValue = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[6]/text()").get();
String expireDate = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[7]/text()").get();
String holdExpireInterestIncome = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[8]/text()").get();
String clientBuyingNetPrice = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[9]/text()").get();
String clientSellingNetPrice = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[10]/text()").get();
String itemType = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[11]/text()").get();
bond.setItemName(itemname);
bond.setItemFeature(s0+" "+s1+" "+s2+" "+s3);
bond.setTradingHours(tradingtime);
bond.setClientBuyingRate(clientbuyingrate);
bond.setClientBuyingFullPrice(clientbuyingprice);
bond.setClientSellingRate(clientsellrate);
bond.setClientSellingFullPrice(clientsellprice);
bond.setRemainTimeLimit(s4+s5);
bond.setCouponRate(couponrate.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");
bond.setAccruedInterest(accruedInterest.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");
bond.setCurrentPaymentDate(currentPaymentDate.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");
bond.setCurrentInterestIncome(currentInterestIncome.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");
bond.setInterestFrequency(interestFrequency.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");
bond.setCouponBondValue(couponBondValue.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");
bond.setExpireDate(expireDate.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");
bond.setHoldExpireInterestIncome(holdExpireInterestIncome.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");
bond.setClientBuyingNetPrice(clientBuyingNetPrice.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");
bond.setClientSellingNetPrice(clientSellingNetPrice.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");
bond.setItemType(itemType.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");
bond.setUpdateTime(updateTime);
list.add(bond);
}
page.putField("bond",list);
list.forEach(e->{
System.out.println(e);
});
}
// 测试示例
public static void main(String[] args) {
String url="https://mybank.icbc.com.cn/icbc/newperbank/nationaldebt/nationaldebt_infoquery_product_nosession.jsp?pageFlag=0&qryBeginPos=1&jSonStrFilter=&isFilterFlag=0&QryTypex=0&matureYear=aaa&term2=aaa&keywords=&remainTerm=aaa&debtType=aaa&couponRate=aaa&currTypeFilter=aaa&pos=0&pos1=0&OrderString=0%7C0%7C0%7C0%7C0%7C0%7C0%7C0%7C&ExtendTableDisplayFlag=2&beginPos=1";
// 获取系统中chromedriver_linux64的配置
System.setProperty("selenuim_config", "/home/myfile/tool_station/chromedriver_linux64/config.ini");
// 设置SeleniumDownloader驱动的爬取方式
Spider.create(new BondProcessor()).thread(1)
.addPipeline(new ConsolePipeline())
.addUrl(url)
.setDownloader(new SeleniumDownloader("/home/myfile/tool_station/chromedriver_linux64/chromedriver").setSleepTime(1000))
.run();
}
}
chromedriver驱动的下载及配置
chromedriver下载路径[http://chromedriver.storage.googleapis.com/index.html](http://chromedriver.storage.googleapis.com/index.html)
【请下载与你的浏览器版本相同的chromedriver包】
config.ini文件配置如下:
driver=chrome
#chrome_exec_path=/usr/bin/google-chrome-stable
chrome_driver_loglevel=DEBUG
本例实体类如下:
@Data
public class Bond {
private Long id;
/**债券名称*/
private String itemName;
/**客户买入到期收益率*/
private String clientBuyingRate;
/**客户买入交易全价*/
private String clientBuyingFullPrice;
/**客户卖出收益率*/
private String clientSellingRate;
/**客户卖出交易全价*/
private String clientSellingFullPrice;
/**票面利率*/
private String couponRate;
/**应计利息*/
private String accruedInterest;
/**本期付息日*/
private String currentPaymentDate;
/**本期利息收益*/
private String currentInterestIncome;
/**付息频率*/
private String interestFrequency;
/**券债面值*/
private String couponBondValue;
/**到期日*/
private String expireDate;
/**持有到期利息收益*/
private String holdExpireInterestIncome;
/**客户买入净价*/
private String clientBuyingNetPrice;
/**客户卖出净价*/
private String clientSellingNetPrice;
/**债券类型*/
private String itemType;
/**债券的特点*/
private String itemFeature;
/**剩余期限*/
private String remainTimeLimit;
/**交易时间*/
private String tradingHours;
/**更新时间*/
private String updateTime;
}
三.js中请求能看到数据的动态渲染网页
示例:
import com.boe.mps.jrj.dataas.entity.ExchangeMarket;
import org.joda.time.DateTime;
import org.springframework.stereotype.Repository;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Json;
import us.codecraft.webmagic.selector.JsonPathSelector;
import java.util.*;
@Repository
public class ExchangeMarketProcessor implements PageProcessor{
private static Site site = Site.me().setRetryTimes(3).setSleepTime(100);
@Override
public Site getSite() {
return site;
}
@Override
public void process(Page page) {
String updateTime = DateTime.now().toString(DateUtils.DATE_FORMAT_LONG_FULL);
String json = page.getJson().get();
System.out.println("=="+json);
// post请求获取数据,取值方式如下
List<String> strings = new JsonPathSelector("$.rf").selectList(page.getRawText());
List<ExchangeMarket> list = new ArrayList<>();
for (String str:strings) {
Json item = new Json(str);
ExchangeMarket exchangeMarket = new ExchangeMarket();
exchangeMarket.setItemName(item.jsonPath("$.proName").get());
exchangeMarket.setRisefall(item.jsonPath("$.riseSign").get());
exchangeMarket.setBankBuyingPrice(item.jsonPath("$.buyRate").get());
exchangeMarket.setBankSellingPrice(item.jsonPath("$.sellRate").get());
exchangeMarket.setMiddlePrice(item.jsonPath("$.middPrice").get());
exchangeMarket.setDayRisefallRange(item.jsonPath("$.openprice_dr").get());
exchangeMarket.setDayRisefallValue(item.jsonPath("$.openprice_dv").get());
exchangeMarket.setYearRisefallRange(item.jsonPath("$.openprice_yr").get());
exchangeMarket.setUpdateTime(updateTime);
list.add(exchangeMarket);
}
page.putField("exchangeMarket",list);
}
public static void main(String[] args) {
String agriculturalUrl="https://mybank.icbc.com.cn/ctp/ctpservlet/EbdpAjaxServlet";
Request exchangeMarketRequest = new Request(agriculturalUrl);
exchangeMarketRequest.setMethod(HttpConstant.Method.POST);
// 构造post请求及参数设置
Map<String, Object> agriculturalMap = new HashMap<>();
agriculturalMap.put("tranCode","A00513");
exchangeMarketRequest.setRequestBody(HttpRequestBody.form(agriculturalMap,"utf-8"));
Spider.create(new ExchangeMarketProcessor()).addPipeline(new ConsolePipeline()).addRequest(exchangeMarketRequest).thread(1).run();
}
}
该例中实体类如下:
@Data
public class ExchangeMarket {
/**主键*/
private Long id;
/**
*品种
*/
private String itemName;
/**
*涨跌
*/
private String risefall;
/**
*银行买入价
*/
private String bankBuyingPrice;
/**
*银行卖出价
*/
private String bankSellingPrice;
/**
*中间价
*/
private String middlePrice;
/**
*当日涨跌值
*/
private String dayRisefallValue;
/**
*当日涨跌幅
*/
private String dayRisefallRange;
/**
*当年涨跌幅
*/
private String yearRisefallRange;
/**
* 更新时间
*/
private String updateTime;
}
该例的请求url【见上文代码】https://mybank.icbc.com.cn/ct...
js请求的可以先postman调用一下,确认请求方式及需要的参数等内容。
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。