webmagic爬取网页数据,【分页爬取内容】见上一篇博文https://segmentfault.com/a/1190000020005655
webmagic的官方文档见: http://webmagic.io/docs/zh/ 可查阅根据不同选择器 获取节点信息等内容
网页内容爬取根据页面生成方式基本上可通过以下方法爬取:
一.静态页面【最常见的】,能通过webmagic的常规方法直接爬取数据
二.一些动态生成网页,需要在爬虫程序里使用浏览器驱动将数据渲染到页面上之后再爬取
三.从js请求中能获取数据的网页,可直接构造http请求获取数据
下文将罗列针对这三种爬取方式的webmagic使用,文章较长,可根据你的需要【爬取方式】取用

一. 静态页面爬取
示例:

import com.boe.mps.jrj.dataas.entity.BigDeposit;
import org.joda.time.DateTime;
import org.springframework.stereotype.Repository;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import java.util.ArrayList;
import java.util.List;

/**
 * 大额存单爬虫
 */
@Repository
public class BigDepositProcessor implements PageProcessor{

    private static Site site = Site.me().setRetryTimes(3).setSleepTime(100);
    @Override
    public Site getSite() {
        return site;
    }

    @Override
    public void process(Page page) {
    //  数据更新时间
        String updateTime = DateTime.now().toString(DateUtils.DATE_FORMAT_LONG_FULL);
    // 获取数据节点
        List<Selectable> nodes = page.getHtml().$(".ebdp-pc4promote-circularcontainer-wrapper").nodes();
        List<BigDeposit> list = new ArrayList<>();
        for (int i = 0; i < nodes.size(); i++) {
            BigDeposit bigDeposit = new BigDeposit();
            //  根据xpth获取table列表中的td节点
            List<Selectable> table = page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[2]/table/tbody/tr/td").nodes();
            bigDeposit.setItemName(page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[1]/span[1]/span/text()").get());
            bigDeposit.setItemRate(page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[2]/table/tbody/tr/td[2]/a/text()").get());
            bigDeposit.setUpdateTime(updateTime);
            if (table.size()<=7){
                bigDeposit.setStartDepositPrice(page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[2]/table/tbody/tr/td[4]/text()").get());
                bigDeposit.setGrading(page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[2]/table/tbody/tr/td[6]/text()").get());
            }else {
                bigDeposit.setStartDepositPrice(page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[2]/table/tbody/tr/td[5]/text()").get());
                bigDeposit.setGrading(page.getHtml().xpath("//*[@id=circularcontainer"+i+"]/div[2]/table/tbody/tr/td[7]/text()").get());
            }
            list.add(bigDeposit);
        }
        page.putField("bigDeposit",list);
        //打印爬取的内容
        list.forEach(e->{
            System.out.println(e);
        });
    }
//  爬虫测试方法
    public static void main(String[] args) {
    // 爬虫爬取路径
        String bigdeposit="https://mybank.icbc.com.cn/servlet/ICBCBaseReqServletNoSession?dse_operationName=per_accountQueryFixedProductsOutOp&cmd=0&NormalOrBooking=0&IN_CURRFLAG=&IN_APPID=02&IN_SAVETYPE=&IN_BIGFLAG=1&JJGFLAG=0&Area_code=1001";
        //爬虫构造,将爬取结果打印到控制台上
        Spider.create(new BigDepositProcessor()).addPipeline(new ConsolePipeline()).addUrl(bigdeposit).thread(5).run();
    }

}

/**
*BigDeposit的实体类
*/
@Data
public class BigDeposit {
    private Long id;
    /**产品名称*/
    private String itemName;
    /**产品利率%*/
    private String itemRate;
    /**起存金额(元)*/
    private String startDepositPrice;
    /**交易级差*/
    private String grading;
    /**更新时间*/
    private String updateTime;
}

二. 动态页面chromedriver先渲染再爬取
webmagic调用chromedriver驱动,先渲染页面,再爬取数据
示例:

import com.boe.mps.jrj.dataas.entity.Bond;
import org.joda.time.DateTime;
import org.springframework.stereotype.Repository;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import java.util.ArrayList;
import java.util.List;

/**
 *全部债券产品
 */
@Repository
public class BondProcessor implements PageProcessor{


    String updateTime = DateTime.now().toString(DateUtils.DATE_FORMAT_LONG_FULL);
    long lastTime = DateTime.now().getMillis();

    private static Site site = Site.me().setRetryTimes(3).setSleepTime(100);
    @Override
    public Site getSite() {
        return site;
    }

    @Override
    public void process(Page page) {
        long execTime = DateTime.now().getMillis();
        //时间超过一分钟才重新赋值更新时间,否则不赋值!解决分页数据,多次执行时间不一致的问题
        if((execTime - lastTime) > 1000*60){
            updateTime = DateTime.now().toString(DateUtils.DATE_FORMAT_LONG_FULL);
            lastTime = execTime;
        }
        //  获取分页数
        String s = page.getHtml().xpath("//*[@id=lbInfo]").get();
        String sum = s.substring(s.indexOf("总记录数:") + 5, s.indexOf("条"));
        System.err.println("ss=="+sum);
        int total = Integer.parseInt(sum);
        for (int i = 0; i < total; i+=8) {
            String nextUrl="https://mybank.icbc.com.cn/icbc/newperbank/nationaldebt/nationaldebt_infoquery_product_nosession.jsp?pageFlag=0&qryBeginPos=1&jSonStrFilter=&isFilterFlag=0&QryTypex=0&matureYear=aaa&term2=aaa&keywords=&remainTerm=aaa&debtType=aaa&couponRate=aaa&currTypeFilter=aaa&pos=0&pos1=0&OrderString=0%7C0%7C0%7C0%7C0%7C0%7C0%7C0%7C&ExtendTableDisplayFlag=2&beginPos="+(i+1);
            //  分页爬取,将下一页的url放入爬虫任务列表里
            page.addTargetRequest(nextUrl);
        }
        List<Selectable> nodes1 = page.getHtml().xpath("//*[@id=ebdp-pc4promote-nationaldebtList]/div").nodes();
        System.out.println("tiaoshu="+nodes1.size());
        List<Bond> list = new ArrayList<>();
        for (int i = 1; i <=nodes1.size() ; i++) {//*[@id="ebdp-pc4promote-nationaldebtList"]/div[2]/div[1]/div[1]/div[1]/a
        Bond bond = new Bond();
        String s0 = page.getHtml().xpath("//*[@id=ebdp-pc4promote-nationaldebtList]/div["+i+"]/div[3]").$(".ebdp-pc4promote-tuijian").get()==null?"":"推荐";
        String itemname = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[1]/div[1]/a/text()").get();
        String s1 = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[1]/div[1]/span[1]/text()").get();
        String s2 = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[1]/div[1]/span[2]/text()").get();
        String s3 = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[1]/div[1]/span[3]/text()").get();
        String tradingtime = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[1]/div[2]/span[2]/text()").get();
        String clientbuyingrate = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[2]/div[1]/ul/li[2]/div[2]/span/text()").get();
        String clientbuyingprice=page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[2]/div[1]/ul/li[2]/div[3]/b/text()").get();
        String clientsellrate=page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[2]/div[1]/ul/li[3]/div[2]/span/text()").get();
        String clientsellprice = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[2]/div[1]/ul/li[3]/div[3]/b/text()").get();
        String s4 = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[2]/div[1]/ul/li[2]/div[4]/p/text()").get();
        String s5 = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[2]/div[1]/ul/li[2]/div[4]/p/b/text()").get();
        String couponrate = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[1]/text()").get();
        String accruedInterest = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[2]/text()").get();
        String currentPaymentDate = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[3]/text()").get();
        String currentInterestIncome = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[4]/text()").get();
        String interestFrequency = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[5]/text()").get();
        String couponBondValue = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[6]/text()").get();
        String expireDate = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[7]/text()").get();
        String holdExpireInterestIncome = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[8]/text()").get();
        String clientBuyingNetPrice = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[9]/text()").get();
        String clientSellingNetPrice = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[10]/text()").get();
        String itemType = page.getHtml().xpath("//*[@id=\"ebdp-pc4promote-nationaldebtList\"]/div["+i+"]/div[1]/div[3]/dl/dd[11]/text()").get();
        bond.setItemName(itemname);
        bond.setItemFeature(s0+" "+s1+" "+s2+" "+s3);
        bond.setTradingHours(tradingtime);
        bond.setClientBuyingRate(clientbuyingrate);
        bond.setClientBuyingFullPrice(clientbuyingprice);
        bond.setClientSellingRate(clientsellrate);
        bond.setClientSellingFullPrice(clientsellprice);
        bond.setRemainTimeLimit(s4+s5);
        bond.setCouponRate(couponrate.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");
        bond.setAccruedInterest(accruedInterest.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");
        bond.setCurrentPaymentDate(currentPaymentDate.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");
        bond.setCurrentInterestIncome(currentInterestIncome.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");
        bond.setInterestFrequency(interestFrequency.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");
        bond.setCouponBondValue(couponBondValue.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");
        bond.setExpireDate(expireDate.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");
        bond.setHoldExpireInterestIncome(holdExpireInterestIncome.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");
        bond.setClientBuyingNetPrice(clientBuyingNetPrice.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");
        bond.setClientSellingNetPrice(clientSellingNetPrice.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");
        bond.setItemType(itemType.contains(":")?couponrate.substring(couponrate.indexOf(":")+1):"");
        bond.setUpdateTime(updateTime);
        list.add(bond);
        }

        page.putField("bond",list);
        list.forEach(e->{
            System.out.println(e);
        });
    }
//  测试示例
    public static void main(String[] args) {
        String url="https://mybank.icbc.com.cn/icbc/newperbank/nationaldebt/nationaldebt_infoquery_product_nosession.jsp?pageFlag=0&qryBeginPos=1&jSonStrFilter=&isFilterFlag=0&QryTypex=0&matureYear=aaa&term2=aaa&keywords=&remainTerm=aaa&debtType=aaa&couponRate=aaa&currTypeFilter=aaa&pos=0&pos1=0&OrderString=0%7C0%7C0%7C0%7C0%7C0%7C0%7C0%7C&ExtendTableDisplayFlag=2&beginPos=1";
        //  获取系统中chromedriver_linux64的配置
        System.setProperty("selenuim_config", "/home/myfile/tool_station/chromedriver_linux64/config.ini");
        //  设置SeleniumDownloader驱动的爬取方式
        Spider.create(new BondProcessor()).thread(1)
                .addPipeline(new ConsolePipeline())
                .addUrl(url)
                .setDownloader(new SeleniumDownloader("/home/myfile/tool_station/chromedriver_linux64/chromedriver").setSleepTime(1000))
                .run();
    }
}

chromedriver驱动的下载及配置

chromedriver下载路径[http://chromedriver.storage.googleapis.com/index.html](http://chromedriver.storage.googleapis.com/index.html)
【请下载与你的浏览器版本相同的chromedriver包】
config.ini文件配置如下:
driver=chrome
#chrome_exec_path=/usr/bin/google-chrome-stable
chrome_driver_loglevel=DEBUG

本例实体类如下:
@Data
public class Bond {
    private Long id;
    /**债券名称*/
    private String itemName;
    /**客户买入到期收益率*/
    private String clientBuyingRate;
    /**客户买入交易全价*/
    private String clientBuyingFullPrice;
    /**客户卖出收益率*/
    private String clientSellingRate;
    /**客户卖出交易全价*/
    private String clientSellingFullPrice;
    /**票面利率*/
    private String couponRate;
    /**应计利息*/
    private String accruedInterest;
    /**本期付息日*/
    private String currentPaymentDate;
    /**本期利息收益*/
    private String currentInterestIncome;
    /**付息频率*/
    private String interestFrequency;
    /**券债面值*/
    private String couponBondValue;
    /**到期日*/
    private String expireDate;
    /**持有到期利息收益*/
    private String holdExpireInterestIncome;
    /**客户买入净价*/
    private String clientBuyingNetPrice;
    /**客户卖出净价*/
    private String clientSellingNetPrice;
    /**债券类型*/
    private String itemType;
    /**债券的特点*/
    private String itemFeature;
    /**剩余期限*/
    private String remainTimeLimit;
    /**交易时间*/
    private String tradingHours;
    /**更新时间*/
    private String updateTime;
    }

三.js中请求能看到数据的动态渲染网页
示例:

import com.boe.mps.jrj.dataas.entity.ExchangeMarket;
import org.joda.time.DateTime;
import org.springframework.stereotype.Repository;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Json;
import us.codecraft.webmagic.selector.JsonPathSelector;

import java.util.*;

@Repository
public class ExchangeMarketProcessor implements PageProcessor{

    private static Site site = Site.me().setRetryTimes(3).setSleepTime(100);

    @Override
    public Site getSite() {
        return site;
    }

    @Override
    public void process(Page page) {
        String updateTime = DateTime.now().toString(DateUtils.DATE_FORMAT_LONG_FULL);
        String json = page.getJson().get();
        System.out.println("=="+json);
        //  post请求获取数据,取值方式如下
        List<String> strings = new JsonPathSelector("$.rf").selectList(page.getRawText());
        List<ExchangeMarket> list = new ArrayList<>();
        for (String str:strings) {
            Json item = new Json(str);
            ExchangeMarket exchangeMarket = new ExchangeMarket();
            exchangeMarket.setItemName(item.jsonPath("$.proName").get());
            exchangeMarket.setRisefall(item.jsonPath("$.riseSign").get());
            exchangeMarket.setBankBuyingPrice(item.jsonPath("$.buyRate").get());
            exchangeMarket.setBankSellingPrice(item.jsonPath("$.sellRate").get());
            exchangeMarket.setMiddlePrice(item.jsonPath("$.middPrice").get());
            exchangeMarket.setDayRisefallRange(item.jsonPath("$.openprice_dr").get());
            exchangeMarket.setDayRisefallValue(item.jsonPath("$.openprice_dv").get());
            exchangeMarket.setYearRisefallRange(item.jsonPath("$.openprice_yr").get());
            exchangeMarket.setUpdateTime(updateTime);
            list.add(exchangeMarket);
        }
        page.putField("exchangeMarket",list);
    }
    public static void main(String[] args) {
        String agriculturalUrl="https://mybank.icbc.com.cn/ctp/ctpservlet/EbdpAjaxServlet";
        Request exchangeMarketRequest = new Request(agriculturalUrl);
        exchangeMarketRequest.setMethod(HttpConstant.Method.POST);
        //  构造post请求及参数设置
        Map<String, Object> agriculturalMap = new HashMap<>();
        agriculturalMap.put("tranCode","A00513");
        exchangeMarketRequest.setRequestBody(HttpRequestBody.form(agriculturalMap,"utf-8"));
        Spider.create(new ExchangeMarketProcessor()).addPipeline(new ConsolePipeline()).addRequest(exchangeMarketRequest).thread(1).run();
    }
}

该例中实体类如下:

@Data
public class ExchangeMarket {
    /**主键*/
    private Long id;
    /**
     *品种
     */
    private String itemName;
    /**
     *涨跌
     */
    private String risefall;
    /**
     *银行买入价
     */
    private String bankBuyingPrice;
    /**
     *银行卖出价
     */
    private String bankSellingPrice;
    /**
     *中间价
     */
    private String middlePrice;
    /**
     *当日涨跌值
     */
    private String dayRisefallValue;
    /**
     *当日涨跌幅
     */
    private String dayRisefallRange;
    /**
     *当年涨跌幅
     */
    private String yearRisefallRange;
    /**
     * 更新时间
     */
    private String updateTime;
    }

该例的请求url【见上文代码】https://mybank.icbc.com.cn/ct...
js请求的可以先postman调用一下,确认请求方式及需要的参数等内容。


一片秋叶一树春
47 声望3 粉丝

贪君子之财,好美景之色,行正义之事,了前生之愿,爱此生之人!!!!!