webmagic有一个selenium模块,其中实现了一个SeleniumDownloader。但是感觉灵活性不大。所以我就自己参考实现了一个。
首先是WebDriverPool用来管理WebDriver池:
import java.util.ArrayList;
import java.util.concurrent.BlockingDeque;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriver;
import org.openqa.selenium.phantomjs.PhantomJSDriverService;
import org.openqa.selenium.remote.DesiredCapabilities;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import net.xby1993.common.util.FileUtil;
/**
* @author taojw
*/
public class WebDriverPool {
private Logger logger = LoggerFactory.getLogger(getClass());
private int CAPACITY = 5;
private AtomicInteger refCount = new AtomicInteger(0);
private static final String DRIVER_PHANTOMJS = "phantomjs";
/**
* store webDrivers available
*/
private BlockingDeque<WebDriver> innerQueue = new LinkedBlockingDeque<WebDriver>(
CAPACITY);
private static String PHANTOMJS_PATH;
private static DesiredCapabilities caps = DesiredCapabilities.phantomjs();
static {
PHANTOMJS_PATH = FileUtil.getCommonProp("phantomjs.path");
caps.setJavascriptEnabled(true);
caps.setCapability(
PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY,
PHANTOMJS_PATH);
caps.setCapability("takesScreenshot", false);
caps.setCapability(
PhantomJSDriverService.PHANTOMJS_PAGE_CUSTOMHEADERS_PREFIX
+ "User-Agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36");
ArrayList<String> cliArgsCap = new ArrayList<String>();
//http://phantomjs.org/api/command-line.html
cliArgsCap.add("--web-security=false");
cliArgsCap.add("--ssl-protocol=any");
cliArgsCap.add("--ignore-ssl-errors=true");
cliArgsCap.add("--load-images=false"); //不加载图片
caps.setCapability(PhantomJSDriverService.PHANTOMJS_CLI_ARGS,
cliArgsCap);
caps.setCapability(
PhantomJSDriverService.PHANTOMJS_GHOSTDRIVER_CLI_ARGS,
new String[] {"--logLevel=INFO"});
}
public WebDriverPool() {
}
public WebDriverPool(int poolsize) {
this.CAPACITY = poolsize;
innerQueue = new LinkedBlockingDeque<WebDriver>(poolsize);
}
public WebDriver get() throws InterruptedException {
WebDriver poll = innerQueue.poll();
if (poll != null) {
return poll;
}
if (refCount.get() < CAPACITY) {
synchronized (innerQueue) {
if (refCount.get() < CAPACITY) {
WebDriver mDriver = new PhantomJSDriver(caps);
// 尝试性解决:https://github.com/ariya/phantomjs/issues/11526问题
mDriver.manage().timeouts()
.pageLoadTimeout(60, TimeUnit.SECONDS);
// mDriver.manage().window().setSize(new Dimension(1366,
// 768));
innerQueue.add(mDriver);
refCount.incrementAndGet();
}
}
}
return innerQueue.take();
}
public void returnToPool(WebDriver webDriver) {
// webDriver.quit();
// webDriver=null;
innerQueue.add(webDriver);
}
public void close(WebDriver webDriver) {
refCount.decrementAndGet();
webDriver.quit();
webDriver = null;
}
public void shutdown() {
try {
for (WebDriver driver : innerQueue) {
close(driver);
}
innerQueue.clear();
} catch (Exception e) {
// e.printStackTrace();
logger.warn("webdriverpool关闭失败",e);
}
}
}
之后便是SeleniumDownloader
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.PlainText;
import us.codecraft.webmagic.utils.UrlUtils;
import java.util.Map;
/**
* @author taojw
*
*/
public class SeleniumDownloader implements Downloader{
private static final Logger log=LoggerFactory.getLogger(SeleniumDownloader.class);
private int sleepTime=3000;//3s
private SeleniumAction action=null;
private WebDriverPool webDriverPool=new WebDriverPool();
public SeleniumDownloader(){
}
public SeleniumDownloader(int sleepTime,WebDriverPool pool){
this(sleepTime,pool,null);
}
public SeleniumDownloader(int sleepTime,WebDriverPool pool,SeleniumAction action){
this.sleepTime=sleepTime;
this.action=action;
if(pool!=null){
webDriverPool=pool;
}
}
public SeleniumDownloader setSleepTime(int sleepTime) {
this.sleepTime = sleepTime;
return this;
}
public void setOperator(SeleniumAction action){
this.action=action;
}
@Override
public Page download(Request request, Task task) {
WebDriver webDriver;
try {
webDriver = webDriverPool.get();
} catch (InterruptedException e) {
log.warn("interrupted", e);
return null;
}
log.info("downloading page " + request.getUrl());
Page page = new Page();
try {
webDriver.get(request.getUrl());
Thread.sleep(sleepTime);
} catch (InterruptedException e) {
e.printStackTrace();
} catch (Exception e) {
webDriverPool.close(webDriver);
page.setSkip(true);
return page;
}
// WindowUtil.changeWindow(webDriver);
WebDriver.Options manage = webDriver.manage();
Site site = task.getSite();
if (site.getCookies() != null) {
for (Map.Entry<String, String> cookieEntry : site.getCookies()
.entrySet()) {
Cookie cookie = new Cookie(cookieEntry.getKey(),
cookieEntry.getValue());
manage.addCookie(cookie);
}
}
manage.window().maximize();
if(action!=null){
action.execute(webDriver);
}
SeleniumAction reqAction=(SeleniumAction) request.getExtra("action");
if(reqAction!=null){
reqAction.execute(webDriver);
}
WebElement webElement = webDriver.findElement(By.xpath("/html"));
String content = webElement.getAttribute("outerHTML");
page.setRawText(content);
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content,
webDriver.getCurrentUrl())));
page.setUrl(new PlainText(webDriver.getCurrentUrl()));
page.setRequest(request);
webDriverPool.returnToPool(webDriver);
return page;
}
@Override
public void setThread(int thread) {
}
}
这里的扩展性主要体现在,我加入了SeleniumAction接口,可以在SeleniumDownloader初始化的时候配置一个全局的SeleniumAction,以及为每个Request配置对应的SeleniumAction。 SeleniumAction接口如下:
public interface SeleniumAction {
void execute(WebDriver driver);
}
它会获得一个WebDriver实例,你可以在里面进行任意的Selenium操作。
本部分到此结束。
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。