java 爬虫代码优化

package com.company;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.apache.http.HttpEntity;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
public class MingLuSpider {
    private String ReponseBody;
    public void MingLuSpider() {
        MingLuSpider mingspider = new MingLuSpider();
    }
    public void GetRequestData(String url) throws IOException {
        String ResponseBody = null;
        String ResponseInsideBody=null;
        try {
            CloseableHttpClient httpClient = HttpClients.createDefault();
            HttpGet httpGet = new HttpGet(url);
            httpGet.setHeader("User-Agent", "Mozilla/5.0(Windows NT 6.1;Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0");
            CloseableHttpResponse response = httpClient.execute(httpGet);
            HttpEntity httpEntity = response.getEntity();
            ResponseBody = EntityUtils.toString(httpEntity, "utf-8");
            Document document = Jsoup.parse(ResponseBody);
            Elements getItems = document.select("td[class='views-field views-field-name']");
            for (Element getItem : getItems) {
                String link = "https://gongshang.mingluji.com" + getItem.select("a").attr("href");
                System.out.println("每个公司链接为:" + link);
                HttpGet GetInsideDate = new HttpGet(link);
                GetInsideDate.setHeader("User-Agent", "Mozilla/5.0(Windows NT 6.1;Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0");
                CloseableHttpResponse ResponseInside = httpClient.execute(GetInsideDate);
                HttpEntity httpinsideEntity = ResponseInside.getEntity();
                ResponseInsideBody = EntityUtils.toString(httpinsideEntity, "utf-8");
                System.out.println(ResponseInsideBody);
                System.out.println("这个链接为");
                System.out.println(link);
            }
            response.close();
            httpClient.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

这里的 每次都需要new new HttpGet(link);请求新的url,而且每次还需set同样的header,有没有不需要new 的而且不需要set header 一次设置就可以了?

阅读 1.5k
1 个回答

试试fluent-hc吧,是httpclient的官方包装,使用起来比httpclient方便太多了。

撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进
推荐问题