springBoot+webMagic实现网站爬虫的实例代码

网友投稿 359 2022-12-07


springBoot+webMagic实现网站爬虫的实例代码

前端时间公司项目需要抓取各类数据,py玩的不6,只好研究java爬虫方案,做一个总结。

开发环境:

springBoot 2.2.6、jdk1.8。

1、导入依赖

us.codecraft

webmagic-core

0.7.3

us.codecraft

webmagic-extension

0.7.3

com.google.guava

guava

16.0

话不多说,直接上代码。

基础案例

下面代码说明以一个类似列表的页面为例

package com.crawler.project.proTask;

import com.alibaba.fastjson.JSONObject;

import org.springframework.scheduling.annotation.Scheduled;

import us.codecraft.webmagic.Page;

import us.codecraft.webmagic.Site;

import us.codecraft.webmagic.Spider;

import us.codecraft.webmagic.processor.PageProcessor;

import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;

import us.codecraft.webmagic.scheduler.QueueScheduler;

import us.codecraft.webmagic.selector.Selectable;

import java.util.List;

public class TaskProcessor implements PageProcessor {

/*

* 此方法为爬虫业务实现

* */

@Override

public void process(Page page) {

//1、爬虫任务获取到一个page 解析page上的列表

List list = page.getHtml().css("css selector").nodes();

if (list.size() > 0){//说明为列表页面、需要解析列表中每个元素的链接,存入待获取page队列中

for (Selectable selectable : list) {

//遍历集合,将每个元素链接存入待获取page队列中

page.addTargetRequest(selectable.links().toString());

}

//同时将下一页的url存入队列中

page.addTargetRequest("下一页的url");

}else {

//此时为列表中单个元素对应的详情页

//在自定义方法中处理详细页,获取需要的数据进行处理。

handle(page);

}

}

private void handle(Page page) {

//例如 处理后的数据为一个JSONObject对象

JSONObject tmp = new JSONObject();

//将这个tmp交由自定义的TaskPipline类处理,若未自定义Pipline并设置到Spider参数中,框架会默认将tmp打印到控制台。

page.putField("obj",tmp);

}

/*

* 此方法为配置爬虫过程的一些参数

* */

private Site site = Site.me()

.setCharset("UTF-8")

.setTimeOut(60 * 1000)

.setRetrySleepTime(60 * 1000)

.setCycleRetryTimes(5);

@Override

public Site getSite() {

return site;

}

/*

设置定时任务,执行爬虫任务

* */

@Scheduled(initialDelay = 1 * 1000,fixedDelay = 2 * 1000)

public void process(){

System.out.println("开始执行爬虫抓取任务");

Spider.create(new TaskProcessor())//注意这里的类名要和当前类名对应

.addUrl("起始页url")

.addPipeline(new TaskPipeline()) //此处课自定义 数据处理类 (在handle()方法中有);

.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(100000)))

.thread(3)//此处设置线程数量(不宜过多,最好和列表页中列表元素数量一致)

.run();

}

}

package com.crawler.project.proTask;

import com.alibaba.fastjson.JSON;

import com.alibaba.fastjson.JSONObject;

import us.codecraft.webmagic.ResultItems;

import us.codecraft.webmagic.Task;

import us.codecraft.webmagic.pipeline.Pipeline;

public class TaskPipeline implements Pipeline {

@Override

public void process(ResultItems resultItems, Task task) {

if (resultItems.getAll() .size() > 0){

Object obj = resultItems.getAll().get("obj");

JSONObject jsonObject = JSON.parseObject(obj.toString());

//获取到JSONObject对象下面可进行自定义的业务处理。

}

}

}

特殊情况一

需根据链接下载图片或文件

eg:在上面说到的详情页中含有iframe。

1、首先获取iframe的src

//获得iframe的src (这里要注意获得的src是绝对路径还是相对路径,相对路径需要拼接主站点url)

String src = html.css("css selector", "src").toString();

//采用jsoup解析

Document document = Jsoup.parse(new URL(src),1000);

//获得需要的元素

Element ele = document.select("css selector").last();

//获取需要下载的文件的链接

String downUrl = ele.attr("href");

//根据链接下载文件 返回一个文件的名称

String fileName = downloadFile(downUrl);

//通过url下载文件

public String downloadFile(String fileUrl) throws FileNotFoundException{

try{

URL httpUrl = new URL(fileUrl);

String fileName = UUID.randomUUID().toString() + ".mp3";

File file = new File(this.STATIC_FILEPATH + fileName);

System.out.println("============保存文件方法被调用===============");

FileUtils.copyURLToFile(httpUrl,file);

return fileName;

}catch (Exception e){

e.printStackTrace();

return null;

}

}

特殊情况二

有些https站点 无法直接使用WebMagic默认的下载器下载,此时我们可以根据站点ssl类型修改下载器。

在项目中创建一个包用于存放自定义(修改)的下载器类

(!!!摘自webMagic框架中HttpClientDownloader,基于此类修改!!!)

/*

此方法中需要传入一个自定义的生成器(HttpClientGenerator)

*/

package com.crawler.project.spider_download;

import org.apache.commons.io.IOUtils;

import org.apache.http.HttpResponse;

import org.apache.http.client.methods.CloseableHttpResponse;

import org.apache.http.impl.client.CloseableHttpClient;

import org.apache.http.util.EntityUtils;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import us.codecraft.webmagic.Page;

import us.codecraft.webmagic.Request;

import us.codecraft.webmagic.Site;

import us.codecraft.webmagic.Task;

import us.codecraft.webmagic.downloader.AbstractDownloader;

import us.codecraft.webmagic.downloader.HttpClientRequestContext;

import us.codecraft.webmagic.downloader.HttpUriRequestConverter;

import us.codecraft.webmagic.proxy.Proxy;

import us.codecraft.webmagic.proxy.ProxyProvider;

import us.codecraft.webmagic.selector.PlainText;

import us.codecraft.webmagic.utils.CharsetUtils;

import us.codecraft.webmagic.utils.HttpClientUtils;

import java.io.IOException;

import java.nio.charset.Charset;

import java.util.HashMap;

import java.util.Map;

/**

* The http downloader based on HttpClient.

*

* @author code4crafter@gmail.com

* @since 0.1.0

*/

public class HttpClientDownloader extends AbstractDownloader {

private Logger logger = LoggerFactory.getLogger(getClass());

private final Map httpClients = new HashMap();

//自定义的生成器(HttpClientGenerator)注意导入的应为自定义的HttpClientGenerator类,而不是WebMagic依赖中的HttpClientGenerator类。

private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();

private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter();

private ProxyProvider proxyProvider;

private boolean responseHeader = true;

public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) {

this.httpUriRequestConverter = httpUriRequestConverter;

}

public void setProxyProvider(ProxyProvider proxyProvider) {

this.proxyProvider = proxyProvider;

}

private CloseableHttpClient getHttpClient(Site site) {

if (site == null) {

return httpClientGenerator.getClient(null);

}

String domain = site.getDomain();

CloseableHttpClient httpClient = httpClients.get(domain);

if (httpClient == null) {

synchronized (this) {

httpClient = httpClients.get(domain);

if (httpClient == null) {

httpClient = httpClientGenerator.getClient(site);

httpClients.put(domain, httpClient);

}

}

}

return httpClient;

}

@Override

public Page download(Request request, Task task) {

if (task == null || task.getSite() == null) {

throw new NullPointerException("task or site can not be null");

}

CloseableHttpResponse httpResponse = null;

CloseableHttpClient httpClient = getHttpClient(task.getSite());

Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null;

HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy);

Page page = Page.fail();

try {

httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext());

page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task);

onSuccess(request);

logger.info("downloading page success {}", request.getUrl());

return page;

} catch (IOException e) {

logger.warn("download page {} error", request.getUrl(), e);

onError(request);

return page;

} finally {

if (httpResponse != null) {

//ensure the connection is released back to pool

EntityUtils.consumeQuietly(httpResponse.getEntity());

}

if (proxyProvider != null && proxy != null) {

proxyProvider.returnProxy(proxy, page, task);

}

}

}

@Override

public void setThread(int thread) {

httpClientGenerator.setPoolSize(thread);

}

protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {

byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());

String contentTypTltmUgNyLle = httpResponse.getEntity().getContentType() == null ? "" : httpResponse.getEntity().getContentType().getValue();

Page page = new Page();

page.setBytes(bytes);

if (!request.isBinaryContent()){

if (charset == null) {

charset = getHtmlCharset(contentType, bytes);

}

page.setCharset(charset);

page.setRawText(new String(bytes, charset));

}

page.setUrl(new PlainText(request.getUrl()));

page.setRequest(request);

page.setStatusCode(httpResponse.getStatusLine().getStatusCode());

page.setDownloadSuccess(true);

if (responseHeader) {

page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders()));

}

return page;

}

private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException {

String charset = CharsetUtils.detectCharset(contentType, contentBytes);

if (charset == null) {

charset = Charset.defaultCharset().name();

logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());

}

return charset;

}

}

然后在自定义的HttpClientGenerator类中修改有关ssl的参数

(!!!摘自webMagic框架中HttpClientGenerator,基于此类修改!!!)

/*

自定义的HttpClientGenerator生成器

*/

package com.sealion_crawler.project.spider_download;

import org.apache.http.HttpException;

import org.apache.http.HttpRequest;

import org.apache.http.HttpRequestInterceptor;

import org.apache.http.client.CookieStore;

import org.apache.http.config.Registry;

import org.apache.http.config.RegistryBuilder;

import org.apache.http.config.SocketConfig;

import org.apache.http.conn.socket.ConnectionSocketFactory;

import org.apache.http.conn.socket.PlainConnectionSocketFactory;

import org.apache.http.conn.ssl.DefaultHostnameVerifier;

import org.apache.http.conn.ssl.SSLConnectionSocketFactory;

import org.apache.http.impl.client.*;

import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;

import org.apache.http.impl.cookie.BasicClientCookie;

import org.apache.http.protocol.HttpContext;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import us.codecraft.webmagic.Site;

import us.codecraft.webmagic.downloader.CustomRedirectStrategy;

import javax.net.ssl.SSLContext;

import javax.net.ssl.TrustManager;

import javax.net.ssl.X509TrustManager;

import java.io.IOException;

import java.security.KeyManagementException;

import java.security.NoSuchAlgorithmException;

import java.security.cert.CertificateException;

import java.security.cert.X509Certificate;

import java.util.Map;

/**

* @author code4crafter@gmail.com

* @since 0.4.0

*/

public class HttpClientGenerator {

private transient Logger logger = LoggerFactory.getLogger(getClass());

private PoolingHttpClientConnectionManager connectionManager;

public HttpClientGenerator() {

Registry reg = RegistryBuilder.create()

.register("http", PlainConnectionSocketFactory.INSTANCE)

.register("https", buildSSLConnectionSocketFactory())

.build();

connectionManager = new PoolingHttpClientConnectionManager(reg);

connectionManager.setDefaultMaxPerRoute(100);

}

/*

此方法中设置ssl有关参数。

*/

private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() {

http:// try {

return new SSLConnectionSocketFactory(createIgnoreVerifySSL(), new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"},

null,

new DefaultHostnameVerifier()); // 优先绕过安全证书

} catch (KeyManagementException e) {

logger.error("ssl connection fail", e);

} catch (NoSuchAlgorithmException e) {

logger.error("ssl connection fail", e);

}

return SSLConnectionSocketFactory.getSocketFactory();

}

private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {

// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法

X509TrustManager trustManager = new X509TrustManager() {

@Override

public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {

}

@Override

public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {

}

@Override

public X509Certificate[] getAcceptedIssuers() {

return null;

}

};

/*

下面为当前框架默认参数

SSLContext sc = SSLContext.getInstance("SSLv3");

可修改为需要的ssl参数类型

*/

SSLContext sc = SSLContext.getInstance("TLS");

sc.init(null, new TrustManager[] { trustManager }, null);

return sc;

}

public HttpClientGenerator setPoolSize(int poolSize) {

connectionManager.setMaxTotal(poolSize);

return this;

}

public CloseableHttpClient getClient(Site site) {

return generateClient(site);

}

private CloseableHttpClient generateClient(Site site) {

HttpClientBuilder httpClientBuilder = HttpClients.custom();

httpClientBuilder.setConnectionManager(connectionManager);

if (site.getUserAgent() != null) {

httpClientBuilder.setUserAgent(site.getUserAgent());

} else {

httpClientBuilder.setUserAgent("");

}

if (site.isUseGzip()) {

httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() {

public void process(

final HttpRequest request,

final HttpContext context) throws HttpException, IOException {

if (!request.containsHeader("Accept-Encoding")) {

request.addHeader("Accept-Encoding", "gzip");

}

}

});

}

//解决post/redirect/post 302跳转问题

httpClientBuilder.setRedirectStrategy(new CustomRedirectStrategy());

SocketConfig.Builder socketConfigBuilder = SocketConfig.custom();

socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true);

socketConfigBuilder.setSoTimeout(site.getTimeOut());

SocketConfig socketConfig = socketConfigBuilder.build();

httpClientBuilder.setDefaultSocketConfig(socketConfig);

connectionManager.setDefaultSocketConfig(socketConfig);

httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true));

generateCookie(httpClientBuilder, site);

return httpClientBuilder.build();

}

private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) {

if (site.isDisableCookieManagement()) {

httpClientBuilder.disableCookieManagement();

return;

}

CookieStore cookieStore = new BasicCookieStore();

for (Map.Entry cookieEntry : site.getCookies().entrySet()) {

BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());

cookie.setDomain(site.getDomain());

cookieStore.addCookie(cookie);

}

for (Map.Entry> domainEntry : site.getAllCookies().entrySet()) {

for (Map.Entry cookieEntry : domainEntry.getValue().entrySet()) {

BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue());

cookie.setDomain(domainEntry.getKey());

cookieStore.addCookie(cookie);

}

}

httpClientBuilder.setDefaultCookieStore(cookieStore);

}

}

好了,到这里 基于WebMagic框架 实现爬虫、包括jsoup的使用总结就到这里的。


版权声明:本文内容由网络用户投稿,版权归原作者所有,本站不拥有其著作权,亦不承担相应法律责任。如果您发现本站中有涉嫌抄袭或描述失实的内容,请联系我们jiasou666@gmail.com 处理,核实后本网站将在24小时内删除侵权内容。

上一篇:Springboot actuator生产就绪功能实现解析
下一篇:详解Java中Math.round()的取整规则
相关文章

 发表评论

暂时没有评论,来抢沙发吧~