springboot+webmagic实现java爬虫jdbc及mysql的方法

网友投稿 295 2023-01-23


springboot+webmagic实现java爬虫jdbc及mysql的方法

前段时间需要爬取网页上的信息,自己对于爬虫没有任何了解,就了解了一下webmagic,写了个简单的爬虫。

一、首先介绍一下webmagic:

webmagic采用完全模块化的设计,功能覆盖整个爬虫的生命周期(链接提取、页面下载、内容抽取、持久化),支持多线程抓取,分布式抓取,并支持自动重试、自定义UA/cookie等功能。

实现理念:

Maven依赖:

us.codecraft

webmagic-core

0.7.3

us.codecraft

webmagic-extension

0.7.3

us.codecraft

webmagic-extension

0.7.3

org.slf4j

slf4j-log4j12

jdbc模式:

ublic class CsdnBlogDao {

private Connection conn = null;

private Statement stmt = null;

public CsdnBlogDao() {

try {

Class.forName("com.mysql.jdbc.Driver");

String url = "jdbc:mysql://localhost:3306/test?"

+ "user=***&password=***3&useUnicode=true&characterEncoding=UTF8";

conn = DriverManager.getConnection(url);

stmt = conn.createStatement();

} catch (ClassNotFoundException e) {

e.printStackTrace();

} catch (SQLException e) {

e.printStackTrace();

}

}

public int add(CsdnBlog csdnBlog) {

try {

String sql = "INSERT INTO `test`.`csdnblog` (`keyes`, `titles`, `content` , `dates`, `tags`, `category`, `views`, `comments`, `copyright`) VALUES (?, ?, ?, ?, ?, ?, ?, ?,?);";

PreparedStatement ps = conn.prepareStatement(sql);

ps.setInt(1, csdnBlog.getKey());

ps.setString(2, csdnBlog.getTitle());

ps.setString(3,csdnBlog.getContent());

ps.setString(4, csdnBlog.getDates());

ps.setString(5, csdnBlog.getTags());

ps.setString(6, csdnBlog.getCategory());

http:// ps.setInt(7, csdnBlog.getView());

ps.setInt(8, csdnBlog.getComments());

ps.setInt(9, csdnBlog.getCopyright());

return ps.executeUpdate();

} catch (SQLException e) {

e.printStackTrace();

}

return -1;

}

}

实体类:

public class CsdnBlog {

private int key;// 编号

private String title;// 标题

private String dates;// 日期

private String tags;// 标签

private String category;// 分类

private int view;// 阅读人数

private int comments;// 评论人数

private int copyright;// 是否原创

http://private String content; //文字内容

public String getContent() {

return content;

}

public void setContent(String content) {

this.content = content;

}

public int getKey() {

return key;

}

public void setKey(int key) {

this.key = key;

}

public String getTitle() {

return title;

}

public void setTitle(String title) {

this.title = title;

}

public String getDates() {

return dates;

}

public void setDates(String dates) {

this.dates = dates;

}

public String getTags() {

return tags;

}

public void setTags(String tags) {

this.tags = tags;

}

public String getCategory() {

return category;

}

public void setCategory(String category) {

this.category = category;

}

public int getView() {

return view;

}

public void setView(int view) {

this.view = view;

}

public int getComments() {

return comments;

}

public void setComments(int comments) {

this.comments = comments;

}

public int getCopyright() {

return copyright;

}

public void setCopyright(int copyright) {

this.copyright = copyright;

}

@Override

public String toString() {

return "CsdnBlog [key=" + key + ", title=" + title + ", content=" + content + ",dates=" + dates + ", tags=" + tags + ", category="

+ category + ", view=" + view + ", comments=" + comments + ", copyright=" + copyright + "]";

}

}

启动类:

public class CsdnBlogPageProcessor implements PageProcessor {

private static String username="CHENYUFENG1991"; // 设置csdn用户名

private static int size = 0;// 共抓取到的文章数量

// 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等

private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);

public Site getSite() {

return site;

}

// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑

public void process(Page page) {

// 列表页

if (!page.getUrl().regex("http://blog\\.csdn\\.net/" + username + "/article/details/\\d+").match()) {

// 添加所有文章页

page.addTargetRequests(page.getHtml().xpath("//div[@id='article_list']").links()// 限定文章列表获取区域

.regex("/" + username + "/article/details/\\d+")

.replace("/" + username + "/", "http://blog.csdn.net/" + username + "/")// 巧用替换给把相对url转换成绝对url

.all());

// 添加其他列表页

page.addTargetRequests(page.getHtml().xpath("//div[@id='papelist']").links()// 限定其他列表页获取区域

.regex("/" + username + "/article/list/\\d+")

.replace("/" + username + "/", "http://blog.csdn.net/" + username + "/")// 巧用替换给把相对url转换成绝对url

.all());

// 文章页

} else {

size++;// 文章数量加1

// 用CsdnBlog类来存抓取到的数据,方便存入数据库

CsdnBlog csdnBlog = new CsdnBlog();

// 设置编号

csdnBlog.setKey(Integer.parseInt(

page.getUrl().regex("http://blog\\.csdn\\.net/" + username + "/article/details/(\\d+)").get()));

// 设置标题

csdnBlog.setTitle(

page.getHtml().xpath("//div[@class='article_title']//span[@class='link_title']/a/text()").get());

//设置内容

csdnBlog.setContent(

page.getHtml().xpath("//div[@class='article_content']/allText()").get());

// 设置日期

csdnBlog.setDates(

page.getHtml().xpath("//div[@class='article_r']/span[@class='link_postdate']/text()").get());

// 设置标签(可以有多个,用,来分割)

csdnBlog.setTags(listToString(page.getHtml().xpath("//div[@class='article_l']/span[@class='link_categories']/a/allText()").all()));

// 设置类别(可以有多个,用,来分割)

csdnBlog.setCategory(listToString(page.getHtml().xpath("//div[@class='category_r']/label/span/text()").all()));

// 设置阅读人数

csdnBlog.setView(Integer.parseInt(page.getHtml().xpath("//div[@class='article_r']/span[@class='link_view']")

.regex("(\\d+)人阅读").get()));

// 设置评论人数

csdnBlog.setComments(Integer.parseInt(page.getHtml()

.xpath("//div[@class='article_r']/span[@class='link_comments']").regex("\\((\\d+)\\)").get()));

// 设置是否原创

csdnBlog.setCopyright(page.getHtml().regex("bog_copyright").match() ? 1 : 0);

// 把对象存入数据库

new CsdnBlogDao().add(csdnBlog);

// 把对象输出控制台

System.out.println(csdnBlog);

}

}

// 把list转换为string,用,分割

public static String listToString(List stringList) {

if (stringList == null) {

return null;

}

StringBuilder result = new StringBuilder();

boolean flag = false;

for (String string : stringList) {

if (flag) {

result.append(",");

} else {

flag = true;

}

result.append(string);

}

return result.toString();

}

public static void main(String[] args) {

long startTime, endTime;

System.out.println("【爬虫开始】...");

startTime = System.currentTimeMillis();

// 从用户博客首页开始抓,开启5个线程,启动爬虫

Spider.create(new CsdnBlogPageProcessor()).addUrl("http://blog.csdn.net/" + username).thread(5).run();

endTime = System.currentTimeMillis();

System.out.println("【爬虫结束】共抓取" + size + "篇文章,耗时约" + ((endTime - startTime) / 1000) + "秒,已保存到数据库,请查收!");

}

}

使用mysql类型:

public class GamePageProcessor implements PageProcessor {

private static final Logger logger = LoggerFactory.getLogger(GamePageProcessor.class);

private static DianJingService d;

private static BannerService bs;

private static SportService ss;

private static YuLeNewsService ys;

private static UpdateService ud ;

// 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等

private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);

public Site getSite() {

return site;

}

// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑

public static void main(String[] args) {

ConfigurableApplicationContext context= SpringApplication.run(GamePageProcessor.clahttp://ss, args);

d = context.getBean(DianJingService.class);

//Spider.create(new GamePageProcessor()).addUrl("网址").thread(5).run();

}

public void process(Page page) {

Selectable url = page.getUrl();

if (url.toString().equals("网址")) {

DianJingVideo dv = new DianJingVideo();

List ls = page.getHtml().xpath("//div[@class='v']/div[@class='v-meta va']/div[@class='v-meta-title']/a/text()").all();

//hrefs

List ls1 = page.getHtml().xpath("//div[@class='v']/div[@class='v-link']/a/@href").all();//获取a标签的href

List ls2 = page.getHtml().xpath("//div[@class='v']/div[@class='v-meta va']/div[@class='v-meta-entry']/div[@class='v-meta-data']/span[@class='r']/text()").all();

//photo

List ls3 = page.getHtml().xpath("//div[@class='v']/div[@class='v-thumb']/img/@src").all();

for (int i = 0; i < 5; i++) {

dv.setTitles(ls.get(i));

dv.setCategory("");

dv.setDates(ls2.get(i));

dv.setHrefs(ls1.get(i));

dv.setPhoto(ls3.get(i));

dv.setSources("");

d.addVideo(dv);

}

}

}

Controller:

@Controller

@RequestMapping(value = "/dianjing")

public class DianJingController {

@Autowired

private DianJingService s;

/*

手游

*/

@RequestMapping("/dianjing")

@ResponseBody

public Object dianjing(){

List list = s.find2();

jsONObject jo = new JSONObject();

if(list!=null){

jo.put("code",0);

jo.put("success",true);

jo.put("count",list.size());

jo.put("list",list);

}

return jo;

}

}

实体类就不展示了

dao层

@Insert("insert into dianjing (titles,dates,category,hrefs,photo,sources) values(#{titles},#{dates},#{category},#{hrefs},#{photo},#{sources})")

int adddj(DianJing dj);


版权声明:本文内容由网络用户投稿,版权归原作者所有,本站不拥有其著作权,亦不承担相应法律责任。如果您发现本站中有涉嫌抄袭或描述失实的内容,请联系我们jiasou666@gmail.com 处理,核实后本网站将在24小时内删除侵权内容。

上一篇:主机共享文件系统(主机共享文件系统设置)
下一篇:Java实现JDBC连接数据库简单案例
相关文章

 发表评论

暂时没有评论,来抢沙发吧~