看了標哥(代码忘烦恼)爬虫博客,两篇博客写的简单易懂:自己也跟着学了一下,几乎就是在上面进行的改动。
Java爬虫java爬取前程无忧(51job),这篇文章看博客代码不完整,建议看 github源码
准备工作:
创建一个普通maven工程 pom依赖
<dependencies>
<dependency>
<groupId>org
.apache
.httpcomponents
</groupId
>
<artifactId>httpclient
</artifactId
>
<version>4.5.9</version
>
</dependency
>
<dependency>
<groupId>org
.apache
.httpcomponents
</groupId
>
<artifactId>httpcore
</artifactId
>
<version>4.4.11</version
>
</dependency
>
<dependency>
<groupId>mysql
</groupId
>
<artifactId>mysql
-connector
-java
</artifactId
>
<version>8.0.15</version
>
</dependency
>
<dependency>
<groupId>org
.mybatis
</groupId
>
<artifactId>mybatis
</artifactId
>
<version>3.5.2</version
>
</dependency
>
<dependency>
<groupId>org
.jsoup
</groupId
>
<artifactId>jsoup
</artifactId
>
<version>1.12.1</version
>
</dependency
>
<dependency>
<groupId>org
.jsoup
</groupId
>
<artifactId>jsoup
</artifactId
>
<version>1.12.1</version
>
</dependency
>
<dependency>
<groupId>com
.alibaba
</groupId
>
<artifactId>fastjson
</artifactId
>
<version>1.2.47</version
>
</dependency
>
</dependencies
>
项目结构
分析页面
https://www.bilibili.com/anime/index/#season_version=-1&area=-1&is_finish=-1©right=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page=1
我们从这找到了相关的json数据
访问获取到的url: (改变page参数json数据发生改变。所以确定改url为爬取入口)
https
://api
.bilibili
.com
/pgc
/season
/index
/result
?season_version
=-1&area
=-1&is_finish
=-1©right
=-1&season_status
=-1&season_month
=-1&year
=-1&style_id
=-1&order
=3&st
=1&sort
=0&page
=1&season_type
=1&pagesize
=20&type
=1
开始爬取
创建数据库
CREATE TABLE `bilibili`
(
`id`
int(11) NOT NULL AUTO_INCREMENT
,
`title`
varchar(1024) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL
,
`cover`
varchar(1024) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL
,
`orders`
varchar(1024) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL
,
`index_show`
varchar(1024) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL
,
PRIMARY KEY
(`id`
)
) ENGINE
=InnoDB AUTO_INCREMENT
=1 DEFAULT CHARSET
=utf8
;
根据json字段创建BiliBili实体类
package com
.scitc
.model
;
public class BiliBili{
private Integer id
;
private String title
;
private String cover
;
private String order
;
private String indexShow
;
public BiliBili() {
}
public Integer
getId() {
return id
;
}
public void setId(Integer id
) {
this.id
= id
;
}
public String
getTitle() {
return title
;
}
public void setTitle(String title
) {
this.title
= title
;
}
public String
getCover() {
return cover
;
}
public void setCover(String cover
) {
this.cover
= cover
;
}
public String
getOrder() {
return order
;
}
public void setOrder(String order
) {
this.order
= order
;
}
public String
getIndexShow() {
return indexShow
;
}
public void setIndexShow(String indexShow
) {
this.indexShow
= indexShow
;
}
@Override
public String
toString() {
return "BiliBili{" +
"id=" + id
+
", title='" + title
+ '\'' +
", cover='" + cover
+ '\'' +
", order='" + order
+ '\'' +
", indexShow='" + indexShow
+ '\'' +
'}';
}
}
BiliBiliMapper
public interface BiliBiliMapper {
int insert(BiliBili biliBili
);
List
<BiliBili> biliBiliList();
}
BiliBiliMapper.xml
<?xml version
="1.0" encoding
="UTF-8" ?>
<!DOCTYPE mapper
PUBLIC
"-//mybatis.org//DTD Mapper 3.0//EN"
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace
="com.scitc.mapper.BiliBiliMapper">
<resultMap id
="biliResultMapper" type
="com.scitc.model.BiliBili">
<id column
="id" property
="id" jdbcType
="INTEGER"/>
<result column
="title" property
="title" jdbcType
="VARCHAR"/>
<result column
="cover" property
="cover" jdbcType
="VARCHAR"/>
<result column
="orders" property
="order" jdbcType
="VARCHAR"/>
<result column
="index_show" property
="indexShow" jdbcType
="VARCHAR"/>
</resultMap
>
<insert id
="insert" parameterType
="com.scitc.model.BiliBili" keyProperty
="id" keyColumn
="id" useGeneratedKeys
="true">
INSERT INTO
bilibili(title
,cover
,orders
,index_show
) VALUES
(#
{title
},#
{cover
},#
{order
},#
{indexShow
})
</insert
>
<select id
="biliBiliList" resultMap
="biliResultMapper">
SELECT
* FROM bilibili
</select
>
</mapper
>
jdbc.properties
jdbc
.url
=jdbc
:mysql
://localhost
:3306/user
?useUnicode
=true&characterEncoding
=utf
-8&serverTimezone
=GMT
%2B8
jdbc
.driver
=com
.mysql
.cj
.jdbc
.Driver
jdbc
.user
=root
jdbc
.password
=123456
mybatis-config.xml
<?xml version
="1.0" encoding
="UTF-8" ?>
<!DOCTYPE configuration
PUBLIC
"-//mybatis.org//DTD Config 3.0//EN"
"http://mybatis.org/dtd/mybatis-3-config.dtd">
<configuration>
<properties resource
="jdbc.properties"/>
<environments
default="development">
<environment id
="development">
<transactionManager type
="JDBC"></transactionManager
>
<dataSource type
="POOLED">
<property name
="driver" value
="${jdbc.driver}"/>
<property name
="url" value
="${jdbc.url}"/>
<property name
="username" value
="${jdbc.user}"/>
<property name
="password" value
="${jdbc.password}"/>
</dataSource
>
</environment
>
</environments
>
<mappers>
<mapper resource
="BiliBiliMapper.xml"/>
</mappers
>
</configuration
>
HTTPUtils
public class HTTPUtils {
public static HttpResponse
getHtml(HttpClient client
, String url
){
HttpGet getMethod
= new HttpGet(url
);
HttpResponse response
= new BasicHttpResponse(HttpVersion
.HTTP_1_1
, HttpStatus
.SC_OK
,"OK");
try {
response
= client
.execute(getMethod
);
} catch (IOException e
) {
e
.printStackTrace();
System
.err
.println("建立客户端出现异常");
}
return response
;
}
}
BiliBiliParse
public class BiliBiliParse {
public static void creteDate(String entity
){
String resources
= "mybatis-config.xml";
InputStream resourceAsStream
= null
;
try {
resourceAsStream
= Resources
.getResourceAsStream(resources
);
} catch (IOException e
) {
e
.printStackTrace();
}
SqlSessionFactory sqlSessionFactory
= new SqlSessionFactoryBuilder().build(resourceAsStream
);
SqlSession sqlSession
= sqlSessionFactory
.openSession();
BiliBiliMapper biliMapper
= sqlSession
.getMapper(BiliBiliMapper
.class);
JSONObject jsonObject
= JSONObject
.parseObject(entity
);
String jsonStr
= jsonObject
.getJSONObject("data").getString("list");
List
<BiliBili> biliBilis
= JSON
.parseArray(jsonStr
, BiliBili
.class);
biliBilis
.stream().map(e
-> {
int insert
= biliMapper
.insert((BiliBili
) e
);
sqlSession
.commit();
return insert
;
}).collect(Collectors
.toList());
sqlSession
.commit();
}
}
URLHandle
public class URLHandle {
public static void urlParser(HttpClient client
, String url
){
HttpResponse response
= HTTPUtils
.getHtml(client
,url
);
int statusCode
= response
.getStatusLine().getStatusCode();
System
.out
.println("响应状态码" + statusCode
);
if(statusCode
==200){
try {
String entity
= EntityUtils
.toString(response
.getEntity(),"utf-8");
System
.out
.println("开始解析...");
BiliBiliParse
.creteDate(entity
);
} catch (IOException e
) {
e
.printStackTrace();
System
.err
.println("解析entity失败");
}
}
}
}
App
public class App
{
public static void main( String
[] args
)
{
System
.out
.println("正在生成客户端...");
HttpClient client
= null
;
System
.out
.println("客户端生成完毕.");
int pageSize
= 149;
for(int page
=1;page
<=pageSize
;page
++){
String url
= "https://api.bilibili.com/pgc/season/index/result?season_version=-1&area=-1&is_finish=-1©right=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page="
+ page
+ "&season_type=1&pagesize=20&type=1";
System
.err
.println("开始爬取第:" + page
+ " 页的数据");
System
.out
.println("正在生成客户端...");
client
= HttpClientBuilder
.create().build();
System
.out
.println("客户端生成完毕.");
System
.out
.println("开始响应客户端...");
URLHandle
.urlParser(client
, url
);
}
System
.out
.println("全部爬取完成");
}
}
运行效果
查看数据库
虽然我这个没有什么用,不过还是挺好玩的。
项目地址
github