Java 爬取BiliBili追番排行榜

mac2022-06-30  34

看了標哥(代码忘烦恼)爬虫博客,两篇博客写的简单易懂:自己也跟着学了一下,几乎就是在上面进行的改动。

Java爬虫java爬取前程无忧(51job),这篇文章看博客代码不完整,建议看 github源码

准备工作:

创建一个普通maven工程 pom依赖 <dependencies> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5.9</version> </dependency> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpcore</artifactId> <version>4.4.11</version> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>8.0.15</version> </dependency> <dependency> <groupId>org.mybatis</groupId> <artifactId>mybatis</artifactId> <version>3.5.2</version> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.12.1</version> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.12.1</version> </dependency> <dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.47</version> </dependency> </dependencies> 项目结构

分析页面

https://www.bilibili.com/anime/index/#season_version=-1&area=-1&is_finish=-1©right=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page=1

我们从这找到了相关的json数据

访问获取到的url: (改变page参数json数据发生改变。所以确定改url为爬取入口)

https://api.bilibili.com/pgc/season/index/result?season_version=-1&area=-1&is_finish=-1&copyright=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page=1&season_type=1&pagesize=20&type=1

开始爬取

创建数据库 CREATE TABLE `bilibili` ( `id` int(11) NOT NULL AUTO_INCREMENT, `title` varchar(1024) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL, `cover` varchar(1024) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL, `orders` varchar(1024) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL, `index_show` varchar(1024) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8; 根据json字段创建BiliBili实体类 package com.scitc.model; public class BiliBili{ private Integer id; private String title; private String cover;//封面图 private String order;//追番人数 private String indexShow;//总集数 public BiliBili() { } public Integer getId() { return id; } public void setId(Integer id) { this.id = id; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getCover() { return cover; } public void setCover(String cover) { this.cover = cover; } public String getOrder() { return order; } public void setOrder(String order) { this.order = order; } public String getIndexShow() { return indexShow; } public void setIndexShow(String indexShow) { this.indexShow = indexShow; } @Override public String toString() { return "BiliBili{" + "id=" + id + ", title='" + title + '\'' + ", cover='" + cover + '\'' + ", order='" + order + '\'' + ", indexShow='" + indexShow + '\'' + '}'; } } BiliBiliMapper public interface BiliBiliMapper { int insert(BiliBili biliBili); List<BiliBili> biliBiliList(); } BiliBiliMapper.xml <?xml version="1.0" encoding="UTF-8" ?> <!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd"> <mapper namespace="com.scitc.mapper.BiliBiliMapper"> <resultMap id="biliResultMapper" type="com.scitc.model.BiliBili"> <id column="id" property="id" jdbcType="INTEGER"/> <result column="title" property="title" jdbcType="VARCHAR"/> <result column="cover" property="cover" jdbcType="VARCHAR"/> <result column="orders" property="order" jdbcType="VARCHAR"/> <result column="index_show" property="indexShow" jdbcType="VARCHAR"/> </resultMap> <insert id="insert" parameterType="com.scitc.model.BiliBili" keyProperty="id" keyColumn="id" useGeneratedKeys="true"> INSERT INTO bilibili(title,cover,orders,index_show) VALUES (#{title},#{cover},#{order},#{indexShow}) </insert> <select id="biliBiliList" resultMap="biliResultMapper"> SELECT * FROM bilibili </select> </mapper> jdbc.properties jdbc.url=jdbc:mysql://localhost:3306/user?useUnicode=true&characterEncoding=utf-8&serverTimezone=GMT%2B8 jdbc.driver=com.mysql.cj.jdbc.Driver jdbc.user=root jdbc.password=123456 mybatis-config.xml <?xml version="1.0" encoding="UTF-8" ?> <!DOCTYPE configuration PUBLIC "-//mybatis.org//DTD Config 3.0//EN" "http://mybatis.org/dtd/mybatis-3-config.dtd"> <configuration> <properties resource="jdbc.properties"/> <environments default="development"> <environment id="development"> <transactionManager type="JDBC"></transactionManager> <dataSource type="POOLED"> <property name="driver" value="${jdbc.driver}"/> <property name="url" value="${jdbc.url}"/> <property name="username" value="${jdbc.user}"/> <property name="password" value="${jdbc.password}"/> </dataSource> </environment> </environments> <mappers> <mapper resource="BiliBiliMapper.xml"/> </mappers> </configuration> HTTPUtils public class HTTPUtils { public static HttpResponse getHtml(HttpClient client, String url){ //获取响应文件,及HTML,采取get方式获取响应数据 HttpGet getMethod = new HttpGet(url); HttpResponse response= new BasicHttpResponse(HttpVersion.HTTP_1_1, HttpStatus.SC_OK,"OK"); //通过client执行get方法 try { response = client.execute(getMethod); } catch (IOException e) { e.printStackTrace(); System.err.println("建立客户端出现异常"); } return response; } } BiliBiliParse public class BiliBiliParse { public static void creteDate(String entity){ //读取mybatis配置文件 String resources = "mybatis-config.xml"; InputStream resourceAsStream = null; try { resourceAsStream = Resources.getResourceAsStream(resources); } catch (IOException e) { e.printStackTrace(); } //得到连接对象注册sqlsession SqlSessionFactory sqlSessionFactory = new SqlSessionFactoryBuilder().build(resourceAsStream); SqlSession sqlSession = sqlSessionFactory.openSession(); BiliBiliMapper biliMapper = sqlSession.getMapper(BiliBiliMapper.class); JSONObject jsonObject = JSONObject.parseObject(entity); String jsonStr = jsonObject.getJSONObject("data").getString("list"); List<BiliBili> biliBilis = JSON.parseArray(jsonStr, BiliBili.class); //需要JDK 1.8 biliBilis.stream().map(e -> { int insert = biliMapper.insert((BiliBili) e); sqlSession.commit(); return insert; }).collect(Collectors.toList()); sqlSession.commit(); } } URLHandle public class URLHandle { public static void urlParser(HttpClient client, String url){ //获取响应资源 HttpResponse response = HTTPUtils.getHtml(client,url); //获取响应状态码 int statusCode = response.getStatusLine().getStatusCode(); System.out.println("响应状态码" + statusCode); if(statusCode ==200){ //页面编码 try { String entity = EntityUtils.toString(response.getEntity(),"utf-8"); System.out.println("开始解析..."); BiliBiliParse.creteDate(entity); } catch (IOException e) { e.printStackTrace(); System.err.println("解析entity失败"); } } } } App public class App { public static void main( String[] args ) { System.out.println("正在生成客户端..."); HttpClient client = null; System.out.println("客户端生成完毕."); int pageSize = 149;//总页数 for(int page=1;page<=pageSize;page++){ String url = "https://api.bilibili.com/pgc/season/index/result?season_version=-1&area=-1&is_finish=-1©right=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page=" + page + "&season_type=1&pagesize=20&type=1"; System.err.println("开始爬取第:" + page + " 页的数据"); System.out.println("正在生成客户端..."); client = HttpClientBuilder.create().build(); System.out.println("客户端生成完毕."); //开始解析 System.out.println("开始响应客户端..."); URLHandle.urlParser(client, url); } System.out.println("全部爬取完成"); } }

运行效果

查看数据库

虽然我这个没有什么用,不过还是挺好玩的。

项目地址

github

最新回复(0)