主要用于操作字符串虽然减少了代码量,阅读性差了
&与|或[abc]匹配abc中的一个,要么是a,要么是b,要么是c()组默认第1组()\1第一组出现1次()\1+第一组出现多次;从左往右看左括号((a)(b©)) 第一组(a)(b©) 第二组a 第三组b© 第四组c\\\以下均需要两个\
.任意字符\d数字\s数字\w单词字符[a-zA-Z0-9_]数量词
X?0次或1次X*任意次X+至少1次X{n}正好n次X{n,}至少n次X{n,m}至少n次,最多m次边界匹配器
^行的开头$行的结尾(在后一个参数可以对前一个参数的组进行获取)\b单词边界\B非单词边界运行结果
爬虫练习
/** * 网页爬虫:就是一个程序用于在互联网中获取符合指定规则的数据 * * 爬取邮箱地址 */ public class Test01 { public static void main(String[] args) throws IOException { List<String> mail = getMailWeb(); for (String string : mail) { System.out.println(string); } } //爬取本地文件中邮箱 public static List<String> getMail() throws IOException{ //1.读取源文件 BufferedReader br = new BufferedReader(new FileReader("本地文件地址")); //2.对读取的数据进行规则匹配。从中获取符合规则的数据 String regex = "[0-9a-zA-Z]{5,15}@{1}\\w{2,}\\.{1}(com|cn|net)"; List<String> list = new ArrayList<>(); Pattern p = Pattern.compile(regex); String line = null; while((line = br.readLine()) != null) { Matcher m = p.matcher(line); while(m.find()) list.add(m.group()); } //3.将符合规则的数据存到集合中 return list; } //爬取网络中邮箱 public static List<String> getMailWeb() throws IOException{ //1.读取源文件 //BufferedReader br = new BufferedReader(new FileReader("")); //URL url = new URL("网页地址"); URL url = new URL("https://zhidao.baidu.com/question/1772307510687057620.html"); BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream())); //2.对读取的数据进行规则匹配。从中获取符合规则的数据 String regex = "[0-9a-zA-Z]{5,15}@{1}\\w{2,}\\.{1}(com|cn|net)"; List<String> list = new ArrayList<>(); Pattern p = Pattern.compile(regex); String line = null; while((line = br.readLine()) != null) { Matcher m = p.matcher(line); while(m.find()) list.add(m.group()); } //3.将符合规则的数据存到集合中 return list; } }运行结果
爬取小说,写入文件,下一章翻页
import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class PaChong { public static String next; public static void main(String[] args) throws IOException { // TODO 自动生成的方法存根 String str1 = "http://www.biqu6.com"; System.out.println(ToolString(getMailWeb(str1+"/1_1821/1157521.html"))); System.out.println(); if(saveBook(ToolString(getMailWeb(str1+"/1_1821/1157521.html")))) System.out.println("SiveOk"); System.out.println(); System.out.println("下一章地址:"+str1+getNex()); System.out.println(); System.out.println(ToolString(getMailWeb(str1+getNex()))); System.out.println(); System.out.println("下一章地址:"+str1+getNex()); System.out.println(); System.out.println(ToolString(getMailWeb(str1+getNex()))); System.out.println(); System.out.println("下一章地址:"+str1+getNex()); } public static String getMailWeb(String inter) throws IOException{ //1.读取源文件 //BufferedReader br = new BufferedReader(new FileReader("")); //URL url = new URL("网页地址"); //String inter = "http://www.biqu6.com/1_1821/1157521.html"; URL url = new URL(inter); BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream())); //2.对读取的数据进行规则匹配。从中获取符合规则的数据 String regex1 = "(div id=\"content\")(.)*(/div)"; String regex2 = "(<a href=\")(.){10,30}(\">下一章</a>)"; StringBuffer book = new StringBuffer(); Pattern p1 = Pattern.compile(regex1); Pattern p2 = Pattern.compile(regex2); String line = null; while((line = br.readLine()) != null) { Matcher m1 = p1.matcher(line); Matcher m2 = p2.matcher(line); while(m1.find()) { book.append(line); } while(m2.find()) { getNexString(m2.group()); } } //3.将符合规则的数据存到集合中 return book.toString(); } //书籍排版 public static String ToolString(String str) { str = str.replaceAll("( )", " "); str = str.replaceAll("(//)(.)*(//)", ""); str = str.replaceAll("(<br/>){1,2}", "\r\n"); str = str.replaceAll("(<)(.){4,20}(>)", ""); str = str.replaceAll("(\\*){4,20}(.)*", ""); return str; } //下一章<a href="/1_1821/1157522.html">下一章</a> public static String getNex() { String str = next.replaceAll("(<a href=\")(/{1}\\w{3,}/{1}\\w{3,}\\.(html))(\">下一章</a>)", "$2"); return str; } public static void getNexString(String str) { next =str; } public static boolean saveBook(String str) { boolean b = false; try ( BufferedWriter bw = new BufferedWriter(new FileWriter("D:/1.doc")); ){ bw.write(str); b = true; } catch (IOException e) { // TODO 自动生成的 catch 块 e.printStackTrace(); } return b; } }运行结果
