Java中Web页面信息获取

package com.imooc.regex;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStreamReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class RegexSample {

    public static void main(String[] args) {
        StringBuilder content = new StringBuilder();
        try {
            FileInputStream fis = new FileInputStream("D:\\eclipse-workspace\\regex\\WebContent\\sample.html");
            InputStreamReader isr = new InputStreamReader(fis,"UTF-8");
            BufferedReader bufferedReader = new BufferedReader(isr);
            String lineText = "";
            while((lineText=bufferedReader.readLine()) !=null ) {
                content.append(lineText + "\n");
            }
            bufferedReader.close();
//            System.out.println(content);
            
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            
        }
        
        //创建正则表达式对象
        Pattern p = Pattern.compile("<li>([\\u4e00-\\u9fa5]{2,8})([a-zA-Z]+)</li>");
        //匹配正则表达式
        Matcher m = p.matcher(content);
        //查找匹配结构
        while(m.find()) {
            System.out.println(m.group(0));//打印全部
            System.out.println(m.group(1));//打印第一个分组
            System.out.println(m.group(2));//打印第二个分组
        }

    }

}

 

posted @ 2020-09-22 21:50  python成长中  阅读(295)  评论(0)    收藏  举报