爬虫爬取文章

package com.tjt;

import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.tjt.utils.FileUtilIO;
//爬虫类
public class TestSpider {

    //需求1:通过一个定的技术,从http://news.sohu.com/爬取这个网站上的文章
        //需求2:获取文章的标题,获取文章的内容,最终保存到本地(以文章标题作为文件的名称,文章内容作为文件内容,后缀名为txt的文件) 标题.txt
        
    public static void main(String[] args) throws IOException {
        //jsoup是一个java后台解析html代码的一个工具(也可以模拟浏览器发送网络请求)
                //1.用jsoup的方式模拟浏览器发送一个请求
        Document document = Jsoup.connect("http://news.sohu.com/").get();
        //2.获取class属性为list16的div
        Elements divs = document.select(".list16");
        //3.得到每个div
        for (Element div : divs) {
            Elements as = div.select("a");
            
            for (Element a : as) {
                String title = a.attr("title");
                title.replace("|", "").replace(" | ", "").replace("\\", "").replace(":", "").replace("*", "").replace("\"", "").replace("?", "").replace("<", "").replace(">", "");
                //a标签的href属性值的连接,就是文章详情的链接地址,因此我们必须现获取文章href的值(是一个网址),这个网址
                //就是进入文章详情的网址,想要获取文章内容,必须得访问文章详情的网址
                String url = a.attr("href");
                //     //www.sohu.com/a/384139596_114941
                if(!url.startsWith("http")) {
                //    http://www.sohu.com/a/384139596_114941
                    url="http:"+url;
                }
                
                System.out.println(url);
                System.out.println(title);
                //再次利用jsoup来模拟浏览器发送请求
                Document document2 = Jsoup.connect(url).get();
//                System.out.println(document2);
                //获取标签为article的所有的元素
                Elements articles = document2.select("article");
                for (Element element : articles) {
                    //获取所有的文章内容
                    String content = element.text();
//                    System.out.println(content);
                    //把文章内容和标题,分别写入到本地文件(以文章标题作为文件标题,文章内容作为文件内容)
                    FileUtilIO.writeFile("e:/a/1711F/"+title+".txt", content, "utf-8");
                }
            }
        }
    }
}

package com.tjt.utils;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;

public class FileUtilIO {
    /**
     * @Title: writeFile
     * @Description: 按照指定的编码把内容写入指定的文件中
     * @param path
     * @param content
     * @param charset
     * @throws IOException
     * @return: void
     */
    public static void writeFile(String path, String content, String charset) throws IOException {
        // 创建写入的文件
        File file = new File(path);
        // 判断父目录是否存在
        if (!file.getParentFile().exists()) {
            // 创建父目录
            file.getParentFile().mkdirs();
        }
        // 创建输出流对象
        BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), charset));
        if(content!=null) {
            bw.write(content);
        }
        bw.flush();
        bw.close();
    }

    /**
     * @Title: readFile
     * @Description: 读取文件内容
     * @param file
     * @param charset
     * @return
     * @throws IOException
     * @return: String
     */
    public static String readFile(File file, String charset) throws IOException {
        // 创建输出流对象
        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset));
        // 定义缓冲对象
        StringBuffer sb = new StringBuffer();
        // 定义读取每行的结果
        String content = null;
        // 循环读取
        while ((content = br.readLine()) != null) {
            // 加入缓冲对象
            sb.append(content);
        }
        // 关闭流
        br.close();
        // 返回结果
        return sb.toString();

    }
}

posted @ 2020-04-13 20:38 Tangjt 阅读(499) 评论(0) 收藏举报

刷新页面返回顶部

Tangjt

爬虫爬取文章

公告