Html批量转csv

不知道大家又没有遇到这样的问题，某些业务系统有导出数据功能，导出的数据都是存放在excel表格里面，需要批量转csv，

但是这样的文件不是标准的excel文档，本质是html文档

比如说，系统导出的文档是这样的

从这里我们可以看出来，感觉就是一个普通的excel文档，通过office也能正常打开，但是你通过编写代码批量转csv的时候，就出问题

我也是在无意中发现这不是标准的excle文档，我们通过文档编辑器打开试试

这明显就是html文件，只能怪这个业务系统的开发人员不够严谨了，现在需要我们来解决这样的问题

我们先在idea里面创建一个maven项目

package com.gong;


import java.io.*;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Scanner;

import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * Jsoup解析html标签时类似于JQuery的一些符号
 *
 * @author chixh
 *
 */
public class HtmlParser {
    protected List<List<String>> data = new LinkedList<List<String>>();

    /**
     * 获取value值
     *
     * @param e
     * @return
     */
    public static String getValue(Element e) {
        return e.attr("value");
    }

    /**
     * 获取
     * <tr>
     * 和
     * </tr>
     * 之间的文本
     *
     * @param e
     * @return
     */
    public static String getText(Element e) {
        return e.text();
    }

    /**
     * 识别属性id的标签,一般一个html页面id唯一
     *
     * @param body
     * @param id
     * @return
     */
    public static Element getID(String body, String id) {
        Document doc = Jsoup.parse(body);
        // 所有#id的标签
        Elements elements = doc.select("#" + id);
        // 返回第一个
        return elements.first();
    }

    /**
     * 识别属性class的标签
     *
     * @param body
     * @param class
     * @return
     */
    public static Elements getClassTag(String body, String classTag) {
        Document doc = Jsoup.parse(body);
        // 所有#id的标签
        return doc.select("." + classTag);
    }

    /**
     * 获取tr标签元素组
     *
     * @param e
     * @return
     */
    public static Elements getTR(Element e) {
        return e.getElementsByTag("tr");
    }

    /**
     * 获取td标签元素组
     *
     * @param e
     * @return
     */
    public static Elements getTD(Element e) {
        return e.getElementsByTag("td");
    }
    /**
     * 获取表元组
     * @param table
     * @return
     */
    public static List<List<String>> getTables(Element table){
        List<List<String>> data = new ArrayList<>();

        for (Element etr : table.select("tr")) {
            List<String> listh=new ArrayList<>();
            //获取表头
            for(Element eth : etr.select("th")){
                String th=eth.text();
                listh.add(th);
            }
            if(!listh.isEmpty()) {
                data.add(listh);
            }
            List<String> list = new ArrayList<>();
            for (Element etd : etr.select("td")) {
                String temp = etd.text();
                //增加一行中的一列
                list.add(temp);
            }
            //增加一行
            if(!list.isEmpty()) {
                data.add(list);
            }
        }
        return data;
    }
    /**
     * 读html文件
     * @param fileName
     * @return
     */
    public static String readHtml(String fileName){
        FileInputStream fis = null;
        StringBuffer sb = new StringBuffer();
        try {
            fis = new FileInputStream(fileName);
            byte[] bytes = new byte[1024];
            while (-1 != fis.read(bytes)) {
                sb.append(new String(bytes));
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                fis.close();
            } catch (IOException e1) {
                e1.printStackTrace();
            }
        }
        return sb.toString();
    }

    public static void getFileName(String inputexecl,String outputcsv){
       // Document doc2 = Jsoup.parse(readHtml("E:\\datas\\小组成员学习情况统计(11).xls"));
        String path = inputexecl;
        File f = new File(path);
        if (!f.exists()) {
            System.out.println(path + " not exists");
            return;
        }

        File fa[] = f.listFiles();//获取该目录下所有文件和目录的绝对路径
        for (int i = 0; i < fa.length; i++) {
            File fs = fa[i];
            if (fs.isDirectory()) {
                System.out.println(fs.getName() + " [目录]");
            } else{
                String filepath= String.valueOf(fs);
                Document doc2 = Jsoup.parse(readHtml(filepath));
                Element table = doc2.select("table").first();
                //获取table表的内容，存放到List集合里面
                List<List<String>> list = getTables(table);
                for (List<String> list2 : list) {
                    for (String string : list2) {
                        System.out.print(string+",");
                    }
                    System.out.println();
                }
                String name= StringUtils.substringBeforeLast(fs.getName(),".");//获取文件名字部分
                //String newFilePath="E:\\datas\\csv\\小组成员学习.csv";
                String newFilePath=outputcsv+name+".csv";
                String savePath = newFilePath;
                File saveCSV = new File(savePath);
                String buffer="";
                try {
                    if(!saveCSV.exists())
                        saveCSV.createNewFile();
                    OutputStreamWriter write = new OutputStreamWriter(new FileOutputStream(saveCSV ),"UTF-8");
                    BufferedWriter writer = new BufferedWriter(write);
                    for(int j=0;j<list.size();j++){
                        List<String> list1=new ArrayList<String>();
                        buffer=list.get(j).toString();
                        System.out.println(buffer);
                        buffer = buffer.substring(1, buffer.lastIndexOf("]")).toString();
                        list1.add(buffer);
                        writer.write(buffer);
                        writer.newLine();
                    }
                    writer.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }

    }
    public static void main(String[] args) {
        System.out.println("请输入Execl数据所在路径");
        Scanner execl=new Scanner(System.in);
        String input=execl.nextLine(); //获取execl输入路径
        System.out.println("请输入csv文件数据的输出路径");
        Scanner csv=new Scanner(System.in);
        String  output = csv.nextLine();
        getFileName(input,output);

    }

}

pom.xml

<?xml version="1.0" encoding="UTF-8"?>

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>com.gong</groupId>
  <artifactId>csv</artifactId>
  <version>1.0-SNAPSHOT</version>

  <name>csv</name>
  <!-- FIXME change it to the project's website -->
  <url>http://www.example.com</url>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <maven.compiler.source>1.7</maven.compiler.source>
    <maven.compiler.target>1.7</maven.compiler.target>
  </properties>

  <dependencies>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.11</version>
      <scope>test</scope>
    </dependency>
    <dependency>
                 <groupId>net.sf.opencsv</groupId>
                 <artifactId>opencsv</artifactId>
                 <version>2.1</version>
             </dependency>
           <dependency>
                <groupId>org.apache.poi</groupId>
               <artifactId>ooxml-schemas</artifactId>
                <version>1.1</version>
               <type>pom</type>
            </dependency>
           <dependency>
                <groupId>org.apache.poi</groupId>
                <artifactId>poi</artifactId>
                <version>3.7</version>
           </dependency>
           <dependency>
                <groupId>org.apache.poi</groupId>
                <artifactId>ooxml-schemas</artifactId>
                <version>1.1</version>
            </dependency>
            <dependency>
                <groupId>org.apache.poi</groupId>
                <artifactId>poi-ooxml</artifactId>
                <version>3.7</version>
            </dependency>
            <dependency>
                <groupId>dom4j</groupId>
                <artifactId>dom4j</artifactId>
                <version>1.6.1</version>
            </dependency>
      <!-- https://mvnrepository.com/artifact/net.sourceforge.jexcelapi/jxl -->
      <dependency>
          <groupId>net.sourceforge.jexcelapi</groupId>
          <artifactId>jxl</artifactId>
          <version>2.6.12</version>
      </dependency>
      <dependency>
          <groupId>commons-lang</groupId>
          <artifactId>commons-lang</artifactId>
          <version>2.6</version>
      </dependency>
      <dependency>
          <groupId>org.jsoup</groupId>
          <artifactId>jsoup</artifactId>
          <version>1.11.3</version>
      </dependency>

  </dependencies>

  <build>
    <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
      <plugins>
        <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
        <plugin>
          <artifactId>maven-clean-plugin</artifactId>
          <version>3.1.0</version>
        </plugin>
        <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
        <plugin>
          <artifactId>maven-resources-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-compiler-plugin</artifactId>
          <version>3.8.0</version>
        </plugin>
        <plugin>
          <artifactId>maven-surefire-plugin</artifactId>
          <version>2.22.1</version>
        </plugin>
        <plugin>
          <artifactId>maven-jar-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-install-plugin</artifactId>
          <version>2.5.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-deploy-plugin</artifactId>
          <version>2.8.2</version>
        </plugin>
        <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
        <plugin>
          <artifactId>maven-site-plugin</artifactId>
          <version>3.7.1</version>
        </plugin>
        <plugin>
          <artifactId>maven-project-info-reports-plugin</artifactId>
          <version>3.0.0</version>
        </plugin>
      </plugins>

    </pluginManagement>
  </build>
</project>

运行分别输入excel文档的目录和csv的输出目录就可以了，在这里提醒一下大家，如果使用我这段代码的话，excel文档的数据文件不能带有其他类型的文件。

posted on 2020-09-15 14:27 百里登峰阅读(1005) 评论(0) 收藏举报

刷新页面返回顶部

百里登风

导航

公告

Html批量转csv