node 读取超大Excel 文件,提取数据

之前是用 node-xlsx 来处理excel文件,主要是读取数据或者根据数据生成excel文件。不过,node-xlsx 似乎无法处理超大的excel(100MB以上),例如:

var xlsx = require('node-xlsx');
var sheets = xlsx.parse('./test.xlsx'); //获取所有sheets
 
文件中有一个sheet的体积比较大,得到的是一个空的数组,估计是因为内存加载不进去。想要解决这个问题,似乎只有一种方法,就是用stream的方式,一段一段提取excel里面的数据。
然而,node-xlsx 不支持用流的方式读取Excel,因为excel文件编码的原因,只有把excel转为xml或者csv,才能用流的方式处理。
 
 

 

感谢这位老哥,找到了xlsx-extract 这个库,完美解决用流的方式读取excel

 var powXLSX = require('xlsx-extract').XLSX;
    new powXLSX().extract('./test.xlsx', { sheet_all: true }) // 读取文件所有sheet,默认只读取第一张sheet,参数配置如下
      .on('sheet', function (sheet) {
        console.log('sheet', sheet);  // sheet is array [sheetname, sheetid, sheetnr]
      })
      .on('row', function (row) {
        console.log('row', row);  // row is a array of values or []
      })
      .on('cell', function (cell) {
        // console.log('cell', cell); //cell is a value or null
      })
      .on('error', function (err) {
        console.error('error', err);
      })
      .on('end', function (err) {
        console.log('eof');
      });

options = {
	// sheet selection (provide one of the following)
	sheet_name?: string; // select by sheet name
	sheet_nr?: string; // default "1" - select by number of the sheet starting on 1
	sheet_id?: string; // select by sheet id, e.g. "1"
	sheet_rid?: string; // select by internal sheet rid, e.g. "rId1'
	sheet_all?: boolean; // default false - select all sheets
	// sax parser selection
	parser?: string; // default "sax" - 'sax'|'expat'
	// row selection
	ignore_header?: number; // default 0 - the number of header lines to ignore
	include_empty_rows?: boolean; // default false - include empty rows in the middle/at start
	// how to output sheet, rows and cells
	format?: string; // default array - convert to 'array'||'json'||'tsv'||'obj'
	// tsv output options
	tsv_float_comma?: boolean; // default false - use "," als decimal point for floats
	tsv_delimiter?: string; // default '\t' - use specified character to field delimiter
	tsv_endofline?: string; // default depending on your operating system (node os.EOL) e.g. '\n'
	// cell value formats
	raw_values?: boolean;  // default false - do not apply cell formats (get values as string as in xlsx)
	round_floats?: boolean; // default true - round float values as the cell format defines (values will be reported as parsed floats otherwise)
	date1904?: boolean;   // default false - use date 1904 conversion
	ignore_timezone?: boolean; // default false - ignore timezone in date parsing
	convert_values?: { // apply cell number formats or not (values will be reported as strings otherwise)
		ints?: boolean;  // rounds to int if number format is for int
		floats?: boolean;  // rounds floats according to float number format
		dates?: boolean;  // converts xlsx date to js date
		bools?: boolean; // converts xlsx bool to js boolean
	};
	// xlsx structure options
	workfolder?: string; // default 'xl' - the workbook subfolder in zip structure
}
 

具体的实现原理可以去github 探究 xlsx-extract  

posted @ 2019-12-16 10:38  kzc275  阅读(...)  评论(...编辑  收藏