java poi sax方式处理大数据量excel文件

网友投稿 448 2022-12-17


java poi sax方式处理大数据量excel文件

系统需要用到一个导入excel文件的功能,使用poi组件常规方式读取excel时,内存耗尽,OutOfMemoryError,或者读取非常慢

所以写了一个工具类,使用poi sax方式读取excel,速度快很多,内存消耗可以接受。

测试结果如下:

.xlsx文件,35M大小,总4个sheel,

只读取第一个,37434行,54列

总行数:37434

读取耗时:39秒

打印耗时:17秒

主要代码如下:

ExcelUtils.class 主入口

package com.xxx.bi.utils.excel;

import java.util.List;

import java.util.Objects;

import org.apache.commons.lang3.StringUtils;

import com.google.common.collect.Lists;

public class ExcelUtils {

/** logger日志. */

// public static final Logger LOGGER = Logger.getLogger(ExcelUtils.class);

public ExcelUtils() {

}

/**

* 获取excel的表头

*

* @param filePath

* 文件路径

* @param headerNum

* 表头所在行数

* @return

*/

public static List getHeader(String filePath, int headerNum) {

if (StringUtils.isBlank(filePath)) {

throw new IllegalArgumentException("传入文件路径不能为空");

}

if (Objects.isNull(headerNum) || headerNum < 1) {

headerNum = 1;

}

try {

return LargeExcelFileReadUtil.getRoYCTLGfTwFromSheetOne(filePath, headerNum);

} catch (Exception e) {

// LOGGER.info("获取excel[" + filePath + "]表头失败,原因:", e);

e.printStackTrace();

}

return Lists.newArrayList();

}

/**

* 获取excel的所有数据

* 所有数据类型都是String

* 会以第一行数据的列数为总列数,所以第一行的数据必须都不为空,否则可能出java.lang.IndexOutOfBoundsException

*

* @param filePath

* 文件路径

* @param headerNum

* 表头所在行数

* @return

*/

public static List> getAllData(String filePath) {

if (StringUtils.isBlank(filePath)) {

throw new IllegalArgumentException("传入文件路径不能为空");

}

try {

return LargeExcelFileReadUtil.getRowsFromSheetOne(filePath);

} catch (Exception e) {

// LOGGER.info("获取excel[" + filePath + "]表头失败,原因:", e);

e.printStackTrace();

}

return Lists.newArrayList();

}

public static void main(String[] args) {

long start = System.currentTimeMillis();

String filepath = "C:/Users/Administrator/Desktop/05-作业调配表 -快递.xlsx";

// List result = ExcelUtils.getHeader(filepath, 1);

// for (String col : result) {

// System.out.println(col);

// }

List> result = ExcelUtils.getAllData(filepath);

long end = System.currentTimeMillis();

for (List list : result) {

System.out.println(list.toString());

}

long end1 = System.currentTimeMillis();

try {

Thread.sleep(1000l);

} catch (InterruptedException e) {

e.printStackTrace();

}

System.err.println("总行数:" + result.size());

System.err.println(("读取耗时:" + (end - start) / 1000) + "秒");

System.err.println(("打印耗时:" + (end1 - end) / 1000) + "秒");

}

}

LargeExcelFileReadUtil.class 真正的工具类

package com.xxx.bi.utils.excel;

import java.io.InputStream;

import java.util.List;

import java.util.Objects;

import org.apache.log4j.Logger;

import org.apache.poi.openxml4j.opc.OPCPackage;

import org.apache.poi.xssf.eventusermodel.XSSFReader;

import org.apache.poi.xssf.model.SharedStringsTable;

import org.xml.sax.InputSource;

import org.xml.sax.XMLReader;

import org.xml.sax.helpers.XMLReaderFactory;

public class LargeExcelFileReadUtil {

/** logger日志. */

public static final Logger LOGGER = Logger.getLogger(LargeExcelFileReadUtil.class);

// 处理一个sheet

public static List getRowFromSheetOne(String filename, Integer rowNum) throws Exception {

InputStream inputStream = null;

OPCPackage pkg = null;

SingleRowHandler singleRowHandler = null;

try {

pkg = OPCPackage.open(filename);

XSSFReader r = new XSSFReader(pkg);

SharedStringsTable sst = r.getSharedStringsTable();

singleRowHandler = new SingleRowHandler(sst, rowNum);

XMLReader parser = XMLReaderFactory.createXMLReader("com.sun.org.apache.xerces.internal.parsers.SAXParser");

parser.setContentHandler(singleRowHandler);

inputStream = r.getSheet("rId1");

InputSource sheetSource = new InputSource(inputStream);

parser.parse(sheetSource);

return singleRowHandler.getRow();

} catch (Exception e) {

String message = e.getMessage();

if (Objects.nonNull(rowNum) && Objects.nonNull(singleRowHandler)

&& SingleRowHandler.FINISH_ROW_MESSAGE.equalsIgnoreCase(message)) {

// 获取某一行数据完成 ,暂时不知道怎么能终止excel解析,直接抛出了异常,实际是成功的

return singleRowHandler.getRow();

}

throw e;

} finally {

if (Objects.nonNull(pkg)) {

pkg.close();

}

if (Objects.nonNull(inputStream)) {

inputStream.close();

}

}

}

// 处理一个sheet

public static List> getRowsFromSheetOne(String filename) throws Exception {

InputStream inputStream = null;

OPCPackage pkg = null;

MultiRowHandler multiRowHandler = null;

try {

pkg = OPCPackage.open(filename);

XSSFReader r = new XSSFReader(pkg);

SharedStringsTable sst = r.getSharedStringsTable();

multiRowHandler = new MultiRowHandler(sst);

XMLReader parser = XMLReaderFactory.createXMLReader("com.sun.org.apache.xerces.internal.parsers.SAXParser");

parser.setContentHandler(multiRowHandler);

inputStream = r.getSheet("rId1");

InputSource sheetSource = new InputSource(inputStream);

parser.parse(sheetSource);

return multiRowHandler.getRows();

} catch (Exception e) {

throw e;

} finally {

if (Objects.nonNull(pkg)) {

pkg.close();

}

if (Objects.nonNull(inputStream)) {

inputStream.close();

}

}

}

}

SingleRowHandler.class 当行处理类,可以只获取表头或表格中的某一行数据

package com.xxx.bi.utils.excel;

import java.util.ArrayList;

import java.util.List;

import java.util.Objects;

import java.util.regex.Pattern;

import org.apache.poi.xssf.model.SharedStringsTable;

import org.apache.poi.xssf.usermodel.XSSFRichTextString;

import org.xml.sax.Attributes;

import org.xml.sax.SAXException;

import org.xml.sax.helpers.DefaultHandler;

public class SingleRowHandler extends DefaultHandler {

public final static String FINISH_ROW_MESSAGE = "row data process finish";

private Integer rowNum = null;// rowNum不为空时则标示只需要获取这一行的数据

private int curRowNum = 1;

private String cellType = "";

private SharedStringsTable sst;

private String lastContents;

private boolean nextIsString;

private String cellPosition;

private List row = new ArrayList<>();

public List getRow() {

return row;

}

public SingleRowHandler(SharedStringsTable sst, Integer rowNum) {

this.sst = sst;

this.rowNum = rowNum;

}

public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {

if (name.equals("c")) {

cellPosition = attributes.getValue("r");

// 这是一个新行

if (Pattern.compile("^A[0-9]+$").matcher(cellPosition).find()) {

curRowNum = Integer.valueOf(cellPosition.substring(1));

}

cellType = "";

cellType = attributes.getValue("t");

if ("s".equals(cellType)) {

nextIsString = true;

} else {

nextIsString = false;

}

}

// 清楚缓存内容

lastContents = "";

if (Objects.nonNull(rowNum) && curRowNum > rowNum) {

// 获取某一行数据完成 ,暂时不知道怎么能终止excel解析,直接抛出了异常,实际是成功的

throw new SAXException(FINISH_ROW_MESSAGE);

}

}

public void endElement(String uri, String localName, String name) throws SAXException {

if (nextIsString) {

int idx = Integer.parseInt(lastContents);

lastContents = new XSSFRichTextString(sst.getEntryAt(idx)).toString();

nextIsString = false;

}

if (name.equals("v")) {

if (Objects.isNull(rowNum) || rowNum == curRowNum) {

row.add(lastContents);

}

}

}

public void characters(char[] ch, int start, int length) throws SAXException {

lastContents += new String(ch, start, length);

}

}

MultiRowHandler.class 获取excel所有行的数据

package com.xxx.bi.utils.excel;

import java.util.ArrayList;

import java.util.List;

import java.util.Objects;

import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;

import org.apache.poi.xssf.model.SharedStringsTable;

import org.apache.poi.xssf.usermodel.XSSFRichTextString;

import org.xml.sax.Attributes;

import org.xml.sax.SAXException;

import org.xml.sax.helpers.DefaultHandler;

/**

* 获取完整excel数据的handler

*

* @author Administrator

*

*/

public class MultiRowHandler extends DefaultHandler {

private int curRowNum = 0;// 行号,从1开始

private int curColIndex = -1;// 列索引,从0开始

private int colCnt = 0;// 列数,取第一行列数做为列总数

private String cellType = "";

private SharedStringsTable sst;

private String lastContents;

private boolean nextIsString;

private String cellPosition;

private List head = null;

private List curRowData = null;

private boolean curRowIsBlank = true;// 当前是个空行

private List> rows = new ArrayList<>();

public List> getRows() {

return rows;

}

public MultiRowHandler(SharedStringsTable sst) {

this.sst = sst;

}

@Override

public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {

if (name.equals("c")) {

cellPosition = attributes.getValue("r");

curColIndex = getColIndex(cellPosition);

// 这是一个新行

if (isNewRow(cellPosition)) {

curRowNum = getRowNum(cellPosition);

if (2 == curRowNum && Objects.nonNull(curRowData)) {

head = curRowData;

colCnt = head.size();

}

curRowData = getBlankRow(colCnt);

}

cellType = "";

cellType = attributes.getValue("t");

if ("s".equals(cellType)) {

nextIsString = true;

} else {

nextIsString = false;

}

}

// 清楚缓存内容

lastContents = "";

}

private boolean isNewRow(String cellPosition) {

// 坐标以A开头,后面跟数字 或者坐标行和当前行不一致的

boolean newRow = Pattern.compile("^A[0-9]+$").matcher(cellPosition).find();

if (!newRow) {

int cellRowNum = getRowNum(cellPosition);

newRow = (cellRowNum != curRowNum);

}

return newRow;

}

/**

* 根据列坐标获取行号,从1开始,返回0时标示出错

*

* @param cellPosition

* 列坐标,为A1,B23等

* @return 行号,从1开始,返回0是为失败

*/

private static int getRowNum(String cellPosition) {

String strVal = Pattern.compile("[^0-9]").matcher(cellPosition).replaceAll("").trim();// 获取坐标中的数字

if (StringUtils.isNotBlank(strVal)) {

return Integer.valueOf(strVal);

}

return 0;

}

/**

* 根据列坐标返回当前列索引,从0开始,返回-1时标示出错

* A1->0; B1->1...AA1->26

*

* @param cellPosition

* 列坐标,为A1,B23等

* @return 列索引,从0开始,返回-1是为失败,A1->0; B1->1...AA1->26

*/

private static int getColIndex(String cellPosition) {

int index = -1;

int num = 65;// A的Unicode码

int length = cellPosition.length();

for (int i = 0; i < length; i++) {

char c = cellPosition.charAt(i);

if (Character.isDigit(c)) {

break;// 确定指定的char值是否为数字

}

index = (index + 1) * 26 + (int) c - num;

}

return index;

}

/**

* 返回一个全部为空字符串的空行

*

* @param cnt

* @return

*/

private List getBlankRow(int cnt) {

List result = new ArrayList<>(cnt);

for (int i = 0; i < cnt; i++) {

result.add(i, "");

}

curRowIsBlank = true;

return result;

}

@Override

public void endElement(String uri, String localName, String name) throws SAXException {

if (nextIsString) {

int idx = Integer.parseInt(lastContents);

lastContents = new XSSFRichTextString(sst.getEntryAt(idx)).toString();

nextIsString = false;

}

if (name.equals("v")) {

// System.out.println(MessageFormat.format("当前列定位:{0},当前行:{1},当前列:{2},当前值:{3}",

// cellPosition, curRowNum,

// curColIndex, lastContents));

if (Objects.isNull(head)) {

curRowData.add(lastContents);

} else {

curRowData.set(curColIndex, lastContents);

}

curRowIsBlank = false;

// 这是一个新行

if (isNewRow(cellPosition)) {

if (Objects.nonNull(curRowData)) {

if (curRowIsBlank) {

curRowData.clear();// 如果当前行是空行,则清空当前行数据

}

rows.add(curRowData);

}

}

}

}

@Override

public void endDocument() throws SAXException {

if (Objects.nonNull(curRowData) && !curRowIsBlank) {

rows.add(curRowData);// 最后一行在上面不好加入,最后一行全是空行的不加入

}

super.endDocument();

}

@Override

public void characters(char[] ch, int start, int length) throws SAXException {

lastContents += new String(ch, start, length);

}

@Override

public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {

lastContents += "";

}

public static void main(String[] args) {

System.out.println(getColIndex("BC2"));

}

}


版权声明:本文内容由网络用户投稿,版权归原作者所有,本站不拥有其著作权,亦不承担相应法律责任。如果您发现本站中有涉嫌抄袭或描述失实的内容,请联系我们jiasou666@gmail.com 处理,核实后本网站将在24小时内删除侵权内容。

上一篇:Java switch多值匹配操作详解
下一篇:Java switch case数据类型原理解析
相关文章

 发表评论

暂时没有评论,来抢沙发吧~