spark调用第三方工具spark-excel,文件过大,oom
/**
* https://github.com/crealytics/spark-excel
* @param sparkSession
* @param filePath
* @param header
* @return
*/
def sparkExcel(sparkSession: SparkSession, filePath: String, header: Boolean): DataFrame = {
println("--------------------sparkExcel-----:", filePath)
import com.crealytics.spark.excel._
val df = sparkSession.read.excel(
header = header, // Required
// dataAddress = "'My Sheet'!B3:C35", // Optional, default: "A1"
treatEmptyValuesAsNulls = true, // Optional, default: true
inferSchema = false, // Optional, default: false
addColorColumns = false, // Optional, default: false
// timestampFormat = "MM-dd-yyyy HH:mm:ss", // Optional, default: yyyy-mm-dd hh:mm:ss[.fffffffff]
maxRowsInMemory = 20 // Optional, default None. If set, uses a streaming reader which can help with big files
// excerptSize = 10, // Optional, default: 10. If set and if schema inferred, number of rows to infer schema from
// workbookPassword = "pass" // Optional, default None. Requires unlimited strength JCE for older JVMs
).load(filePath)
df.show(5)
df
}
设置:maxRowsInMemory
浙公网安备 33010602011771号