我正在使用spark-excel(com.crealytics.spark.excel)库来读取excel文件。如果excel文件中没有重复列,则库工作正常。如果excel文件中出现任何重复的列名,则抛出异常。如何克服这个错误。是否有任何解决方法可以解决这个问题。
线程中的异常" main" org.apache.spark.sql.AnalysisException:在数据模式中找到重复的列:
net territory
; 在org.apache.spark.sql.util.SchemaUtils $ .checkColumnNameDuplication(SchemaUtils.scala:85)
答案 0 :(得分:0)
Using spark excel API getting exception .
StructType schema = DataTypes.createStructType(new StructField[]{DataTypes.createStructField("CGISAI", DataTypes.StringType, true), DataTypes.createStructField("SALES TERRITORY", DataTypes.StringType, true)});
SQLContext sqlcxt = new SQLContext(jsc);
Dataset<Row> df = sqlcxt.read()
.format("com.crealytics.spark.excel")
.option("path", "file:///"+siteinfofile)
.option("useHeader", "true")
.option("spark.read.simpleMode", "true")
.option("treatEmptyValuesAsNulls", "true")
.option("inferSchema", "false")
.option("addColorColumns", "False")
.option("sheetName", "sheet1")
.option("startColumn", 22)
.option("endColumn", 23)
//.schema(schema)
.load();
return df;
This is the code I am using. I am using sparkexcel library from com.crealytics.spark.excel.
I want the solution to identify whether excel file has duplicate columns or not. if have duplicate columns, how to rename/eliminate the duplicate columns.
WorkAround is as below:
convert .xlsx file into .csv . using spark default csv api that can handle duplicate column names by renaming them automatically.
Below is the code to convert from xlsx to csv file.
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package com.huawei.java.tools;
/**
*
* @author Nanaji Jonnadula
*/
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.util.CellAddress;
import org.apache.poi.ss.util.CellReference;
import org.apache.poi.util.SAXHelper;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFComment;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import javax.xml.parsers.ParserConfigurationException;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import static org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
public class ExcelXlsx2Csv {
private static class SheetToCSV implements SheetContentsHandler {
private boolean firstCellOfRow = false;
private int currentRow = -1;
private int currentCol = -1;
private StringBuffer lineBuffer = new StringBuffer();
/** * Destination for data */
private FileOutputStream outputStream;
public SheetToCSV(FileOutputStream outputStream) {
this.outputStream = outputStream;
}
@Override
public void startRow(int rowNum) {
/** * If there were gaps, output the missing rows: * outputMissingRows(rowNum - currentRow - 1); */
// Prepare for this row
firstCellOfRow = true;
currentRow = rowNum;
currentCol = -1;
lineBuffer.delete(0, lineBuffer.length()); //clear lineBuffer
}
@Override
public void endRow(int rowNum) {
lineBuffer.append('\n');
try {
outputStream.write(lineBuffer.substring(0).getBytes());
} catch (IOException e) {
System.out.println("save date to file error at row number: {}"+ currentCol);
throw new RuntimeException("save date to file error at row number: " + currentCol);
}
}
@Override
public void cell(String cellReference, String formattedValue, XSSFComment comment) {
if (firstCellOfRow) {
firstCellOfRow = false;
} else {
lineBuffer.append(',');
}
// gracefully handle missing CellRef here in a similar way as XSSFCell does
if (cellReference == null) {
cellReference = new CellAddress(currentRow, currentCol).formatAsString();
}
int thisCol = (new CellReference(cellReference)).getCol();
int missedCols = thisCol - currentCol - 1;
if (missedCols > 1) {
lineBuffer.append(',');
}
currentCol = thisCol;
if (formattedValue.contains("\n")) {
formattedValue = formattedValue.replace("\n", "");
}
formattedValue = "\"" + formattedValue + "\"";
lineBuffer.append(formattedValue);
}
@Override
public void headerFooter(String text, boolean isHeader, String tagName) {
// Skip, no headers or footers in CSV
}
}
private static void processSheet(StylesTable styles, ReadOnlySharedStringsTable strings,
SheetContentsHandler sheetHandler, InputStream sheetInputStream) throws Exception {
DataFormatter formatter = new DataFormatter();
InputSource sheetSource = new InputSource(sheetInputStream);
try {
XMLReader sheetParser = SAXHelper.newXMLReader();
ContentHandler handler = new XSSFSheetXMLHandler(
styles, null, strings, sheetHandler, formatter, false);
sheetParser.setContentHandler(handler);
sheetParser.parse(sheetSource);
} catch (ParserConfigurationException e) {
throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage());
}
}
public static void process(String srcFile, String destFile,String sheetname_) throws Exception {
File xlsxFile = new File(srcFile);
OPCPackage xlsxPackage = OPCPackage.open(xlsxFile.getPath(), PackageAccess.READ);
ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(xlsxPackage);
XSSFReader xssfReader = new XSSFReader(xlsxPackage);
StylesTable styles = xssfReader.getStylesTable();
XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
int index = 0;
while (iter.hasNext()) {
InputStream stream = iter.next();
String sheetName = iter.getSheetName();
System.out.println(sheetName + " [index=" + index + "]");
if(sheetName.equals(sheetname_)){
FileOutputStream fileOutputStream = new FileOutputStream(destFile);
processSheet(styles, strings, new SheetToCSV(fileOutputStream), stream);
fileOutputStream.flush();
fileOutputStream.close();
}
stream.close();
++index;
}
xlsxPackage.close();
}
public static void main(String[] args) throws Exception {
ExcelXlsx2Csv.process("D:\\data\\latest.xlsx", "D:\\data\\latest.csv","sheet1"); //source , destination, sheetname
}
}