我正在尝试将xlsb文件读入databricks中的临时csv文件,然后通过spark读取它。我可以在熊猫上使用它,但是遇到内存问题,因此将其写到csv中。
我似乎无法正常工作。读完后,我需要删除文件
当尝试读入spark时,以下错误是:
'Path does not exist: dbfs:/tmp/tmp3chb73c9;'
from pyxlsb import open_workbook
import re
import tempfile
sheets=["Apr-Jun","Jul-Sep" ]
skip_rows=-1
required_fields_list = jsonReader(configFilePath,"requiredColumns")
excel_file = "/dbfs/{}".format(path)
with open_workbook(excel_file) as wb:
#if sheet names are defined in the config file
with tempfile.NamedTemporaryFile(mode='w', delete=False) as csvfile:
wr = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
tempDir = os.path.dirname(csvfile.name)
for sheet in sheets:
#set count =0. if count is 0 at the end then no sheets have the required headers
second_break=False
#assign the sheet name to the object sheet
header = []
with wb.get_sheet(sheet) as worksheet_name:
#create empty header and body lists for each sheet
count=0
for i,row in enumerate(worksheet_name.rows()):
lrow = []
#if the row index is greater then skip rows then we want to read that row in as the header
if i==skip_rows+1:
header.append([cell.v for cell in row])
elif i>skip_rows+1:
count=count+1
if count==1:
header=header[0]
header=[w.replace(' ', '_') if w is not None else w for w in header]
header=[w.replace('.', '') if w is not None else w for w in header]
if(all(elem in header for elem in required_fields_list)==False):
second_break=True
break
header=unique_list(header)
colNames=header
for cell in row:
lrow.append(cell.v)
wr.writerow(lrow)
else:
if second_break==True:
continue
elif second_break==False:
for cell in row:
lrow.append(cell.v)
wr.writerow(lrow)
colSchema = StructType([StructField(colNames[i], StringType(), True) for i in range(len(colNames))])
df = spark.read.format("csv").option("header", "false").schema(colSchema).load(csvfile.name)
```