创建临时CSV文件并将其读入Spark

时间:2019-12-12 12:58:15

标签: csv databricks temporary-files xlsb

我正在尝试将xlsb文件读入databricks中的临时csv文件,然后通过spark读取它。我可以在熊猫上使用它,但是遇到内存问题,因此将其写到csv中。

我似乎无法正常工作。读完后,我需要删除文件

当尝试读入spark时,以下错误是:

'Path does not exist: dbfs:/tmp/tmp3chb73c9;'

from pyxlsb import open_workbook
import re
import tempfile

sheets=["Apr-Jun","Jul-Sep" ]
skip_rows=-1
required_fields_list = jsonReader(configFilePath,"requiredColumns")
excel_file = "/dbfs/{}".format(path)
with open_workbook(excel_file) as wb:
#if sheet names are defined in the config file
  with tempfile.NamedTemporaryFile(mode='w', delete=False) as csvfile:
    wr = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
    tempDir = os.path.dirname(csvfile.name)
    for sheet in sheets:
      #set count =0. if count is 0 at the end then no sheets have the required headers    

      second_break=False
      #assign the sheet name to the object sheet
      header = []
      with wb.get_sheet(sheet) as worksheet_name:       
        #create empty header and body lists for each sheet

        count=0
        for i,row in enumerate(worksheet_name.rows()):
          lrow = []
            #if the row index is greater then skip rows then we want to read that row in as the header
          if i==skip_rows+1:             
            header.append([cell.v for cell in row])

          elif i>skip_rows+1: 
            count=count+1
            if count==1:
              header=header[0]
              header=[w.replace(' ', '_') if w is not None else w for w in header]
              header=[w.replace('.', '') if w is not None else w for w in header]

              if(all(elem in header for elem in required_fields_list)==False):
                second_break=True      
                break
              header=unique_list(header)  
              colNames=header
              for cell in row:
                 lrow.append(cell.v)
              wr.writerow(lrow)

            else:
              if second_break==True:
                continue
              elif second_break==False: 

                for cell in row:
                    lrow.append(cell.v)
                wr.writerow(lrow)

    colSchema = StructType([StructField(colNames[i], StringType(), True) for i in range(len(colNames))])
    df = spark.read.format("csv").option("header", "false").schema(colSchema).load(csvfile.name)
     ```

0 个答案:

没有答案