xlrd数组索引超出第二个文件的范围

时间:2015-03-31 04:53:02

标签: python csv xlrd

我有一个包含5个以上无效CSV文件的目录。我没有问题阅读文件,然后一次一个地写为“好”的CSV文件。但是当我尝试处理第二个文件时,我得到“IndexError:数组索引超出范围”

import xlrd
import csv, sys, os
import datetime, time
import logging
import Gmail_email

program = "CleanCSV"

date = datetime.datetime(1899, 12, 30)

argv0=""
argv1 = 'c:/tmp/checkEmail/' #input directory
argv2 = "f:/foo/in/bar-" #output directory

sys.argv = [argv0, argv1, argv2]

inDir = sys.argv[1]#input directory
outDir = sys.argv[2] #output directory
lList = [] #holder list to hold names of files to be processed

def processFiles():
    try: #Makes list of local files in lDir, Populates lList
        if os.listdir(inDir) == []: #checks for files in lDir
            logging.info('No Files to upload')
            exit()
        else:
            for file_name in os.listdir(inDir):
                #print file_name
                if os.path.isfile(inDir+file_name):
                    lList.append(file_name) # populate local dir list
                    if 'Thumbs.db' in lList: #remove windows thumbs file
                        lList.remove('Thumbs.db')
            logging.info('Files to be checked')
            logging.info('%s', lList )
            #print lList, 'lList'
    except Exception, e:
            Gmail_email.email(e, program)
            logging.warning('Error with local files') 
            logging.warning('%s', e)
            exit() 
    for each in lList: #calls on cleanup method for each file in lLIst
        filePath= inDir+each
        print filePath, "filepath"
        testFile(filePath)

def testFile(filePath):
    try:
        with open(filePath, "rb") as csvfile:
            spamreader= csv.reader(csvfile, delimiter=' ', quotechar='|')
            for row in spamreader:
                #print "good file, most likely"
                pass
    except Exception, e:
        logging.warning('Error with local files') 
        logging.warning('%s', e)
        #print "cleaing bad file", filePath
        cleanBadFile(filePath)

def cleanBadFile(filePath):
    timestr = time.strftime("%Y%m%d-%H%M%S")
    #print "bad file trying to clean"
    f = open(outDir+timestr+".csv", 'ab')
    try: #can i read the file
        workbook = xlrd.open_workbook(filePath)
        #will error here if bad xlrd cannot open it
        print workbook.sheet_names()
        #print workbook
    except Exception, e:
        #print e, " error"
        pass
    worksheet = workbook.sheet_by_name('Sheet')
    num_rows = worksheet.nrows - 1
    num_cells = worksheet.ncols - 1
    #print worksheet.ncols, 'num cells'
    curr_row = -1 
    while curr_row < num_rows: #goes over every row
        num_cells = worksheet.ncols - 1
        curr_row += 1
        row = worksheet.row(curr_row)
        print row, "row"
        curr_cell = -1
        print worksheet.row_len(curr_row), "row len"
        print curr_row, curr_cell, "curr row, curr cell"
        cell_type = worksheet.cell_type(curr_row, curr_cell)
        cell_value = worksheet.cell_value(curr_row, curr_cell)
        print '   ', cell_type, ':', cell_value
        values= []
        if cell_type == 0: #tests if first value in row is data 
            #assuming that good rows will have a value in the first cell of each row
            #if no data row is not copied to new file
            print "bad line"
            pass
        else:
            while curr_cell < num_cells:
                curr_cell += 1
                # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank
                print curr_row, "; ",curr_cell, " row and cell"
                cell_type = worksheet.cell_type(curr_row, curr_cell)
                cell_value = worksheet.cell_value(curr_row, curr_cell)
                #print cell_type, ":", cell_value
                if cell_type == xlrd.XL_CELL_DATE:
                    cell_value=datetime.timedelta(int(cell_value))
                    cell_value = str(date + cell_value)[:10]
                    #print cell_value, "cell value, cell date"

               values.append(cell_value)
            #print values, "values"
            csv.writer(f, delimiter=',',
                        quotechar=',', quoting=csv.QUOTE_MINIMAL).writerow( values )
    f.close()
    print f.closed
    print "ah"
    curr_cell= 0
    curr_row = 0            

#print "checking file:", readFile
processFiles()
#print "exit"
exit

错误消息

Traceback (most recent call last):
  File "F:\cleanCSV.py", line 132, in <module>
    processFiles()
  File "F:\cleanCSV.py", line 51, in processFiles
    testFile(filePath)
  File "F:\cleanCSV.py", line 64, in testFile
    cleanBadFile(filePath)
  File "F:\cleanCSV.py", line 106, in cleanBadFile
    cell_type = worksheet.cell_type(curr_row, curr_cell)
  File "C:\Python27\lib\site-packages\xlrd\sheet.py", line 413, in cell_type
    return self._cell_types[rowx][colx]
IndexError: array index out of range

我觉得我需要“重置”一个计数变量,但我认为我拥有它们。我不知道该怎么做。

2 个答案:

答案 0 :(得分:0)

导致异常curr_cell的行之前的两行被设置为-1,这不能是有效的单元索引。注释某些行进一步向下表明你希望它是行中的第一个单元格,因此索引应该是0而不是-1。

答案 1 :(得分:0)

我将+1(curr_cell+=1)向下移动了3行。

while curr_cell < num_cells:

                # Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank
                #print curr_row, "; ",curr_cell, " row and cell"
                cell_type = worksheet.cell_type(curr_row, curr_cell)
                cell_value = worksheet.cell_value(curr_row, curr_cell)
                print cell_type, ":", cell_value
                curr_cell += 1
                if cell_type == xlrd.XL_CELL_DATE:
                    cell_value=datetime.timedelta(int(cell_value))
                    cell_value = str(date + cell_value)[:10]
                    #print cell_value, "cell value, cell date"