我有一个包含5个以上无效CSV文件的目录。我没有问题阅读文件,然后一次一个地写为“好”的CSV文件。但是当我尝试处理第二个文件时,我得到“IndexError:数组索引超出范围”
import xlrd
import csv, sys, os
import datetime, time
import logging
import Gmail_email
program = "CleanCSV"
date = datetime.datetime(1899, 12, 30)
argv0=""
argv1 = 'c:/tmp/checkEmail/' #input directory
argv2 = "f:/foo/in/bar-" #output directory
sys.argv = [argv0, argv1, argv2]
inDir = sys.argv[1]#input directory
outDir = sys.argv[2] #output directory
lList = [] #holder list to hold names of files to be processed
def processFiles():
try: #Makes list of local files in lDir, Populates lList
if os.listdir(inDir) == []: #checks for files in lDir
logging.info('No Files to upload')
exit()
else:
for file_name in os.listdir(inDir):
#print file_name
if os.path.isfile(inDir+file_name):
lList.append(file_name) # populate local dir list
if 'Thumbs.db' in lList: #remove windows thumbs file
lList.remove('Thumbs.db')
logging.info('Files to be checked')
logging.info('%s', lList )
#print lList, 'lList'
except Exception, e:
Gmail_email.email(e, program)
logging.warning('Error with local files')
logging.warning('%s', e)
exit()
for each in lList: #calls on cleanup method for each file in lLIst
filePath= inDir+each
print filePath, "filepath"
testFile(filePath)
def testFile(filePath):
try:
with open(filePath, "rb") as csvfile:
spamreader= csv.reader(csvfile, delimiter=' ', quotechar='|')
for row in spamreader:
#print "good file, most likely"
pass
except Exception, e:
logging.warning('Error with local files')
logging.warning('%s', e)
#print "cleaing bad file", filePath
cleanBadFile(filePath)
def cleanBadFile(filePath):
timestr = time.strftime("%Y%m%d-%H%M%S")
#print "bad file trying to clean"
f = open(outDir+timestr+".csv", 'ab')
try: #can i read the file
workbook = xlrd.open_workbook(filePath)
#will error here if bad xlrd cannot open it
print workbook.sheet_names()
#print workbook
except Exception, e:
#print e, " error"
pass
worksheet = workbook.sheet_by_name('Sheet')
num_rows = worksheet.nrows - 1
num_cells = worksheet.ncols - 1
#print worksheet.ncols, 'num cells'
curr_row = -1
while curr_row < num_rows: #goes over every row
num_cells = worksheet.ncols - 1
curr_row += 1
row = worksheet.row(curr_row)
print row, "row"
curr_cell = -1
print worksheet.row_len(curr_row), "row len"
print curr_row, curr_cell, "curr row, curr cell"
cell_type = worksheet.cell_type(curr_row, curr_cell)
cell_value = worksheet.cell_value(curr_row, curr_cell)
print ' ', cell_type, ':', cell_value
values= []
if cell_type == 0: #tests if first value in row is data
#assuming that good rows will have a value in the first cell of each row
#if no data row is not copied to new file
print "bad line"
pass
else:
while curr_cell < num_cells:
curr_cell += 1
# Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank
print curr_row, "; ",curr_cell, " row and cell"
cell_type = worksheet.cell_type(curr_row, curr_cell)
cell_value = worksheet.cell_value(curr_row, curr_cell)
#print cell_type, ":", cell_value
if cell_type == xlrd.XL_CELL_DATE:
cell_value=datetime.timedelta(int(cell_value))
cell_value = str(date + cell_value)[:10]
#print cell_value, "cell value, cell date"
values.append(cell_value)
#print values, "values"
csv.writer(f, delimiter=',',
quotechar=',', quoting=csv.QUOTE_MINIMAL).writerow( values )
f.close()
print f.closed
print "ah"
curr_cell= 0
curr_row = 0
#print "checking file:", readFile
processFiles()
#print "exit"
exit
错误消息
Traceback (most recent call last):
File "F:\cleanCSV.py", line 132, in <module>
processFiles()
File "F:\cleanCSV.py", line 51, in processFiles
testFile(filePath)
File "F:\cleanCSV.py", line 64, in testFile
cleanBadFile(filePath)
File "F:\cleanCSV.py", line 106, in cleanBadFile
cell_type = worksheet.cell_type(curr_row, curr_cell)
File "C:\Python27\lib\site-packages\xlrd\sheet.py", line 413, in cell_type
return self._cell_types[rowx][colx]
IndexError: array index out of range
我觉得我需要“重置”一个计数变量,但我认为我拥有它们。我不知道该怎么做。
答案 0 :(得分:0)
导致异常curr_cell
的行之前的两行被设置为-1,这不能是有效的单元索引。注释某些行进一步向下表明你希望它是行中的第一个单元格,因此索引应该是0而不是-1。
答案 1 :(得分:0)
我将+1(curr_cell+=1
)向下移动了3行。
while curr_cell < num_cells:
# Cell Types: 0=Empty, 1=Text, 2=Number, 3=Date, 4=Boolean, 5=Error, 6=Blank
#print curr_row, "; ",curr_cell, " row and cell"
cell_type = worksheet.cell_type(curr_row, curr_cell)
cell_value = worksheet.cell_value(curr_row, curr_cell)
print cell_type, ":", cell_value
curr_cell += 1
if cell_type == xlrd.XL_CELL_DATE:
cell_value=datetime.timedelta(int(cell_value))
cell_value = str(date + cell_value)[:10]
#print cell_value, "cell value, cell date"