Question

我多年没有使用过Python并试图重新使用它。我有一个Input_file（.csv），我想解析并将输出存储在output.csv或.txt

我已经设法使用此代码解析.csv文件，并且在大多数情况下它可以正常工作，但是我无法将其保存到保存到文件（问题1）而不会出现以下错误（错误1）

 import csv
 import re
 import itertools

 file_name = 'PhoneCallData1.txt'
  try:
   lol = list(csv.reader(open(file_name, 'r'), delimiter=' '))
   count =0
  except:
    print('File cannot be opened:',file_name)
    exit()

  try:
    fout = open('output.txt','w')

  except:
    Print("File cannot be written to:","OutputFile")
    exit()

 d = dict()

 for item in itertools.chain(lol): # Lists all items (field) in the CSV    file. 
  count +=1 # counter to keep track of row im looping through

  if lol[count][3] is None:
    print("value is not blank")
    count +=1
  else:   
    try:
        check_date = re.search(r'(\d+/\d+/\d+)', lol[count][3]) # check       to determine if date is a date
    except:
        continue

        check_cost = re.compile(r'($+\d*)', lol[count][9]) # check to determine if value is a cost
     if check_date ==TRUE:
        try:
            key =lol[count][3] # If is a date value, store key
         except ValueError:
            continue
         if check_cost==TRUE:
            value = lol[count][9] # if is a cost ($) store value 
            d[key] = value 
            print (d[key])
            # fout.write((d[key])

# What if there is no value in the cell?
# I keep getting "IndexError: list index out of range", anyone know why?
# Is there a better way to do this?
# I only want to store the destination and the charge

现在是复杂的部分。我需要解析的文件在所需数据之前和之间有许多不相关的数据行。

数据格式

screenshot

我想做什么; 我想迭代两列数据，只存储其中包含日期或成本的行，而不管其他数据。

import csv
import re
import itertools

lol = list(csv.reader(open('PhoneCallData1.txt', 'r'), delimiter=' '))
count =0


d = dict()

for item in itertools.chain(lol): #Lists all items (field) in the CSV    file. 

 count +=1 # counter to keep track of row im looping through
 check_date = re.search(r'(\d+/\d+/\d+)', lol[count][3]) #check to    determine
  check_cost = re.compile(r'($+\d*)', lol[count][9]) #check to determine if value is a cost
 if check_date ==TRUE:
     key =lol[count][3] #If is a date value, store key
     if check_cost==TRUE:
         value = lol[count][9] #if is a cost ($) store value 
         d[key] = value 
         print (d[key])

#What if there is no value in the cell?
# I keep getting "IndexError: list index out of range", anyone know why?
# Is there a better way to do this?
# I only want to store the destination and the charges

我尝试过的; 我在加载数据后试图对数据进行索引，但这似乎没有用。我创建它只是为了查看超过一定长度的行，但它的代码很糟糕。我希望有更实用和可重用的东西。

import re

with open('PhoneCallData1.txt','r') as f, open('sample_output.txt','w') as        fnew:
 for line in f:
     if len(line) > 50:
         print(line)
         fnew.write(line + '\n')


Import csv

lol = list(csv.reader(open('PhoneCallData1.txt', 'rb'), delimiter='\t'))


#d = dict()
#key = lol[5][0]      # cell A7
#value = lol[5][3]    # cell D7
#d[key] = value       # add the entry to the dictionary

继续使索引超出界限错误

import re
import csv


match=re.search(r'(\d+/\d+/\d+)','testing date 11/12/2017')
print match.group(1)

尝试使用正则表达式在第一列数据中搜索日期。

注意：我想尝试Pandas，但我觉得我需要从这里开始。任何帮助都会很棒。

Answer 1

回答是否需要解析下一条记录必须具体，我有answer a similar question，以同样的方式，finite-state machine可能有帮助

主要代码是：

state = 'init'
output = []
# for line loop:
    if state == 'init':  # seek for start parsing
         # check if start parsing
         state = 'start'
        elif state == 'start':  # start parsing now
             # parsing
             # check if need to end parsing
             state = 'init'

Answer 2

import csv
import re
import itertools
import timeit
start_time = timeit.default_timer()
# code you want to evaluate


file_name = 'PhoneCallData.txt'
try:
 lol = list(csv.reader(open(file_name, 'r'), delimiter=' '))

except:
 print('File cannot be opened:', file_name)
 exit()

try:
 fout = open('output.txt','w')

except:
 Print("File cannot be written to:","OutputFile")
 exit()

# I could assign key value pairs and store in dictionry. Then print,       search,ect on the dictionary. Version2   
# d = dict()
count =0
total = 0

for row in lol:  # Lists all items (field) in the CSV file.
#print(len(row))
count +=1  # counter to keep track of row im looping through
if len(row) == 8:
    if row[2].isdigit():
        # Remove the $ and convert to float
        cost = re.sub('[$]', '', row[7])
        # Assign total value
        try:
            # Calculate total for verification purposes
            total = total + float(cost) 
            total = round(total, 2)
        except:
            continue
        string = str(row[2] + " : " + (row[7]) + " : " + str(total) + "\n")
        print (string)
        fout.write(string)
if len(row) == 9:
    if row[2].isdigit():
        # Remove the $ and convert to float
        cost = re.sub('[$]', '', row[8])
        # Assign total value
        try:
            # Calculate total for verification purposes
            total = total + float(cost) 
            total = round(total, 2)
        except:
            continue
        string = str(row[2] + " : " + row[8] + " : " + str(total) + "\n")
        print(string)
        fout.write(string)
if len(row) == 10:
        # print (row[2] +":"+ row[9])
        # Remove the $ and convert to float
        cost = re.sub('[$]', '', row[9])
        # Assign total value
        try:
            # Calculate total for verification purposes
            total = total + float(cost)
            total = round(total, 2)
        except:
            continue
        string = str(row[2] + " : " + row[9] + " : " + str(total) + "\n")
        print(string)
        fout.write(string)

# Convert to string so I can print and store in file
count_string = str(count)
total_string = str(total)
total_string.split('.', 2)
# Write to screen
print (total_string + " Total\n")
print("Rows parsed :" + count_string)
# write to file
fout.write(count_string + " Rows were parsed\n")
fout.write(total_string + " Total")
# Calcualte time spent on task
elapsed = timeit.default_timer() - start_time
round_elapsed = round(elapsed, 2)
string_elapsed = str(round_elapsed)
fout.write(string_elapsed)
print(string_elapsed + " seconds")
fout.close()

Python解析csv文件以存储在文件

数据格式

2 个答案: