我一直试图弄清楚为什么在运行下面的代码后数据库中缺少700条记录中的47条记录。请帮助看看这是Python中的编码错误还是内存限制。
def create_csv_file():
source_html = open(r'C:\\Users\\Admin\\SkyDrive\\eCommerce\\Servi-fied\\Raw Data\\EMA - Electricians (Raw).txt', 'r')
bs_object = BeautifulSoup(source_html, "html.parser")
data_out = open(r'C:\\Users\\Admin\\SkyDrive\\eCommerce\\Servi-fied\\Raw Data\\temp.csv', 'w+')
data_in = open(r'C:\\Users\\Admin\\SkyDrive\\eCommerce\\Servi-fied\\Raw Data\\temp.csv', 'r')
csv_file1 = open(r'C:\\Users\\Admin\\SkyDrive\\eCommerce\\Servi-fied\\Raw Data\\EMA - Electricians (Processed).csv', 'w+')
csv_file2 = open(r'C:\\Users\\Admin\\SkyDrive\\eCommerce\\Servi-fied\\Raw Data\\EMA - Electricians (Processed).csv', 'r')
csv_file3 = open(r'C:\\Users\\Admin\\SkyDrive\\eCommerce\\Servi-fied\\Raw Data\\EMA - Electricians (Processed).csv', 'w+')
writer1 = csv.writer(data_out, delimiter='<', skipinitialspace=True)
table = bs_object.find("table", {"id":"gasOfferSearch"})
rows = table.findAll("tr")
try:
# Iterates through the list, but skips the first record (i.e. the table header)
for row in rows[1:]:
csvRow = []
for cell in row.findAll(['td','th']):
# Replace "\n" with a whitespace; replace <br> tags with 5 whitespaces
line = str(cell).replace('\n', ' ').replace('<br>', ' ')
# Replace 2 or more spaces with "\n"
line = re.sub('\s{2,}', '*', line)
# Converts results to a BeautifulSoup object
line_bsObj = BeautifulSoup(line, "html.parser")
# Strips: Removes all tags and trailing and leading whitespaces
# Replace: Removes all quotation marks
csvRow.append(line_bsObj.get_text().strip().replace('"',''))
# Converts the string into a csv file
writer1.writerow(csvRow)
# Reads from the temp file and replaces all "<*" with "<"
# TODO: Issue - 47 records missing with replacement
temp_string = data_in.read().replace("<*", "<").replace("*\n", "")
csv_file1.write(temp_string)
# Clear the temp_string variable
temp_string = ""
for line in csv_file2.readlines():
temp_string += line.replace("*", "<", 1)
csv_file3.write(temp_string)
finally:
source_html.close()
csv_file1.close()
csv_file2.close()
data_out.close()
data_in.close()
# Remove the temp file
# os.remove('C:\\Users\\Admin\\SkyDrive\\eCommerce\\Servi-fied\\Raw Data\\temp.csv')
return None
答案 0 :(得分:0)
我不确切地知道出了什么问题,但这里有一些一般的建议:
csv_file[1,2,3]
相同)print
命令以仔细检查发生了什么:
for now in rows
之前放一个temp_string = data_in...
附近,以确保这些数字正确