以下代码无法在StringIO对象类型中创建临时csv文件。代码中某处有错误吗? " data_temp"变量保持搅拌空对象。
我正在使用StringIO对象以避免在磁盘上创建另一个文件。
from bs4 import BeautifulSoup
from io import StringIO
import csv
import re
# Creates a new csv file to import data to MySQL
def create_csv_file():
source_html = open(r'C:\\Users\\Admin\\OneDrive\\eCommerce\\Servi-fied\\Raw Data\\EMA - Electricians (Raw).txt', 'r')
bs_object = BeautifulSoup(source_html, "html.parser")
data_temp = StringIO()
csv_file1 = open(r'C:\\Users\\Admin\\OneDrive\\eCommerce\\Servi-fied\\Raw Data\\EMA - Electricians (Processed).csv', 'w+')
writer1 = csv.writer(data_temp, delimiter='<', skipinitialspace=True)
table = bs_object.find("table", {"id":"gasOfferSearch"})
rows = table.findAll("tr")
# Debugging statement
print("There are " + (len(rows) - 1).__str__() + " rows.")
try:
# Iterates through t he list, but skips the first record (i.e. the table header)
counter = 0
for row in rows[1:]:
csvRow = []
for cell in row.findAll(['td','th']):
# Replace "\n" with a whitespace; replace <br> tags with 5 whitespaces
line = str(cell).replace('\n', ' ').replace('<br>', ' ')
# Replace 2 or more spaces with "\n"
line = re.sub('\s{2,}', '*', line)
# Converts results to a BeautifulSoup object
line_bsObj = BeautifulSoup(line, "html.parser")
# Strips: Removes all tags and trailing and leading whitespaces
# Replace: Removes all quotation marks
csvRow.append(line_bsObj.get_text().strip().replace('"',''))
# Converts the string into a csv file
writer1.writerow(csvRow)
print(data_temp.readlines())
counter += 1
# Debugging statement
print("There are " + counter.__str__() + " rows.")
print(data_temp.readlines())
# Reads from the temp file and replaces all "<*" with "<"
csv_file1.write(
data_temp.read().replace("<*", "<").replace("*\n", "").replace("*", "<", 1)
)
finally:
source_html.close()
csv_file1.close()
return None
# Execute the following functions
create_csv_file()
答案 0 :(得分:3)
您正在写入StringIO对象data_temp
,然后立即尝试从中读取:
data_temp = StringIO()
writer1 = csv.writer(data_temp, delimiter='<', skipinitialspace=True)
...
writer1.writerow(csvRow)
print(data_temp.readlines())
在那一刻(以及稍后同上),data_temp
的“文件”指针位于流的末尾。所以你试图读取当前文件的末尾,导致没有数据。
如果你想以这种方式做事,seek
先到date_temp
开始,然后再阅读:
data_temp.seek(0)
result = data_temp.read()
(但是,如果没有深入研究您的代码,我会猜测还有另一种方法可以做你正在做的事情,而无需在临时对象上写入和读取。)