我有以下代码需要一个多小时才能运行。我相信它变慢的原因是附加数据帧是一项占用大量内存的操作。 有什么办法可以改善它,使其在更大的数据集(即> 25MB)上更快?
df = pd.DataFrame(columns = ["X1", "NumHelpfulVotes", "NumVotes", "Product_ID", "Rating", "ReviewLength", "numExclamations", "HelpfulFraction"])
filename = "Reviews.txt"
#open file and read all the lines
file = open(filename, "r", encoding = "latin-1")
lines = file.readlines()
file.close()
#track index of item
index = 0
#loop through all the lines
#each new entry starts every 9 lines
for i in range(0,len(lines), 9):
#get product ID
info_start = lines[i].index(":") + 1
product_id = lines[i][info_start:].strip() #get the text after the colon and strip white space
#get helpfulness data
info_start = lines[i+3].index(":") + 1
helpfulness_data = lines[i+3][info_start:].strip() #get text after colon and clear whitespace
info_start = helpfulness_data.index("/")
numVotes = int(helpfulness_data[info_start+1:]) #get number after /
numHelpful = int(helpfulness_data[:info_start]) #get number before /
#get rating
info_start = lines[i+4].index(":") + 1
rating = float(lines[i+4][info_start:]) #get number after colon
#review data
info_start = lines[i+7].index(":") + 1
review = lines[i+7][info_start:].strip() #get all the text in the review
review_len = len(review)
num_exclamations = review.count("!") #count the number of exclamations
if numVotes == 0:
helpful_fraction = pd.np.nan
else:
helpful_fraction = numHelpful/numVotes
#append data to df
df.loc[len(df)]=[index, numHelpful, numVotes, product_id,rating, review_len,num_exclamations, helpful_fraction]
#increment index
index +=1
df.head()``