我目前正在尝试创建来自Reddit的评论和回复数据库。我正在关注该项目的this教程。
我已经到达需要读取reddit注释文件并将其存储到SQLite数据库中的位置。
处理10万行大约需要1m30s,我认为可以改进。我曾尝试将项目切换到我的SSD,但这并没有提高速度。我也尝试增加文件缓冲区,但这也无济于事。我看不到另一种读取和循环文件的方式。有人可以建议一种更有效的方法吗?
with open('D:/Projects/Personal/AI/ChatBot/RC_{}'.format(timeframe), buffering=1000) as f:
for row in f:
row_counter += 1
if row_counter > start_row:
try:
row = json.loads(row)
parent_id = row['parent_id'].split('_')[1]
body = format_data(row['body'])
created_utc = row['created_utc']
score = row['score']
comment_id = row['id']
subreddit = row['subreddit']
parent_data = find_parent(parent_id)
existing_comment_score = find_existing_score(parent_id)
if existing_comment_score:
if score > existing_comment_score:
if acceptable(body):
sql_insert_replace_comment(comment_id,parent_id,parent_data,body,subreddit,created_utc,score)
else:
if acceptable(body):
if parent_data:
if score >= 2:
sql_insert_has_parent(comment_id,parent_id,parent_data,body,subreddit,created_utc,score)
paired_rows += 1
else:
sql_insert_no_parent(comment_id,parent_id,body,subreddit,created_utc,score)
except Exception as e:
print(str(e))
if row_counter % 100000 == 0:
print('Total Rows Read: {}, Paired Rows: {}, Time: {}'.format(row_counter, paired_rows, str(datetime.now())))
{"subreddit":"SeattleWA","body":"They may just pull out of the Seattle market completely, at least until they have autonomous vehicles.","stickied":false,"gilded":0,"score":1,"author":"music4mic","link_id":"t3_5l79wh","retrieved_on":1485679711,"subreddit_id":"t5_2vbli","controversiality":0,"author_flair_css_class":null,"author_flair_text":null,"created_utc":1483228800,"distinguished":null,"parent_id":"t1_dbts1i6","id":"dbumnq7","edited":false}