我有一个约有50场演出的json文件,我已经对它进行了一些处理,并且不得不在中途停止,我在第16000000条目中,我无法承担开始处理的问题。开始是否有任何方法可以获得上述行并从那里开始?我试图让我的数据库上的最后一个条目说comment_id在那里搜索并从那里开始处理但是无法有效地执行此操作。
UPDATE 所以我在那里有一个while循环,但问题是即使有一个while循环,插入过程仍在进行中。如何阻止它执行,直到满足所述while循环条件。
import sqlite3
import json
from datetime import datetime
import time
timeframe = '2017-10'
sql_transaction = []
start_row = 0
cleanup = 1000000
connection = sqlite3.connect('{}.db'.format(timeframe))
c = connection.cursor()
def create_table():
c.execute("""CREATE TABLE IF NOT EXISTS
parent_reply(parent_id TEXT PRIMARY KEY, comment_id TEXT UNIQUE, parent TEXT, comment TEXT, subreddit TEXT, unix INT,
score INT)""")
def format_data(data):
data = data.replace('\n',' newlinechar ').replace('\r',' newlinechar ').replace('"',"'")
return data
def transaction_bldr(sql):
global sql_transaction
sql_transaction.append(sql)
if len(sql_transaction) > 1000:
c.execute('BEGIN TRANSACTION')
for s in sql_transaction:
try:
c.execute(s)
except:
pass
connection.commit()
sql_transaction = []
def sql_insert_replace_comment(commentid,parentid,parent,comment,subreddit,time,score):
try:
sql = """UPDATE parent_reply SET parent_id = ?, comment_id = ?, parent = ?, comment = ?, subreddit = ?, unix = ?, score = ?
WHERE parent_id =?;""".format(parentid, commentid, parent, comment, subreddit, int(time), score, parentid)
transaction_bldr(sql)
except Exception as e:
print('s0 insertion',str(e))
def sql_insert_has_parent(commentid,parentid,parent,comment,subreddit,time,score):
try:
sql = """INSERT INTO parent_reply (parent_id, comment_id, parent, comment, subreddit, unix, score)
VALUES ("{}","{}","{}","{}","{}",{},{});""".format(parentid, commentid, parent, comment, subreddit, int(time), score)
transaction_bldr(sql)
except Exception as e:
print('s0 insertion',str(e))
def sql_insert_no_parent(commentid,parentid,comment,subreddit,time,score):
try:
sql = """INSERT INTO parent_reply (parent_id, comment_id, comment, subreddit, unix, score)
VALUES ("{}","{}","{}","{}",{},{});""".format(parentid, commentid, comment, subreddit, int(time), score)
transaction_bldr(sql)
except Exception as e:
print('s0 insertion',str(e))
def acceptable(data):
if len(data.split(' ')) > 1000 or len(data) < 1:
return False
elif len(data) > 32000:
return False
elif data == '[deleted]':
return False
elif data == '[removed]':
return False
else:
return True
def find_parent(pid):
try:
sql = "SELECT comment FROM parent_reply WHERE comment_id = '{}' LIMIT 1".format(pid)
c.execute(sql)
result = c.fetchone()
if result != None:
return result[0]
else: return False
except Exception as e:
#print(str(e))
return False
def find_existing_score(pid):
try:
sql = "SELECT score FROM parent_reply WHERE parent_id = '{}' LIMIT 1".format(pid)
c.execute(sql)
result = c.fetchone()
if result != None:
return result[0]
else: return False
except Exception as e:
#print(str(e))
return False
if __name__ == '__main__':
create_table()
row_counter = 0
paired_rows = 0
with open('F:\pydata\RC_2017-10', buffering=1000) as f:
for row in f:
#print(row)
#time.sleep(555)
row_counter += 1
if row_counter > start_row:
try:
row = json.loads(row)
#print(row)
parent_id = row['parent_id'].split('_')[1]
body = format_data(row['body'])
created_utc = row['created_utc']
score = row['score']
comment_id = row['id']
subreddit = row['subreddit']
parent_data = find_parent(parent_id)
existing_comment_score = find_existing_score(parent_id)
while True:
#print('dasfdjasf;ldjasfdasflj')
if (comment_id == 'dnqil6j'):
print('true')
#print('asdfadsfasdf')
break
if existing_comment_score:
print('score')
#print('going for score bro')
if score > existing_comment_score:
if acceptable(body):
#print('inserting')
sql_insert_replace_comment(comment_id,parent_id,parent_data,body,subreddit,created_utc,score)
else:
if acceptable(body):
if parent_data:
if score >= 2:
sql_insert_has_parent(comment_id,parent_id,parent_data,body,subreddit,created_utc,score)
paired_rows += 1
else:
sql_insert_no_parent(comment_id,parent_id,body,subreddit,created_utc,score)
except Exception as e:
print(str(e))
if row_counter % 100000 == 0:
print('checking from here')
print('Total Rows Read: {}, Paired Rows: {}, Time: {}'.format(row_counter, paired_rows, str(datetime.now())))
if row_counter > start_row:
if row_counter % cleanup == 0:
print("Cleanin up!")
sql = "DELETE FROM parent_reply WHERE parent IS NULL"
c.execute(sql)
connection.commit()
c.execute("VACUUM")
connection.commit()