PostgreSQL,Python 3.5,SQLAlchemy
我有一个主数据库,其中有一个日记表,其中包含30k +个日记。对于每本期刊,我都有一个RSS feed URL,可以使用它解析并获取新文章。如果某些文章中缺少某些必填字段,则存在一个第三方API,我可以使用该API检索有关该文章的其他信息。我需要编写一个python脚本,该脚本可以连续运行,并且每天循环多次遍历从主数据库检索到的日志。
在保存到FeedResponse DB中之前,需要验证检索到的文章的属性。所有有效的文章都将存储在文章表中,失败的文章将与错误消息一起保存在错误表中,最终只有成功的文章才会发送到主数据库。
现在,我需要一些有关为此设计解决方案的帮助。到目前为止,我有:
将有多个工作线程从主数据库中检索数据并解析rss feed以检索文章列表,然后验证文章并在需要时调用第三方API。清理和验证后,它将保存从中检索到的所有文章FeedResponse DB中的该日记。
我的问题:
为简洁起见,我只提到了2个解析器,但是也有几个其他解析器。
我的实施
我在FeedResponse DB中有一个Process表,该表用于标识工作线程需要拾取哪些日记。数据库的架构如下:
class Process(Base):
__tablename__ = 'process'
id = Column(Integer, primary_key=True, autoincrement=True)
worker_id = Column(Integer)
journal_id = Column(Integer)
time_started = Column(DateTime(timezone=True), nullable=True)
time_finished = Column(DateTime(timezone=True), nullable=True)
is_finished = Column(Boolean)
manage_worker.py
def start_workers():
count = int(sys.argv[1])
if count > 1:
for i in range(0, count):
worker = Worker(i)
worker_pool.append(worker)
else:
w = Worker(0, True)
worker_pool.append(w)
for w in worker_pool:
w.start()
worker.py
class Worker(Thread):
def __init__(self, worker_id, run_single=False):
self.worker_id = worker_id
self.current_job_journal_id = None
self.current_job_row_id = None
self.current_highest_id = 0
self.is_working = False
self.run_single = run_single
threading.Thread.__init__(self)
def run(self):
self.connect_db() #Start a session with both the DBs: main DB and FeedResponseDB
self.start_work()
def start_work(self):
while True:
if not self.is_working:
self.is_working = True
# the max journal id from main DB is assigned to max journal id
max_journal_id = self.get_max_journal_id()
# tuple of new inserted row id from process table in FeedResponseDB
new_process_row_id, new_journal_id = self.create_new_job(max_journal_id[0])
# the current job id is assigned as the new journal id
self.current_job_journal_id = new_journal_id
self.current_job_row_id = new_process_row_id
# journal data is the row from main DB returned when queried by the new journal id (which is the max journal id from main DB)
journal_data = self.get_journal_data(new_journal_id)
self.parse(journal_data, self.handle_is_finished)
def parse(self, journal_data, handle_is_finished):
time.sleep(2)
articles = RssParser(journal_data.rss_url, journal_data.id)
for a in articles:
If data_incomplete: #If article data is incomplete
Article = ApiParser(a)
# update the entry in the list of articles
validate_articles(articles)
handle_is_finished()
def create_new_job(self, max_journal_id, last_call_id=None):
if last_call_id is None:
# get the latest parse date from FeedResponse DB
# get the latest id from the latest parse date from FeedResponse DB
max_process = db.session.query(Process).filter(Process.time_started == max_date).first()
max_process_journal_id = max_process.journal_id
else:
max_process_journal_id = 0
else:
# if last_call_id has a value, make it the latest process journal id from the table
max_process_journal_id = last_call_id
# add 1 to the latest process journal id
max_process_journal_id += 1
# check if our max journal id is higher than the highest in main db, and if so, reset to 1
if max_process_journal_id > max_journal_id:
max_process_journal_id = 1
#insert process in FeedResponse DB
new_max_process_id = db.session.execute(
'''
INSERT INTO process (worker_id, journal_id, time_started, time_finished, is_finished)
SELECT
{} as worker_id,
CASE
WHEN {} <= {} THEN {}
ELSE 1
END as journal_id,
clock_timestamp() as time_started,
null as time_finished,
'FALSE' as is_finished
RETURNING id
'''
.format(self.worker_id,
max_process_journal_id,
max_journal_id,
max_process_journal_id).first().id
return (new_max_process_id, max_process_journal_id)
def handle_is_finished(self):
# print("FINISHING PARSE PROCESS FOR ROW ID: ------->", self.current_job_row_id)
current_process = db..session.query(Process).filter(Process.id == self.current_job_row_id).first()
current_process.time_finished = datetime.datetime.now()
current_process.is_finished = True
self.current_job_journal_id = None
self.current_job_row_id = None
self.disconnect_db()
self.is_working = False