我从Gerrit
API提取提交数据,提交号在226,000范围内。我必须在每次提交时向端点发出请求,这需要花费很长时间才能理解。我想知道如何才能最好地将线程应用到我当前的流程中。
我有两个类,一个Project
类,它向下钻取并检索与之关联的所有提交,并将它们保存为Commit
对象,其中包含随后循环执行所需的所有信息。得到与之关联的json
。我将它们全部拉到一个大的列表中,然后迭代调用get_data
和write_data
方法。
class Project(object):
def __init__(self, name):
self.name = name
self.commits = []
def add_commits(self, changes_list):
for change in changes_list:
change_id=change['change_id'],
revision_list=change['revisions']
self.commits.extend([Commit(rid, change_id)
for rid in revision_list.keys()])
def return_results(self, ger_obj, start=0):
self.ger = ger_obj
while True:
endpoint = (r'/changes/?q=project:{project}&o=ALL_REVISIONS&'
r'S={num}'.format(
project=self.name,
num=start
))
logging.info('Endpoint: {}'.format(endpoint))
try:
changes = ger_obj.get(endpoint)
self.add_commits(changes_list=changes)
except HTTPError:
break
start += 500
try:
if not changes[-1].get('_more_changes'):
break
except IndexError:
break
class Commit(object):
def __init__(self, rev_id, change_id):
self.rev_id = rev_id
self.change_id = change_id
def get_data(self, ger_obj):
endpoint = (r'/changes/{c_id}/revisions/{r_id}/commit'.format(
c_id=self.change_id[0],
r_id=self.rev_id
))
try:
self.data = ger_obj.get(endpoint)
except HTTPError as e:
logging.warning('Endpoint: {} did not return data'.format(
endpoint
))
else:
self.data['commitid'] = self.data.get('commit')
self.data['name'] = self.data.get('committer')['name']
self.data['email'] = self.data.get('committer')['email']
self.data['date'] = self.data.get('committer')['date']
hash = md5()
hash.update(json.dumps(self.data).encode('utf-8'))
self.data['etl_checksum_md5'] = hash.hexdigest()
self.data['etl_process_status'] = ETL_PROCESS_STATUS
self.data['etl_datetime_local'] = ETL_DATETIME_LOCAL
self.data['etl_pdi_version'] = ETL_PDI_VERSION
self.data['etl_pdi_build_version'] = ETL_PDI_BUILD_VERSION
self.data['etl_pdi_hostname'] = ETL_PDI_HOSTNAME
self.data['etl_pdi_ipaddress'] = ETL_PDI_IPADDRESS
self.data['message'] = self.data['message'].replace('\n', ' ').replace('|', '[pipe]')
def write_data(self, writer):
writer.writerow(self.data)
我认为实现线程的最佳位置是我在列表中拥有所有提交并准备迭代它们:
projects = [Project(value['id']) for value in project_data.values()]
for project in projects[:10]:
if project.name in bad_names.keys():
project.name = bad_names[project.name]
project.return_results(rest)
all_commits.extend(project.commits)
fieldnames = get_fieldnames(
'ods_gerrit.staging_gerrit_commits',
REDSHIFT_POSTGRES_INFO)
with open('testfile.csv', 'wb') as outf:
writer = DictWriter(
outf,
fieldnames=fieldnames,
extrasaction='ignore',
delimiter='|'
)
# Implement Threading?
for commit in all_commits:
commit.get_data(rest)
try:
commit.write_data(writer=writer)
except AttributeError:
continue
except Exception:
print commit.data, 'caused an exception.'
continue
我已经阅读了一些线程教程,并且不确定如何正确执行此操作。我特别担心由于锁定不当而覆盖数据。