python请求url parallel

时间:2016-10-10 22:23:14

标签: python parallel-processing python-requests

我创建了以下脚本,以从API端点下载图像,该端点按预期工作。事情是它相当缓慢,因为所有请求都必须彼此等待。对于每个我想要获取的项目,仍然可以同步执行步骤的正确方法是什么,但是对于每个单独的项目使其并行。这来自一个名为的在线服务 servicem8 所以我希望实现的是:

  • 获取所有可能的工作ID =>保留姓名/和其他信息
  • 获取客户名称
  • 获取作业的每个附件

应该为每项工作完成这三个步骤。所以我可以为每个工作做平行,因为他们不必相互等待。

更新

问题我不明白你怎么能确保你把一个调用中的每个项目的三个调用捆绑在一起作为我唯一可以并行执行的项目,例如当我想

  • 获取项目(获取名称=>获取说明=>获取ID)

所以它是我想要并行的获取项吗?

我现有的代码工作正常但很慢:

import requests
import dateutil.parser
import shutil
import os

user = "test@test.com"
passw = "test"

print("Read json")
url = "https://api.servicem8.com/api_1.0/job.json"
r = requests.get(url, auth=(user, passw))

print("finished reading jobs.json file")
scheduled_jobs = []
if r.status_code == 200:
    for item in r.json():
        scheduled_date = item['job_is_scheduled_until_stamp']
        try:
            parsed_date = dateutil.parser.parse(scheduled_date)
            if parsed_date.year == 2016:
                if parsed_date.month == 10:
                    if parsed_date.day == 10:
                        url_customer = "https://api.servicem8.com/api_1.0/Company/{}.json".format(item[
                                                                                                  'company_uuid'])
                        c = requests.get(url_customer, auth=(user, passw))
                        cus_name = c.json()['name']
                        scheduled_jobs.append(
                            [item['uuid'], item['generated_job_id'], cus_name])

        except ValueError:
            pass

    for job in scheduled_jobs:
        print("fetch for job {}".format(job))
        url = "https://api.servicem8.com/api_1.0/Attachment.json?%24filter=related_object_uuid%20eq%20{}".format(job[
                                                                                                                 0])
        r = requests.get(url, auth=(user, passw))
        if r.json() == []:
            pass
        for attachment in r.json():
            if attachment['active'] == 1 and attachment['file_type'] != '.pdf':
                print("fetch for attachment {}".format(attachment))
                url_staff = "https://api.servicem8.com/api_1.0/Staff.json?%24filter=uuid%20eq%20{}".format(
                    attachment['created_by_staff_uuid'])
                s = requests.get(url_staff, auth=(user, passw))
                for staff in s.json():
                    tech = "{}_{}".format(staff['first'], staff['last'])

                url = "https://api.servicem8.com/api_1.0/Attachment/{}.file".format(attachment[
                                                                                    'uuid'])
                r = requests.get(url, auth=(user, passw), stream=True)
                if r.status_code == 200:
                    creation_date = dateutil.parser.parse(
                        attachment['timestamp']).strftime("%d.%m.%y")
                    if not os.path.exists(os.getcwd() + "/{}/{}".format(job[2], job[1])):
                        os.makedirs(os.getcwd() + "/{}/{}".format(job[2], job[1]))
                    path = os.getcwd() + "/{}/{}/SC -O {} {}{}".format(
                        job[2], job[1], creation_date, tech.upper(), attachment['file_type'])
                    print("writing file to path {}".format(path))
                    with open(path, 'wb') as f:
                        r.raw.decode_content = True
                        shutil.copyfileobj(r.raw, f)
else:
    print(r.text)

更新[14/10] 我通过以下方式更新了代码并给出了一些提示。为此非常感谢。我唯一可以优化的东西是附件下载,但现在工作正常。我学到的有趣的事情是你不能在Windows机器上创建一个CON文件夹:-)不知道。

我也使用pandas来试图避免我的dicts列表中的一些循环,但不确定我是否已经是最高效的。最长的是实际读取完整的json文件。我完全阅读了它们,因为我找不到API方式只是告诉api,只返回我2016年9月的工作.api查询功能似乎适用于eq / lt / ht。

import requests
import dateutil.parser
import shutil
import os
import pandas as pd

user = ""
passw = ""

FOLDER = os.getcwd()
headers = {"Accept-Encoding": "gzip, deflate"}

import grequests
urls = [
    'https://api.servicem8.com/api_1.0/job.json',
    'https://api.servicem8.com/api_1.0/Attachment.json',
    'https://api.servicem8.com/api_1.0/Staff.json',
    'https://api.servicem8.com/api_1.0/Company.json'
]

#Create a set of unsent Requests:

print("Read json files")
rs = (grequests.get(u, auth=(user, passw), headers=headers) for u in urls)
#Send them all at the same time:
jobs,attachments,staffs,companies = grequests.map(rs)

#create dataframes
df_jobs = pd.DataFrame(jobs.json())
df_attachments = pd.DataFrame(attachments.json())
df_staffs = pd.DataFrame(staffs.json())
df_companies = pd.DataFrame(companies.json())

#url_customer = "https://api.servicem8.com/api_1.0/Company/{}.json".format(item['company_uuid'])
#c = requests.get(url_customer, auth=(user, passw))

#url = "https://api.servicem8.com/api_1.0/job.json"
#jobs = requests.get(url, auth=(user, passw), headers=headers)


#print("Reading attachments json")
#url = "https://api.servicem8.com/api_1.0/Attachment.json"
#attachments = requests.get(url, auth=(user, passw), headers=headers)

#print("Reading staff.json")
#url_staff = "https://api.servicem8.com/api_1.0/Staff.json"
#staffs = requests.get(url_staff, auth=(user, passw))

scheduled_jobs = []

if jobs.status_code == 200:
    print("finished reading json file")
    for job in jobs.json():
        scheduled_date = job['job_is_scheduled_until_stamp']
        try:
            parsed_date = dateutil.parser.parse(scheduled_date)
            if parsed_date.year == 2016:
                if parsed_date.month == 9:
                    cus_name = df_companies[df_companies.uuid == job['company_uuid']].iloc[0]['name'].upper()
                    cus_name = cus_name.replace('/', '')
                    scheduled_jobs.append([job['uuid'], job['generated_job_id'], cus_name])

        except ValueError:
            pass
    print("{} jobs to fetch".format(len(scheduled_jobs)))

    for job in scheduled_jobs:
        print("fetch for job attachments {}".format(job))
        #url = "https://api.servicem8.com/api_1.0/Attachment.json?%24filter=related_object_uuid%20eq%20{}".format(job[0])

        if attachments == []:
            pass
        for attachment in attachments.json():
            if attachment['related_object_uuid'] == job[0]:
                if attachment['active'] == 1 and attachment['file_type'] != '.pdf' and attachment['attachment_source'] != 'INVOICE_SIGNOFF':
                    for staff in staffs.json():
                        if staff['uuid'] == attachment['created_by_staff_uuid']:
                            tech = "{}_{}".format(
                                staff['first'].split()[-1].strip(), staff['last'])

                    creation_timestamp = dateutil.parser.parse(
                        attachment['timestamp'])
                    creation_date = creation_timestamp.strftime("%d.%m.%y")
                    creation_time = creation_timestamp.strftime("%H_%M_%S")

                    path = FOLDER + "/{}/{}/SC_-O_D{}_T{}_{}{}".format(
                        job[2], job[1], creation_date, creation_time, tech.upper(), attachment['file_type'])

                    # fetch attachment

                    if not os.path.isfile(path):
                        url = "https://api.servicem8.com/api_1.0/Attachment/{}.file".format(attachment[
                                                                                            'uuid'])
                        r = requests.get(url, auth=(user, passw), stream = True)
                        if r.status_code == 200:
                            if not os.path.exists(FOLDER + "/{}/{}".format(job[2], job[1])):
                                os.makedirs(
                                    FOLDER + "/{}/{}".format(job[2], job[1]))

                            print("writing file to path {}".format(path))
                            with open(path, 'wb') as f:
                                r.raw.decode_content = True
                                shutil.copyfileobj(r.raw, f)
                    else:
                        print("file already exists")
else:
    print(r.text)

1 个答案:

答案 0 :(得分:0)

一般的想法是使用异步url请求,并且有一个名为grequests的python模块 - https://github.com/kennethreitz/grequests

来自文档:

import grequests
urls = [
    'http://www.heroku.com',
    'http://python-tablib.org',
    'http://httpbin.org',
    'http://python-requests.org',
    'http://fakedomain/',
    'http://kennethreitz.com'
]
#Create a set of unsent Requests:
rs = (grequests.get(u) for u in urls)
#Send them all at the same time:
grequests.map(rs)

和resopnse

[<Response [200]>, <Response [200]>, <Response [200]>, <Response [200]>, None, <Response [200]>]