Question

我正在尝试将100,000个数据点上传到Web服务后端。如果我一次运行一个，则需要约12个小时。它们同时支持20个API调用。如何同时运行此POST以便加快导入？

def AddPushTokens():

 import requests
 import csv
 import json

 count=0
 tokenList=[]

 apikey="12345"
 restkey="12345"
 URL="https://api.web.com/1/install/"
 headers={'content-type': 'application/json','Application-Id': apikey,'REST-API-Key':restkey}

 with open('/Users/name/Desktop/push-new.csv','rU') as csvfile:
      deviceTokens=csv.reader(csvfile, delimiter=',')

      for token in deviceTokens:

       deviceToken=token[0].replace("/","")
       deviceType="ios"
       pushToken="pushtoken_"+deviceToken
       payload={"deviceType": deviceType,"deviceToken":deviceToken,"channels":["",pushToken]}
       r = requests.post(URL, data=json.dumps(payload), headers=headers)

       count=count+1
       print "Count: " + str(count)
       print r.content

编辑：我正在尝试使用concurrent.futures。我感到困惑的是如何设置它以便从CSV中提取令牌并将其传递给load_url？另外，我想确保它通过前20次运行请求，然后在21处获取并运行下一组20。

import concurrent.futures
import requests

URLS = ['https://api.web.com/1/installations/',
     'https://api.web.com/1/installations/',
     'https://api.web.com/1/installations/',
     'https://api.web.com/1/installations/',
     'https://api.web.com/1/installations/',
     'https://api.web.com/1/installations/',
     'https://api.web.com/1/installations/',
     'https://api.web.com/1/installations/',
     'https://api.web.com/1/installations/',
     'https://api.web.com/1/installations/',
     'https://api.web.com/1/installations/',
     'https://api.web.com/1/installations/',
     'https://api.web.com/1/installations/',
     'https://api.web.com/1/installations/',
     'https://api.web.com/1/installations/',
     'https://api.web.com/1/installations/',
     'https://api.web.com/1/installations/',
     'https://api.web.com/1/installations/',
     'https://api.web.com/1/installations/',
     'https://api.web.com/1/installations/']


apikey="12345"
restkey="12345"
URL="https://api.web.com/1/installations/"
headers={'content-type': 'application/json','X-web-Application-Id': apikey,'X-web-REST-API-Key':restkey}


     with open('/Users/name/Desktop/push-new.csv','rU') as csvfile:
     deviceTokens=csv.reader(csvfile, delimiter=',')

     for token in deviceTokens:

          deviceToken=token[0].replace("/","")
          deviceType="ios"
          pushToken="pushtoken_"+deviceToken
          payload={"deviceType": deviceType,"deviceToken":deviceToken,"channels":["",pushToken]}
          r = requests.post(URL, data=json.dumps(payload), headers=headers)


# Retrieve a single page and report the url and contents
def load_url(token):

     URL='https://api.web.com/1/installations/'

     deviceToken=token[0].replace("/","")
     deviceType="ios"
     pushToken="pushtoken_"+deviceToken
     payload={"deviceType": deviceType,"deviceToken":deviceToken,"channels":["",pushToken]}
     r = requests.post(URL, data=json.dumps(payload), headers=headers)

     count=count+1
     print "Count: " + str(count)
     print r.content

# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
    # Start the load operations and mark each future with its URL
    future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
    for future in concurrent.futures.as_completed(future_to_url):
        url = future_to_url[future]
        try:
            data = future.result()
        except Exception as exc:
            print('%r generated an exception: %s' % (url, exc))
        else:
            print('%r page is %d bytes' % (url, len(data)))

编辑：根据下面的评论更新

import concurrent.futures
import requests
import csv
import json

apikey="ldy0eSCqPz9PsyOLAt35M2b0XrfDZT1NBW69Z7Bw"
restkey="587XASjEYdQwH2UHruA1yeZfT0oX7uAUJ8kWTmE3"
URL="https://api.parse.com/1/installations/"
headers={'content-type': 'application/json','X-Parse-Application-Id': apikey,'X-Parse-REST-API-Key':restkey}

with open('/Users/jgurwin/Desktop/push/push-new.csv','rU') as csvfile:
     deviceTokens=csv.reader(csvfile, delimiter=',')

     for device in deviceTokens:

        token=device[0].replace("/","")

        # Retrieve a single page and report the url and contents

        def load_url(token):

          count=0
          deviceType="ios"
          pushToken="pushtoken_"+token
          payload={"deviceType": deviceType,"deviceToken":token,"channels":["",pushToken]}
          r = requests.post(URL, data=json.dumps(payload), headers=headers)

          count=count+1
          print "Count: " + str(count)
          print r.content


        # We can use a with statement to ensure threads are cleaned up promptly
          with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
              # Start the load operations and mark each future with its URL
              future_to_token = {executor.submit(load_url, token, 60): token for token in deviceTokens}
              for future in concurrent.futures.as_completed(future_to_url):
                  url = future_to_url[future]
                  try:
                      data = future.result()
                  except Exception as exc:
                      print('%r generated an exception: %s' % (url, exc))
                  else:
                      print('%r page is %d bytes' % (url, len(data)))

Answer 1

执行此操作的简单方法是使用线程。几乎同样简单的方法是使用gevent或类似的库（以及grequests甚至联系gevent和requests，这样您就不必弄清楚如何这样做）。困难的方法是建立一个事件循环（或者更好的是，使用像Twisted或Tulip这样的东西）并自己多路复用请求。

让我们这么简单。

您不希望一次运行100000个主题。除了需要数百GB的堆栈空间，并且您的CPU将花费更多时间进行上下文切换而不是运行实际代码，该服务一次只支持20个连接。所以，你需要20个线程。

那么，你如何在20个线程上运行100000个任务？使用线程池执行程序（或裸线程池）。

concurrent.futures文档的example几乎与您要执行的操作完全相同，除了执行GET而不是POST并使用urllib而不是requests。只需将load_url函数更改为以下内容：

def load_url(token):
    deviceToken=token[0].replace("/","")
    # … your original code here …
    r = requests.post(URL, data=json.dumps(payload), headers=headers)
    return r.content

......示例将按原样运行。

由于您使用的是Python 2.x，因此stdlib中没有concurrent.futures模块;你需要后退，futures。

在Python（至少是CPython）中，一次只有一个线程可以执行任何CPU工作。如果您的任务花费更多时间通过网络下载（I / O工作）而不是构建请求和解析响应（CPU工作），那不是问题。但如果不是这样，你将需要使用进程而不是线程。这只需要使用ThreadPoolExecutor替换示例中的ProcessPoolExecutor。

如果你想在2.7 stdlib中完全做到这一点，那么multiprocessing中内置的线程和进程池几乎是微不足道的。请参阅Using a pool of workers和Process Pools API，如果您想使用线程而不是流程，请参阅multiprocessing.dummy。

Answer 2

可能有点矫枉过正，但您可能希望看看Celery。

Tutorial

tasks.py可以是：

from celery import Celery
import requests

app = Celery('tasks', broker='amqp://guest@localhost//')

apikey="12345"
restkey="12345"

URL="https://api.web.com/1/install/"
headers={'content-type': 'application/json','Application-Id': apikey,'REST-API-Key':restkey}

f = open('upload_data.log', 'a+')
@app.task
def upload_data(data, count):
    r = requests.post(URL, data=data, headers=headers)
    f.write("Count: %d\n%s\n\n" % (count, r.content)

启动芹菜任务：

$ celery -A tasks worker --loglevel=info -c 20

然后在另一个脚本中：

import tasks
def AddPushTokens():

    import csv
    import json

    count=0
    tokenList=[]

    with open('/Users/name/Desktop/push-new.csv','rU') as csvfile:
        deviceTokens=csv.reader(csvfile, delimiter=',')

        for token in deviceTokens:
            deviceToken=token[0].replace("/","")
            deviceType="ios"
            pushToken="pushtoken_"+deviceToken
            payload={"deviceType": deviceType,"deviceToken":deviceToken,"channels":["",pushToken]}
   r = tasks.upload_data.delay(json.dumps(payload), count)

   count=count+1

注意：上面的代码是示例。您可能需要根据您的要求对其进行修改。

在Python中同时运行POST

2 个答案: