我用过这个脚本
from twisted.internet import reactor, threads
from urlparse import urlparse
import httplib
import itertools
concurrent = 200
finished=itertools.count(1)
reactor.suggestThreadPoolSize(concurrent)
def getStatus(ourl):
url = urlparse(ourl)
conn = httplib.HTTPConnection(url.netloc)
conn.request("HEAD", url.path)
res = conn.getresponse()
return res.status
def processResponse(response,url):
print response, url
processedOne()
def processError(error,url):
print "error", url#, error
processedOne()
def processedOne():
if finished.next()==added:
reactor.stop()
def addTask(url):
req = threads.deferToThread(getStatus, url)
req.addCallback(processResponse, url)
req.addErrback(processError, url)
added=0
for url in open('urllist.txt'):
added+=1
addTask(url.strip())
try:
reactor.run()
except KeyboardInterrupt:
reactor.stop()
当我尝试运行脚本$ python test.py
时它只是打印url而不是cUrl或发送HTTP请求..
我怎样才能为每个
发送HTTP或cURL流程由于
答案 0 :(得分:0)
使用inlineCallbacks
和deferToThread
测试代码。同时使用defer.gatherResults
知道何时处理了所有延迟(而不是OP中的计数器方法):
from twisted.internet import reactor, defer, utils
from twisted.internet.threads import deferToThread
from urlparse import urlparse
import httplib
threadDeferred = deferToThread.__get__
@threadDeferred
def get_url_head(url_arg):
url = urlparse(url_arg)
conn = httplib.HTTPConnection(url.netloc)
conn.request("HEAD", url.path)
res = conn.getresponse()
conn.close()
return res.status
@defer.inlineCallbacks
def check_url(sem,url_arg):
yield sem.acquire()
try:
result = yield get_url_head(url_arg)
defer.returnValue(result)
finally:
sem.release()
@defer.inlineCallbacks
def run(reactor,SEMAPHORE_SIZE=10):
sem = defer.DeferredSemaphore(SEMAPHORE_SIZE)
deferreds = []
failed_urls = []
responded_urls = []
with open('urllist.txt','r') as f:
for line in f:
url_arg = line.strip()
d = check_url(sem,url_arg)
d.addCallback(processResult,url_arg,responded_urls).addErrback(processErr,url_arg,failed_urls)
deferreds.append(d)
res = yield defer.gatherResults(deferreds)
# Do something else with failed_urls and responded_urls
reactor.callLater(0,reactor.stop)
def main():
from twisted.internet import reactor
reactor.callWhenRunning(run,reactor)
reactor.run()
def processResult(result,url_arg,responded_urls):
print "Reponse %s from %s" % (result,url_arg)
responded_urls.append((url_arg,result))
def processErr(err,url_arg,failed_urls):
print "Error checking %s: %s" % (url_arg,repr(err.value))
failed_urls.append((url_arg,err.value))
if __name__ == '__main__':
main()
答案 1 :(得分:0)
如果您的网址格式不包含' http://'然而, 如果他们确实包含' http://'在评论中有一个解决方案
import httplib
def requester(url):
host = url.split('/')[0]
#if urls do contain 'http://' --> host = url.split('/')[2].replace('http://','')
req = url[url.find(host)+len(host):]
conn = httplib.HTTPConnection(host)
conn.request("HEAD","/"+req)
response = conn.getresponse()
print response.status, response.reason
#if you want data...
#data = response.read()
#print data
for url in open(urls.txt):
try:
requester(url)
except Error,e:
print Error, e
此外,我建议您查看httplib