在python中生成数千个get请求的最佳方法

时间:2016-10-01 20:51:15

标签: python networking concurrency queue

现在我正在处理一个python脚本,它接受一个url列表作为参数,然后对每个url执行GET请求,然后使用xpath搜索输出以指纹网站。当列表大约有50个站点时,它似乎就像一个魅力,但之后的任何事情都会导致程序减速到停止的程度(通常大约150个站点)。向下滚动到您看到主app逻辑及其相关代码的位置。现在我只是在数组中使用50个元素并且它工作正常,但是之后的任何事情都会使整个程序停止。任何建议将不胜感激!

#!/usr/bin/python
# Web Scraper
# 1.0

# Imports for file
from multiprocessing.dummy import Pool as ThreadPool 
from threading import Thread
from Queue import Queue
from lxml import html
import requests
import time
import sys

# Get Raw HTML
def scrape(url):
    try:
        page = requests.get(url, timeout=2.0)
        if page.status_code == requests.codes.ok:
            html_page = html.fromstring(page.content)
            s =requests.session()
            s.close()
            return html_page
        else:
            s =requests.session()
            s.close()
            return False
    except:
        s =requests.session()
        s.close()
        return False


# Format URL
def format_url(url):
    if url.find("http://") == -1:
        url = "http://"+url
    if url[-1] == "/":
        url = url[:-1]
    return url


# Check if WordPress Site
def check_wordpress(tree):
    scripts = tree.xpath("//script[contains(@src,'wp-content')]")
    if len(scripts) > 0:
        return True
    return False



# Check WordPress Version
def wordpress_version(tree):
    type = tree.xpath("//meta[@name='generator']/@content")
    version = 0
    if len(type) > 0:
        details = type[0].split()
        if len(details)>1 and details[0] == "WordPress":
            if len(details) > 1:
                version = details[1]
            else:
                version = type[0]
    return version

# Find Contact Page
def find_contact_page(tree):
    contact = tree.xpath("//a[contains(text(),'Contact')]/@href")
    try_xpath = 1
    while len(contact) == 0:
        if try_xpath == 1:
            contact = tree.xpath("//span[contains(text(),'Contact')]/../@href")
        elif try_xpath == 2:
            contact = tree.xpath("//p[contains(text(),'Contact')]/../@href")
        elif try_xpath == 3:
            break
        try_xpath+=1
    if len(contact) > 0:
        contact = contact[0]
        if contact.find('#') == -1:
            if contact[0] == '/':
                contact = url + "" + contact
        print contact   


# Juicer method
def juice(url):
    url = format_url(url)
    string = url
    tree = scrape(url)
    if tree == False:
        return string + " \t\t\t No XML tree"
    elif check_wordpress(tree) == True:
        version = wordpress_version(tree)
        return string + " \t\t\t WordPress: " + str(version)
    else:
        return string + " \t\t\t Not WordPress"


# Main App Logic Below ------------------------------------->


# Open list of websites from given argument
list = open(sys.argv[1],'r').read().split('\n')

# Juice url
def juice_url():
    while True:
        url = q.get()
        result = juice(url)
        print result
        q.task_done()

# Create concurrent queues
concurrent = 50
q = Queue(concurrent)
for i in range(concurrent):
    t = Thread(target=juice_url)
    t.daemon = True
    t.start()


# Add URL to Queue
time1 = time.time()
for url in list[0:50]:
    q.put(url)
q.join()

# Calculate total time
total = time.time() - time1
print "Total Time: %f" % total
print "Average Time: %f" % (total/50)

0 个答案:

没有答案