现在我正在处理一个python脚本,它接受一个url列表作为参数,然后对每个url执行GET请求,然后使用xpath搜索输出以指纹网站。当列表大约有50个站点时,它似乎就像一个魅力,但之后的任何事情都会导致程序减速到停止的程度(通常大约150个站点)。向下滚动到您看到主app逻辑及其相关代码的位置。现在我只是在数组中使用50个元素并且它工作正常,但是之后的任何事情都会使整个程序停止。任何建议将不胜感激!
#!/usr/bin/python
# Web Scraper
# 1.0
# Imports for file
from multiprocessing.dummy import Pool as ThreadPool
from threading import Thread
from Queue import Queue
from lxml import html
import requests
import time
import sys
# Get Raw HTML
def scrape(url):
try:
page = requests.get(url, timeout=2.0)
if page.status_code == requests.codes.ok:
html_page = html.fromstring(page.content)
s =requests.session()
s.close()
return html_page
else:
s =requests.session()
s.close()
return False
except:
s =requests.session()
s.close()
return False
# Format URL
def format_url(url):
if url.find("http://") == -1:
url = "http://"+url
if url[-1] == "/":
url = url[:-1]
return url
# Check if WordPress Site
def check_wordpress(tree):
scripts = tree.xpath("//script[contains(@src,'wp-content')]")
if len(scripts) > 0:
return True
return False
# Check WordPress Version
def wordpress_version(tree):
type = tree.xpath("//meta[@name='generator']/@content")
version = 0
if len(type) > 0:
details = type[0].split()
if len(details)>1 and details[0] == "WordPress":
if len(details) > 1:
version = details[1]
else:
version = type[0]
return version
# Find Contact Page
def find_contact_page(tree):
contact = tree.xpath("//a[contains(text(),'Contact')]/@href")
try_xpath = 1
while len(contact) == 0:
if try_xpath == 1:
contact = tree.xpath("//span[contains(text(),'Contact')]/../@href")
elif try_xpath == 2:
contact = tree.xpath("//p[contains(text(),'Contact')]/../@href")
elif try_xpath == 3:
break
try_xpath+=1
if len(contact) > 0:
contact = contact[0]
if contact.find('#') == -1:
if contact[0] == '/':
contact = url + "" + contact
print contact
# Juicer method
def juice(url):
url = format_url(url)
string = url
tree = scrape(url)
if tree == False:
return string + " \t\t\t No XML tree"
elif check_wordpress(tree) == True:
version = wordpress_version(tree)
return string + " \t\t\t WordPress: " + str(version)
else:
return string + " \t\t\t Not WordPress"
# Main App Logic Below ------------------------------------->
# Open list of websites from given argument
list = open(sys.argv[1],'r').read().split('\n')
# Juice url
def juice_url():
while True:
url = q.get()
result = juice(url)
print result
q.task_done()
# Create concurrent queues
concurrent = 50
q = Queue(concurrent)
for i in range(concurrent):
t = Thread(target=juice_url)
t.daemon = True
t.start()
# Add URL to Queue
time1 = time.time()
for url in list[0:50]:
q.put(url)
q.join()
# Calculate total time
total = time.time() - time1
print "Total Time: %f" % total
print "Average Time: %f" % (total/50)