我显然在这里遗漏了一些东西。同样的项目我已经工作了好几天。一点一点地走过它,似乎工作得很好。我在main()函数的一部分中添加了实际创建比较列表,突然开始抛出不能从空列表中弹出错误给我,即使是通过我放在pop()调用之前的打印函数清楚地显示列表不是空的?我有什么想法我做错了吗?这个怪物真的会以我想要的方式运作吗?第一次使用线程和所有。以下是完整的代码:
import urllib
import urllib2
import sys
from lxml.html import parse, tostring, fromstring
from urlparse import urlparse
import threading
class Crawler(threading.Thread):
def __init__(self):
self.links = []
self.queue = []
self.mal_list = []
self.count = 0
self.mal_set = set(self.mal_list)
self.crawled = []
self.crawled_set = set(self.crawled)
self.links_set = set(self.links)
self.queue.append(sys.argv[1])
self.queue_set = set(self.queue)
def run(self, max_depth):
print(self.queue)
while self.count < max_depth:
tgt = self.queue.pop(0)
if tgt not in self.mal_set:
self.crawl(tgt)
else:
print("Malicious Link Found: {0}".format(tgt)
continue
sys.exit("Finished!")
def crawl(self, tgt):
url = urlparse(tgt)
self.crawled.append(tgt)
try:
print("Crawling {0}".format(tgt))
request = urllib2.Request(tgt)
request.add_header("User-Agent", "Mozilla/5,0")
opener = urllib2.build_opener()
data = opener.open(request)
self.count += 1
except:
return
doc = parse(data).getroot()
for tag in doc.xpath("//a[@href]"):
old = tag.get('href')
fixed = urllib.unquote(old)
self.links.append(fixed)
self.queue_links(self.links_set, url)
def queue_links(self, links, url):
for link in links:
if link.startswith('/'):
link = "http://" + url.netloc + "/" + link
elif link.startswith('#'):
continue
elif link.startswith('http'):
link = 'http://' + url.netloc + '/' + link
if link.decode('utf-8') not in self.crawled_set:
self.queue.append(link)
def make_mal_list(self):
"""
Open various malware and phishing related blacklists and create a list
of URLS from which to compare to the crawled links
"""
hosts1 = "hosts.txt"
hosts2 = "MH-sitelist.txt"
hosts3 = "urls.txt"
with open(hosts1) as first:
for line1 in first.readlines():
link = "http://" + line1.strip()
self.mal_list.append(link)
with open(hosts2) as second:
for line2 in second.readlines():
link = "http://" + line2.strip()
self.mal_list.append(link)
with open(hosts3) as third:
for line3 in third.readlines():
link = "http://" + line3.strip()
self.mal_list.append(link)
def main():
crawler = Crawler()
crawler.make_mal_list()
crawler.run(25)
if __name__ == "__main__":
main()
答案 0 :(得分:4)
首先,我在阅读你的代码时确实迷路了所以如果我以前可能会给你一些评论:
到许多实例变量你不必创建一个新的实例var只是为了把它放在另一个vars的set()上,比如这段代码:self.mal_set = set(self.mal_list)
你重复同样的事情很多次
如果你想使用线程,那么使用它,因为在你的代码中你只是创建一个线程,因为你应该创建类似(10)线程,或者每个线程都会处理一堆URL应该获取,并且不要忘记将线程放在Queue.Queue中以在它们之间进行同步。
编辑:啊,我忘了:缩进你的代码:)
现在关于你的问题:
你在哪里分配self.queue因为我看不到它?你只是调用make_mal_list()
方法只会初始化self.mal_list
,而当你运行自己的线程后,我认为很明显self.queue
是空的,所以你不能pop()对吗?
编辑2:
我认为你的例子更复杂(使用黑名单和所有这些东西,......)但你可以从这样的事情开始:
import threading
import Queue
import sys
import urllib2
import url
from urlparse import urlparse
THREAD_NUMBER = 10
class Crawler(threading.Thread):
def __init__(self, queue, mal_urls):
self.queue = queue
self.mal_list = mal_urls
threading.Thread.__init__(self) # i forgot , thanks seriyPS :)
def run(self):
while True:
# Grabs url to fetch from queue.
url = self.queue.get()
if url not in self.mal_list:
self.crawl(url)
else:
print "Malicious Link Found: {0}".format(url)
# Signals to queue job is done
self.queue.task_done()
def crawl(self, tgt):
try:
url = urlparse(tgt)
print("Crawling {0}".format(tgt))
request = urllib2.Request(tgt)
request.add_header("User-Agent", "Mozilla/5,0")
opener = urllib2.build_opener()
data = opener.open(request)
except: # TODO: write explicit exceptions the URLError, ValueERROR ...
return
doc = parse(data).getroot()
for tag in doc.xpath("//a[@href]"):
old = tag.get('href')
fixed = urllib.unquote(old)
# I don't think you need this, but maybe i'm mistaken.
# self.links.append(fixed)
# Add more URL to the queue.
self.queue_links(fixed, url)
def queue_links(self, link, url):
"""I guess this method allow recursive download of urls that will
be fetched from the web pages ????
"""
#for link in links: # i changed the argument so now links it just one url.
if link.startswith('/'):
link = "http://" + url.netloc + "/" + link
elif link.startswith('#'):
continue
elif link.startswith('http'):
link = 'http://' + url.netloc + '/' + link
# Add urls extracted from the HTML text to the queue to fetche them
if link.decode('utf-8') not in self.crawled_set:
self.queue.put(link)
def get_make_mal_list():
"""Open various malware and phishing related blacklists and create a list
of URLS from which to compare to the crawled links
"""
hosts1 = "hosts.txt"
hosts2 = "MH-sitelist.txt"
hosts3 = "urls.txt"
mal_list = []
with open(hosts1) as first:
for line1 in first:
link = "http://" + line1.strip()
mal_list.append(link)
with open(hosts2) as second:
for line2 in second:
link = "http://" + line2.strip()
mal_list.append(link)
with open(hosts3) as third:
for line3 in third:
link = "http://" + line3.strip()
mal_list.append(link)
return mal_list
def main():
queue = Queue.Queue()
# Get malicious URLs.
mal_urls = set(get_make_mal_list())
# Create a THREAD_NUMBER thread and start them.
for i in xrange(THREAD_NUMBER):
cr = Crawler(queue, mal_urls)
cr.start()
# Get all url that you want to fetch and put them in the queue.
for url in sys.argv[1:]:
queue.put(url)
# Wait on the queue until everything has been processed.
queue.join()
if __name__ == '__main__':
main()
答案 1 :(得分:2)
小型offtopic:
class Crawler(threading.Thread):
def __init__(self):
#you code
threading.Thread.__init__(self)#!!!
如果覆盖Thread.__init__(self)
函数,请不要忘记直接运行__init__
当然,你必须使用http://docs.python.org/library/queue.html类来实现线程安全模式下的作业队列
答案 2 :(得分:1)
我的主要语言是C#,但您遇到的问题是因为线程问题。在线程#1中,您检查该列表是否为空,而线程#2清除该列表,因此您将收到异常。
答案 3 :(得分:0)
list
不是线程安全的。如果需要线程安全的数据结构,请使用Queue.Queue(Python 2.x)或queue.Queue(Python 3.x)。
答案 4 :(得分:0)
另外,看看这个片段:
print(self.queue)
while self.count < max_depth:
tgt = self.queue.pop(0)
你只在第一次while
迭代之前打印(self.queue),因此,self.queue.pop()可以进行多次迭代(并获取许多链接)并引发“无法从空列表中弹出”只有当队列真的是空的时候!
试试这个:
while self.count < max_depth:
print(self.queue)
tgt = self.queue.pop(0)
用于检测异常时的检测时刻。