我正在使用set类变量来表示已在网络搜寻器中搜寻到的页面链接。我尝试运行我的程序,它告诉我该集合是一个Nonetype,并且不能通过它进行迭代
class Creeper:
project_name = ''
base_url = ''
domain_name = ''
queue_file = ''
crawled_file = ''
queue = set()
crawled = set()
def __init__(self, base_url, project_name, domain_name):
Creeper.project_name = project_name
Creeper.base_url = base_url
Creeper.domain_name = domain_name
Creeper.queue_file = Creeper.project_name + '/queue.txt'
Creeper.crawled_file = Creeper.project_name + '/crawled.txt'
self.boot()
self.crawl_page('First creeper', Creeper.base_url)
@staticmethod
def crawl_page(thread_name, page_url):
if page_url not in Creeper.crawled:
print(thread_name + ' now crawling ' + page_url)
print('Queue: ' + str(len(Creeper.queue)) + ' | Crawled: ' + str(len(Creeper.crawled)))
Creeper.queue_links(Creeper.gather_links(page_url))
Creeper.queue.remove(page_url)
Creeper.crawled.add(page_url)
Creeper.update_files()
crawl_page中的if语句是问题的位置,传递的page_url不应最初位于集合中,而应输入正文,但是,我会收到Nonetype错误
这是我收到的当前错误消息
Traceback (most recent call last):
File "/Users/courtmiddleton/Desktop/PycharmProjects/creepyCrawler/main.py", line 15, in <module>
Creeper(HOMEPAGE, PROJECT_NAME, DOMAIN_NAME)
File "/Users/courtmiddleton/Desktop/PycharmProjects/creepyCrawler/creeper.py", line 23, in __init__
self.crawl_page('First creeper', Creeper.base_url)
File "/Users/courtmiddleton/Desktop/PycharmProjects/creepyCrawler/creeper.py", line 34, in crawl_page
if page_url not in Creeper.crawled:
TypeError: argument of type 'NoneType' is not iterable
答案 0 :(得分:0)
请勿使用@staticmethod
,而将所有Creeper.x
替换为self.x
。另外,您需要在方法定义中使用self
,如下所示。我尚未测试您的代码,但这至少在设计级别上是可以的。
因为拥有Creeper.x
类属性及其方法无法相互访问,并且自.crawled
类实例化以来,Creeper
属性仍为空集。 @CharlesDuffy在他的评论中也指出了这一点。
class Creeper:
project_name = ''
base_url = ''
domain_name = ''
queue_file = ''
crawled_file = ''
queue = set()
crawled = set()
def __init__(self, base_url, project_name, domain_name):
self.project_name = project_name
self.base_url = base_url
self.domain_name = domain_name
self.queue_file = self.project_name + '/queue.txt'
self.crawled_file = self.project_name + '/crawled.txt'
self.boot()
self.crawl_page('First creeper', self.base_url)
#@staticmethod
def crawl_page(self, thread_name, page_url):
if page_url not in self.crawled:
print(thread_name + ' now crawling ' + page_url)
print('Queue: ' + str(len(self.queue)) + ' | Crawled: ' + str(len(self.crawled)))
self.queue_links(self.gather_links(page_url))
self.queue.remove(page_url)
self.crawled.add(page_url)
self.update_files()
我建议您进一步了解Python class
documentation。