我一直在关注创建我的网络抓取工具的教程,一切都已完成,但在运行抓取工具时遇到了多个错误,主要集中在我正在使用的线程库,如果有人有想法的话关于为什么会发生这种情况让我知道。
还有另一个问题,当蜘蛛使用我的hist文件来读取url时,它不会将它们放入crawled.txt中。当我包含代码时,最终结果是它将访问该页面并收集关键字并将其另存为另一个文档作为列表,这是我的蜘蛛的代码。
main.py
import threading
from Queue import Queue
from Spider import *
from Gen_info import *
#PROJECT_NAME = 'HistoryForensics'
QUEUE_FILE = 'Chrome_Hist'
CRAWLED_FILE = 'Lewis_Hist/CrawledUrls.txt'
NUMBER_OF_THREADS = 5
ThreadQueue = Queue()
#Create Spiders for Threads (Spiders will die on exit)
def create_spiders():
for _ in range(NUMBER_OF_THREADS):
t = threading.Thread(target=work)
t.daemon = True
t.start()
# process next job in the queue
def work():
while True:
url = ThreadQueue.get()
Spider.crawl_page(threading.currentThread().name, url)
ThreadQueue.task_done()
# each queued link is a new job
def create_jobs():
for link in file_to_set(QUEUE_FILE):
ThreadQueue.put(link)
ThreadQueue.join()
crawl()
#Crawl Queued urls
def crawl():
queued_links = file_to_set(QUEUE_FILE)
if len(queued_links) > 0:
print(str(len(queued_links)) + ' links in the queue')
create_jobs()
create_spiders()
crawl()
我还必须注释掉项目名称部分,因为我遇到了运行程序时没有创建我的目录的问题。
Scrape_Parse.py
import HTMLParser
from urllib2 import *
from scrapy.item import Item, Field
import mechanize
class GatherData():
def __init__(self, base_url, page_url):
self.base_url = base_url
self.page_url = page_url
"""def get_Keywords(Page_words):
common = open("Common_words.txt").read().split('\n')
word_dict = {}
word_list = Page_words.lower().split()
for word in word_list:
if word not in common and word.isalnum() :
if word not in word_dict:
word_dict[word] = 1
if word in word_dict:
word_dict[word] +=1
Words_listed = sorted(word_dict.items(),key=lambda(k,v):(v,k),reverse=True
for w in Words_listed:
print w[0]"""
Spider.py
from Gen_info import *
from Scrape_Parse import *
class Spider:
# Class Variables are shared among all instances
project_name = ''
base_url = ''
queue_file = ''
crawled_file = ''
queue = set()
crawled = set()
def __init__(self, project_name, base_url, domain_name):
Spider.project_name = project_name
Spider.base_url = base_url
Spider.domain_name = domain_name
Spider.queue_file = Spider.project_name + 'Chrome_Hist'
Spider.crawled_file = Spider.project_name + 'CrawledUrls.txt'
self.boot()
self.crawl_page('Spider1', Spider.base_url)
@staticmethod
def boot():
create_HistProjec_directory(Spider.project_name)
Use_Hist_files(Spider.project_name, Spider.base_url)
Spider.queue = Read_Hist_file(Spider.queue_file)
Spider.crawled = Add_to_file(Spider.crawled_file)
@staticmethod
def crawl_page(thread_name, page_url):
if page_url not in Spider.crawled:
print(thread_name + 'crawling' + page_url)
print('Queue' + str(len(Spider.queue)) + '| crawled' + str(len(Spider.crawled)))
Spider.queue.remove(page_url)
Spider.crawled.add(page_url)
Spider.update_files()
@staticmethod
def gather_data(page_url):
html_string = ''
try:
response = urlopen(page_url)
if response.getheader('Content-Type') == 'text/html':
html_bytes = response.read()
html_string = html_bytes.decode("utf-8")
finder = GatherData(Spider.base_url, page_url)
finder.feed(html_string)
except Exception as e:
print(str(e))
return set()
@staticmethod
def update_files():
set_to_file(Spider.queue, Spider.queue_file)
set_to_file(Spider.crawled, Spider.crawled_file)
Gen_info.py
import os
# Creates a directories for scraping files
def create_project_dir(directory):
if not os.path.exists(directory):
print('Creating History directory ' + directory)
os.makedirs(directory)
def create_keyword_directory(directory):
if not os.path.exists(directory):
print ('Creating directory ' + directory)
os.makedirs(directory)
#Queue and crawled Url files
def Use_Hist_files(project_name, base_url):
queue = project_name + '/Chrome_Hist'
crawled = project_name + '/CrawledUrls.txt'
if not os.path.isfile(queue):
write_file(queue, base_url)
if not os.path.isfile(crawled):
write_file(crawled, '')
# Creates new file
def write_file(path,data):
f = open(path, 'w')
f.write(data)
f.close()
#Adds data to existing file
def Add_to_file(path, data):
with open(path, 'a') as file:
file.write(data + '\n')
# Delete the contents of a file
def delete_file_contents(path):
with open(path, 'w'):
pass
#Links to read from i.e. Chrome History
def Read_Hist_file(file_name):
with open(file_name, 'rt') as f:
f.read()
def set_to_file(links, file):
delete_file_contents(file)
for link in sorted(links):
Add_to_file(file, link)
def file_to_set(file_name):
results = set()
with open(file_name, 'rt') as f:
for line in f:
results.add(line.replace('\n', ''))
return results
提前致谢。