Question

我一直在关注创建我的网络抓取工具的教程，一切都已完成，但在运行抓取工具时遇到了多个错误，主要集中在我正在使用的线程库，如果有人有想法的话关于为什么会发生这种情况让我知道。

还有另一个问题，当蜘蛛使用我的hist文件来读取url时，它不会将它们放入crawled.txt中。当我包含代码时，最终结果是它将访问该页面并收集关键字并将其另存为另一个文档作为列表，这是我的蜘蛛的代码。

main.py

import threading
from Queue import Queue
from Spider import *
from Gen_info import *

#PROJECT_NAME = 'HistoryForensics'
QUEUE_FILE = 'Chrome_Hist'
CRAWLED_FILE = 'Lewis_Hist/CrawledUrls.txt'
NUMBER_OF_THREADS = 5
ThreadQueue = Queue()


#Create Spiders for Threads (Spiders will die on exit)

def create_spiders():
    for _ in range(NUMBER_OF_THREADS):
        t = threading.Thread(target=work)
        t.daemon = True
        t.start()


# process next job in the queue
def work():
    while True:
        url = ThreadQueue.get()
        Spider.crawl_page(threading.currentThread().name, url)
        ThreadQueue.task_done()



# each queued link is a new job
def create_jobs():
    for link in file_to_set(QUEUE_FILE):
       ThreadQueue.put(link)
    ThreadQueue.join()
    crawl()


#Crawl Queued urls
def crawl():
    queued_links = file_to_set(QUEUE_FILE)
    if len(queued_links) > 0:
        print(str(len(queued_links)) + ' links in the queue')
        create_jobs()


create_spiders()
crawl()

我还必须注释掉项目名称部分，因为我遇到了运行程序时没有创建我的目录的问题。

Scrape_Parse.py

import HTMLParser
from urllib2 import *
from scrapy.item import Item, Field
import mechanize

class GatherData():


    def __init__(self, base_url, page_url):
        self.base_url = base_url
        self.page_url = page_url



"""def get_Keywords(Page_words):
    common = open("Common_words.txt").read().split('\n')
    word_dict = {}
    word_list = Page_words.lower().split()
    for word in word_list:
        if word not in common and word.isalnum() :
            if word not in word_dict:
                word_dict[word] = 1
        if word in word_dict:
            word_dict[word] +=1
        Words_listed = sorted(word_dict.items(),key=lambda(k,v):(v,k),reverse=True
        for w in Words_listed:
            print w[0]"""

Spider.py

from Gen_info import *
from Scrape_Parse import *




class Spider:

        # Class Variables are shared among all instances
    project_name = ''
    base_url = ''
    queue_file = ''
    crawled_file = ''
    queue = set()
    crawled = set()


    def __init__(self, project_name, base_url, domain_name):
        Spider.project_name = project_name
        Spider.base_url = base_url
        Spider.domain_name = domain_name
        Spider.queue_file = Spider.project_name + 'Chrome_Hist'
        Spider.crawled_file = Spider.project_name + 'CrawledUrls.txt'
        self.boot()
        self.crawl_page('Spider1', Spider.base_url)

    @staticmethod
    def boot():
        create_HistProjec_directory(Spider.project_name)
        Use_Hist_files(Spider.project_name, Spider.base_url)
        Spider.queue = Read_Hist_file(Spider.queue_file)
        Spider.crawled = Add_to_file(Spider.crawled_file)

    @staticmethod
    def crawl_page(thread_name, page_url):
        if page_url not in Spider.crawled:
            print(thread_name + 'crawling' + page_url)
            print('Queue' + str(len(Spider.queue)) + '| crawled' + str(len(Spider.crawled)))
            Spider.queue.remove(page_url)
            Spider.crawled.add(page_url)
            Spider.update_files()

    @staticmethod
    def gather_data(page_url):
        html_string = ''
        try:
            response = urlopen(page_url)
            if response.getheader('Content-Type') == 'text/html':
                html_bytes = response.read()
                html_string = html_bytes.decode("utf-8")
            finder = GatherData(Spider.base_url, page_url)
            finder.feed(html_string)
        except Exception as e:
            print(str(e))
            return set()

    @staticmethod
    def update_files():
        set_to_file(Spider.queue, Spider.queue_file)
        set_to_file(Spider.crawled, Spider.crawled_file)

Gen_info.py

import os
# Creates a directories for scraping files

def create_project_dir(directory):
    if not os.path.exists(directory):
        print('Creating History directory ' + directory)
        os.makedirs(directory)

def create_keyword_directory(directory):
    if not os.path.exists(directory):
        print ('Creating directory ' + directory)
        os.makedirs(directory)


#Queue and crawled Url files
def Use_Hist_files(project_name, base_url):
    queue = project_name + '/Chrome_Hist'
    crawled = project_name + '/CrawledUrls.txt'
    if not os.path.isfile(queue):
        write_file(queue, base_url)
    if not os.path.isfile(crawled):
        write_file(crawled, '')



# Creates new file
def write_file(path,data):
    f = open(path, 'w')
    f.write(data)
    f.close()



#Adds data to existing file
def Add_to_file(path, data):
     with open(path, 'a') as file:
        file.write(data + '\n')

# Delete the contents of a file
def delete_file_contents(path):
    with open(path, 'w'):
        pass


#Links to read from i.e. Chrome History
def Read_Hist_file(file_name):
    with open(file_name, 'rt') as f:
        f.read()

def set_to_file(links, file):
    delete_file_contents(file)
    for link in sorted(links):
        Add_to_file(file, link)

def file_to_set(file_name):
    results = set()
    with open(file_name, 'rt') as f:
        for line in f:
            results.add(line.replace('\n', ''))
    return results

Pycharm error report

Code Project directories

提前致谢。

Python27多线程Web-Crawler多个问题

0 个答案: