Question

I got bored today so I started coding up this little web crawler. Not knowing much about the standard library's HTMLParser I figured a web crawler would be a good way to learn the API. I was surprised how easy it was to get the content from the pages that I wanted. The only major issue I had with HTMLParser was making it fault tolerant. The fix was actually very simple. I had to overwrite the error method of HTMLParser. I'm not sure but I think it stops parsing the page on broken html. Before overwriting the error method it would just break by way of an exception that would be very hard and hacky to catch.
The problem I'm taking on now is recording the 404 links. I thought this would be trivial but for some reason I can't get it to work. No exceptions are raised but the set that's supposed to contain the bad links stays empty.

#-*-coding:utf8;-*-
#qpy:3
#qpy:console

from html.parser import HTMLParser as _HTMLParser
from urllib.request import urlopen
import urllib.robotparser
from urllib.parse import urlparse, urljoin
from json import dumps as parse_json
import cProfile
import mimetypes
import time
import math
import random


mimetypes.init()

PROFILE = cProfile.Profile()


ENABLE_PROFILING = False
if ENABLE_PROFILING:
    PROFILE.enable()

DEFAULT_ENCODING = 'latin-1'

PARANT = '../'

def profile(func):
    def wrap(*args, **kw):
        PROFILE.enable()
        return func(*args, **kw)
    PROFILE.disable()
    return wrap

def average(lst):
    return float(sum(lst) /  len(lst))

def random_wait(mini, maxi):
    time.sleep(random.randint(mini, maxi))

USER_AGENTS = [
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', 
    'Opera/9.25 (Windows NT 5.1; U; en)', 
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
    'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)', 
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
    'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9' 
] 

def random_ua():
    return random.choice(USER_AGENTS)

class CrawlerClient(object):
    def __init__(self, **kw):
        self.robotparser = urllib.robotparser.RobotFileParser()
        self.ua = random_ua()
        self.referer = kw.get('referer', 'www.google.com')
        self.not_found = set()

    def can_fetch(self, url):
        return self.robotparser.can_fetch(self.ua, url)

    def get(self, url):
        self.ua = random_ua()
        req = urllib.request.Request(url)
        req.add_header('User-Agent', self.ua)
        req.add_header('Connection', 'keep-alive')
        req.add_header('Accept','text/html,xhtml,xml')
        req.add_header('Referer', self.referer)

        parsed_url = urlparse(url)
        robot_file_path = parsed_url.scheme + '://' + parsed_url.netloc + '/robots.txt'
        self.robotparser.set_url(robot_file_path)
        self.robotparser.read()
        if self.can_fetch(url):
            try:
                with urlopen(req) as res:
                    http_headers = res.headers
                    status_code = res.getcode()

                    if status_code == 404:
                        self.not_found.add(url)
                        return

                    if status_code == 500:
                        return

                    content_type, *charset = http_headers.get('content-type').split(';')
                    # Try to guess the charset.
                    if charset:
                        charset = charset[0].strip().split('=')[1]
                    # Use fallback encoding.
                    else:
                        charset = DEFAULT_ENCODING

                    # If the downloaded content is of type text/*
                    # feed the content too the parser.
                    if content_type.split('/')[0] == 'text':
                        return res.read().decode(charset)  
            except Exception as e:
                #print(e)
                pass


class HTMLParser(_HTMLParser):
    def __init__(self, url, strict=False):
        self.url = urlparse(url)
        self.size = 0
        self.client = CrawlerClient(referer='https://rickys-python-notes.blogspot.com')
        # If strict is True the parser will break
        # on broken html. Othewise it will ignore
        # broken html and keep on parsing.
        if not strict:
            self.error = self._do_nothing

        _HTMLParser.__init__(self)
        # Links holds all the links that parser finds.
        # The parser looks for links in anchor and link tags.        
        self.links = set()
        self.base_url = '{}://{}'.format(self.url.scheme, self.url.netloc)
        # title will hold the value of the pages title if
        # the page has a title
        self.title = None
        # This variable lets the handle_data method know
        # that we are curently reading the title data so
        # it can store it in self.title
        self.recording_title = False

        html = self.client.get(url)
        if html:
            self.feed(html)

    def handle_starttag(self, tag, attrs):
        try:
            key, val, *_ = attrs[0]
        except IndexError:
            key, val = 0, 0

        if key == 'href':

            if val.startswith('//'):
                val = self.url.scheme + ':' + val

            url = urlparse(val)
            if not url.netloc:
                url = urlparse(urljoin(self.base_url, url.path))
            self.links.add(url.geturl())

        if tag == 'title':
            self.recording_title = True

    def handle_endtag(self, tag):
        if tag == 'title':
            self.recording_title = False


    def handle_data(self, data):
        if self.recording_title:
            self.title = data.strip()

    def _do_nothing(self, *_,  **__):
        return



class CrawlerQueue(object):
    def __init__(self, seed, **kw):
        self.seed = seed
        self.tocrawl = [seed]
        self.crawled = list()
        self.non_html_links = list()
        self.domain = urlparse(seed).netloc
        self.same_domain = kw.get('same_domain', True)
        self.exclude_parant_links = kw.get('exclude_parant_links', True)


    def next(self):
        random.shuffle(self.tocrawl)
        link = self.tocrawl.pop()
        self.crawled.append(link)
        return link


    def is_same_domain(self, link):
        return urlparse(link).netloc == self.domain


    def add_link(self, link):
        guessed_type = mimetypes.guess_type(link)[0] or 'text/html'
        if not guessed_type == 'text/html':
            return
        else:
            self.non_html_links.append(link)

        if link in self.crawled:
            return


        if self.exclude_parant_links and PARANT in link:
            return

        if not self.same_domain:
            self.tocrawl.append(link)

        else:
            if self.is_same_domain(link):
                self.tocrawl.append(link)

    def add_links(self, links):
        [self.add_link(link) for link in links]

    @property
    def total_crawled(self):
        return len(self.crawled)

    @property
    def in_queue(self):
        return len(self.tocrawl)

    @property
    def total_non_html_links(self):
        return len(self.non_html_links)

    @property
    def has_links(self):
        return bool(self.tocrawl)

    @property
    def empty(self):
        return self.has_links is False


q = CrawlerQueue('http://reddit.com', same_domain=0)
not_found = set([])
while q.has_links:
    crawling = q.next()
    page = HTMLParser(crawling)
    [not_found.add(link) for link in page.client.not_found]    
    q.add_links(page.links)
    title = page.title
    if title:
        print(title, not_found)

Why is my crawler not recording 404s I think it may be an error in the logic

0 个答案: