I got bored today so I started coding up this little web crawler. Not knowing much about the standard library's HTMLParser I figured a web crawler would be a good way to learn the API. I was surprised
how easy it was to get the content from the pages that I wanted. The only major issue I had with HTMLParser
was making it fault tolerant. The fix was actually very simple. I had to overwrite the error
method of HTMLParser
. I'm not sure but I think it stops parsing the page on broken html. Before overwriting the error
method it would just break by way of an exception that would be very hard and hacky to catch.
The problem I'm taking on now is recording the 404 links. I thought this would be trivial but for some reason I can't get it to work. No exceptions are raised but the set
that's supposed to contain the bad links stays empty.
#-*-coding:utf8;-*-
#qpy:3
#qpy:console
from html.parser import HTMLParser as _HTMLParser
from urllib.request import urlopen
import urllib.robotparser
from urllib.parse import urlparse, urljoin
from json import dumps as parse_json
import cProfile
import mimetypes
import time
import math
import random
mimetypes.init()
PROFILE = cProfile.Profile()
ENABLE_PROFILING = False
if ENABLE_PROFILING:
PROFILE.enable()
DEFAULT_ENCODING = 'latin-1'
PARANT = '../'
def profile(func):
def wrap(*args, **kw):
PROFILE.enable()
return func(*args, **kw)
PROFILE.disable()
return wrap
def average(lst):
return float(sum(lst) / len(lst))
def random_wait(mini, maxi):
time.sleep(random.randint(mini, maxi))
USER_AGENTS = [
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9'
]
def random_ua():
return random.choice(USER_AGENTS)
class CrawlerClient(object):
def __init__(self, **kw):
self.robotparser = urllib.robotparser.RobotFileParser()
self.ua = random_ua()
self.referer = kw.get('referer', 'www.google.com')
self.not_found = set()
def can_fetch(self, url):
return self.robotparser.can_fetch(self.ua, url)
def get(self, url):
self.ua = random_ua()
req = urllib.request.Request(url)
req.add_header('User-Agent', self.ua)
req.add_header('Connection', 'keep-alive')
req.add_header('Accept','text/html,xhtml,xml')
req.add_header('Referer', self.referer)
parsed_url = urlparse(url)
robot_file_path = parsed_url.scheme + '://' + parsed_url.netloc + '/robots.txt'
self.robotparser.set_url(robot_file_path)
self.robotparser.read()
if self.can_fetch(url):
try:
with urlopen(req) as res:
http_headers = res.headers
status_code = res.getcode()
if status_code == 404:
self.not_found.add(url)
return
if status_code == 500:
return
content_type, *charset = http_headers.get('content-type').split(';')
# Try to guess the charset.
if charset:
charset = charset[0].strip().split('=')[1]
# Use fallback encoding.
else:
charset = DEFAULT_ENCODING
# If the downloaded content is of type text/*
# feed the content too the parser.
if content_type.split('/')[0] == 'text':
return res.read().decode(charset)
except Exception as e:
#print(e)
pass
class HTMLParser(_HTMLParser):
def __init__(self, url, strict=False):
self.url = urlparse(url)
self.size = 0
self.client = CrawlerClient(referer='https://rickys-python-notes.blogspot.com')
# If strict is True the parser will break
# on broken html. Othewise it will ignore
# broken html and keep on parsing.
if not strict:
self.error = self._do_nothing
_HTMLParser.__init__(self)
# Links holds all the links that parser finds.
# The parser looks for links in anchor and link tags.
self.links = set()
self.base_url = '{}://{}'.format(self.url.scheme, self.url.netloc)
# title will hold the value of the pages title if
# the page has a title
self.title = None
# This variable lets the handle_data method know
# that we are curently reading the title data so
# it can store it in self.title
self.recording_title = False
html = self.client.get(url)
if html:
self.feed(html)
def handle_starttag(self, tag, attrs):
try:
key, val, *_ = attrs[0]
except IndexError:
key, val = 0, 0
if key == 'href':
if val.startswith('//'):
val = self.url.scheme + ':' + val
url = urlparse(val)
if not url.netloc:
url = urlparse(urljoin(self.base_url, url.path))
self.links.add(url.geturl())
if tag == 'title':
self.recording_title = True
def handle_endtag(self, tag):
if tag == 'title':
self.recording_title = False
def handle_data(self, data):
if self.recording_title:
self.title = data.strip()
def _do_nothing(self, *_, **__):
return
class CrawlerQueue(object):
def __init__(self, seed, **kw):
self.seed = seed
self.tocrawl = [seed]
self.crawled = list()
self.non_html_links = list()
self.domain = urlparse(seed).netloc
self.same_domain = kw.get('same_domain', True)
self.exclude_parant_links = kw.get('exclude_parant_links', True)
def next(self):
random.shuffle(self.tocrawl)
link = self.tocrawl.pop()
self.crawled.append(link)
return link
def is_same_domain(self, link):
return urlparse(link).netloc == self.domain
def add_link(self, link):
guessed_type = mimetypes.guess_type(link)[0] or 'text/html'
if not guessed_type == 'text/html':
return
else:
self.non_html_links.append(link)
if link in self.crawled:
return
if self.exclude_parant_links and PARANT in link:
return
if not self.same_domain:
self.tocrawl.append(link)
else:
if self.is_same_domain(link):
self.tocrawl.append(link)
def add_links(self, links):
[self.add_link(link) for link in links]
@property
def total_crawled(self):
return len(self.crawled)
@property
def in_queue(self):
return len(self.tocrawl)
@property
def total_non_html_links(self):
return len(self.non_html_links)
@property
def has_links(self):
return bool(self.tocrawl)
@property
def empty(self):
return self.has_links is False
q = CrawlerQueue('http://reddit.com', same_domain=0)
not_found = set([])
while q.has_links:
crawling = q.next()
page = HTMLParser(crawling)
[not_found.add(link) for link in page.client.not_found]
q.add_links(page.links)
title = page.title
if title:
print(title, not_found)