我一直致力于在Python中创建一个单线程Web爬虫,它将对每个页面的资产进行分组并输出以下形式的JSON数组:
[
{
url: 'http://url.com/',
assets: [
'http://url.com/imgs/img1.jpg',
'http://url.com/css/style.css',
]
},
{
url: 'http://url.com/services',
assets: [
'http://url.com/imgs/services.jpg',
'http://url.com/css/style.css',
]
},
...
]
快速总结功能:
BeautifulSoup
解析HTML并提取链接使用urlparse
:
absolute
网址relative
网址
netloc
paths
使用robotparser
检查我是否可以通过查看robots.txt
文件来抓取每个页面
./crawl.py http://sitename.com/
(包括最后的斜杠)我已假设如果网址以.html
结尾或资源路径不包含.
,我将能够将其抓取为HTML网页
我遇到了一些问题,包括:
locales
- 是否有一种智能方法可以检测并避免在不同的区域设置中抓取相同的网页?
maximum recursion depth exceeded
消息。rel
属性是否包含alternate
来避免这种情况,但这似乎并没有产生重大影响。http://url.com/
,但也必须抓取http://url.com/en-us
,http://url.com/en-au
等。 angular
/ react
- 是否可以抓取使用angular
/ react
/类似框架的网站? < / p>
非常感谢任何信息/反馈
以下代码:
#!/usr/bin/python
import sys
import json
import urlparse
import robotparser
import urllib2
from queue import Queue
from bs4 import BeautifulSoup
class Crawler:
def gethtml(self, url):
try:
return urllib2.urlopen(url)
except urllib2.HTTPError, e:
print 'We failed with error code - %s.' % e.code
if e.code == 404:
print('404 File Not Found: ' + url)
else:
print('e code not 404')
return None
def __init__(self):
url = sys.argv[1]
sys.setrecursionlimit(100000)
parsedurl = urlparse.urlparse(url)
print('Crawling from URL: ' + url)
self.parser = robotparser.RobotFileParser()
self.parser.set_url(url + 'robots.txt')
self.parser.read()
if parsedurl.netloc.startswith('www.'): # compare netlocs without www.
self.netloc = parsedurl.netloc[4:]
else:
self.netloc = parsedurl.netloc
html = self.gethtml(url)
if html is not None:
self.visited = {}
self.current = {}
self.currentassets = {}
self.output = []
self.queue = Queue()
if len(parsedurl.path) < 1:
self.visited['/index.html'] = True
self.crawlhtml(url, html)
else:
print("Sorry, couldn't find HTML at that URL!")
def isabsolute(self, url):
return bool(urlparse.urlparse(url).netloc)
def checkifhtml(self, url):
parsedurl = urlparse.urlparse(url)
path = parsedurl.path
if url.endswith('.html') or '.' not in path: # path is a html file
if not self.visited.has_key(path):
self.queue.enqueue(url)
return True
else:
return False
def getasseturl(self, current_url, url):
if not self.isabsolute(url): # make our relative url absolute
url = urlparse.urljoin(current_url, url)
parsedurl = urlparse.urlparse(url)
path = parsedurl.path
netloc = parsedurl.netloc
local = False
if netloc.startswith('www.'): # check if is local url
netloc = netloc.replace('www.', '', 1)
if netloc == self.netloc:
local = True
if self.currentassets.get(path) is None:
self.currentassets[path] = True
if local:
if self.checkifhtml(url) is False:
self.current['assets'].append(url)
def checkqueue(self):
print('Checking queue. Queue Size: ' + str(self.queue.size()))
if self.queue.size() == 0:
print('\n------------------------------------------------------\n')
print(json.dumps(self.output, indent=4))
print('\n------------------------------------------------------\n')
print(self.visited)
else:
url = self.queue.dequeue()
parsedurl = urlparse.urlparse(url)
path = parsedurl.path
if self.visited.get(path) is None:
self.visited[path] = True
html = self.gethtml(url)
if html is not None:
self.crawlhtml(url, html)
else:
self.checkqueue()
else:
self.checkqueue()
def crawlhtml(self, url, html):
print('---------------------------------------\nLooking at url: ' + url)
if self.parser.can_fetch('*', url):
self.current['url'] = url
self.current['assets'] = []
parsedhtml = BeautifulSoup(html, 'lxml') # use lxml for speed
for link in parsedhtml.find_all(['a', 'link', 'area', 'base', 'image']):
if link.get('href') is not None:
if link.get('rel') is None:
self.getasseturl(url, link.get('href'))
else:
if not 'alternate' in link.get('rel'):
self.getasseturl(url, link.get('href'))
for link in parsedhtml.find_all(['script', 'img', 'frame', 'iframe', 'input', 'audio', 'embed', 'source', 'video']):
if link.get('src') is not None:
self.getasseturl(url, link.get('src'))
self.output.append(self.current)
self.current = {}
self.currentassets = {}
self.checkqueue()
c = Crawler()