它不断给出“编解码器无法解码字节”的错误。我不知道这意味着什么,或者如何解决这个问题。
import urllib.request
from bs4 import BeautifulSoup
import re
re.IGNORECASE = True
from urllib.parse import urlparse
#SourceUrl
url = ['http://www.imdb.com','http://www.pinterest.com']
urls = url
z = urlparse(urls[0])
TopLevel = z.scheme+'://'+z.netloc
visited =[url]
robotsUrl = TopLevel +'/robots.txt'
print (robotsUrl)
imagesVisited = []
imageTitles = []
urlTitles = []
while len(urls) < 5000000:
try:
htmltext = urllib.request.urlopen(urls[0]).read()
robots = urllib.request.urlopen(robotsUrl).read()
disallowList = re.findall(b'Disallow\:\s*([a-zA-Z0-9\*\-\/\_\?\.\%\:\&]+)', robots)
except:
print (urls[0])
sourceCode = BeautifulSoup(htmltext, "html.parser")
urls.pop(0)
print(len(urls))
print ('urls')
for link in sourceCode.findAll('a', href=True):
if "http://" not in link['href']:
link['href'] = urllib.parse.urljoin(TopLevel,link['href'])
in_disallow = False
for i in range(len(disallowList)):
if (disallowList[i]).upper().decode() in link['href'].upper():
in_disallow = True
break
if not in_disallow:
if link['href'] not in visited:
urls.append(link['href'])
visited.append(link['href'])
print ('these are the urls crawled')
print (urls)