编解码器无法解码字节

时间:2015-09-19 04:16:59

标签: python parsing python-3.x beautifulsoup web-crawler

它不断给出“编解码器无法解码字节”的错误。我不知道这意味着什么,或者如何解决这个问题。

import urllib.request
from bs4 import BeautifulSoup
import re
re.IGNORECASE = True
from urllib.parse import urlparse

#SourceUrl
url = ['http://www.imdb.com','http://www.pinterest.com']

urls = url

z = urlparse(urls[0])

TopLevel = z.scheme+'://'+z.netloc

visited =[url]

robotsUrl = TopLevel +'/robots.txt'

print (robotsUrl)

imagesVisited = []

imageTitles = []

urlTitles = []

while len(urls) < 5000000:
        try:
            htmltext = urllib.request.urlopen(urls[0]).read()
            robots = urllib.request.urlopen(robotsUrl).read()
            disallowList = re.findall(b'Disallow\:\s*([a-zA-Z0-9\*\-\/\_\?\.\%\:\&]+)', robots)
        except:
            print (urls[0])
        sourceCode = BeautifulSoup(htmltext, "html.parser")
        urls.pop(0)
        print(len(urls))
        print ('urls')
        for link in sourceCode.findAll('a', href=True):
            if "http://" not in link['href']:

                link['href'] = urllib.parse.urljoin(TopLevel,link['href'])
            in_disallow = False
            for i in range(len(disallowList)):
                if (disallowList[i]).upper().decode() in link['href'].upper():
                    in_disallow = True
                    break
            if not in_disallow:
                if link['href'] not in visited:
                    urls.append(link['href'])
                    visited.append(link['href'])

print ('these are the urls crawled')
print (urls)

0 个答案:

没有答案