Question

我的网络抓取工具存在问题。它可以通过任何常规旧网站运行，如魅力，但当它遇到https协议时，它似乎无法正常工作。

这是我尝试通过抓取工具运行https网址时出现的错误（名称＆＃39; htmltext＆＃39;未定义）

import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import re
re.IGNORECASE = True
from urllib.parse import urlparse

#SourceUrl
url = "https://en.wikipedia.org/wiki/Main_Page"

urls = [url]

z = urlparse(urls[0])

TopLevel = z.scheme+'://'+z.netloc

visited =[url]

robotsUrl = TopLevel +'/robots.txt'



while len(urls) < 100:
        try:
            htmltext = urllib.request.urlopen(urls[0]).read()
            robots = urllib.request.urlopen(robotsUrl).read()
            disallowList = re.findall(b'Disallow\:\s*([a-zA-Z0-9\*\-\/\_\?\.\%\:\&]+)', robots)
        except:
            print (urls[0])
        sourceCode = BeautifulSoup(htmltext, "html.parser")
        urls.pop(0)
        print(len(urls))
        for link in sourceCode.findAll('a', href=True):
            if "http://" not in link['href']:
                link['href'] = urllib.parse.urljoin(url,link['href'])
            in_disallow = False
            for i in range(len(disallowList)):
                if (disallowList[i]).upper().decode() in link['href'].upper():
                    in_disallow = True
                    break
            if not in_disallow:
                if link['href'] not in visited:
                        urls.append(link['href'])
                        visited.append(link['href'])
print (visited)

Webcrawler不使用HTTPS

0 个答案: