我的网络抓取工具存在问题。它可以通过任何常规旧网站运行,如魅力,但当它遇到https协议时,它似乎无法正常工作。
这是我尝试通过抓取工具运行https网址时出现的错误(名称' htmltext'未定义)
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import re
re.IGNORECASE = True
from urllib.parse import urlparse
#SourceUrl
url = "https://en.wikipedia.org/wiki/Main_Page"
urls = [url]
z = urlparse(urls[0])
TopLevel = z.scheme+'://'+z.netloc
visited =[url]
robotsUrl = TopLevel +'/robots.txt'
while len(urls) < 100:
try:
htmltext = urllib.request.urlopen(urls[0]).read()
robots = urllib.request.urlopen(robotsUrl).read()
disallowList = re.findall(b'Disallow\:\s*([a-zA-Z0-9\*\-\/\_\?\.\%\:\&]+)', robots)
except:
print (urls[0])
sourceCode = BeautifulSoup(htmltext, "html.parser")
urls.pop(0)
print(len(urls))
for link in sourceCode.findAll('a', href=True):
if "http://" not in link['href']:
link['href'] = urllib.parse.urljoin(url,link['href'])
in_disallow = False
for i in range(len(disallowList)):
if (disallowList[i]).upper().decode() in link['href'].upper():
in_disallow = True
break
if not in_disallow:
if link['href'] not in visited:
urls.append(link['href'])
visited.append(link['href'])
print (visited)