import urllib
def get_page(url):
try:
import urllib
return urllib.urlopen(url).read()
except:
return ""
def get_next_target(seed):
start_link = seed.find('<a href=')
if start_link == -1:
return 0, 0
start_quote = seed.find('"', start_link)
end_quote = seed.find('"', start_quote + 1)
url = seed[start_quote + 1 : end_quote ]
return url, end_quote
def get_all_links(seed):
links = []
while True:
url, remaining_page = get_next_target(seed)
if url:
links.append(url)
seed = seed[remaining_page : ]
else:
break
return links
def crawl_web(seed):
tocrawl = [seed]
crawled = []
while tocrawl:
page = tocrawl.pop()
if page not in crawled:
crawled.append(page)
f = urllib.urlopen(page)
content = f.read()
tocrawl = tocrawl + get_all_links(content)
print crawled
crawl_web('https://en.wikipedia.org/wiki/Software_engineering')
我正在尝试构建一个网络抓取工具,但上面的代码会出现以下错误:
click here for viewing the error
请帮助!!