我实现了一个简单的网络爬虫。我拿了一个名为crawled for crawled url link的数组。您可以在下面看到整个代码。代码运行良好,但不会打印正确的链接。
import urllib2
def record_user_click(index,keyword,url):
urls = lookup(index, url)
if urls:
for entry in index:
if entry[0] == url:
entry[1] = entry[1] + 1
def add_to_index(index, keyword, url):
if keyword in index:
index[keyword].append(url)
else:
index[keyword] = [url]
def get_page(url):
try:
import urllib2
return urllib2.urlopen(url).read()
except:
return ""
def union(a, b):
for e in b:
if e not in a:
a.append(e)
def get_next_target(page):
start_link = page.find('<a href=' or '" href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def get_all_links(page):
links = []
while True:
url, endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
def crawl_web(seed, max_pages=200):
tocrawl = [seed]
crawled = []
graph = {}
index = {}
while tocrawl and len(crawled) < max_pages:
page = tocrawl.pop()
if page not in crawled:
content = get_page(page)
add_page_to_index(index, page, content)
outlinks = get_all_links(content)
graph[page] = outlinks
union(tocrawl, outlinks)
crawled.append(page)
#print crawled
return crawled, index, graph
def add_page_to_index(index, url, content):
words = content.split()
for word in words:
add_to_index(index, word, url)
def lookup(index, keyword):
if keyword in index:
return index[keyword]
return None
crawled,index, graph = crawl_web('http://en.wikipedia.org/wiki/Information')
print crawled
执行程序时,会显示链接。输出中的最后一个url是javascript:bgscro(3),但它不是有效的url。我该如何解决这个问题?
[...,'javascript:bgscro(3)']
答案 0 :(得分:0)
看起来它正在寻找一个实际上意味着触发javascript功能的链接。如果你想忽略这些情况,看起来你可以编辑你的get_all_links函数 这个:
def get_all_links(page):
links = []
while True:
url, endpos = get_next_target(page)
if url and not url.startswith("javascript:"): # ignore javascript
links.append(url)
page = page[endpos:]
else:
break
return links
在处理之前过滤您的链接列表
outlinks = filter(lambda x: not x.startswith("javascript:"), outlinks)
你很可能遇到很多这样的边缘案例。