Question

我实现了一个简单的网络爬虫。我拿了一个名为crawled for crawled url link的数组。您可以在下面看到整个代码。代码运行良好，但不会打印正确的链接。

import urllib2
def record_user_click(index,keyword,url):
    urls = lookup(index, url)
    if urls:
        for entry in index:
            if entry[0] == url:
                entry[1] = entry[1] + 1

def add_to_index(index, keyword, url):
    if keyword in index:
        index[keyword].append(url)
    else:
        index[keyword] = [url]


def get_page(url):
    try:
        import urllib2
        return urllib2.urlopen(url).read()
    except:
        return ""

def union(a, b):
    for e in b:
        if e not in a:
            a.append(e)

def get_next_target(page):
    start_link = page.find('<a href=' or '" href=')
    if start_link == -1:
        return None, 0
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1:end_quote]
    return url, end_quote

def get_all_links(page):
    links = []
    while True:
        url, endpos = get_next_target(page)
        if url:
            links.append(url)
            page = page[endpos:]
        else:
            break
    return links

def crawl_web(seed, max_pages=200):
    tocrawl = [seed]
    crawled = []
    graph = {}
    index = {}
    while tocrawl and len(crawled) < max_pages:
        page = tocrawl.pop()
        if page not in crawled:
            content = get_page(page)
            add_page_to_index(index, page, content)
            outlinks = get_all_links(content)
            graph[page] = outlinks
            union(tocrawl, outlinks)
            crawled.append(page)
    #print crawled
    return crawled, index, graph



def add_page_to_index(index, url, content):
    words = content.split()
    for word in words:
         add_to_index(index, word, url)


def lookup(index, keyword):
     if keyword in index:
         return index[keyword]
    return None


crawled,index, graph = crawl_web('http://en.wikipedia.org/wiki/Information')
print crawled

执行程序时，会显示链接。输出中的最后一个url是javascript：bgscro（3），但它不是有效的url。我该如何解决这个问题？

[...，'javascript：bgscro（3）']

Answer 1

看起来它正在寻找一个实际上意味着触发javascript功能的链接。如果你想忽略这些情况，看起来你可以编辑你的get_all_links函数这个：

def get_all_links(page):
    links = []
    while True:
        url, endpos = get_next_target(page)
        if url and not url.startswith("javascript:"): # ignore javascript
            links.append(url)
            page = page[endpos:]
        else:
            break
    return links

在处理之前过滤您的链接列表

outlinks = filter(lambda x: not x.startswith("javascript:"), outlinks)

你很可能遇到很多这样的边缘案例。

web crawler url地址失败

1 个答案: