Question

我的网页抓取工具和python的urlparse插件存在问题。我的代码基本上抓取了一个特定的域名，例如bloomberg，并将所有html下载到我的桌面。它还处于相当早的阶段，所以我确定你会注意到错误等（我是python的新手。）

我目前遇到的具体问题与函数reconstruct_url有关。我已经单独测试了urlparse.urljoin(a,b)函数，它以我期望的方式运行，但是在这个类中它只是不喜欢它。你们中的任何人都可以帮我们看看这里的问题吗？

如果我的代码中有任何其他问题对您很明显，请随时大声说出来，这是我第一次尝试编写完整的程序。虽然注意到这仍处于相对较早的阶段。非常感谢您的帮助。

#note: <meta content='Story' property='bb:resource_type'>

import urllib2
import os
from bs4 import BeautifulSoup
from urlparse import urljoin

class Spider:

    links_to_crawl = []
    crawled_links = []
    ignored_links = ['/']
    domain = 'http://bloomberg.com/'
    #meta type = ('meta', {'property','bb:resource_type'})['content']=='Story'

    # append all starting link to links_to_crawl
    def __init__(self, url):
        print 'Spider initialising...'
        self.links_to_crawl.append(url)

    # open input url and return html
    def grab_html(self,url):
        open_url = self.urllib2.urlopen(url)
        data = open_url.read()
        open_url.close()
        return data

    # return title from input html for file naming and ensure
    # no '/' present in title.
    def get_title(self, data=''):
        title_start = data.find('<title>')+7
        title_end = data.find('</title>')-1
        title = data[title_start:title_end]
        title = title.translate(None, '/')
        return title+".txt"

    # return date from input html for file saving structure
    def get_date(self, data=''):
        soup = self.BeautifulSoup(data)
        # try statement to avoid error when meta tag combinations
        # not found.
        try:
            date = soup.find('meta', {'name':'pubdate'})['content']
            return date[:12] # !! only tested with bloomberg.com !!
        # if there is no published date, return 'Other'
        except TypeError:
            return 'Other'

    # if link is relative url return 'Rel' or 
    # if url is allowed domain return 'Abs', else False.
    def url_type(self,url=''):
        if url[0:4] != 'http':
            return 'Rel'
        elif url.find(self.domain) != -1:
            return 'Abs'
        else:
            return False

    # reconstruct relative url
    def reconstruct_url(self, page='', rel=''):
        print page #debug
        print rel #debug
        print self.urljoin(page, rel) #debug
        return self.urljoin(page, rel)

    # get all links in input html and append to links_to_crawl
    # unless in crawled_links or ignored_links
    # if link is relative url reconstruct url and append to 
    # links_to_crawl, append relative url to ignored_links
    def get_links(self, data=''):
        soup = self.BeautifulSoup(data) 
        for link in soup.find_all('a'):
            # try statement to avoid error when finding
            # <a> tags withou 'href'
            try:
                if link['href'] in self.ignored_links or self.crawled_links:
                    pass
                else:
                    if self.url_type(link['href'])=='Rel':
                        reconstructed_link = self.reconstruct_url(self.domain, link['href']) #to change !!!!!!!!!!!!!!!!!
                        self.links_to_crawl.append(reconstructed_link) # append reconstructed link to links_to_crawl
                        self.ignored_links.append(link['href']) # append original link to ignored_links
                    else:
                        self.links_to_crawl.append(link['href'])
            except KeyError:
                pass

    # if directory exists do nothing
    # if directory does not exist write directory
    def ensure_dir(self, directory=''):
        if self.os.path.exists(directory):
            pass
        else:
            self.os.makedirs(directory)

    # ensure the html being saved is the type requested
    # currently only compatible with 1 meta type
    def ensure_meta_type(self, data=''):
        soup = self.BeautifulSoup(data)
        try:
            soup.find('meta', {'property':'bb:resource_type'})['content']=='Story'
            print 'True'
            return True
        except TypeError:
            print 'False'
            return False

    # save input html to txt file on mac os desktop and return
    # absolute path to file
    def save_html(self,data=''):
        if self.ensure_meta_type(data):
            print 'SAVING URL'
            # allocate save path for file and ensure save path exists
            save_path = self.os.path.abspath('/Users/sampeka/Desktop/Python Spider'+'/'+self.get_date(data))
            self.ensure_dir(save_path)
            # get file name and write file to absolute path
            file_name = self.get_title(data)
            absolute_path = save_path+'/'+file_name
            opened_file = open(absolute_path,'w')
            opened_file.write(data)
            opened_file.close()
        else:
            pass



    # crawl links_to_crawl and pop to crawled_links list
    # if ValueError then pop to ignored_links
    # except urllib2.URLError to avoid web crawler crawling
    # non-url links  
    def crawl_links(self):
        while len(self.links_to_crawl) > 0:
            url = self.links_to_crawl[0]
            print url
            try:
                data = self.grab_html(url)
                self.get_links(data)
                self.save_html(data)
                self.crawled_links.append(self.links_to_crawl.pop(0))
            except (ValueError, self.urllib2.URLError):
                self.ignored_links.append(self.links_to_crawl.pop(0))
        print 'Spider finished.'
        print 'Ignored links:'
        print self.ignored_links
        print 'Crawled links:'
        print self.crawled_links


spider = Spider('http://www.bloomberg.com/news')
spider.crawl_links()

Answer 1

您的reconstruct_url()无效，因为您尝试使用self.urljoin的未定义方法Spider。只需使用您从urlparse导入的功能：

# reconstruct relative url
def reconstruct_url(self, page='', rel=''):
    print page #debug
    print rel #debug
    print urljoin(page, rel) #debug
    return urljoin(page, rel)

Answer 2

正如@twil所提到的那样，对于其他一些模块，你也使用self，here是你代码的差异，并为所有模块修正了代码。

Web Crawler错误：“AttributeError：Spider实例没有属性'find'”

2 个答案: