通过scrapy中的蜘蛛回调函数保存PDF文件

时间:2013-04-22 05:40:14

标签: python-2.7 web-crawler beautifulsoup scrapy

您好我已经编写了以下代码来提取pdf文件所在的url并通过回调函数保存pdf。 但我的回调函数save_pdf永远不会被调用。

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from bs4 import BeautifulSoup

class spider_aicte(BaseSpider):
    name = "Colleges"
    allowed_domains = ["http://www.xyz.org"]
    start_urls = [
        "http://www.xyz/appwebsite.html",
        "http://www.xyz/misappengineering.htm",
        ]

    def parse(self, response):
        filename = response.url.split("/")[-2]
        soup = BeautifulSoup(response.body)
        for link in soup.find_all('a'):
            download_link = link.get('href')
            if '.pdf' in download_link:
                pdf_link = "http://www.xyz.org" + download_link
                print pdf_link
                print "pdf_link";
                for url in pdf_link:
                    print "inforLOOP-----pdflink"
                    yield Request(url, callback=self.save_pdf)

    def save_pdf(self, response):
         print "SAVING";
         path = self.get_path(response.url)
         with open(path, "wb") as f:
            f.write(response.body)
            print "SAVED" ;

0 个答案:

没有答案