您好我已经编写了以下代码来提取pdf文件所在的url并通过回调函数保存pdf。 但我的回调函数save_pdf永远不会被调用。
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from bs4 import BeautifulSoup
class spider_aicte(BaseSpider):
name = "Colleges"
allowed_domains = ["http://www.xyz.org"]
start_urls = [
"http://www.xyz/appwebsite.html",
"http://www.xyz/misappengineering.htm",
]
def parse(self, response):
filename = response.url.split("/")[-2]
soup = BeautifulSoup(response.body)
for link in soup.find_all('a'):
download_link = link.get('href')
if '.pdf' in download_link:
pdf_link = "http://www.xyz.org" + download_link
print pdf_link
print "pdf_link";
for url in pdf_link:
print "inforLOOP-----pdflink"
yield Request(url, callback=self.save_pdf)
def save_pdf(self, response):
print "SAVING";
path = self.get_path(response.url)
with open(path, "wb") as f:
f.write(response.body)
print "SAVED" ;