我目前正在开发一个网络抓取工具,它应该访问目录中的网站列表,访问网站' CSS样式表,检查@media标签(检查响应式设计的基本方法,我知道还有其他需要考虑的案例),并打印所有不对文件使用响应式设计的网站。
我相当确定我实际检查CSS的@media标签的方法工作正常,但蜘蛛在决定是否找到一个带有@media标签的CSS文件之前没有访问所有CSS文件。我有一个测试文件,在程序进行时记录调试输出,它显示奇怪的模式,例如完成检查所有CSS文件,然后打印出文件中找到的内容,这不应该发生。
我希望有人可以查看我的代码并帮助我确定为什么这不按照我想要的顺序发生。作为参考,目标是:
这里是我的代码(并非所有内容都能完美运行 - 例如,程序超时因为我还没有使用TimeOutError,但在大多数情况下,我觉得应该这样做&# 39;正确评估网站的工作,并没有这样做):
import scrapy
import re
import os.path
from scrapy.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from twisted.internet.error import TimeoutError
import time
class LCCISpider(CrawlSpider):
name = "lcci"
start_urls = ["http://www.lancasterchamber.com/busdirectory.aspx?mode=category"]
#Calls parse_item for every category link on main page
rules = (Rule(SgmlLinkExtractor(restrict_xpaths=('//div[@id="catListingResults"]/table/tr')),
callback = 'parse_item', follow = True),)
website_list = []
found_media = False
#Called for each category
def parse_item(self, response):
#For each site on the page, calls parse_website
sites = response.xpath('//div[@id="busListingResults"]/table/tr')
for site in sites:
urls = site.xpath('.//td/a[4]/@href').extract()
for url in urls:
if len(url) == 0:
continue
else:
new_site = response.urljoin(url)
yield scrapy.Request(new_site, callback=self.parse_website,
errback=self.errback_website)
def parse_website(self, response):
f = open('output2.txt', 'a')
f.write("NOW VISITING")
f.flush()
f.write(response.url)
f.flush()
f.write("\n")
f.flush()
f.close()
#reset found_media to false for each website
self.found_media = False
#for every link in the header, check potential css for @media tag
for href in response.css("head > link::attr('href')"):
url = response.urljoin(href.extract())
#if @media tag has not been found, continue checking css
if self.found_media == False:
#Call check_css for the url of the css file
yield scrapy.Request(url, callback=self.check_css,
errback=self.errback_website)
f = open('output2.txt', 'a')
f.write("step\n")
f.flush()
f.close()
else:
break
#if no @media tag is found in any link in the header, add the url to the website_list
if self.found_media == False:
#self.website_list.append(response.url)
f = open('output2.txt', 'a')
f.write("No @media tag in")
f.flush()
f.write(response.url)
f.flush()
f.write("\n")
f.flush()
f.close()
f = open('outputfalse2.txt', 'a')
f.write(response.url)
f.write("\n")
f.close()
else:
f = open('outputtrue.txt', 'a')
f.write(reponse.url)
f.write("\n")
f.close()
def check_css(self, response):
#Just a way of converting url into a string, the ".txt" is otherwise meaningless
string = str(response.url)
f = open('output2.txt', 'a')
f.write("Checking CSS in ")
f.write(response.url)
f.write("\n")
f.flush()
f.close()
#only perform regex search if it's a .css file
if (string[-4:] == ".css"):
media_match = re.search(r'@media', response.body, flags=0)
if media_match != None:
f = open('output2.txt', 'a')
f.write("found @media tag in " + response.url + "\n")
f.flush()
#If an @media tag is found, set found_media to True
self.found_media = True
f.close()
else:
f = open('output2.txt', 'a')
f.write("not css")
f.flush()
f.close()
def errback_website(self, failure):
if failure.check(TimeoutError):
request = failure.request
self.logger.error = ('TimeoutError on %s', request.url)
答案 0 :(得分:1)
我浏览过,无法帮助完成这项工作。这是完全清理的代码。 在逻辑方面几乎没有变化。 它现在做的是:
.css
个链接.css
个链接
6.1如果media
正则表达式与yield产品匹配css url和item 这里唯一的问题是由于scrapy的异步性质,你最终会有很多重复项,因为你当时可能会抓取多个.css文件。为此,我们可以使用简单的管道来检测和删除重复项。
为了将来参考,您不应该使用文件写入进行调试。看一下scrapy shell,你甚至可以在parse
里面使用它在爬行过程中打开shell,如:
def parse(self, response):
inspect_response(response, self)
这是工作蜘蛛:
import re
from scrapy.spiders import CrawlSpider, Rule
from scrapy.exceptions import DropItem
from scrapy.linkextractors import LinkExtractor
from twisted.internet.error import TimeoutError
from scrapy import Request
class DupePipeline(object):
def __init__(self):
self.known_websites = set()
def process_item(self, item, spider):
if item['website'] in self.known_websites:
raise DropItem('duplicate')
self.known_websites.add(item['website'])
return item
class LCCISpider(CrawlSpider):
name = "lcci"
start_urls = ["http://www.lancasterchamber.com/busdirectory.aspx?mode=category"]
custom_settings = {
'ROBOTSTXT_OBEY': False,
'ITEM_PIPELINES': {
'myproject.spiders.spider.DupePipeline': 666,
}
}
# Calls parse_item for every category link on main page
rules = (Rule(LinkExtractor(restrict_xpaths=['//div[@id="catListingResults"]/table/tr']),
callback='parse_item', follow=True),) # why follow?
# Called for each category
def parse_item(self, response):
# For each site on the page, calls parse_website
sites = response.xpath('//div[@id="busListingResults"]/table/tr')
for site in sites:
urls = site.xpath('.//td/a[4]/@href').extract()
for url in urls:
if not url:
continue
new_site = response.urljoin(url)
yield Request(new_site,
callback=self.parse_website,
errback=self.errback_website)
def parse_website(self, response):
# for every link in the header, check potential css for @media tag
for href in response.css("head > link::attr('href')").extract():
if not href.endswith('.css'): # only css files
continue
yield Request(response.urljoin(href),
meta={'website': response.url},
callback=self.check_css,
errback=self.errback_website)
def check_css(self, response):
media_match = re.search(r'@media', response.body, flags=0)
if media_match:
# return item!
yield {'url': response.url,
'website': response.meta['website']}
def errback_website(self, failure):
if failure.check(TimeoutError):
request = failure.request
self.logger.error = ('TimeoutError on %s', request.url)
使用scrapy crawl lcci -o test.json
运行几分钟后的结果我得到了这个结果:http://pastebin.com/raw/kfsTKqUY