我正在试图抓住一家知名英国零售商的网站,但我遇到了我的CrawlSpider问题 - 我收到以下错误消息:
AttributeError:'NlCrawlerSpider'对象没有属性'_rules'
我使用示例here将我的常规蜘蛛转换为爬行蜘蛛;我也使用了建议here的规则的语法,但最终得到了相同的错误消息。非常感谢您的所有帮助 - 提前谢谢您!
# Scrapy
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
# Other Packages
import time
from datetime import date
from selenium import webdriver
class NlCrawlerSpider(CrawlSpider):
name = 'nl_crawler'
allowed_domains = ['newlook.com']
start_urls = ['http://www.newlook.com/uk/womens/clothing/c/uk-womens-clothing?comp=NavigationBar%7Cmn%7Cwomens%7Cclothing#/?q=:relevance&page=1&sort=relevance&content=false']
rules = (
Rule(LinkExtractor(allow=r'\?q=:relevance&page=[1-130]&sort=relevance&content=false', ), callback='parse_item', follow=True),
)
def __init__(self):
self.driver = webdriver.Safari()
self.driver.set_window_size(800,600)
time.sleep(2)
def parse_item(self, response):
driver = self.driver
driver.get(response.url)
time.sleep(2)
# Collect products
products = driver.find_elements_by_class_name('plp-item ng-scope')
# Iterate over products; extract data and append individual features to NlScrapeItem
for item in products:
# Pull features
desc = item.find_element_by_class_name('product-item__name link--nounderline ng-binding').text
href = item.find_element_by_class_name('plp-carousel__img-link ng-scope').get_attribute('href')
# Generate a product identifier
identifier = href.split('/p/')[1].split('?comp')[0]
identifier = int(identifier)
# datetime
dt = date.today()
dt = dt.isoformat()
# Price Symbol removal and integer conversion
try:
priceString = item.find_element_by_class_name('price ng-binding').text
except:
priceString = item.find_element_by_class_name('price price--previous-price product-item__price--previous-price ng-binding ng-scope').text
priceInt = priceString.split('£')[1]
originalPrice = float(priceInt)
# discountedPrice Logic
try:
discountedPriceString = item.find_element_by_class_name('price ng-binding price--marked-down').text
discountedPriceInt = discountedPriceString.split('£')[1]
discountedPrice = float(discountedPriceInt)
except:
discountedPrice = 'N/A'
# NlScrapeItem
item = NlScrapeItem()
# Append product to NlScrapeItem
item['identifier'] = identifier
item['href'] = href
item['description'] = desc
item['originalPrice'] = originalPrice
item['discountedPrice'] = discountedPrice
item['firstSighted'] = dt
item['lastSighted'] = dt
yield item
加法: 所以我试图忽略使用crawlSpider的想法并遵循@jabargas思考 - 见下文:
def __init__(self):
self.driver = webdriver.Safari()
self.driver.set_window_size(800,600)
def start_requests(self):
n = 5
urls= []
for pageNumber in range(1,n):
url = 'http://www.newlook.com/uk/womens/clothing/c/uk-womens-clothing?comp=NavigationBar%%7Cmn%%7Cwomens%%7Cclothing#/?q=:relevance&page=%d&sort=relevance&content=false' % pageNumber
urls.append(url)
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
driver = self.driver
driver.get(response.url)
time.sleep(2)
# Collect products
products = driver.find_elements_by_class_name('plp-item ng-scope')
# Iterate over products; extract data and append individual features to NlScrapeItem
for item in products:
# Pull features
desc = item.find_element_by_class_name('product-item__name link--nounderline ng-binding').text
href = item.find_element_by_class_name('plp-carousel__img-link ng-scope').get_attribute('href')
# Generate a product identifier
identifier = href.split('/p/')[1].split('?comp')[0]
identifier = int(identifier)
# datetime
dt = date.today()
dt = dt.isoformat()
# Price Symbol removal and integer conversion
try:
priceString = item.find_element_by_class_name('price ng-binding').text
except:
priceString = item.find_element_by_class_name('price price--previous-price product-item__price--previous-price ng-binding ng-scope').text
priceInt = priceString.split('£')[1]
originalPrice = float(priceInt)
# discountedPrice Logic
try:
discountedPriceString = item.find_element_by_class_name('price ng-binding price--marked-down').text
discountedPriceInt = discountedPriceString.split('£')[1]
discountedPrice = float(discountedPriceInt)
except:
discountedPrice = 'N/A'
# NlScrapeItem
item = NlScrapeItem()
# Append product to NlScrapeItem
item['identifier'] = identifier
item['href'] = href
item['description'] = desc
item['originalPrice'] = originalPrice
item['discountedPrice'] = discountedPrice
item['firstSighted'] = dt
item['lastSighted'] = dt
yield item
不幸的是没有运气:它提取了48项细节。
答案 0 :(得分:0)
你可以这样做,直到第n页:
start_urls = ['http://www.newlook.com/uk/womens/clothing/c/uk-womens-clothing?comp=NavigationBar%%7Cmn%%7Cwomens%%7Cclothing#/?q=:relevance&page=%d&sort=relevance&content=false' % page_number' for page_number in range(1,n)]
其中n是最后一页+ 1
或者你可以使用scrapy分页 - 获取下一页的链接并按照它查找here。
答案 1 :(得分:0)
另一个可能的问题是,您没有在 init 方法中添加超级构造函数。
为此添加“ super(MySpider,self)。初始化(* a,** kw)”。
我遇到了同样的问题,并由此解决了问题。
因此 init 应该如下所示
def __init__(self, *a, **kw):
super(MySpider, self).__init__(*a, **kw)
//your initializations