我正在尝试使用scrapy和selenium从多个页面抓取数据 我成功使用selenium驱动程序登录,但是当我的蜘蛛开始抓取时,他不使用selenium中登录的会话,只抓取了任何用户(未经身份验证的用户)可用的数据
class Brother(Spider):
name = "spiderbrother"
allowed_domain = ["mywebsite"]
start_urls = ['https://mywebsite../']
custom_settings = {
'ITEM_PIPELINES': {
'Equipe.pipelines.Brother': 500
},
'COOKIES_ENABLED': True
}
def parse(self, response):
driver = webdriver.Firefox()
driver.get("https://mywebsite../login")
username = driver.find_element_by_id("email")
password = driver.find_element_by_id("passwd")
username.send_keys("myEmail")
password.send_keys("MyPWD")
driver.find_element_by_name("SubmitLogin").click()
categories = Selector(response).xpath('//*[@id="leo-top-menu"]/ul/li/a')
for categorie in categories:
page_url = categorie.xpath('@href').extract_first()
next_page = response.urljoin(page_url)
if next_page:
yield scrapy.Request(url=next_page, callback=self.types)
def types(self, response):
sub_categories = Selector(response).xpath('//*[@id="subcategories"]/div/div/div/h5/a')
for sub_categorie in sub_categories:
page_url = sub_categorie.xpath('@href').extract_first()
next_page = response.urljoin(page_url)
if next_page:
yield scrapy.Request(url=next_page, callback=self.products)
def products(self, response):
products = Selector(response).xpath('//div[@class="product-image-container image"]/a')
for product in products:
url = product.xpath('@href').extract_first()
page = requests.get(url).text
soup = BeautifulSoup(page, 'html.parser')
item = TestItem()
item["title"] = soup.find("h1").text
item['image_url'] = soup.find("div", {"id": "image-block"}).img["src"]
item['price'] = soup.find("span", {"id": "our_price_display"}).text
try:
item['availability'] = soup.find("span", {"id": "availability_value"}).text()
except:
item['availability'] = "Available"
try:
item['description'] = soup.find("div", {"itemprop": "description"}).text.strip()
except:
item['description'] = "no description found"
yield item
next_page = response.xpath('//li[@class="pagination_next"]/a/@href').extract_first()
next_page = response.urljoin(next_page)
if next_page:
yield scrapy.Request(url=next_page, callback=self.products)
我获取除“价格”以外的所有数据,因为只有在登录时才可用
尝试使用FormRequest登录而不是硒仍然存在相同的问题..我尝试在访问产品页面并使用BeautifulSoup对其进行解析之前检索数据(仅价格),但它确实起作用..看来beautifulsoup是问题所在这里
我使用FormRequest登录
def parse(self, response):
return FormRequest.from_response(response,
formxpath="//*[@id='login_form']",
formdata={'email': 'MyEmail', 'passwd': 'myPWD'},
callback=self.after_login)
def after_login(self, response):
categories = Selector(response).xpath('//*[@id="leo-top-menu"]/ul/li/a')
for categorie in categories:
page_url = categorie.xpath('@href').extract_first()
next_page = response.urljoin(page_url)
if next_page:
yield Request(url=next_page, callback=self.types)
答案 0 :(得分:0)
看来requests.get()
正在打开未登录会话的URL,所以我尝试使用Request
访问URL并回调新方法parse_item()
,以便Beatifulsoup从响应中解析并有效。
更新后的代码
def products(self, response):
products = Selector(response).xpath('//div[@class="product-image-container image"]/a')
for product in products:
url = product.xpath('@href').extract_first()
page = response.urljoin(url)
yield Request(url=page, callback=self.parse_item)
next_page = response.xpath('//li[@class="pagination_next"]/a/@href').extract_first()
next_page = response.urljoin(next_page)
if next_page:
yield Request(url=next_page, callback=self.products)
def parse_item(self, response):
soup = BeautifulSoup(response.text, 'lxml')
item = TestItem()
item["title"] = soup.find("h1").text
item['image_url'] = soup.find("div", {"id": "image-block"}).img["src"]
item['price'] = soup.find("span", {"id": "our_price_display"}).text
try:
item['availability'] = soup.find("span", {"id": "availability_value"}).text()
except:
item['availability'] = "Available"
try:
item['description'] = soup.find("div", {"itemprop": "description"}).text.strip().replace(u'\xa0', u' ')
except:
print("no description found")
yield item