# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
class InfoSpider(scrapy.Spider):
name = 'info'
allowed_domains = ['womenonlyconnected.com']
start_urls =['http://www.womenonlyconnected.com/socialengine/pageitems/index']
def parse(self, response):
urls = response.xpath('//h3/a/@href').extract()
for url in urls:
absolute_url = response.urljoin(url)
yield Request(absolute_url , callback = self.parse_page)
def parse_page(self , response):
pass
这里是我的代码使用这个代码我只能抓第一个24链接只需要帮助抓取所有链接后"查看更多"在页面上 pag url给予bbelow http://www.womenonlyconnected.com/socialengine/pageitems/index
答案 0 :(得分:0)
经过一些调查后发现,您可以使用以下网址进行分页:
http://www.womenonlyconnected.com/socialengine/pageitems/index?page=N
其中 N 以1开头为第一页等。所以我会像这样修改你的蜘蛛:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
class InfoSpider(scrapy.Spider):
name = 'info'
allowed_domains = ['womenonlyconnected.com']
start_urls = ['http://www.womenonlyconnected.com/socialengine/pageitems/index']
page_num = 1
def parse(self, response):
urls = response.xpath('//h3/a/@href').extract()
for url in urls:
absolute_url = response.urljoin(url)
yield Request(absolute_url , callback = self.parse_page)
if self.page_num < 100:
self.page_num += 1
yield Request(start_urls[0] + '?page={}'.format(self.page_num) , callback = self.parse)
def parse_page(self , response):
pass
我在第100页停止的原因是,确定是否有更多结果并不是那么容易,因此你应该转到下一页。理论上,您可以检查页面上是否存在查看更多元素。问题是它总是存在,如果没有更多的页面有结果,它就会被隐藏。但是使用JavaScript隐藏这个元素,所以Scrapy总是看到它被隐藏了。为了可靠地判断是否有更多页面,您必须使用例如Splash