我尝试了几次但是如果我使用parse作为define方法在start_urls中插入链接,CrawlSpider工作正常:
start_urls = ["https://rent.591.com.tw/home/search/rsList?is_new_list=1&type=1&kind=0&searchtype=1®ion=1&firstRow=0&totalRows=9672",
"https://rent.591.com.tw/home/search/rsList?is_new_list=1&type=1&kind=0&searchtype=1®ion=1&firstRow=30&totalRows=9672",
"https://rent.591.com.tw/home/search/rsList?is_new_list=1&type=1&kind=0&searchtype=1®ion=1&firstRow=60&totalRows=9672",
"https://rent.591.com.tw/home/search/rsList?is_new_list=1&type=1&kind=0&searchtype=1®ion=1&firstRow=90&totalRows=9672", ]
但是在使用LinkExtractor时,爬虫一直都没有引用
> 2018-02-04 22:36:18 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
>2018-02-04 22:36:18 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023
>2018-02-04 22:36:19 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://rent.591.com.tw/robots.txt> (referer: None)
>2018-02-04 22:36:24 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://rent.591.com.tw/home/search/rsList?is_new_list=1&type=1&kind=0&searchtype=1®ion=1&firstRow=0&totalRows=9672> (referer: None)
具体问题可能是什么?我已经整天都在努力了。 LinkExtractor是否不适用于JSON响应和Java请求?或者规则设置是否适合以下情况?
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from bs4 import BeautifulSoup
import json
class crawler(CrawlSpider):
name = "test1"
allowed_domains = ["rent.591.com.tw"]
start_urls = ["https://rent.591.com.tw/home/search/rsList?is_new_list=1&type=1&kind=0&searchtype=1®ion=1&firstRow=0&totalRows=9672"]
rules = [Rule(LinkExtractor(allow=('/search/rsList?is_new_list=1&type=1&kind=0&searchtype=1®ion=1&firstRow=([30]{0,4})&totalRows=9672'),),
callback='parse_list', follow = True)]
def parse_list (self, response):
jsondata = json.loads(response.text)
house_detail_domain = "https://rent.591.com.tw/rent-detail-{0}.html"
for i in range(99):
print ("========================test===================")
print (jsondata.keys())
print (jsondata["status"])
print (jsondata["records"])
print (jsondata["is_recom"])
#print (jsondata["data"])
print (jsondata["data"].keys())
#print (jsondata["data"]["biddings"]
#print (jsondata["data"]["page"]
print (jsondata["data"]["data"][0])
#print (jsondata["data"]["topData"]
print ("========================topdata===================")
print ("========================test===================")
print (jsondata["data"]["topData"][0].keys())
print (jsondata["data"]["topData"][0].values())
print (jsondata["data"]["topData"][0].items())
print (jsondata["data"]["topData"][0])
print ("========================test===================")
print ("========================test===================")
for topdata_house in jsondata["data"]["topData"]:
print (topdata_house.items())
for each_data in topdata_house.items():
print (each_data)
#print (jsondata["data"]["data"])
print ("========================test===================")
print ("========================data===================")
print (jsondata["data"]["data"][0].keys())
print (jsondata["data"]["data"][0].values())
print (jsondata["data"]["data"][0].items())
print (jsondata["data"]["topData"][0])
print ("========================test===================")
for data_house in jsondata["data"]["data"]:
print (data_house.items())
for each_data in data_house.items():
print (each_data)
for i in range(59):
print ("========================topdata_house_link===================")
for topdata_house in jsondata["data"]["topData"]:
# for each_data in topdata_house.items():
print (topdata_house["post_id"])
print (topdata_house["detail_url"])
yield scrapy.Request(house_detail_domain.format(topdata_house["post_id"]), self.parse_house_detail)
print (topdata_house["post_id"])
print (topdata_house["detail_url"])
for i in range(59):
print ("========================data_house_link===================")
for data_house in jsondata["data"]["data"]:
# for each_data in data_house.items():
print (data_house["post_id"])
# print (data_house["detail_url"])
yield scrapy.Request(house_detail_domain.format(data_house["post_id"]), self.parse_house_detail)
print (data_house["post_id"])
print ("========================test===================")
def parse_house_detail (self, response):
house_detail = BeautifulSoup(response.text, "lxml")
print ("===========================================")
print (house_detail.select(".houseInfoTitle")[0].text)
print ("===========================================")
print (house_detail.select(".pageView")[0].text)
print ("===========================================")
print (house_detail.select(".detailInfo")[0].text)
print ("===========================================")
print (house_detail.select(".houseIntro")[0].text)
print ("===========================================")
print (house_detail.select(".lifeBox")[0].text)
print ("===========================================")
print (house_detail.select(".labelList")[0].text)
print ("===========================================")
print (house_detail.select(".facility")[0].text)
print ("===========================================")
print (house_detail.select(".userInfo"))
print ("===========================================")
print (house_detail.select(".banner"))
print ("===========================================")
print (house_detail.select("#show"))