Scrapy:Crawlspider没有处理规则linkextractor

时间:2018-02-05 04:22:23

标签: python regex scrapy web-crawler python-requests

  

我尝试了几次但是如果我使用parse作为define方法在start_urls中插入链接,CrawlSpider工作正常:

    start_urls = ["https://rent.591.com.tw/home/search/rsList?is_new_list=1&type=1&kind=0&searchtype=1&region=1&firstRow=0&totalRows=9672",
"https://rent.591.com.tw/home/search/rsList?is_new_list=1&type=1&kind=0&searchtype=1&region=1&firstRow=30&totalRows=9672",
"https://rent.591.com.tw/home/search/rsList?is_new_list=1&type=1&kind=0&searchtype=1&region=1&firstRow=60&totalRows=9672",
"https://rent.591.com.tw/home/search/rsList?is_new_list=1&type=1&kind=0&searchtype=1&region=1&firstRow=90&totalRows=9672",    ]
  

但是在使用LinkExtractor时,爬虫一直都没有引用

> 2018-02-04 22:36:18 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)

>2018-02-04 22:36:18 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6023

>2018-02-04 22:36:19 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://rent.591.com.tw/robots.txt> (referer: None)

>2018-02-04 22:36:24 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://rent.591.com.tw/home/search/rsList?is_new_list=1&type=1&kind=0&searchtype=1&region=1&firstRow=0&totalRows=9672> (referer: None)
  

具体问题可能是什么?我已经整天都在努力了。   LinkExtractor是否不适用于JSON响应和Java请求?或者规则设置是否适合以下情况?

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from bs4 import BeautifulSoup
import json


class crawler(CrawlSpider):
name = "test1"
allowed_domains = ["rent.591.com.tw"]
start_urls = ["https://rent.591.com.tw/home/search/rsList?is_new_list=1&type=1&kind=0&searchtype=1&region=1&firstRow=0&totalRows=9672"]


rules = [Rule(LinkExtractor(allow=('/search/rsList?is_new_list=1&type=1&kind=0&searchtype=1&region=1&firstRow=([30]{0,4})&totalRows=9672'),),
                                callback='parse_list', follow = True)]



def parse_list (self, response):
    jsondata = json.loads(response.text)
    house_detail_domain = "https://rent.591.com.tw/rent-detail-{0}.html"

    for i in range(99):
        print ("========================test===================")


    print (jsondata.keys())

    print (jsondata["status"])
    print (jsondata["records"])
    print (jsondata["is_recom"])

    #print (jsondata["data"])
    print (jsondata["data"].keys())

    #print (jsondata["data"]["biddings"]
    #print (jsondata["data"]["page"]
    print (jsondata["data"]["data"][0])


    #print (jsondata["data"]["topData"]

    print ("========================topdata===================")
    print ("========================test===================")

    print (jsondata["data"]["topData"][0].keys())
    print (jsondata["data"]["topData"][0].values())
    print (jsondata["data"]["topData"][0].items())
    print (jsondata["data"]["topData"][0])
    print ("========================test===================")
    print ("========================test===================")

    for topdata_house in jsondata["data"]["topData"]:
        print (topdata_house.items())
        for each_data in topdata_house.items():
            print (each_data)


    #print (jsondata["data"]["data"])
    print ("========================test===================")
    print ("========================data===================")



    print (jsondata["data"]["data"][0].keys())
    print (jsondata["data"]["data"][0].values())
    print (jsondata["data"]["data"][0].items())
    print (jsondata["data"]["topData"][0])

    print ("========================test===================")


    for data_house in jsondata["data"]["data"]:
        print (data_house.items())
        for each_data in data_house.items():
            print (each_data)

    for i in range(59):
        print ("========================topdata_house_link===================")


    for topdata_house in jsondata["data"]["topData"]:
    #            for each_data in topdata_house.items():
        print (topdata_house["post_id"])
        print (topdata_house["detail_url"])
        yield scrapy.Request(house_detail_domain.format(topdata_house["post_id"]), self.parse_house_detail)
        print (topdata_house["post_id"])
        print (topdata_house["detail_url"])

    for i in range(59):
        print ("========================data_house_link===================")

    for data_house in jsondata["data"]["data"]:
    #            for each_data in data_house.items():
        print (data_house["post_id"])
    #            print (data_house["detail_url"])
        yield scrapy.Request(house_detail_domain.format(data_house["post_id"]), self.parse_house_detail)
        print (data_house["post_id"])

    print ("========================test===================")

def parse_house_detail (self, response):
    house_detail = BeautifulSoup(response.text, "lxml")
    print ("===========================================")
    print (house_detail.select(".houseInfoTitle")[0].text)
    print ("===========================================")
    print (house_detail.select(".pageView")[0].text)
    print ("===========================================")
    print (house_detail.select(".detailInfo")[0].text)
    print ("===========================================")
    print (house_detail.select(".houseIntro")[0].text)
    print ("===========================================")
    print (house_detail.select(".lifeBox")[0].text)
    print ("===========================================")
    print (house_detail.select(".labelList")[0].text)
    print ("===========================================")
    print (house_detail.select(".facility")[0].text)
    print ("===========================================")
    print (house_detail.select(".userInfo"))
    print ("===========================================")
    print (house_detail.select(".banner"))
    print ("===========================================")
    print (house_detail.select("#show"))

0 个答案:

没有答案