我试图从雅典的多个地点抓取希腊的一个房地产网站,该网站使用某种保护,所以我总是使用 cookie 和标题(我知道我必须轮换用户代理,所以我不会被阻止这将是下一步,因为我必须始终将我的 IP 更改为刮刮(所以因为我是新手仍然在刮刮))。
站点的每个页面显示 10 个平面,循环在前 4 个中迭代正确,然后返回 [scrapy.core.engine] DEBUG: Crawled (200)
所以我做错了什么,我不能像第一个一样正确地刮掉剩余的单位??
import scrapy
class MainprojectSpider(scrapy.Spider):
name = 'mainProject'
allowed_domains = ['www.spitogatos.gr']
download_delay = 5.0
headers = {
"authority": "www.spitogatos.gr",
"pragma": "no-cache",
"cache-control": "no-cache",
"sec-ch-ua": "\" Not;A Brand\";v=\"99\", \"Google Chrome\";v=\"91\", \"Chromium\";v=\"91\"",
"sec-ch-ua-mobile": "?0",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "navigate",
"sec-fetch-user": "?1",
"sec-fetch-dest": "document",
"accept-language": "en-GB,en;q=0.9,el-GR;q=0.8,el;q=0.7,en-US;q=0.6"
}
cookies = {
"PHPSESSID": "946hl5lpbcf6jalm271g2450u8",
"spitogatosHomepageMap": "0",
"currentCurrency": "EUR",
"_ga": "GA1.2.1080968683.1626031679",
"_gid": "GA1.2.1871094347.1627582033",
"_hjTLDTest": "1",
"_hjid": "66a31a57-9c80-47f1-8767-9bca91d47b1f",
"_fbp": "fb.1.1626031679317.910543448",
"__qca": "P0-1840509725-1625417320200",
"_hjAbsoluteSessionInProgress": "0",
"openedTabs": "1",
"_gat_UA-3455846-10": "1",
"_gat_UA-3455846-2": "1",
"_hjIncludedInSessionSample": "1",
"reese84": "3:5Stu7C2tUWBSIdWSdzMOSQ==:ppmbagk94sf6IvvS66908AApyTMfCE+K7i7PgkeyRs6C9VGkCcqBSz8ZsgbOx56c46ktjL+1iyfp8zL1PuiT7AUsmA9XLdcmMoDQm30MnEPgcbQl/dMQV1PgqtVJgWVwGabZlMhGM+T6D8zf5ENVuGhLJ81U74a+gr+GySA5Xx/CqUPcGa/YG2zNICEMnZN7D4bRwJq6vxEvOU+wbSfAE6OquI4ipeHR3dz8jBwY961ka2PfY7MoLCLeGdzPUu07yOxv41lvdcZbaj9/peyxLnLSFqD9QnV5MXsXy7mKE3eNoT46F/ITB8/GAVpc/zqW792F+7HuUkWJD/pWaNOsr6+rc75kpKw15xtN5oCw9Qh3Fw9SYUtfbFMTRXBrUt0Ow/Lv2C3oOLBQyVex80cr76c4ibxS/niuNvKA87f7XZc=:THUtE26ivNhlKtaznqNuX7swpAf4x5S8pF+xoBg5KwE="
[![enter image description here][1]][1]}
#start_urls = ['https://www.spitogatos.gr/']
def start_requests(self):
url = 'https://www.spitogatos.gr/pwliseis-katoikies/athina-notia-proastia/'
yield scrapy.Request(
url=url,
method='GET',
cookies=self.cookies,
headers=self.headers,
callback= self.parse,
)
def parse(self, response):
print(response.xpath('//span[@class="tile-v5-sr__details-price"]/text()').extract())
houses = response.xpath('//h4[@class="tile-v5-sr__title"]')
print("houses"+str(len(houses)))
for house in houses:
link = house.xpath('.//@href').get()
print(link)
yield response.follow(url=link, callback=self.parse_housing, meta={'description': "name"},
headers = self.headers, cookies= self.cookies)
# next_page = response.xpath("//li[@class='next']/a@href").get()
# if next_page:
# yield response.follow(url=next_page, callback=self.parse, meta={'description': "name"},
# headers = self.headers, cookies= self.cookies)
def parse_housing(self, response): #just dummy prints
name = response.request.meta['description']
print("hello")
print(response.xpath('.//h6[@class="line__text nowrap"]/text()').extract())
print(response.xpath('.//div[@class="line"]/h6/text()').extract())#tetragonika metra
yield (response.xpath('.//h6[@class="line__text nowrap"]/text()').extract())