我正尝试在此网站上抓取数据 :https://scrapingclub.com/exercise/detail_cookie/ 但是数据存储在 这个网址:https://scrapingclub.com/exercise/ajaxdetail_cookie/?token=YQ00FY4B4D
每次访问时令牌都会更改
我不知道如何从第二个网址提取数据
import json
class FindgoldSpider(scrapy.Spider):
name = 'findgold'
def start_requests(self):
url='https://scrapingclub.com/exercise/ajaxdetail_cookie/?token='
headers = {
'User-Agent': 'Mozilla/5.0',
'X-Requested-With' : 'XMLHttpRequest' ,
}
cookies = {
'__cfduid' : 'd6ec4f3842fa1b47ec88a035af6773fd11599773899' ,
}
yield scrapy.http.Request(url, headers=headers)
def parse(self, response):
data = response.json
print(data)
答案 0 :(得分:0)
如果我不理解错误,希望对您有所帮助。尝试帮助。
import scrapy
import json
from ..items import JsonscrapyItem
class PatelcoSpider(scrapy.Spider):
name = 'patelco'
allowed_domains = ['patelco.org']
start_urls = ['https://scrapingclub.com/exercise/detail_cookie/']
def start_requests(self):
headers = {
'authority': 'scrapingclub.com',
'accept': '*/*',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://scrapingclub.com/exercise/detail_cookie/',
'accept-language': 'en-US,en;q=0.9',
'cookie': '__cfduid=dadbc0660498959aca2ad988813bede171600011839; _ga=GA1.2.1127524216.1600011846; _gid=GA1.2.685369613.1600011846; _gat_gtag_UA_39890589_8=1; token=14Z022XFYS',
}
params = (
('token', '14Z022XFYS'),
)
yield scrapy.Request(url=self.start_urls[0], meta={'data': params}, headers=headers, callback=self.parse)
def parse(self, response):
hreflink = response.xpath('/html/body/div/div/div[2]/div/div[2]/p[1]/a/@href').extract()
data = response.xpath('/html/body/div/div/div[2]/div/div[2]/p[2]/text()').extract()
print('\n',hreflink, '\n')
print('\n',data, '\n')
答案 1 :(得分:0)
感谢您的回答,它对我有很大帮助,但我仍然遇到问题:
import scrapy
import json
import re
class GoldSpider(scrapy.Spider):
name = 'gold'
def parse(self, response):
pattern=re.compile('token=(.*?);')
token=pattern.findall( response.headers.get("set-cookie").decode("utf-8"))[0]
header = {
'authority': 'scrapingclub.com',
'accept': '*/*',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://scrapingclub.com/exercise/detail_cookie/',
'accept-language': 'en-US,en;q=0.9',
}
cookie = {
'__cfduid': 'd95301af9f316c3263fffa2e373424e8f1600024418',
'_ga': 'idk how to find it',
'_gid': 'idk how to find it',
'token': token
}
yield scrapy.Request(url='https://scrapingclub.com/exercise/ajaxdetail_cookie/?token='+str(token),cookies=cookie,headers=header,callback=self.parse_json)
data=response.json
print(data) ```
The data is stored in https://scrapingclub.com/exercise/ajaxdetail_cookie/?token=(the token)