我想对此网址发出一些发帖请求:http://31.193.137.143/whill/locator2/public/results
。
我正在使用python请求,它运行良好。我从该发帖请求的响应中获得的状态码为200,与我从网站(http://31.193.137.143/whill/locator2/public
)获得的状态码相同。但是,稍后我想将其合并到使用scrapy的代码库中。我发现我现在得到的响应的状态码为301或302。内容基本上没有任何内容。我还注意到请求的主体现在为空,而不是我在开始时传递的键值对。
可能是什么问题?
使用python请求的代码
def get_location(lat, lng, p = {'http': os.environ['http_proxy'], 'https': os.environ['https_proxy']}):
ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko)'
try:
r = requests.post('http://31.193.137.143/whill/locator2/public/results', data = {'lat': lat, 'lng': lng},
verify = False, proxies = p, headers = {'User-Agent': ua})
df = pd.DataFrame(columns = ['address', 'name', 'id'])
tree = html.fromstring(r.text)
shops = tree.xpath('//div[@class="inner-marker-data"]')
df['address'] = [''.join(s.xpath('./span[@class="marker-address"]/text()')) for s in shops]
df['name'] = [''.join(s.xpath('./h3/text()')) for s in shops]
df['id'] = [s.xpath('./a/@data-item-id')[0] for s in shops]
return df
except:
print(lat, lng)
return pd.DataFrame()
使用scrapy的代码:
from scrapy import Spider, Request
import pandas as pd
import json
from shapely import geometry
import time
class WHSpider(Spider):
name = 'WilliamHill'
url = 'http://31.193.137.143/whill/locator2/public/results'
custom_settings = {
'CONCURRENT_REQUEST_PER_IP': 5
}
handle_httpstatus_list = [301, 302]
headers = {
'Proxy-Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Origin': 'http://31.193.137.143',
'Upgrade-Insecure-Requests': '1',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Referer': 'http://31.193.137.143/whill/locator2/public/',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.9',
}
def __init__(self):
super().__init__()
self.data = pd.DataFrame()
def start_requests(self):
geo = json.load(open('*.json'))
geo = [(g['town'], float(g['latitude']), float(g['longitude']), g['postcode']) for g in geo]
for i in range(len(geo)):
name, lat, lng, code = geo[i]
for latitude, longitude in set(list(geometry.Point(lat, lng).buffer(.1, resolution = 1).exterior.coords)):
request_body = json.dumps({'lat': latitude, 'lng': longitude, '_token': 'eKz2cHSu5EX7zKZGy3w226nCQpBUYHti1QFf99bO'})
yield Request(self.url, method = 'POST', body = request_body, callback = self.parse, headers = self.headers, dont_filter = True)
def parse(self, response):
<do something>