我已经编写了代码来擦除weedmaps.com并获取提供医用大麻的分娩,医生和药房的数据。每个州的列表页面URL的数量取决于其拥有的提供程序的数量而有所不同。每个页面只有100个列表,所以我写了一个循环,告诉抓取工具抓取第n个页面的网址(即,科罗拉多州有882个列表,应该循环到page = 9(请参阅下面的代码以供参考)。>
运行代码时,它可以抓取第1页,但是只要它到达第2页,我就会收到403错误,尽管有标头(我的最佳猜测是该网站内置了强大的防御功能来与网络作战-爬网;我必须通过使用浏览器开发人员工具找到隐藏的API来找出代码)。我认为问题不在于速率限制,因为我之前一直在尝试通过time.sleep更改两次API调用之间的秒数,但我仍然遇到了这个问题。我认为还需要包含其他隐藏参数,以便我可以实际运行网络爬虫,但是不确定什么是识别这些参数的最佳方法。我该如何解决?
import csv
import requests
import json
import urllib3
import urllib.request
import pandas as pd
import time
from time import sleep
from urllib.request import Request, urlopen
state_list = ['colorado']
'''state_list = ['alabama', 'alaska', 'arizona', 'arkansas', 'california', 'colorado', 'connecticut', 'delaware', 'florida',
'georgia', 'hawaii', 'idaho', 'illinois', 'indiana', 'iowa', 'kansas', 'kentucky', 'louisiana', 'maine', 'maryland',
'massachusetts', 'michigan', 'minnesota', 'mississippi', 'missouri', 'montana', 'nebraska', 'nevada', 'new-hampshire',
'new-jersey', 'new-mexico', 'new-york', 'north-carolina', 'north-dakota', 'ohio', 'oklahoma', 'oregon', 'pennsylvania',
'rhode-island', 'south-carolina', 'south-dakota', 'tennessee', 'texas', 'utah', 'vermont', 'virginia', 'washington',
'west-virginia', 'wisconsin', 'wyoming', 'puerto-rico']'''
json_data = []
for state in state_list:
print("Starting on", state)
page1url = "https://api-g.weedmaps.com/discovery/v1/listings?page_size=100&size=100&filter%5Bregion_slug%5Bdoctors%5D%5D={0}&filter%5Bregion_slug%5Bdispensaries%5D%5D={0}&filter%5Bregion_slug%5Bdeliveries%5D%5D={0}&filter%5Bplural_types%5D%5B%5D=doctors&filter%5Bplural_types%5D%5B%5D=dispensaries&filter%5Bplural_types%5D%5B%5D=deliveries&page=1".format(state)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'}
req1 = Request(url=page1url, headers=headers)
data1 = urlopen(req1).read()
js1 = json.loads(data1)
num_listings = js1["meta"]["total_listings"]
print(num_listings)
if num_listings % 100 >= 1:
state_pages = (num_listings//100) + 1
remainder = num_listings % 100
print(state_pages, "pages to scrape through and", remainder, "listings on the last page")
else:
state_pages = num_listings / 100
print(state_pages, "pages to scrape through and no remainders on the last page")
for x in range(1, state_pages):
starturl = "https://api-g.weedmaps.com/discovery/v1/listings?page_size=100&size=100&filter%5Bregion_slug%5Bdoctors%5D%5D={0}&filter%5Bregion_slug%5Bdispensaries%5D%5D={0}&filter%5Bregion_slug%5Bdeliveries%5D%5D={0}&filter%5Bplural_types%5D%5B%5D=doctors&filter%5Bplural_types%5D%5B%5D=dispensaries&filter%5Bplural_types%5D%5B%5D=deliveries&page=".format(state)
url = starturl + str(x)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
time.sleep(5)
req = Request(url=url, headers=headers)
print(url)
data = urlopen(req).read()
try:
js = json.loads(data)
except:
js = None
print(state,"fail")
break
for numb in range(100):
weedmaps_dict={"id_":"NaN","wmid":"NaN","business":"NaN", "state":"NaN", "city":"NaN", "type_desc":"NaN", "web_url":"NaN","license_type":"NaN", "address":"NaN", "zip_code":"NaN", "timezone":"NaN"}
id_ = js["data"]["listings"][numb]["id"]
wmid = js["data"]["listings"][numb]["wmid"]
business = js["data"]["listings"][numb]["name"].encode("utf-8")
state = js["data"]["listings"][numb]["state"].encode("utf-8")
city = js["data"]["listings"][numb]["city"].encode("utf-8")
type_desc = js["data"]["listings"][numb]["type"].encode("utf-8")
web_url = js["data"]["listings"][numb]["web_url"].encode("utf-8")
license_type = js["data"]["listings"][numb]["license_type"].encode("utf-8")
address = js["data"]["listings"][numb]["address"].encode("utf-8")
zip_code = js["data"]["listings"][numb]["zip_code"].encode("utf-8")
timezone = js["data"]["listings"][numb]["timezone"].encode("utf-8")
if id_ is not None:
weedmaps_dict["id_"] = id_
if wmid is not None:
weedmaps_dict["wmid"] = wmid
if business is not None:
weedmaps_dict["business"] = business
if state is not None:
weedmaps_dict["state"] = state
if city is not None:
weedmaps_dict["city"] = city
if type_desc is not None:
weedmaps_dict["type_desc"]=type_desc
if web_url is not None:
weedmaps_dict["web_url"] = web_url
print(business, city, web_url)
if license_type is not None:
weedmaps_dict["license_type"] = license_type
if address is not None:
weedmaps_dict["address"]=address
if zip_code is not None:
weedmaps_dict["zip_code"] = zip_code
if timezone is not None:
weedmaps_dict["timezone"] = timezone
json_data.append(weedmaps_dict)
print(state, "page", x, " done")
df = pd.DataFrame(json_data)
df.to_csv("weedmaps2020q2.csv")
print("Saved")