我正在尝试使用以下代码抓取网站:
import requests
import pandas as pd
with requests.Session() as connection:
connection.headers.update(
{
"referer": "https://gmatclub.com/forum/decision-tracker.html",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.86 YaBrowser/21.3.0.740 Yowser/2.5 Safari/537.36",
}
)
_ = connection.get("https://gmatclub.com/forum/decision-tracker.html")
endpoint = connection.get("https://gmatclub.com/api/schools/v1/forum/app-tracker-latest-updates?limit=50&year=all").json()
for item in endpoint["statistics"]:
print(item)
我不确定如何在决策跟踪器下获取录取状态 - 实时更新。
答案 0 :(得分:1)
勾号、叉号和圆圈代表申请是被录取、被拒绝还是因任何原因待决。此信息位于 status_id
下。在源代码中可以找到数字的映射库。当我们将其转换为 python 字典时,我们可以获得状态,还可以重建刻度等:
import requests
status_mapping = {1: { 'id':1,'class':'mainApplicationSubmitted','name':'Application Submitted' },
3: { 'id':3,'class':'mainInterviewed','name':'interviewed' },
4: { 'id':4,'class':'mainAdmitted','name':'admited' },
5: { 'id':5,'class':'mainDenied','name':'denied' },
6: { 'id':6,'class':'mainDenied','name':'denied' },
7: { 'id':7,'class':'mainWaitListed','name':'waitlisted' },
8: { 'id':8,'class':'mainWaitListed','name':'waitlisted' },
9: { 'id':9,'class':'mainMatriculating','name':'matriculating' },
10:{ 'id':10,'class':'mainWlAdmited','name':'admitted From WL' },
11:{ 'id':11,'class':'mainResearching','name':'researching Or Writing Essays' },
12:{ 'id':12,'class':'mainInvitedToInterview','name':'invited To Interview' },
13:{ 'id':13,'class':'mainWithdrawn','name':'withdrawn Application '}}
with requests.Session() as connection:
connection.headers.update(
{
"referer": "https://gmatclub.com/forum/decision-tracker.html",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.86 YaBrowser/21.3.0.740 Yowser/2.5 Safari/537.36",
}
)
_ = connection.get("https://gmatclub.com/forum/decision-tracker.html")
endpoint = connection.get("https://gmatclub.com/api/schools/v1/forum/app-tracker-latest-updates?limit=50&year=all").json()
for item in endpoint["statistics"]:
try:
status = status_mapping[int(item['status_id'])]['name']
if int(item['status_id']) in [4]:
status_short = 'green'
elif int(item['status_id']) in [5,6]:
status_short = 'red'
else:
status_short = 'grey'
print(status, status_short)
except:
print(f"Key {item['status_id']} is missing from status_mapping. Check the entry at {item['date']} to see what this key represents and add it to status_mapping.")