我正在使用 pushshift api 在 Reddit subreddit 中收集帖子。但是请求返回的一些数据抛出了一个错误:
"None of [Index(['author', 'id', 'title', 'score', 'created_utc', 'permalink',\n 'num_comments'],\n dtype='object')] are in the [columns]"
我知道这意味着某些列中有空格,但问题是我在 112 个不同的日子里运行了这个请求,并且它成功了 94 次。因此,当理想情况下请求每次都应返回相同格式的数据时,我正在努力弄清楚如何解决它,因为我唯一要更改的是 before
和 after
的天数范围。< /p>
我无法容纳返回的整个 json 对象,但您可以通过运行以下命令自行检查:
import requests
from pprint import pprint
link = "https://api.pushshift.io/reddit/search/submission/?size=100&before=111d&after=112d&sort_type=num_comments&sort=desc&subreddit=wallstreetbets"
sample = requests.get(link).json()
pprint(sample)
import pandas as pd
import requests
for i in reversed(range(2, 113)):
after_ = i
before_ = i - 1
link = f"https://api.pushshift.io/reddit/search/submission/?size=100&before={before_}d&after={after_}d&sort_type=num_comments&sort=desc&subreddit=wallstreetbets"
path = f"/content/drive/MyDrive/UsersWSB/Posts/days_after_{after_}.csv"
try:
sample = requests.get(link).json()
df = pd.DataFrame(sample['data'])
# I only need these columns from the entire df
df = df[['author', 'id', 'title', 'score', 'created_utc', 'permalink', 'num_comments']]
df.to_csv(path)
except Exception as e:
print(e)
答案 0 :(得分:0)
在完整代码中,您没有针对预期条件进行测试
status_code!=200
import pandas as pd
import requests
if True:
df = pd.DataFrame()
link = "https://api.pushshift.io/reddit/search/submission/"
for i in reversed(range(2, 113)):
after_ = i
before_ = i - 1
params = {"size":100,"before":f"{before_}d","after":f"{after_}d","sort_type":"num_comments","sort":"desc","subreddit":"wallstreetbets"}
# DL is very slow, only DL if haven't done so already
if len(df)==0 or len(df.loc[df.before.eq(before_) & df.after.eq(after_)])==0:
req = requests.get(link, params=params)
# check request succeeded and it returned some data...
if req.status_code == 200 and "data" in req.json().keys() and len(req.json()["data"])>0:
df = pd.concat([df, pd.json_normalize(req.json()['data']).loc[:,['author', 'id', 'title', 'score', 'created_utc', 'permalink', 'num_comments']].assign(before=before_,after=after_)])
else:
print(f"{req.status_code} {params} {'error' if req.status_code!=200 else req.json()}" )
print(f"***DONE*** downloaded: {len(df)}")