我一直在尝试抓取Instagram帖子以获取密钥的特定主题标签:display_url,taken_at_timestamp,text,edge_liked_by。这对于开始时的几百个来说非常完美,但之后只停止提取“text”关键字。其他三个字段已成功获取。我不确定为什么会这样。
我正在解析JSON https://www.instagram.com/explore/tags/something/?__a=1。
base_url = "https://www.instagram.com/explore/tags/salonedelmobile/?__a=1"
url = "https://www.instagram.com/explore/tags/salonedelmobile/?__a=1"
while True:
response = url_req.urlopen(url)
json_file = json.load(response)
for i in json_file['graphql']['hashtag']['edge_hashtag_to_media']['edges']:
try:
post_text = i['node']['edge_media_to_caption']['edges'][0]['node']['text']
except IndexError as e:
post_text = e
try:
display_url = i['node']['display_url']
except:
display_url = e
try:
like_count = i['node']['edge_liked_by']['count']
except:
like_count = e
try:
time_stamp = i['node']['taken_at_timestamp']
except:
time_stamp = e
output.append([display_url, like_count, time_stamp, post_text])
df = pd.DataFrame(output,columns=['URL', 'Like Count', 'Time', 'Text'])
try:
df.to_excel('instagram.xlsx')
except:
pass
if json_file['graphql']['hashtag']['edge_hashtag_to_media']['page_info']['has_next_page'] == True:
end_cursor = json_file['graphql']['hashtag']['edge_hashtag_to_media']['page_info']['end_cursor']
url = base_url + '&max_id=' + end_cursor
else:
break