我试图从例如以下链接中抓取数据: https://i.instagram.com/api/v1/users/6862425230/info/
这是我的代码:
import requests
from bs4 import BeautifulSoup
url = 'https://i.instagram.com/api/v1/users/6862425230/info/'
page_response = requests.get(url, timeout=5)
page_content = BeautifulSoup(page_response.content, 'html.parser')
但是,当我查看page_content时,某些数据丢失了。
这是我在浏览器中看到的:
{"user": {"pk": 6862425230, "username": "ukskinlaserclinics", "full_name": "UK Skin + Laser Clinics", "is_private": false, "profile_pic_url": "https://scontent-sjc3-1.cdninstagram.com/vp/f8fa9418e6ceaa806679b6f87a08b0fc/5CECF960/t51.2885-19/s150x150/35403653_2099249220343854_7002585735337345024_n.jpg?_nc_ht=scontent-sjc3-1.cdninstagram.com", "profile_pic_id": "1812637506760726849_6862425230", "is_verified": false, "has_anonymous_profile_picture": false, "media_count": 472, "follower_count": 1080, "following_count": 204, "following_tag_count": 2, "biography": "Trusted and Affordable! \u2728\n- Medical Grade Laser Hair Removal\n- Clinical Skin Treatments \n- Cosmetic Injectables\nOur new skin packages\ud83d\udc47\ud83c\udffc", "external_url": "https://abd.com/2AVrSP8", "external_lynx_url": "https://l.instagram.com/?u=https%3A%2F%2Fbit.ly%2F2AVrSP8\u0026e=ATOlMuSNxIZdNOf8PZWn78JsdfXQaVaPK9cQx7fk_dbUqe6myf59NPNAgsbUU6gsrvsJpPK1O4Ap0quX", "total_igtv_videos": 0, "total_ar_effects": 0, "reel_auto_archive": "on", "usertags_count": 12, "is_favorite": false, "is_interest_account": true, "hd_profile_pic_versions": [{"width": 320, "height": 320, "url": "https://scontent-sjc3-1.cdninstagram.com/vp/1640062d27e1a983de093fa502caabed/5CEE8618/t51.2885-19/s320x320/35403653_2099249220343854_7002585735337345024_n.jpg?_nc_ht=scontent-sjc3-1.cdninstagram.com"}, {"width": 640, "height": 640, "url": "https://scontent-sjc3-1.cdninstagram.com/vp/7606a820992b811ea4c02bf504eae678/5CE4B5A3/t51.2885-19/s640x640/35403653_2099249220343854_7002585735337345024_n.jpg?_nc_ht=scontent-sjc3-1.cdninstagram.com"}], "hd_profile_pic_url_info": {"url": "https://scontent-sjc3-1.cdninstagram.com/vp/82af6acb7b88a9b998b74398570eda14/5D266818/t51.2885-19/35403653_2099249220343854_7002585735337345024_n.jpg?_nc_ht=scontent-sjc3-1.cdninstagram.com", "width": 1042, "height": 1042}, "mutual_followers_count": 0, "has_highlight_reels": true, "school": {}, "is_eligible_for_school": false, "can_be_reported_as_fraud": false, "direct_messaging": "UNKNOWN", "fb_page_call_to_action_id": "", "address_street": "59 St John's Rd", "business_contact_method": "CALL", "category": "Beauty, Cosmetic \u0026 Personal Care", "city_id": 106078429431815, "city_name": "London, United Kingdom", "contact_phone_number": "+442034750661", "is_call_to_action_enabled": false, "latitude": 51.4618874, "longitude": -0.1673537, "public_email": "hello@ukskinlaser.com", "public_phone_country_code": "44", "public_phone_number": "2034750661", "zip": "SW11 1QW", "instagram_location_id": "", "is_business": true, "account_type": 2, "can_hide_category": false, "can_hide_public_contacts": false, "should_show_category": true, "should_show_public_contacts": true, "include_direct_blacklist_status": true, "is_potential_business": true, "is_bestie": false, "has_unseen_besties_media": false, "show_account_transparency_details": true, "auto_expand_chaining": false, "highlight_reshare_disabled": false}, "status": "ok"}
以下是我在Chrome浏览器中看到的屏幕截图:
但是在page_content
中,我看不到例如address_street
。
如何抓取这些数据?
答案 0 :(得分:1)
正如我在评论中说的那样,您可以在真正的instagram页面(instagram.com/ukskinlaserclinics)内粘贴页脚
所有数据都在<script type="text/javascript">
要查找街道地址,您必须再次使用json_loads()
,因为它在用户json中的父级是str。
from bs4 import BeautifulSoup as soup
import re
import json
import requests
def _get_json_footer(html):
s = str(html)
r = re.compile('"entry_data":(.*?),"gatekeepers"')
m = r.search(s)
if m:
result = m.group(1)
return json.loads(result)
url = 'https://www.instagram.com/ukskinlaserclinics/'
page = requests.get(url)
html = soup(page.text, 'html.parser')
json_footer = _get_json_footer(html)
profile = json_footer.get('ProfilePage')
business_address_json = profile[0].get('graphql',{}).get('user',{}).get('business_address_json',{})
street_address = json.loads(business_address_json).get('street_address',{})
print(street_address)
输出:
圣约翰道59号