只是想知道如何从html请求中的标记之间获取可用数据。假设这是在一些html中,我将如何提取要使用的字典。
<script type="text/javascript">window._sharedData = {"static_root":"\/\/d36xtkk24g8jdx.cloudfront.net\/bluebar\/a1968ef","platform":{"is_touch":false,"app_platform":"web"},"hostname":"instagram.com","entry_data":{"DesktopPPage":[{"canSeePrerelease":false,"viewer":null,"media":{"caption_is_edited":false,"code":"vF25LwCnL8","date":1415348305.0,"video_url":"http:\/\/videos-h-12.ak.instagram.com\/hphotos-ak-xap1\/10753251_876245142395032_328159772_n.mp4","caption":"2014 season teaser! Just a taste of some of the \ud83d\udd28\ud83d\udd28\ud83d\udd28 that got fumbled on \ud83d\udcf9 this season. Edit dropping fall 2017 @m.wilkie @sturhyssmith #snowboarding #springshred #bdpproteam #turoaparks #turoa #mtruapehu #seasonedit #wouldyouratherfightagoatwithahumanheadorahumanwithagoathead?","secure_video_url":"https:\/\/igcdn-videos-h-12-a.akamaihd.net\/hphotos-ak-xap1\/10753251_876245142395032_328159772_n.mp4","usertags":{"nodes":[]},"comments":{"nodes":[{"text":"Where do I buy tickets to the London premiere? #fanboy","viewer_can_delete":false,"id":"848487057151652334","user":{"username":"jamesbutchernz","profile_pic_url":"https:\/\/instagramimages-a.akamaihd.net\/profiles\/profile_1052126311_75sq_1391324963.jpg"}},{"text":"It's invites only @jamesbutchernz @m.wilkie is choosing too so chances are slim unless your smoking hot with low self esteem.","viewer_can_delete":false,"id":"849353938720944684","user":{"username":"bobeykrebner","profile_pic_url":"https:\/\/igcdn-photos-g-a.akamaihd.net\/hphotos-ak-xpf1\/10584664_742398385822158_510451676_a.jpg"}},{"text":"It's lucky we all know I'm both of those. #easy","viewer_can_delete":false,"id":"849403857951420829","user":{"username":"jamesbutchernz","profile_pic_url":"https:\/\/instagramimages-a.akamaihd.net\/profiles\/profile_1052126311_75sq_1391324963.jpg"}},{"text":"Last I heard you were smoking hot and had the self esteem of Kanye West @jamesbutchernz what changed?","viewer_can_delete":false,"id":"849671858500038887","user":{"username":"bobeykrebner","profile_pic_url":"https:\/\/igcdn-photos-g-a.akamaihd.net\/hphotos-ak-xpf1\/10584664_742398385822158_510451676_a.jpg"}},{"text":"You know what they say @bobeykrebner. Treat yourself like Kayne treats Kayne.","viewer_can_delete":false,"id":"849966794608898266","user":{"username":"jamesbutchernz","profile_pic_url":"https:\/\/instagramimages-a.akamaihd.net\/profiles\/profile_1052126311_75sq_1391324963.jpg"}}]},"shared_by_author":true,"likes":{"count":41,"viewer_has_liked":false,"nodes":[{"user":{"username":"claytonbenson","profile_pic_url":"https:\/\/instagramimages-a.akamaihd.net\/profiles\/profile_52633025_75sq_1359351765.jpg"}},{"user":{"username":"snowrev","profile_pic_url":"https:\/\/igcdn-photos-d-a.akamaihd.net\/hphotos-ak-xaf1\/10735284_1474262932842435_1018554144_a.jpg"}},{"user":{"username":"shayning_","profile_pic_url":"https:\/\/igcdn-photos-f-a.akamaihd.net\/hphotos-ak-xaf1\/10817775_319647074907693_836092401_a.jpg"}},{"user":{"username":"paused_future","profile_pic_url":"https:\/\/igcdn-photos-f-a.akamaihd.net\/hphotos-ak-xpa1\/10809941_1580815445475533_469492417_a.jpg"}},{"user":{"username":"kris_tayl0r","profile_pic_url":"https:\/\/igcdn-photos-e-a.akamaihd.net\/hphotos-ak-xaf1\/10802916_384369668395220_1244229274_a.jpg"}},{"user":{"username":"crazyshuz","profile_pic_url":"https:\/\/igcdn-photos-h-a.akamaihd.net\/hphotos-ak-xfp1\/10787707_905860216092359_425635869_a.jpg"}},{"user":{"username":"titstatertots","profile_pic_url":"https:\/\/igcdn-photos-b-a.akamaihd.net\/hphotos-ak-xpf1\/10554089_855164584513369_706239607_a.jpg"}}]},"owner":{"username":"bobeykrebner","requested_by_viewer":false,"followed_by_viewer":false,"profile_pic_url":"https:\/\/igcdn-photos-g-a.akamaihd.net\/hphotos-ak-xpf1\/10584664_742398385822158_510451676_a.jpg","has_blocked_viewer":false,"id":"1459690667","is_private":false},"is_video":true,"id":"848325528968131324","display_src":"http:\/\/photos-e.ak.instagram.com\/hphotos-ak-xfp1\/10748245_307748359428196_942078105_n.jpg"},"__get_params":{},"staticRoot":"\/\/d36xtkk24g8jdx.cloudfront.net\/bluebar\/a1968ef","__query_string":"?","prerelease":false,"__path":"\/p\/vF25LwCnL8\/","shortcode":"vF25LwCnL8"}]},"country_code":"AU","config":{"viewer":null,"csrf_token":"0bfa16595bdacb5bcfcb94441d0fb7ab"}};</script>
我基本上想知道如何从脚本标记中获取可用数据,但是在"window._sharedData ="
行之后。
答案 0 :(得分:2)
您将结合使用HTML解析和文本操作。
BeautifulSoup将有助于解析,之后您可以提取<script>
标记文本内容并拆分JavaScript对象定义:
from bs4 import BeautifulSoup
import re
soup = BeautifulSoup(html_page_source)
script_tag = soup.find('script', text=re.compile('window\._sharedData'))
shared_data = script_tag.string.partition('=')[-1].strip(' ;')
最后一行获取标记的字符串内容,将所有内容拆分为第一个=
,然后删除所有前导和尾随空格和分号。
演示,包括将结果字符串加载为JSON:
>>> from bs4 import BeautifulSoup
>>> import re
>>> soup = BeautifulSoup('''\
... <script type="text/javascript">window._sharedData = {"static_root":"\/\/d36xtkk24g8jdx.cloudfront.net\/bluebar\/a1968ef","platform":{"is_touch":false,"app_platform":"web"},"hostname":"instagram.com","entry_data":{"DesktopPPage":[{"canSeePrerelease":false,"viewer":null,"media":{"caption_is_edited":false,"code":"vF25LwCnL8","date":1415348305.0,"video_url":"http:\/\/videos-h-12.ak.instagram.com\/hphotos-ak-xap1\/10753251_876245142395032_328159772_n.mp4","caption":"2014 season teaser! Just a taste of some of the \ud83d\udd28\ud83d\udd28\ud83d\udd28 that got fumbled on \ud83d\udcf9 this season. Edit dropping fall 2017 @m.wilkie @sturhyssmith #snowboarding #springshred #bdpproteam #turoaparks #turoa #mtruapehu #seasonedit #wouldyouratherfightagoatwithahumanheadorahumanwithagoathead?","secure_video_url":"https:\/\/igcdn-videos-h-12-a.akamaihd.net\/hphotos-ak-xap1\/10753251_876245142395032_328159772_n.mp4","usertags":{"nodes":[]},"comments":{"nodes":[{"text":"Where do I buy tickets to the London premiere? #fanboy","viewer_can_delete":false,"id":"848487057151652334","user":{"username":"jamesbutchernz","profile_pic_url":"https:\/\/instagramimages-a.akamaihd.net\/profiles\/profile_1052126311_75sq_1391324963.jpg"}},{"text":"It's invites only @jamesbutchernz @m.wilkie is choosing too so chances are slim unless your smoking hot with low self esteem.","viewer_can_delete":false,"id":"849353938720944684","user":{"username":"bobeykrebner","profile_pic_url":"https:\/\/igcdn-photos-g-a.akamaihd.net\/hphotos-ak-xpf1\/10584664_742398385822158_510451676_a.jpg"}},{"text":"It's lucky we all know I'm both of those. #easy","viewer_can_delete":false,"id":"849403857951420829","user":{"username":"jamesbutchernz","profile_pic_url":"https:\/\/instagramimages-a.akamaihd.net\/profiles\/profile_1052126311_75sq_1391324963.jpg"}},{"text":"Last I heard you were smoking hot and had the self esteem of Kanye West @jamesbutchernz what changed?","viewer_can_delete":false,"id":"849671858500038887","user":{"username":"bobeykrebner","profile_pic_url":"https:\/\/igcdn-photos-g-a.akamaihd.net\/hphotos-ak-xpf1\/10584664_742398385822158_510451676_a.jpg"}},{"text":"You know what they say @bobeykrebner. Treat yourself like Kayne treats Kayne.","viewer_can_delete":false,"id":"849966794608898266","user":{"username":"jamesbutchernz","profile_pic_url":"https:\/\/instagramimages-a.akamaihd.net\/profiles\/profile_1052126311_75sq_1391324963.jpg"}}]},"shared_by_author":true,"likes":{"count":41,"viewer_has_liked":false,"nodes":[{"user":{"username":"claytonbenson","profile_pic_url":"https:\/\/instagramimages-a.akamaihd.net\/profiles\/profile_52633025_75sq_1359351765.jpg"}},{"user":{"username":"snowrev","profile_pic_url":"https:\/\/igcdn-photos-d-a.akamaihd.net\/hphotos-ak-xaf1\/10735284_1474262932842435_1018554144_a.jpg"}},{"user":{"username":"shayning_","profile_pic_url":"https:\/\/igcdn-photos-f-a.akamaihd.net\/hphotos-ak-xaf1\/10817775_319647074907693_836092401_a.jpg"}},{"user":{"username":"paused_future","profile_pic_url":"https:\/\/igcdn-photos-f-a.akamaihd.net\/hphotos-ak-xpa1\/10809941_1580815445475533_469492417_a.jpg"}},{"user":{"username":"kris_tayl0r","profile_pic_url":"https:\/\/igcdn-photos-e-a.akamaihd.net\/hphotos-ak-xaf1\/10802916_384369668395220_1244229274_a.jpg"}},{"user":{"username":"crazyshuz","profile_pic_url":"https:\/\/igcdn-photos-h-a.akamaihd.net\/hphotos-ak-xfp1\/10787707_905860216092359_425635869_a.jpg"}},{"user":{"username":"titstatertots","profile_pic_url":"https:\/\/igcdn-photos-b-a.akamaihd.net\/hphotos-ak-xpf1\/10554089_855164584513369_706239607_a.jpg"}}]},"owner":{"username":"bobeykrebner","requested_by_viewer":false,"followed_by_viewer":false,"profile_pic_url":"https:\/\/igcdn-photos-g-a.akamaihd.net\/hphotos-ak-xpf1\/10584664_742398385822158_510451676_a.jpg","has_blocked_viewer":false,"id":"1459690667","is_private":false},"is_video":true,"id":"848325528968131324","display_src":"http:\/\/photos-e.ak.instagram.com\/hphotos-ak-xfp1\/10748245_307748359428196_942078105_n.jpg"},"__get_params":{},"staticRoot":"\/\/d36xtkk24g8jdx.cloudfront.net\/bluebar\/a1968ef","__query_string":"?","prerelease":false,"__path":"\/p\/vF25LwCnL8\/","shortcode":"vF25LwCnL8"}]},"country_code":"AU","config":{"viewer":null,"csrf_token":"0bfa16595bdacb5bcfcb94441d0fb7ab"}};</script>
... ''')
>>> script_tag = soup.find('script', text=re.compile('window\._sharedData'))
>>> shared_data = script_tag.string.partition('=')[-1].strip(' ;')
>>> import json
>>> result = json.loads(shared_data)
>>> from pprint import pprint
>>> pprint(result)
{u'config': {u'csrf_token': u'0bfa16595bdacb5bcfcb94441d0fb7ab',
u'viewer': None},
u'country_code': u'AU',
u'entry_data': {u'DesktopPPage': [{u'__get_params': {},
u'__path': u'/p/vF25LwCnL8/',
u'__query_string': u'?',
u'canSeePrerelease': False,
u'media': {u'caption': u'2014 season teaser! Just a taste of some of the \U0001f528\U0001f528\U0001f528 that got fumbled on \U0001f4f9 this season. Edit dropping fall 2017 @m.wilkie @sturhyssmith #snowboarding #springshred #bdpproteam #turoaparks #turoa #mtruapehu #seasonedit #wouldyouratherfightagoatwithahumanheadorahumanwithagoathead?',
u'caption_is_edited': False,
u'code': u'vF25LwCnL8',
u'comments': {u'nodes': [{u'id': u'848487057151652334',
u'text': u'Where do I buy tickets to the London premiere? #fanboy',
u'user': {u'profile_pic_url': u'https://instagramimages-a.akamaihd.net/profiles/profile_1052126311_75sq_1391324963.jpg',
u'username': u'jamesbutchernz'},
u'viewer_can_delete': False},
{u'id': u'849353938720944684',
u'text': u"It's invites only @jamesbutchernz @m.wilkie is choosing too so chances are slim unless your smoking hot with low self esteem.",
u'user': {u'profile_pic_url': u'https://igcdn-photos-g-a.akamaihd.net/hphotos-ak-xpf1/10584664_742398385822158_510451676_a.jpg',
u'username': u'bobeykrebner'},
u'viewer_can_delete': False},
{u'id': u'849403857951420829',
u'text': u"It's lucky we all know I'm both of those. #easy",
u'user': {u'profile_pic_url': u'https://instagramimages-a.akamaihd.net/profiles/profile_1052126311_75sq_1391324963.jpg',
u'username': u'jamesbutchernz'},
u'viewer_can_delete': False},
{u'id': u'849671858500038887',
u'text': u'Last I heard you were smoking hot and had the self esteem of Kanye West @jamesbutchernz what changed?',
u'user': {u'profile_pic_url': u'https://igcdn-photos-g-a.akamaihd.net/hphotos-ak-xpf1/10584664_742398385822158_510451676_a.jpg',
u'username': u'bobeykrebner'},
u'viewer_can_delete': False},
{u'id': u'849966794608898266',
u'text': u'You know what they say @bobeykrebner. Treat yourself like Kayne treats Kayne.',
u'user': {u'profile_pic_url': u'https://instagramimages-a.akamaihd.net/profiles/profile_1052126311_75sq_1391324963.jpg',
u'username': u'jamesbutchernz'},
u'viewer_can_delete': False}]},
u'date': 1415348305.0,
u'display_src': u'http://photos-e.ak.instagram.com/hphotos-ak-xfp1/10748245_307748359428196_942078105_n.jpg',
u'id': u'848325528968131324',
u'is_video': True,
u'likes': {u'count': 41,
u'nodes': [{u'user': {u'profile_pic_url': u'https://instagramimages-a.akamaihd.net/profiles/profile_52633025_75sq_1359351765.jpg',
u'username': u'claytonbenson'}},
{u'user': {u'profile_pic_url': u'https://igcdn-photos-d-a.akamaihd.net/hphotos-ak-xaf1/10735284_1474262932842435_1018554144_a.jpg',
u'username': u'snowrev'}},
{u'user': {u'profile_pic_url': u'https://igcdn-photos-f-a.akamaihd.net/hphotos-ak-xaf1/10817775_319647074907693_836092401_a.jpg',
u'username': u'shayning_'}},
{u'user': {u'profile_pic_url': u'https://igcdn-photos-f-a.akamaihd.net/hphotos-ak-xpa1/10809941_1580815445475533_469492417_a.jpg',
u'username': u'paused_future'}},
{u'user': {u'profile_pic_url': u'https://igcdn-photos-e-a.akamaihd.net/hphotos-ak-xaf1/10802916_384369668395220_1244229274_a.jpg',
u'username': u'kris_tayl0r'}},
{u'user': {u'profile_pic_url': u'https://igcdn-photos-h-a.akamaihd.net/hphotos-ak-xfp1/10787707_905860216092359_425635869_a.jpg',
u'username': u'crazyshuz'}},
{u'user': {u'profile_pic_url': u'https://igcdn-photos-b-a.akamaihd.net/hphotos-ak-xpf1/10554089_855164584513369_706239607_a.jpg',
u'username': u'titstatertots'}}],
u'viewer_has_liked': False},
u'owner': {u'followed_by_viewer': False,
u'has_blocked_viewer': False,
u'id': u'1459690667',
u'is_private': False,
u'profile_pic_url': u'https://igcdn-photos-g-a.akamaihd.net/hphotos-ak-xpf1/10584664_742398385822158_510451676_a.jpg',
u'requested_by_viewer': False,
u'username': u'bobeykrebner'},
u'secure_video_url': u'https://igcdn-videos-h-12-a.akamaihd.net/hphotos-ak-xap1/10753251_876245142395032_328159772_n.mp4',
u'shared_by_author': True,
u'usertags': {u'nodes': []},
u'video_url': u'http://videos-h-12.ak.instagram.com/hphotos-ak-xap1/10753251_876245142395032_328159772_n.mp4'},
u'prerelease': False,
u'shortcode': u'vF25LwCnL8',
u'staticRoot': u'//d36xtkk24g8jdx.cloudfront.net/bluebar/a1968ef',
u'viewer': None}]},
u'hostname': u'instagram.com',
u'platform': {u'app_platform': u'web', u'is_touch': False},
u'static_root': u'//d36xtkk24g8jdx.cloudfront.net/bluebar/a1968ef'}