如何使用BeautifulSoup从JS代码中抓取url标签?

时间:2016-03-20 19:25:34

标签: javascript python beautifulsoup url-parsing

我试图使用BeautifulSoup从网页上的JS脚本中抓取所有网址,脚本如下所示pastebin link

<script>(function(e,a){var t={"cdn_url":"https://f.vimeocdn.com","view":1,"request":{"files":{"dash":{"origin":"gcs","url":"https://skyfire.vimeocdn.com/1458528314-0x41f9db4bb4ec1a10ff0f95af0416acde0acbe440/143512140/video/431745862,431745863,431745861/master.json?base64_init=1","cdn":"fastly_skyfire","streams":[{"profile":112,"quality":"360p","id":431745862,"fps":25},{"profile":119,"quality":"1080p","id":431745863,"fps":25},{"profile":113,"quality":"720p","id":431745861,"fps":25}]},"hls":{"url":"https://skyfire.vimeocdn.com/1458528314-0x41f9db4bb4ec1a10ff0f95af0416acde0acbe440/143512140/video/431745862,431745863,431745861/master.m3u8","cdn":"fastly_skyfire"},"progressive":[{"profile":119,"width":1920,"mime":"video/mp4","fps":25,"url":"https://fpdl.vimeocdn.com/vimeo-prod-skyfire-std-us/01/3702/5/143512140/431745863.mp4?token=56ef603a_0x9aadd2d9ee5d6efd3c0b34db826227d2d01423c6","cdn":"fastly","quality":"1080p","id":431745863,"origin":"gcs","height":1080},{"profile":112,"width":640,"mime":"video/mp4","fps":25,"url":"https://fpdl.vimeocdn.com/vimeo-prod-skyfire-std-us/01/3702/5/143512140/431745862.mp4?token=56ef603a_0x828889dabd8ec15e0d2d4cb9c8e23809b7217052","cdn":"fastly","quality":"360p","id":431745862,"origin":"gcs","height":360},{"profile":113,"width":1280,"mime":"video/mp4","fps":25,"url":"https://fpdl.vimeocdn.com/vimeo-prod-skyfire-std-us/01/3702/5/143512140/431745861.mp4?token=56ef603a_0xd248e4a1bd2a9700a92aacf727d5a3b68186a75f","cdn":"fastly","quality":"720p","id":431745861,"origin":"gcs","height":720}]},"ga_account":"UA-76641-35","expires":28792,"timestamp":1458499222,"signature":"7cef4747eae9195685e18285f78a7b1e","currency":"EUR","session":"c1f8c8ee9b746ae6b1ee6a454a2e311da606fa041458499222","cookie":{"scaling":1,"volume":1.0,"quality":"720p","hd":1,"captions":null},"cookie_domain":".vimeo.com","referrer":"http://racing4everyone.eu/2015/10/25/formula-e-201516formula-e-201516-round01-china-race/","comscore_id":"14640914","flags":{"dnt":0,"preload_video":"metadata_on_hover","plays":1,"webp":1,"flash_hls":1,"login":1,"partials":1,"blurr":0},"build":{"player":"67209230","js":"2.20.3"},"urls":{"zeroclip_swf":"https://f.vimeocdn.com/p/external/zeroclipboard/ZeroClipboard.swf","js":"https://f.vimeocdn.com/p/2.20.3/js/player.js","proxy":"https://f.vimeocdn.com/p/2.20.3/proxy.html","flideo":"https://f.vimeocdn.com/p/flash/flideo/1.0.3b10/flideo.swf","moog":"https://f.vimeocdn.com/p/flash/moogaloop/6.3.4/moogaloop.swf?clip_id=143512140","comscore_js":"https://f.vimeocdn.com/p/external/streamsense.4.1408.29.min.js","blurr":"https://fresnel.vimeocdn.com/add/player-stats","chromeless_css":"https://f.vimeocdn.com/p/2.20.3/css/chromeless.css","vuid_js":"https://f.vimeocdn.com/js_opt/modules/utils/vuid.min.js","chromeless_js":"https://f.vimeocdn.com/p/2.20.3/js/chromeless.js","moog_js":"https://f.vimeocdn.com/p/2.20.3/js/moogaloop.js","zeroclip_js":"https://f.vimeocdn.com/p/external/zeroclipboard/ZeroClipboard-patch.js","css":"https://f.vimeocdn.com/p/2.20.3/css/player.css"},"country":"NL"},"player_url":"player.vimeo.com","video":{"allow_hd":1,"height":1080,"owner":{"account_type":"pro","name":"Mike Beging","img":"https://secure.gravatar.com/avatar/bc83adfd0d41055c020aa37a5dba653b?d=https%3A%2F%2Fi.vimeocdn.com%2Fportrait%2Fdefault-red_60x60.png&s=60","url":"https://vimeo.com/user48874148","img_2x":"https://secure.gravatar.com/avatar/bc83adfd0d41055c020aa37a5dba653b?d=https%3A%2F%2Fi.vimeocdn.com%2Fportrait%2Fdefault-red_120x120.png&s=120","id":48874148},"thumbs":{"1280":"https://i.vimeocdn.com/video/541116322_1280.jpg","960":"https://i.vimeocdn.com/video/541116322_960.jpg","640":"https://i.vimeocdn.com/video/541116322_640.jpg","base":"https://i.vimeocdn.com/video/541116322"},"duration":7198,"id":143512140,"hd":1,"embed_code":"<iframe src=\"https://player.vimeo.com/video/143512140\" width=\"500\" height=\"281\" frameborder=\"0\" webkitallowfullscreen mozallowfullscreen allowfullscreen></iframe>","default_to_hd":1,"title":"FormulaE.2015-16.Round01.China","url":null,"privacy":"disable","share_url":"https://vimeo.com/143512140","width":1920,"embed_permission":"whitelist","fps":25.0},"build":{"player":"67209230","rpc":"dev"},"embed":{"autopause":1,"color":"00adef","on_site":0,"outro":"nothing","api":3,"player_id":"","quality":null,"settings":{"fullscreen":1,"byline":0,"like":0,"playbar":1,"title":0,"color":0,"branding":0,"share":0,"scaling":1,"logo":0,"collections":0,"info_on_pause":0,"watch_later":0,"portrait":0,"embed":0,"badge":0,"volume":1},"context":"embed.main","time":0,"loop":0,"autoplay":0},"vimeo_url":"vimeo.com","user":{"liked":0,"account_type":"none","progress":0,"owner":0,"watch_later":0,"logged_in":0,"id":0,"mod":0}};if(!t.request){return}if(typeof t.request==="object"&&"error"in t.request){if("html"in t.request){e.documentElement.innerHTML=t.request.html.replace(/&lt;/g,"<").replace(/&gt;/g,">")}return}var r=function(){try{return window.self!==window.top}catch(e){return true}}();if(!r&&/twitter/i.test(navigator.userAgent)&&t.video.url){window.location=t.video.url}var n="exitFullscreen"in e||"webkitExitFullscreen"in e||"webkitCancelFullScreen"in e||"mozCancelFullScreen"in e||"msExitFullscreen"in e||"webkitEnterFullScreen"in e.createElement("video");var i=function(){var e=navigator;var a="Shockwave Flash";var t="application/x-shockwave-flash";var r="ShockwaveFlash.ShockwaveFlash";if(typeof e.plugins!=="undefined"&&typeof e.plugins[a]==="object"){if(e.plugins[a].description&&!(typeof e.mimeTypes!=="undefined"&&e.mimeTypes[t]&&!e.mimeTypes[t].enabledPlugin)){return true}}try{if(window.ActiveXObject&&new ActiveXObject(r)){return true}}catch(n){}return false}();var o=function(){var a=e.createElement("video");return{h264:"canPlayType"in a&&a.canPlayType("video/mp4")!=="",textTracks:typeof TextTrackList!=="undefined"&&typeof a.textTracks!=="undefined"&&a.textTracks instanceof TextTrackList}}();var s=function(){var a=e.createElement("div");a.innerHTML="<svg/>";return(a.firstChild&&a.firstChild.namespaceURI)==="http://www.w3.org/2000/svg"}();var l=/MSIE 9/.test(navigator.userAgent)&&/Windows Phone/.test(navigator.userAgent);var c=/IE 10/.test(navigator.userAgent);window._gaq=[["player._setAccount",'UA-76641-35'],["player._setDomainName","player.vimeo.com"],["player._set","_anonymizeIp",true],["player._trackPageview"]];var u=n||c||l;var d=e.getElementsByTagName("script")[0];var m=e.createElement("script");var p=false;var f=false;if("text_tracks"in t.request&&(!o.textTracks||o.textTracks&&!o.h264)&&i){u=false}if(!s){u=false}var v="vod"in t.video&&t.video.vod.is_feature;if(v&&i&&("files"in t.request&&!t.request.files.dash)){u=false}if(u){a.className="player loading";var g=(new Date).getTime();m.src=t.request.urls.js;d.parentNode.insertBefore(m,d);m["onreadystatechange"in m?"onreadystatechange":"onload"]=function(){if(!p&&(!this.readyState||this.readyState==="loaded"||this.readyState==="complete")){p=true;var e=(new Date).getTime()-g;window._gaq.push(["player._trackTiming","Player","JavaScript Load",e]);f=new VimeoPlayer(a,t,y||{link:w,startTime:g})}};var y=false;var w=e.createElement("link");w.rel="stylesheet";w.href=t.request.urls.css+(typeof cacheBuster==="undefined"?"":cacheBuster);e.getElementsByTagName("head")[0].appendChild(w);w.onload=function(){y=true;var e=(new Date).getTime()-g;window._gaq.push(["player._trackTiming","Player","CSS Load",e])}}else if(i){a.innerHTML='<object id="flash-object" name="flash-object" type="application/x-shockwave-flash" width="100%" height="100%" data="'+t.request.urls.moog+"&z="+(new Date).getTime()+'"><param name="flashvars" value="clip_id='+t.video.id+"&amp;embed_location="+encodeURIComponent(t.request.referrer)+'&amp;js_getConfig=getConfig&amp;js_setConfig=setConfig&amp;js_onLoad=onMoogaloopLoaded&amp;api=1&amp;moogaloop_type=moogaloop"><param name="movie" value="'+t.request.urls.moog+"&z="+(new Date).getTime()+'"><param name="allowfullscreen" value="true"><param name="allowscriptaccess" value="always"><param name="bgcolor" value="#000000"><param name="wmode" value="opaque"><param name="quality" value="high"><param name="scalemode" value="noscale"></object>';var h=false;var T=e.getElementById("flash-object");window.onMoogaloopLoaded=function(){h=true;if(!f&&p){a.className="player";f=new VimeoPlayer(T,t)}};window.getConfig=function(){return t};window.setConfig=function(e){t=e};m.src=t.request.urls.moog_js;d.parentNode.insertBefore(m,d);m["onreadystatechange"in m?"onreadystatechange":"onload"]=function(){if(!p&&(!this.readyState||this.readyState==="loaded"||this.readyState==="complete")){p=true;if(!f&&h){f=new VimeoPlayer(T,t)}}}}else{a.innerHTML='<div class="fallback"><iframe src="/video/143512140/fallback?js&amp;referrer='+encodeURIComponent(t.request.referrer)+'" frameborder="0"></iframe></div>'}if(!t.request.flags.dnt){var _=e.createElement("script");_.async=true;_.src='https://ssl.google-analytics.com/ga.js';d.parentNode.insertBefore(_,d);window._vuid=[["pid",t.request.session]];var k=e.createElement("script");k.async=true;k.src=t.request.urls.vuid_js;d.parentNode.insertBefore(k,d)}})(document,document.getElementById("player"));</script>

我想从此JS脚本中删除所有视频"url"链接。我非常确信我不能用BeautifulSoup做到这一点,所以我可以使用什么代替(最好是正确的代码)。

每次加载页面时URL也会更新(并在设定的时间后过期),那么是否可以编写一些始终获得新URL的代码?

1 个答案:

答案 0 :(得分:0)

我不确定您希望如何区分其他链接中的视频,但您可以使用以下功能从文档中删除所有网址,

def all_links(content):
    links = []
    url_re = re.compile(r'"(https?://[^"]*)"')
    for m in url_re.finditer(content):
        links.append(m.group(1))
    return links

(这可以写成一行列表理解,但这种方式更具可读性。)

要根据JS对象中的标签提取特定链接,例如“cdn_url:”链接,您可以使用更具体的正则表达式,如下所示,

m = re.search(r'"cdn_url": *"(https?://[^"]*)"')
if m:
    link = m.group(1)

这应该让你去。