我使用来自get()
模块的Python request
函数向this page服务器发出请求。当我访问请求的内容时,我得到了这个(样本):
b'\n\n\n\n<!DOCTYPE html>\n<html\nxmlns:og="http://ogp.me/ns#"\nxmlns:fb="http://www.facebook.com/2008/fbml">\n <head>\n <meta charset="utf-8">\n <meta http-equiv="X-UA-Compatible" content="IE=edge">\n\n <meta name="apple-itunes-app" content="app-id=342792525, app-argument=imdb:///?src=mdot">\n \n \n <script type="text/javascript">var ue_t0=window.ue_t0||+new Date();</script>\n <script type="text/javascript">\n var ue_mid = "A1EVAM02EL8SFB"; \n var ue_sn = "www.imdb.com"; \n var ue_furl = "fls-na.amazon.com";\n var ue_sid = "000-0000000-0000000";\n var ue_id = "03N6Z2NEAF09T9H26QYE";\n (function(e){var c=e;var a=c.ue||{};a.main_scope="mainscopecsm";a.q=[];a.t0=c.ue_t0||+new Date();a.d=g;function g(h){return +new Date()-(h?0:a.t0)}function d(h){return function(){a.q.push({n:h,a:arguments,t:a.d()})}}function b(m,l,h,j,i){var k={m:m,f:l,l:h,c:""+j,err:i,fromOnError:1,args:arguments};c.ueLogError(k);return false}b.skipTrace=1;e.onerror=b;function f(){c.uex("ld")}if(e.addEventListener){e.addEventListener("load",f,false)}else{if(e.attachEvent){e.attachEvent("onload",f)}}a.tag=d("tag");a.log=d("log");a.reset=d("rst");c.ue_csm=c;c.ue=a;c.ueLogError=d("err");c.ues=d("ues");c.uet=d("uet");c.uex=d("uex");c.uet("ue")})(window);(function(e,d){var a=e.ue||{};function c(g){if(!g){return}var f=d.head||d.getElementsByTagName("head")[0]||d.documentElement,h=d.createElement("script");h.async="async";h.src=g;f.insertBefore(h,f.firstChild)}function b(){var k=e.ue_cdn||"z-ecx.images-amazon.com",g=e.ue_cdns||"images-na.ssl-images-amazon.com",j="/images/G/01/csminstrumentation/",h=e.ue_file||"ue-full-11e51f253e8ad9d145f4ed644b40f692._V1_.js",f,i;if(h.indexOf("NSTRUMENTATION_FIL")>=0){return}if("ue_https" in e){f=e.ue_https}else{f=e.location&&e.location.protocol=="https:"?1:0}i=f?"https://":"http://";i+=f?g:k;i+=j;i+=h;c(i)}if(!e.ue_inline){if(a.loadUEFull){a.loadUEFull()}else{b()}}a.uels=c;e.ue=a})(window,document);\n if (!(\'CS\' in window)) { window.CS = {}; }\n window.CS.loginLocation = "https://www.imdb.com/registration/signin?u=%2Fsearch%2Ftitle%3Frelease_date%3D2017%26sort%3Dnum_votes%2Cdesc%26page%3D1";\n </script>\n \n\n \n <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:\'java\'};</script>\n \n <script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>\n <title>IMDb: Most Voted Titles Released 2017-01-01 to 2017-12-31 - IMDb</title>\n <script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>\n \n <link rel="canonical" href="http://www.imdb.com/search/title" />\n <meta property="og:url" content="http://www.imdb.com/search/title" />\n \n <script>(function(t){ (t.events = t.events || {})["csm_head_pre_icon"] = new Date().getTime(); })(IMDbTimer);</script>\n <link href="http://ia.media-imdb.com/images/G/01/imdb/images/safari-favicon-517611381._CB522736552_.svg" mask rel="icon" sizes="any">\n <link rel="icon" type="image/ico" href="http://ia.media-imdb.com/images/G/01/imdb/images/favicon-2165806970._CB522736556_.ico" />\n <meta name="theme-color" content="#000000" />\n <link rel="shortcut icon" type="image/x-icon" href="http://ia.media-imdb.com/images/G/01/imdb/images/desktop-favicon-2165806970._CB522736561_.ico" />\n <link href="http://ia.media-imdb.com/images/G/01/imdb/images/mobile/apple-touch-icon-web-4151659188._CB522736129_.png" rel="apple-touch-icon"> \n <link href="http://ia.media-imdb.com/images/G/01/imdb/images/mobile/apple-touch-icon-web-76x76-53536248._CB522736233_.png" rel="apple-touch-icon" sizes="76x76"> \n <link href="http://ia.media-imdb.com/images/G/01/imdb/images/mobile/apple-touch-icon-web-120x120-2442878471._CB522736253_.png" rel="apple-touch-icon" sizes="120x120"> \n <link href="http://ia.media-imdb.com/images/G/01/imdb/images/mobile/apple-touch-icon-web-152x152-1475823641._CB522736557_.png" rel="apple-touch-icon" sizes="152x152"> \n <link rel="search" type="application/opensearchdescription+xml" href="http://ia.media-imdb.com/images/G/01/imdb/images/imdbsearch-3349468880._CB522736605_.xml" title="IMDb" />\n <script>(function(t){ (t.events = t.events || {})["csm_head_post_icon"] = new Date().getTime(); })(IMDbTimer);</script>\n \n <meta property="pageType" content="search" />\n <meta property="subpageType" content="title" />\n\n\n <link rel=\'image_src\' href="http://ia.media-imdb.com/images/G/01/imdb/images/logos/imdb_fb_logo-1730868325._CB522736557_.png">\n <meta property=\'og:image\' content="http://ia.media-imdb.com/images/G/01/imdb/images/logos/imdb_fb_logo-1730868325._CB522736557_.png" />\n\n <meta property=\'fb:app_id\' content=\'115109575169727\' />\n\n <meta property=\'og:title\' content="IMDb: Most Voted Titles Released 2017-01-01 to 2017-12-31" />\n <meta property=\'og:site_name\' content=\'IMDb\' />\n <meta name="title" content="IMDb: Most Voted Titles Released 2017-01-01 to 2017-12-31 - IMDb" />\n <meta name="description" content="IMDb\'s advanced search allows you to run extremely powerful queries over all people and titles in the database. Find exactly what you\'re looking for!" />\n <meta property="og:description" content="IMDb\'s advanced search allows you to run extremely powerful queries over all people and titles in the database. Find exactly what you\'re looking for!" />\n <meta name="request_id" content="03N6Z2NEAF09T9H26QYE" />\n \n <script>(function(t){ (t.events = t.events || {})["csm_head_pre_css"] = new Date().getTime(); })(IMDbTimer);</script>\n<link rel="stylesheet" type="text/css" href="http://ia.media-imdb.com/images/G/01/imdb/css/collections/consumersite-4100637360._CB530008524_.css" />\n<!-- h=ics-1e-c4-2xl-4b098b82.us-east-1 -->\n<link rel="stylesheet" type="text/css" href="http://ia.media-imdb.com/images/G/01/imdb/css/site/consumer-navbar-mega-238568768._CB532297092_.css" />\n<!--[if IE]><link rel="stylesheet" type="text/css" href="http://ia.media-imdb.com/images/G/01/imdb/css/collections/ie-1170868033._CB522736261_.css" /><![endif]-->\n\n <link rel="stylesheet" type="text/css" href="http://ia.media-imdb.com/images/G/01/imdb/css/collections/other-3780135229._CB530008515_.css" />\n <link rel="stylesheet" type="text/css" href="http://ia.media-imdb.com/images/G/01/imdb/css/collections/starbarwidget-2454701167._CB522736579_.css" />\n <link rel="stylesheet" type="text/css" href="http://ia.media-imdb.com/images/G/01/imdb/css/collections/watchlistButton-3806422028._CB531876201_.css" />\n <noscript>\n <link rel="stylesheet" type="text/css" href="http://ia.media-imdb.com/images/G/01/imdb/css/wheel/nojs-2827156349._CB522739048_.css">\n </noscript>\n <script>(function(t){ (t.events = t.events || {})["csm_head_post_css"] = new Date().getTime(); })(IMDbTimer);</script>\n \n <script>(function(t){ (t.events = t.events || {})["csm_head_pre_ads"] = new Date().getTime(); })(IMDbTimer);</script>\n \n <script type="text/javascript">\n // ensures js doesn\'t die if ads service fails. \n // Note that we need to define the js here, since ad js is being rendered inline after this.\n (function(f) {\n // Fallback javascript, when the ad Service call fails. \n \n if((window.csm == null || window.generic == null || window.consoleLog == null)) {\n if (window.console && console.log) {\n console.log("one or more of window.csm, window.generic or window.consoleLog has been stubbed...");\n }\n }\n \n window.csm = window.csm || { measure:f, record:f, duration:f, listen:f, metrics:{} };\n window.generic = window.generic || { monitoring: { start_timing: f, stop_timing: f } };\n window.consoleLog = window.consoleLog || f;\n })(function() {});\n </script>\n <script>\n if (\'csm\' in window) {\n csm.measure(\'csm_head_delivery_finished\');\n }\n </script>\n
这是什么格式,您希望快速识别它的语法功能是什么?
答案 0 :(得分:2)
这主要是带有一些内联脚本的HTML ... \n
(在开头)是标记语言中的换行符,因为站点开头是空行。
有什么问题?你期望得到什么?
答案 1 :(得分:2)
您获得了字节响应b'....'
,您可以在answer
要从页面获取完整文本,请使用以下示例:
import requests as r
url = 'your_url_here'
content = r.get(url).text
print(content)
UPD:对于解析,您可以使用Scrapy或Beautiful Soup工具。
答案 2 :(得分:1)
这是HTML,我从DTD确定了这一点:<!DOCTYPE html>
您可以使用BeautifulSoup
对其进行解析。
由于我还不能评论,我还想向@Alex解释。多余的代码很可能来自脚本标记,允许ECMAScript在页面中即时执行。
希望这可以帮助,
BoxTechy
答案 3 :(得分:1)
你似乎要问的是
if(!('CS' in window)) { window.CS = {}; }
JavaScript是一种典型的编程语言,您可以将其嵌入到HTML中(如果仔细查看,您肯定会发现它位于&lt; script&gt;和&lt; / script&gt;标记之间)。
你特别看到的是'in'运算符(https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/in)和一个空对象({}部分)。如果'window'对象根本没有'CS'属性,则分配一个空对象。事实上,许多程序员都会编写
window.CS=window.CS || {};