Question

我正在尝试获取大量的状态信息，这些信息在网站中进行编码，主要在＆＃34;＆lt; head＆gt;＆lt; / head＆gt;＆＃34;元件。

我知道我可以使用wget或curl或python来获取整个页面。但我不想给服务器施加太多不必要的压力（页面本身相当大/复杂）。

是否有任何方法只获取头元素？

我认为除了检查html标题之外，还有一些代理服务器。

Jusk澄清：我不会搜索html-headers，仅用于html - <head>。

Answer 1

无法仅加载<head>标记之间的数据，因为服务器必须在发送之前解析所请求的页面。

可能的解决方案是读取几个字节，直到找到</head>标记。

以下内容从源读取n个字节，并检查是否包含字符串</head>。如果是，则将字节转换为string并进行修剪，使得结果包含标记<head>和</head>以及它们之间的数据。否则，它将继续读取n个字节，直到找到</head>。

import urllib.request


def get_head_tag_data(url, n=512):
    """Read n bytes form source until '</head> is included. Trim result to
       '<head> ... </head>' and return it as string."""

    # open resource
    with urllib.request.urlopen(url) as site:
        # read n bytes until `buff` includes "</head>"
        data = b''
        i = 1
        while True:
            buff = site.read(n)
            data += buff
            if b'</head>' in buff:
                break
            elif buff == b'':
                raise AttributeError('Not head-tag found.')
            i += 1

    print('{} bytes read'.format(n*i))

    # cast to string
    data = str(data)

    # detect tag position
    start_tag = data.find('<head>')
    end_tag = data.find('</head>') + 7

    return data[start_tag:end_tag]


tag_data = get_head_tag_data('https://stackoverflow.com', n=256)

请注意，此函数不会检查可能的错误，例如，如果没有</head>标记。

Answer 2

你可以尝试BeautifulSoap。这是一个示例 Python 脚本。

import urllib.request as urllib2
from bs4 import BeautifulSoup as bs
url = 'change_wiht_your_desired_url' # www.https://stackoverflow.com/questions/48262523/get-only-html-head-element-with-a-script-or-tool 
page = urllib2.urlopen(url)
soup = bs(page, 'html.parser')
soup.head

您可以使用soup对象获取不同的标记。阅读完整的docs here。我希望这会有所帮助。

修改

print(soup.head)输出

<head> <title>Memory Leak with import_array() for numpy Python3.5 - Stack Overflow</title> <link href="https://cdn.sstatic.net/Sites/stackoverflow/img/favicon.ico?v=4f32ecc8f43d" rel="shortcut icon"/> <link href="https://cdn.sstatic.net/Sites/stackoverflow/img/apple-touch-icon.png?v=c78bd457575a" rel="apple-touch-icon image_src"/> <link href="/opensearch.xml" rel="search" title="Stack Overflow" type="application/opensearchdescription+xml"/> <meta content="website" property="og:type"/> <meta content="https://stackoverflow.com/questions/48200892/memory-leak-with-import-array-for-numpy-python3-5" property="og:url"/> <meta content="https://cdn.sstatic.net/Sites/stackoverflow/img/apple-touch-icon@2.png?v=73d79a89bded" itemprop="image primaryImageOfPage" property="og:image"/> <meta content="summary" name="twitter:card"/> <meta content="stackoverflow.com" name="twitter:domain"/> <meta content="Memory Leak with import_array() for numpy Python3.5" itemprop="title name" name="twitter:title" property="og:title"/> <meta content="Could someone suggest a fix for this problem? When I use import_array(), Valgrind reports memory leak of 157528 bytes. Here is the small piece of code to replicate the problem on Ubuntu16.04 and P..." itemprop="description" name="twitter:description" property="og:description"/> <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.4/jquery.min.js"></script> <script src="https://cdn.sstatic.net/Js/stub.en.js?v=10cafd98c67a"></script> <link href="https://cdn.sstatic.net/Sites/stackoverflow/all-primary.css?v=44b77c3d4e1c" rel="stylesheet" type="text/css"/> <link href="/feeds/question/48200892" rel="alternate" title="Feed for question 'Memory Leak with import_array() for numpy Python3.5'" type="application/atom+xml"/> <meta content="US" name="twitter:app:country"/> <meta content="Stack Exchange iOS" name="twitter:app:name:iphone"/> <meta content="871299723" name="twitter:app:id:iphone"/> <meta content="se-zaphod://stackoverflow.com/questions/48200892/memory-leak-with-import-array-for-numpy-python3-5" name="twitter:app:url:iphone"/> <meta content="Stack Exchange iOS" name="twitter:app:name:ipad"/> <meta content="871299723" name="twitter:app:id:ipad"/> <meta content="se-zaphod://stackoverflow.com/questions/48200892/memory-leak-with-import-array-for-numpy-python3-5" name="twitter:app:url:ipad"/> <meta content="Stack Exchange Android" name="twitter:app:name:googleplay"/> <meta content="http://stackoverflow.com/questions/48200892/memory-leak-with-import-array-for-numpy-python3-5" name="twitter:app:url:googleplay"/> <meta content="com.stackexchange.marvin" name="twitter:app:id:googleplay"/> <script> StackExchange.ready(function () { StackExchange.using("snippets", function () { StackExchange.snippets.initSnippetRenderer(); }); StackExchange.using("postValidation", function () { StackExchange.postValidation.initOnBlurAndSubmit($('#post-form'), 2, 'answer'); }); StackExchange.question.init({showAnswerHelp:true,totalCommentCount:2,shownCommentCount:2,highlightColor:'#F4A83D',backgroundColor:'#FFF',questionId:48200892}); styleCode(); StackExchange.realtime.subscribeToQuestion('1', '48200892'); StackExchange.using("gps", function () { StackExchange.gps.trackOutboundClicks('#content', '.post-text'); }); }); </script> <script> StackExchange.init({"locale":"en","serverTime":1516020520,"routeName":"Questions/Show","stackAuthUrl":"https://stackauth.com","networkMetaHostname":"meta.stackexchange.com","site":{"name":"Stack Overflow","description":"Q&A for professional and enthusiast programmers","isNoticesTabEnabled":true,"recaptchaPublicKey":"6LdchgIAAAAAAJwGpIzRQSOFaO0pU6s44Xt8aTwc","recaptchaAudioLang":"en","enableNewTagCreationWarning":true,"insertSpaceAfterNameTabCompletion":false,"id":1,"childUrl":"https://meta.stackoverflow.com","enableSocialMediaInSharePopup":true,"protocol":"https"},"user":{"fkey":"b01c42fe577b568489e86923067aebbd","tid":"5b158809-5378-6383-c1bd-bdf22c97f475","rep":0,"isAnonymous":true,"isAnonymousNetworkWide":true,"canSeeNewHeaderDesign":true},"events":{"postType":{"question":1},"postEditionSection":{"title":1,"body":2,"tags":3}},"story":{"minCompleteBodyLength":75,"likedTagsMaxLength":300,"dislikedTagsMaxLength":300},"jobPreferences":{"maxNumDeveloperRoles":2,"maxNumIndustries":4}}, {"site":{"allowImageUploads":true,"enableUserHovercards":true,"styleCode":true,"enableImgurHttps":true,"forceHttpsImages":true},"comments":{},"userProfile":{"openGraphAPIKey":"58740831ad23540e00c58987"},"tags":{},"accounts":{"currentPasswordRequiredForChangingStackIdPassword":true},"flags":{"allowRetractingFlags":true},"topBar":{"showNewFeatureNotice":true},"snippets":{"snippetsEnabled":true,"renderDomain":"stacksnippets.net"},"paths":{},"markdown":{"asteriskIntraWordEmphasis":true},"monitoring":{"clientTimingsAbsoluteTimeout":30000,"clientTimingsDebounceTimeout":1000}}); StackExchange.using.setCacheBreakers({"js/prettify-full.en.js":"653f3a9edf23","js/moderator.en.js":"22b640565fb8","js/full-anon.en.js":"448b407c0535","js/full.en.js":"b5454c77884f","js/wmd.en.js":"70a0e707c944","js/third-party/jquery.autocomplete.min.js":"d3b8fa7fdf74","js/third-party/jquery.autocomplete.min.en.js":"","js/mobile.en.js":"8e20e188854d","js/help.en.js":"890f7bf1827b","js/tageditor.en.js":"68e773dc21b3","js/tageditornew.en.js":"c77ac7fa331f","js/inline-tag-editing.en.js":"681a5e3ebd00","js/revisions.en.js":"2faaeaae2529","js/review.en.js":"1cbc9c06f708","js/tagsuggestions.en.js":"b278f9a0b23b","js/post-validation.en.js":"d8d9b527c3ea","js/explore-qlist.en.js":"88f824a42b1a","js/events.en.js":"9c2e85f6190f","js/keyboard-shortcuts.en.js":"a6f8e6251bbd","js/external-editor.en.js":"7a4d6f43f0bf","js/adops.en.js":"22a9bd59b1e9","js/external-editor.en.js":"7a4d6f43f0bf","js/snippet-javascript.en.js":"a3fb7827a7b4","js/snippet-javascript-codemirror.en.js":"72e55eacc0ed"}); StackExchange.using("gps", function() { StackExchange.gps.init(true); }); </script> <noscript id="noscript-css"><style>body,.top-bar{margin-top:1.9em}</style></noscript> </head>

仅使用脚本或工具获取HTML元素

2 个答案: