使用Python中的BeautifulSoup下载url内容

时间:2016-01-15 04:10:29

标签: python-3.x beautifulsoup

  

无类型对象没有属性文本。第16行

from bs4 import BeautifulSoup, SoupStrainer
try:
    import urllib.request as urllib2
except ImportError:
    import urllib2
import re

def main():
    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
    url = 'http://www.cnn.com/2013/10/29/us/florida-shooting-cell-phone-blocks-bullet/index.html?hpt=ju_c2'
    soup = BeautifulSoup(opener.open(url))
    #1) Link to the website 

    #2) Date article published 
    date = soup.find("div", {"class":"cnn_strytmstmp"}).text.encode('utf-8')
    #3) title of article 
    title = soup.find("div", {"id":"cnnContentContainer"}).find('h1').text.encode('utf-8')
    #4) Text of the article
    paragraphs = soup.find('div', {"class":"cnn_strycntntlft"}).find_all('p')
    text = " ".join([ paragraph.text.encode('utf-8') for paragraph in paragraphs])

    print (url)
    print (date)
    print (title) 
    print (text)

if __name__ == '__main__':   
     main()

1 个答案:

答案 0 :(得分:0)

这是一个代码段,可以从页面中获取您想要的详细信息。

from bs4 import BeautifulSoup
import requests


def get_new_cnn(url):
        response = requests.get(url, headers={'User-agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(response.text, 'html.parser')
        # 1) Link to the website
        print(url)
        # 2) Date article published
        date = soup.find("p", attrs={"class": "update-time"})
        print(date.text.replace('Updated ', ''))
        # 3) title of article
        title = soup.find("h1", attrs={"class": "pg-headline"})
        print(title.text.encode('UTF-8'))
        # 4) Text of the article
        paragraphs = soup.find_all('p', attrs={"class": "zn-body__paragraph"})
        text = " ".join([paragraph.text for paragraph in paragraphs])
        print(text.encode('UTF-8'))

src_url = 'http://www.cnn.com/2013/10/29/us/florida-shooting-cell-phone-blocks-bullet/index.html?hpt=ju_c2'
get_new_cnn(src_url)