无类型对象没有属性文本。第16行
from bs4 import BeautifulSoup, SoupStrainer
try:
import urllib.request as urllib2
except ImportError:
import urllib2
import re
def main():
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
url = 'http://www.cnn.com/2013/10/29/us/florida-shooting-cell-phone-blocks-bullet/index.html?hpt=ju_c2'
soup = BeautifulSoup(opener.open(url))
#1) Link to the website
#2) Date article published
date = soup.find("div", {"class":"cnn_strytmstmp"}).text.encode('utf-8')
#3) title of article
title = soup.find("div", {"id":"cnnContentContainer"}).find('h1').text.encode('utf-8')
#4) Text of the article
paragraphs = soup.find('div', {"class":"cnn_strycntntlft"}).find_all('p')
text = " ".join([ paragraph.text.encode('utf-8') for paragraph in paragraphs])
print (url)
print (date)
print (title)
print (text)
if __name__ == '__main__':
main()
答案 0 :(得分:0)
这是一个代码段,可以从页面中获取您想要的详细信息。
from bs4 import BeautifulSoup
import requests
def get_new_cnn(url):
response = requests.get(url, headers={'User-agent': 'Mozilla/5.0'})
soup = BeautifulSoup(response.text, 'html.parser')
# 1) Link to the website
print(url)
# 2) Date article published
date = soup.find("p", attrs={"class": "update-time"})
print(date.text.replace('Updated ', ''))
# 3) title of article
title = soup.find("h1", attrs={"class": "pg-headline"})
print(title.text.encode('UTF-8'))
# 4) Text of the article
paragraphs = soup.find_all('p', attrs={"class": "zn-body__paragraph"})
text = " ".join([paragraph.text for paragraph in paragraphs])
print(text.encode('UTF-8'))
src_url = 'http://www.cnn.com/2013/10/29/us/florida-shooting-cell-phone-blocks-bullet/index.html?hpt=ju_c2'
get_new_cnn(src_url)