from lxml import html
import requests
# Initial attempt to scrape HTML from link using BeautifulSoup
obama_4427 = requests.get('http://millercenter.org/president/obama/speech-4427')
obama_4427_tree = html.fromstring(obama_4427.text)
# The speech text itself is stored in the HTML with an Xpath
# of '//*[@id="transcript"]/p' and is a <div>
obama_4427_text = obama_4427_tree.xpath('//div[@id="transcript"]/p')
print(obama_4427_text)
import urllib2,sys
from bs4 import BeautifulSoup,NavigableString
obama_4427_url = 'http://millercenter.org/president/obama/speech-4427'
obama_4427_html = urllib2.urlopen(obama_4427_url).read()
# Second attempt, using User-Agent
import httplib
httplib.HTTPConnection.debuglevel = 1
import urllib2
request = urllib2.Request(obama_4427_url)
opener = urllib2.build_opener()
feeddata = opener.open(request).read()
我最终收到以下错误代码:
HTTPError:找不到
在第二次尝试中,我试图将自己标识为用户代理以获取能够抓取此演讲的权限,但未成功。我在这里缺少什么?
顺便说一句,我在Anaconda Spyder中运行Python 2.7。