使用someones wikipedia解析器,我想要在维基百科上抓取页面历史记录的p标签的下一个元素。
import sys
import urllib
import urllib2
import re
from bs4 import BeautifulSoup
article = sys.argv[1]
while article!="Philosophy" and count<MAX_HOPS:
articleURL = urllib.quote(article)
#print "Article URL: %s" %(articleURL)
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
resource = opener.open("http://en.wikipedia.org/wiki/" + articleURL)
data = resource.read()
resource.close()
soup = BeautifulSoup(data)
articleTemp = ""
#('div',id="bodyContent").find_all('p')
for soupContent in soup.find_all('p')
我在汤上调用find_all('p')
但是
soupContent = soup.find('div',id="bodyContent").p
给我第一段问题是我需要它也要经过第二段。
答案 0 :(得分:0)
正如Totem上面所说,代码中的for
循环未正确形成。我认为find_all
方法本身没有任何问题。例如,下面的代码对我来说运行正常:
import sys
import urllib
import urllib2
import re
from bs4 import BeautifulSoup
article = "Stack_Overflow_(website)"
articleURL = urllib.quote(article)
#print "Article URL: %s" %(articleURL)
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
resource = opener.open("http://en.wikipedia.org/wiki/" + articleURL)
data = resource.read()
resource.close()
soup = BeautifulSoup(data)
articleTemp = ""
#('div',id="bodyContent").find_all('p')
for soupContent in soup.find_all('p'):
print soupContent.text