答案 0 :(得分:7)
使用BeautifulSoup的findAll()方法提取带有“注释”类的所有span标记,因为它们包含您需要的信息。然后,您可以根据需要对它们执行任何操作。
soup = BeautifulSoup(html,"html.parser")
data = soup.findAll("span", { "class":"comments" })
numbers = [d.text for d in data]
这是输出:
[u'100', u'97', u'87', u'86', u'86', u'78', u'75', u'74', u'72', u'72', u'72', u'70', u'70', u'66', u'66', u'65', u'65', u'63', u'61', u'60', u'60', u'59', u'59', u'57', u'56', u'54', u'52', u'52', u'51', u'47', u'47', u'41', u'41', u'41', u'38', u'35', u'32', u'31', u'24', u'19', u'19', u'18', u'17', u'16', u'13', u'8', u'7', u'1', u'1', u'1']
答案 1 :(得分:1)
我正在从Coursera学习同样的课程。你不介意尝试上述解决方案吗?我觉得这个问题属于我们在上述问题之前所学到的范围。它绝对适合我。
import urllib
import re
from bs4 import *
url = 'http://python-data.dr-chuck.net/comments_216543.html'
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html,"html.parser")
sum=0
# Retrieve all of the anchor tags
tags = soup('span')
for tag in tags:
# Look at the parts of a tag
y=str(tag)
x= re.findall("[0-9]+",y)
for i in x:
i=int(i)
sum=sum+i
print sum
答案 2 :(得分:0)
@Learner的解决方案是完全正确的!但是如果你想用名字和评论做更多的事情,你可以这样做,返回名字和评论列表:
from BeautifulSoup import BeautifulSoup
import re
import urllib
url = 'http://python-data.dr-chuck.net/comments_42.html'
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
all = soup.findAll('span',{'class':'comments'},text=re.compile(r'[0-9]{0,4}')) #use regex to extract only numbers
cleaned = filter(lambda x: x!=u'\n',all)[4:]
In [18]: cleaned
Out[18]:
[u'Leven',
u'100',
u'Mahdiya',
u'97',
u'Ajayraj',
u'87',
u'Lillian',
u'86',
u'Aon',
u'86',
u'Ruaraidh',
u'78',
u'Gursees',
u'75',
u'Emmanuel',
u'74',
u'Christy',
u'72',
u'Annoushka',
u'72',
u'Inara',
u'72',
u'Caite',
u'70',
u'Rosangel',
u'70',
u'Iana',
u'66',
u'Anise',
u'66',
u'Jaosha',
u'65',
u'Cadyn',
u'65',
u'Edward',
u'63',
u'Charlotte',
u'61',
u'Sammy',
u'60',
u'Zarran',
u'60',.....] #
答案 3 :(得分:0)
以基本方式做到......
# Retrieve all of the anchor tags
tags = soup('span')
sum = 0
count = 0
for tag in tags:
# Look at the parts of a tag
#print tag.contents[0]
num = float(tag.contents[0])
#print num
sum = sum + num
count = count + 1
print 'count:',count
print 'sum:',sum
答案 4 :(得分:0)
我在诅咒上做了这件事,它给了我所有正确的答案。希望它有所帮助;)
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input('Enter - ')
html = urlopen(url, context=ctx).read()
soup = BeautifulSoup(html,"html.parser")
# Retrieve all of the anchor tags
tags = soup('span')
sum = 0
count = 0
for tag in tags:
# Look at the parts of a tag
#print tag.contents[0]
num = float(tag.contents[0])
#print num
sum = sum + num
count = count + 1
print ('count:', count)
print ('sum:', sum)
答案 5 :(得分:0)
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
import re
lst = list()
sum = 0
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input('Enter - ')
html = urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, "html.parser")
tags = soup('span')
for tag in tags:
strtag = str(tag)
lst = re.findall('[0-9+]+',strtag)
sum = sum + int(lst[0])
print(sum)
答案 6 :(得分:0)
import urllib.request
import re
from bs4 import BeautifulSoup
url = input('Enter: ')
tag = input("input the html tag to search: ")
parameter = input("Enter the html parameter of the tag for better selection (optional): ")
p_value = input("Enter the parameter value (optional): ")
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
if not parameter == "" and not p_value == "":
numbers = soup(tag, {parameter: p_value})
else:
numbers = soup(tag)
sumation = 0
for number in numbers:
n = str(number)
x = re.findall('([0-9]+)', n)
for item in x:
sumation += int(item)
print(sumation)
Tag
将html标记作为输入进行搜索Parameter
采用html参数,例如class
,id
等。p_value
以类名或id名称作为输入答案 7 :(得分:-1)
import urllib.request,urllib.parse,urllib.error
import re
from bs4 import BeautifulSoup
url = input('Enter - ')
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html,"html.parser")
tags=soup('span')
sum=0
for tag in tags:
x=re.findall("[0-9]+",tag)
for i in x:
z=int(i)
sum=sum+i
print(sum)