Question

我正在尝试使用Python / BeautifulSoup解析网址。以下是我的代码。我只是想知道在不使用两个列表的情况下获取排序字典是否更好？或者有没有有效的方法？

import requests
from bs4 import BeautifulSoup
import operator

req = requests.get('https://www.firstchampionship.org/sponsorvideos')
soup = BeautifulSoup(req.text, "lxml")

companies = []
votes = []
all_in_one = {}

for company in soup.find_all("div", {"class": "views-field views-field-title"}):
    # print(company.text.encode('ascii','ignore'))
    companies.append(company.text.encode('ascii','ignore'))

for vote in soup.find_all("div", {"class": "rate-info"}):     
    vote_x = vote.text.split(" ")
    votes.append(int(vote_x[0]))

for i, x in enumerate(companies):
    all_in_one.update({x:votes[i]})

for key, value in all_in_one.iteritems():
    print(key+"->"+str(value))

sorted_x = sorted(all_in_one.items(), key=operator.itemgetter(1), reverse=True)
print(sorted_x)

我的决赛如下。公司名称 - ＆gt;票数

[（'Analog Devices'，7227），（'Bechtel'，6797），（'NVIDIA'，436），（'Qualcomm Incorporated'，349），（'ViaSat'，292），（'BOSCH'，201），（'Nokia Bell Labs'，124），（'Walt Disney Imagineering'，119），（' Google，Inc。'，113），（'NI'，109），（'FedEx'，100），（'NASA'，97），（'波音公司'，86），（'美国空军'，83），（'第一'， 74），（'3M Company'，73），（'Twitch'，73），（'Baxter'，70），（' Rockwell Automation'，68），（'Booz Allen Hamilton'，68），（'NRG'， 66），（'Mouser Electronics'，63），（'IBM Corporation'，63），（'John Deere'，63），（'Motorola Solutions'，62），（'Delphi'，62），（' Boston Scientific'，60），（'Texas Instruments'，59），（'The Dow Chemical Co.'，59），（'PTC'，59），（'Xerox'，58），（'西南航空公司，57），（'GM'，55），（'Vulcan Spring'，53），（'Rockwell Collins'，52），（'Festo'，52），（'Monsanto'，50），（'LEGO Education '，39）]

Answer 1

您可以获得公司标题，获取父节点，找到与投票节点匹配的下一个元素，将其附加到列表以公司名称 dict 和订单。

import requests
from bs4 import BeautifulSoup
from operator import itemgetter

req = requests.get('https://www.firstchampionship.org/sponsorvideos')
soup = BeautifulSoup(req.text, "lxml")

companies = []

for company in soup.find_all("div", {"class": "views-field views-field-title"}):
    companies.append({"Company": company.get_text(strip=True).encode('ascii','ignore'), "Votes": company.parent.find_next("div", class_="rate-info").get_text(strip=True).split(' ')[0]})

print (sorted(companies, key=itemgetter('Company')))

输出：

[{＆＃39;公司＆＃39;：＆＃39; 3M公司＆＃39;投票＆＃39;：＆＃39; 76＆＃39;}，{＆＃39;公司＆＃ 39;：＆＃39; ADI公司＆＃39;投票＆＃39;：u＆＃39; 7282＆＃39;} ...

Answer 2

您可以使用馆藏库中的计数器。

from collection import Counter

from bs4 import BeautifulSoup
import requests

req = requests.get('https://www.firstchampionship.org/sponsorvideos')
soup = BeautifulSoup(req.text, "lxml")

counts = Counter()

companies = soup.find_all("div", {"class": "views-field views-field-title"})
votes = soup.find_all("div", {"class": "rate-info"})

for company, vote in zip(companies, votes):
    company_name = company.text.encode('ascii','ignore')
    vote_count = int(vote.text.split(" ")[0])
    counts[company_name] = vote_count

for company_name, vote_count in counts.most_common():
    print(company_name, vote_count)

使用beautifulSoup获取列表

2 个答案: