您如何遍历每个子文本(战斗机)以获取我需要的数据,然后保留子链接以返回到上面带有所有战斗机名称的页面,然后迭代到下一个战斗机(链接)并获取该战斗机的所有数据,并继续进行操作,直到到达该特定页面上列表的末尾为止。
records=[]
r = requests.get('http://www.espn.com/mma/fighters')
soup = BeautifulSoup(r.text,'html.parser')
data = soup.find_all('tr',attrs={'class':['oddrow','evenrow']})
for d in data:
try:
name = d.find('a').text
except AttributeError: name = ""
try:
country = d.find('td').findNext('td').text
except AttributeError: county = ""
records.append([name,country])
以上代码是所有战斗机名称所在的位置。我可以遍历每个人来收集(战斗人员的姓名和国家/地区)
links = [f"http://www.espn.com{i['href']}" for i in data.find_all('a') if re.findall('^/mma/', i['href'])][1]
r1 = requests.get(links)
data1 = BeautifulSoup(test.text,'html.parser')
bio = data1.find('div', attrs={'class':'mod-content'})
weightClass = data1.find('li',attrs={'class':'first'}).text
trainingCenter = data1.find('li',attrs={'class':'last'}).text
wins = data1.find('table',attrs={'class':'header-stats'})('td')[0].text
loses = data1.find('table',attrs={'class':'header-stats'})('td')[1].text
draws = data1.find('table',attrs={'class':'header-stats'})('td')[2].text
tkos = data1.find_all('table',attrs={'class':'header-stats'})[1]('td')[0].text
subs = data1.find_all('table',attrs={'class':'header-stats'})[1]('td')[1].text
以上代码当前正在输入第二架战斗机,并收集该特定战斗机的所有数据(链接)。
records=[]
r = requests.get('http://www.espn.com/mma/fighters')
soup = BeautifulSoup(r.text,'html.parser')
data = soup.find_all('tr',attrs={'class':['oddrow','evenrow']})
links = [f"http://www.espn.com{i['href']}" for i in data.find_all('a') if re.findall('^/mma/', i['href'])]
for d in data:
try:
name = d.find('a').text
except AttributeError: name = ""
try:
country = d.find('td').findNext('td').text
except AttributeError: county = ""
for l in links:
r1 = requests.get(links)
data1 = BeautifulSoup(test.text,'html.parser')
bio = data1.find('div', attrs={'class':'mod-content'})
for b in bio:
try:
weightClass = data1.find('li',attrs={'class':'first'}).text
except AttributeError: name = ""
try:
trainingCenter = data1.find('li',attrs={'class':'last'}).text
except AttributeError: name = ""
try:
wins = data1.find('table',attrs={'class':'header-stats'})('td')[0].text
except AttributeError: name = ""
try:
loses = data1.find('table',attrs={'class':'header-stats'})('td')[1].text
except AttributeError: name = ""
try:
draws = data1.find('table',attrs={'class':'header-stats'})('td')[2].text
except AttributeError: name = ""
try:
tkos = data1.find_all('table',attrs={'class':'header-stats'})[1]('td')[0].text
except AttributeError: name = ""
try:
subs = data1.find_all('table',attrs={'class':'header-stats'})[1]('td')[1].text
except AttributeError: name = ""
records.append([name,country,weightClass])
上面的代码是我正在尝试的,但是我收到一条错误消息: “ ResultSet对象没有属性'find_all'。您可能将项目列表像单个项目一样对待。当您打算调用find()时是否调用了find_all()?”
如何将其添加到我拥有的初始代码中,我可以在原始页面上收集战斗机的名称和国家/地区,然后迭代到战斗机(链接)并收集上面看到的数据,然后执行该操作该页面上的所有战斗机?
答案 0 :(得分:1)
签出此解决方案。我目前没有太多时间,但有空时我会尽快检查一下。您可以使用以下代码进行主要操作。您要做的唯一一件事就是从目标页面获取数据。下面的脚本可以从分页(a到z)的每个页面中获取所有链接,然后从目标页面中收集名称。
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "http://www.espn.com/mma/fighters?search={}"
for linknum in [chr(i) for i in range(ord('a'),ord('z')+1)]:
r = requests.get(url.format(linknum))
soup = BeautifulSoup(r.text,'html.parser')
for links in soup.select(".tablehead a[href*='id']"):
res = requests.get(urljoin(url,links.get("href")))
sauce = BeautifulSoup(res.text,"lxml")
title = sauce.select_one(".player-bio h1").text
print(title)
答案 1 :(得分:0)
import requests, re
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import pandas as pd
url = "http://www.espn.com/mma/fighters?search={}"
titleList = []
countryList = []
stanceList = []
reachList = []
ageList = []
weightClassList = []
trainingCenterList = []
winsList = []
losesList =[]
drawsList = []
tkosList = []
subsList = []
#i believe this is what takes us from one page to another, but not 100% sure yet
for linknum in [chr(i) for i in range(ord('a'),ord('z')+1)]:
r = requests.get(url.format(linknum))
soup = BeautifulSoup(r.text,'html.parser')
#a[href*=] gets all anchors a that contain whatever the href*=''
for links in soup.select(".tablehead a[href*='id']"):
#urljoin just takes a url and another string and combines them to create a new url
res = requests.get(urljoin(url,links.get("href")))
sauce = BeautifulSoup(res.text,"lxml")
try:
title = sauce.select_one(".player-bio h1").text
except AttributeError: title = ""
try:
country = sauce.find('span',text='Country').next_sibling
except AttributeError: country = ""
try:
stance = sauce.find('span',text='Stance').next_sibling
except AttributeError: stance = ""
try:
reach = sauce.find('span',text='Reach').next_sibling
except AttributeError: reach = ""
try:
age = sauce.find('span',text='Birth Date').next_sibling[-3:-1]
except AttributeError: age = ""
try:
weightClass = sauce.find('li',attrs={'class':'first'}).text
except AttributeError: weightClass = ""
try:
trainingCenter = sauce.find('li',attrs={'class':'last'}).text
except AttributeError: trainingCenter = ""
try:
wins = sauce.find('table',attrs={'class':'header-stats'})('td')[0].text
except AttributeError: wins = ""
try:
loses = sauce.find('table',attrs={'class':'header-stats'})('td')[1].text
except AttributeError: loses = ""
try:
draws = sauce.find('table',attrs={'class':'header-stats'})('td')[2].text
except AttributeError: draws = ""
try:
tkos = sauce.find_all('table',attrs={'class':'header-stats'})[1]('td')[0].text
except AttributeError: tkos = ""
try:
subs = sauce.find_all('table',attrs={'class':'header-stats'})[1]('td')[1].text
except AttributeError: subs = ""
titleList.append(title)
countryList.append(country)
stanceList.append(stance)
reachList.append(reach)
ageList.append(age)
weightClassList.append(weightClass)
trainingCenterList.append(trainingCenter)
winsList.append(wins)
losesList.append(loses)
drawsList.append(draws)
tkosList.append(tkos)
subsList.append(subs)
df = pd.DataFrame()
df['title'] = titleList
df['country'] = countryList
df['stance'] = stanceList
df['reach'] = reachList
df['age'] = ageList
df['weightClass'] = weightClassList
df['trainingCenter']= trainingCenterList
df['wins'] = winsList
df['loses'] = losesList
df['draws'] = drawsList
df['tkos'] = tkosList
df['subs'] = subsList
df.to_csv('MMA Fighters', encoding='utf-8')