这是我正在使用的代码
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'
}
leagues = ['L1', 'GB1', 'IT1', 'FR1', 'ES1']
def main(url):
with requests.Session() as req:
links = []
for lea in leagues:
print(f"Fetching Links from {lea}")
r = req.get(url.format(lea), headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
link = [f"{url[:31]}{item.next_element.get('href')}" for item in soup.findAll(
"td", class_="hauptlink no-border-links hide-for-small hide-for-pad")]
links.extend(link)
print(f"Collected {len(links)} Links")
goals = []
for num, link in enumerate(links):
print(f"Extracting Page# {num +1}")
r = req.get(link, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
target = soup.find("table", class_="items")
pn = [pn.text for pn in target.select("div.rn_nummer")]
pos = [pos.text for pos in target.findAll("td", class_=False)]
name = [name.text for name in target.select("td.hide")]
dob = [date.find_next(
"td").text for date in target.select("td.hide")]
nat = [" / ".join([a.get("alt") for a in nat.find_all_next("td")[1] if a.get("alt")]) for nat in target.findAll(
"td", itemprop="athlete")]
val = [val.get_text(strip=True)
for val in target.select('td.rechts.hauptlink')]
goal = zip(pn, pos, name, dob, nat, val)
df = pd.DataFrame(goal, columns=[
'position_number', 'position_description', 'name', 'dob', 'nationality', 'value'])
goals.append(df)
new = pd.concat(goals)
new.to_csv("data.csv", index=False)
main("https://www.transfermarkt.co.uk/jumplist/startseite/wettbewerb/{}/plus/?saison_id=2019")
答案 0 :(得分:1)
问题是您的某些列表为空,因此您的 zip 对象为空并创建了一个空数据框。这与您没有调用正确的直接标签这一事实有关(即名称在 <span>
标签下,而不是 <td>
试试这个:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'
}
leagues = ['L1', 'GB1', 'IT1', 'FR1', 'ES1']
def main(url):
with requests.Session() as req:
links = []
for lea in leagues:
print(f"Fetching Links from {lea}")
r = req.get(url.format(lea), headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
link = [f"{url[:31]}{item.next_element.get('href')}" for item in soup.findAll(
"td", class_="hauptlink no-border-links hide-for-small hide-for-pad")]
links.extend(link)
print(f"Collected {len(links)} Links")
goals = []
for num, link in enumerate(links):
print(f"Extracting Page# {num +1}")
r = req.get(link, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
target = soup.find("table", class_="items")
# Get all the rows in the table
rows = target.find_all('tr')
# This particular site the data we want is in every 3rd row
# This will essentially skip the header column <th>, and skip every 2nd and 3rd row for each player
for row_num, row in enumerate(rows):
if row.find('th') == None and row_num%3==1:
pn = row.find('div',{'class':'rn_nummer'}).text
pos = row.find('td', {'class':False}).text
# Find the <span> tag that has "hide" within its class attributes
name = row.find('span',{'class':re.compile('.*hide.*')}).text
# If the name not found there, we'll look in the <a> tag
if name == '':
name = row.find('a',{'class':re.compile('.*profil_tooltip.*')}).text
# Finds the <span> tag with class attribute, then grab the next <td> tag
dob = row.find('span',{'class':re.compile('.*hide.*')}).find_next('td',{'class':'zentriert'}).text
# Pull out the number betwen the ( )
dob_num = dob[dob.find("(")+1:dob.find(")")]
# Finds the <span> tag with class attribute, then grab the next <td> tag, to get the <img> tag that contains the nationality in the title attribute
nat = row.find('span',{'class':re.compile('.*hide.*')}).find_next('td',{'class':'zentriert'}).find_next('img')['title']
val = row.find('td',{'class':'rechts hauptlink'}).text
else:
continue
# Places a dictionary of {column_names:vales} in to a list
goals.append({'position_number':pn,
'position_description':pos,
'name':name,
'dob':dob,
'nationality':nat,
'value':val,
'dob_num':dob_num})
# Create dataframe from the list of rows (the list of dictionaries of {column names:values:})
new = pd.DataFrame(goals)
#Write to file
new.to_csv("data.csv", index=False)