网页抓取后csv文件中没有数据

时间:2021-03-19 16:05:58

标签: python web-scraping beautifulsoup

这是我正在使用的代码

import requests
from bs4 import BeautifulSoup
import pandas as pd


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'
}

leagues = ['L1', 'GB1', 'IT1', 'FR1', 'ES1']


def main(url):
    with requests.Session() as req:
        links = []
        for lea in leagues:
            print(f"Fetching Links from {lea}")
            r = req.get(url.format(lea), headers=headers)
            soup = BeautifulSoup(r.content, 'html.parser')
            link = [f"{url[:31]}{item.next_element.get('href')}" for item in soup.findAll(
                "td", class_="hauptlink no-border-links hide-for-small hide-for-pad")]
            links.extend(link)

        print(f"Collected {len(links)} Links")
        goals = []
        for num, link in enumerate(links):
            print(f"Extracting Page# {num +1}")
            r = req.get(link, headers=headers)
            soup = BeautifulSoup(r.content, 'html.parser')
            target = soup.find("table", class_="items")
            pn = [pn.text for pn in target.select("div.rn_nummer")]
            pos = [pos.text for pos in target.findAll("td", class_=False)]
            name = [name.text for name in target.select("td.hide")]
            dob = [date.find_next(
                "td").text for date in target.select("td.hide")]
            nat = [" / ".join([a.get("alt") for a in nat.find_all_next("td")[1] if a.get("alt")]) for nat in target.findAll(
                "td", itemprop="athlete")]
            val = [val.get_text(strip=True)
                   for val in target.select('td.rechts.hauptlink')]
            goal = zip(pn, pos, name, dob, nat, val)
            df = pd.DataFrame(goal, columns=[
                              'position_number', 'position_description', 'name', 'dob', 'nationality', 'value'])
            goals.append(df)

        new = pd.concat(goals)
        new.to_csv("data.csv", index=False)


main("https://www.transfermarkt.co.uk/jumplist/startseite/wettbewerb/{}/plus/?saison_id=2019")

1 个答案:

答案 0 :(得分:1)

问题是您的某些列表为空,因此您的 zip 对象为空并创建了一个空数据框。这与您没有调用正确的直接标签这一事实有关(即名称在 <span> 标签下,而不是 <td>

试试这个:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'
}

leagues = ['L1', 'GB1', 'IT1', 'FR1', 'ES1']


def main(url):
    with requests.Session() as req:
        links = []
        for lea in leagues:
            print(f"Fetching Links from {lea}")
            r = req.get(url.format(lea), headers=headers)
            soup = BeautifulSoup(r.content, 'html.parser')
            link = [f"{url[:31]}{item.next_element.get('href')}" for item in soup.findAll(
                "td", class_="hauptlink no-border-links hide-for-small hide-for-pad")]
            links.extend(link)
    
        print(f"Collected {len(links)} Links")
        goals = []
        for num, link in enumerate(links):
            print(f"Extracting Page# {num +1}")
            r = req.get(link, headers=headers)
            soup = BeautifulSoup(r.content, 'html.parser')
            target = soup.find("table", class_="items")
            
            # Get all the rows in the table
            rows = target.find_all('tr')
            
            # This particular site the data we want is in every 3rd row
            # This will essentially skip the header column <th>, and skip every 2nd and 3rd row for each player
            for row_num, row in enumerate(rows):
                if row.find('th') == None and row_num%3==1:
                    pn = row.find('div',{'class':'rn_nummer'}).text
                    pos = row.find('td', {'class':False}).text
                    
                    # Find the <span> tag that has "hide" within its class attributes
                    name = row.find('span',{'class':re.compile('.*hide.*')}).text
                    
                    # If the name not found there, we'll look in the <a> tag
                    if name == '':
                        name = row.find('a',{'class':re.compile('.*profil_tooltip.*')}).text
                    
                    # Finds the <span> tag with class attribute, then grab the next <td> tag
                    dob = row.find('span',{'class':re.compile('.*hide.*')}).find_next('td',{'class':'zentriert'}).text
                    
                    # Pull out the number betwen the (  )
                    dob_num = dob[dob.find("(")+1:dob.find(")")]
                    
                    # Finds the <span> tag with class attribute, then grab the next <td> tag, to get the <img> tag that contains the nationality in the title attribute
                    nat = row.find('span',{'class':re.compile('.*hide.*')}).find_next('td',{'class':'zentriert'}).find_next('img')['title']
                    val = row.find('td',{'class':'rechts hauptlink'}).text
                else:
                    continue
                
                # Places a dictionary of {column_names:vales} in to a list
                goals.append({'position_number':pn, 
                              'position_description':pos, 
                              'name':name, 
                              'dob':dob, 
                              'nationality':nat,
                              'value':val,
                              'dob_num':dob_num})
                    
            
        # Create dataframe from the list of rows (the list of dictionaries of {column names:values:})
        new = pd.DataFrame(goals)
        
        #Write to file
        new.to_csv("data.csv", index=False)