我想从这个起始链接的多个页面中抓取网址:https://www.transfermarkt.com/detailsuche/spielerdetail/suche/27403221/page/1
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
def extract(page):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'}
url = f'https://www.transfermarkt.com/detailsuche/spielerdetail/suche/27403221/page/1'
r = requests.get(url, headers)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def trasform(soup):
links = soup.find_all('table', class_ = 'items')
for item in links.find(class_='spielprofil_tooltip'):
url = 'https://www.transfermarkt.com' + item.attrs['href']
urls = {
'url': url
}
urllist.append(urls)
return
urllist = []
for i in range(1,10,1):
print(f'Getting page, {i}')
c = extract(1)
trasform(c)
df = pd.DataFrame(urllist)
print(df.head())
df.to_csv('urlslist.csv', index=False)`
但是我得到了这个
AttributeError: ResultSet object has no attribute 'find'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?
答案 0 :(得分:0)
在第 14 行,而不是 find
for item in links.find(class_='spielprofil_tooltip'):
写find_all,
for item in links.find_all(class_='spielprofil_tooltip'):
这应该可以修复错误。