我有一个pandas数据框,其中一列名为artist。仅当新的艺术家名称不在此列中时,我才想添加新行。
我试过但没有成功:
if (all_data != name.all(axis = 0)):
all_data = all_data.append({'artist':str(name), 'netWorth':str(worth.strip())}, ignore_index = True)
这是我的所有代码:
def get_webpage(i, url):
URL = url+str(i)
response = requests.get(URL)
return bs4.BeautifulSoup(response.text, 'html.parser')
COLUMNS = ['artist', 'netWorth']
all_data = pd.DataFrame(columns = COLUMNS)
def scrape(soup):
artists = soup.find_all('article', class_ = 'thumb-wrap')
for ar in artists:
name = ar.h3.a.text
worth = ar.div.find('div', class_='bc-networth').text
global all_data
if (all_data['artist'] != name).any():
all_data = all_data.append({'artist':str(name), 'netWorth':str(worth.strip())}, ignore_index = True)
i = 1
url = 'http://www.therichest.com/celebnetworth-category/celeb/singer/page/'
while (i<=14):
soup = get_webpage(i, url)
i = i+1
data = scrape(soup)
i = 1
url = 'http://www.therichest.com/celebnetworth-category/celeb/musician/page/'
while (i<=7):
soup = get_webpage(i, url)
i = i+1
data = scrape(soup)
答案 0 :(得分:1)
我认为只需要检查一列artist
:
if (all_data['artist'] != str(name)).all():
<强>示例强>:
all_data = pd.DataFrame({'netWorth':[5,3],
'artist':list('ab')})
print (all_data)
netWorth artist
0 5 a
1 3 b
name = 'a'
b = 10
if (all_data['artist'] != str(name)).all():
all_data = all_data.append({'artist':str(name), 'netWorth':b }, ignore_index = True)
print (all_data)
netWorth artist
0 5 a
1 3 b
name = 'd'
b = 10
if (all_data['artist'] != name).all():
all_data = all_data.append({'artist':str(name), 'netWorth':b }, ignore_index = True)
print (all_data)
netWorth artist
0 5 a
1 3 b
2 10 d