import csv
import requests
from bs4 import BeautifulSoup
url = 'http://www.showmeboone.com/sheriff/JailResidents/JailResidents.asp'
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, "html.parser")
fileList = []
# For the table-header cells
tableHeader = soup.find('tr', attrs={'class': 'table-header'})
rowList = []
for cell in tableHeader.findAll('th'):
cellText = cell.text.replace(' ', '').replace('\n', '')
rowList.append(cellText)
fileList.append(rowList)
# For the table body cells
table = soup.find('tbody', attrs={'class': 'stripe'})
for row in table.findAll('tr'):
rowList = []
for cell in row.findAll('td'):
cellText = cell.text.replace(' ', '').replace('\n', '')
if cellText == "Details":
continue
rowList.append(cellText)
fileList.append(rowList)
outfile = open("./prison-inmates.csv", "w")
writer = csv.writer(outfile)
writer.writerows(fileList)
我想为每个 A B C D
0 01:00:00 2002-01-16 10 3
1 01:30:00 2002-01-16 10 -12
2 02:00:00 2002-01-16 10 7
3 01:00:00 2002-01-17 20 33
4 01:30:00 2002-01-17 20 -27
5 02:00:00 2002-01-17 20 12
results = {}
组选择一行,取第一个行,满足下列条件之一:
输出应为:
A
我试过了:
A B C D
1 01:30:00 2002-01-16 10 -12
3 01:00:00 2002-01-17 20 33
答案 0 :(得分:2)
除了使用groupby.apply
之外你或多或少有你所拥有的东西,同样来自你想要的输出它似乎没有优先考虑第一个条件,在这种情况下,你需要将这两个条件与结合起来或 |
:
def first_last(g):
# this is used at multiple places, cache the condition
cond = g.D.ge(g.C.mul(0.5)) | g.D.le(g.C.mul(-1))
if cond.any():
return g[cond].iloc[0]
else:
return g.iloc[-1]
df.groupby('B', as_index=False).apply(first_last)
# A B C D
#0 01:30:00 2002-01-16 10 -12
#1 01:00:00 2002-01-17 20 33
或更短的版本:
def first_last(g):
cond = g.D.ge(g.C.mul(0.5)) | g.D.le(g.C.mul(-1))
return g[cond].iloc[0] if cond.any() else g.iloc[-1]
df.groupby('B', as_index=False).apply(first_last)