我有一个这样的数据框:
import requests
from bs4 import BeautifulSoup
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_detail_page(soup):
try:
title = (soup.find('h1',class_="cdm_style",id=False).text)
except:
title = 'Empty Title'
try:
collection = (soup.find('td',id="metadata_collec").find('a').text)
except:
collection = "Empty Collection"
try:
author = (soup.find('td',id="metadata_creato").text)
except:
author = "Empty Author"
try:
abstract = (soup.find('td',id="metadata_descri").text)
except:
abstract = "Empty Abstract"
try:
keywords = (soup.find('td',id="metadata_keywor").text)
except:
keywords = "Empty Keywords"
try:
publishers = (soup.find('td',id="metadata_publis").text)
except:
publishers = "Empty Publishers"
try:
date_original = (soup.find('td',id="metadata_contri").text)
except:
date_original = "Empty Date original"
try:
date_digital = (soup.find('td',id="metadata_date").text)
except:
date_digital = "Empty Date digital"
try:
formatt = (soup.find('td',id="metadata_source").text)
except:
formatt = "Empty Format"
try:
release_statement = (soup.find('td',id="metadata_rights").text)
except:
release_statement = "Empty Realease Statement"
try:
library = (soup.find('td',id="metadata_librar").text)
except:
library = "Empty Library"
try:
date_created = (soup.find('td',id="metadata_dmcreated").text)
except:
date_created = "Empty date Created"
data = {
'Title' : title.strip(),
'Collection' : collection.strip(),
'Author' : author.strip(),
'Abstract' : abstract.strip(),
'Keywords' : keywords.strip(),
'Publishers' : publishers.strip(),
'Date_original': date_original.strip(),
'Date_digital' : date_digital.strip(),
'Format' : formatt.strip(),
'Release-st' : release_statement.strip(),
'Library' : library.strip(),
'Date_created' : date_created.strip()
}
return data
def get_index_data(soup):
try:
titles_link = soup.find_all('a',class_="body_link_11")
except:
titles_link = []
else:
titles_link_output = []
for link in titles_link:
try:
item_id = link.attrs.get('item_id', None) #All titles with valid links will have an item_id
if item_id:
titles_link_output.append("{}{}".format("http://cgsc.cdmhost.com",link.attrs.get('href', None)))
except:
continue
return titles_link_output
def write_csv(data,url):
with open('1111_to_5555.csv','a') as csvfile:
writer = csv.writer(csvfile)
row = [data['Title'], data['Collection'], data['Author'],
data['Abstract'], data['Keywords'], data['Publishers'], data['Date_original'],
data['Date_digital'], data['Format'], data['Release-st'], data['Library'],
data['Date_created'], url]
writer.writerow(row)
def main():
for x in range(2,4):
mainurl = ("http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/")
print(x)
url = f"{mainurl}{x}"
products = get_index_data(get_page(url))
for product in products:
data1 = get_detail_page(get_page(product))
write_csv(data1,product)
if __name__ == '__main__':
main()
我想按大小分组,然后减去然后取平均值
df = pd.DataFrame({'size':['A','A','B','B','B','C','C','C','C'],
'value': [2,3,1,4,5,1,0,2,3,]})
size value
A 2
A 3
B 1
B 4
B 5
C 1
C 0
C 2
C 3
我希望我的输出看起来像这样:
df.groupby('size').agg({'value':'mean'})
mean value
size
A 2.5000
B 3.3333
C 1.5000
答案 0 :(得分:3)
您可以先Groupby+transform
,然后再subtract
:
df['value'] = df['value'].sub(df.groupby('size')['value'].transform('mean'))
#or df.groupby('size')['value'].transform(lambda x: x - x.mean()) as sammywemmy suggests
print(df)
size value
0 A -0.500000
1 A 0.500000
2 B -2.333333
3 B 0.666667
4 B 1.666667
5 C -0.500000
6 C -1.500000
7 C 0.500000
8 C 1.500000