尽管如此,我找不到正确的方法来获得在pandas中运行的等效查询。
update product
set maxrating = (select max(rating)
from rating
where source = 'customer'
and product.sku = rating.sku
group by sku)
where maxrating is null;
熊猫
product = pd.DataFrame({'sku':[1,2,3],'maxrating':[0,0,1]})
rating = pd.DataFrame({'sku':[1,1,2,3,3],'rating':[2,5,3,5,4],'source':['retailer','customer','customer','retailer','customer']})
expected_result = pd.DataFrame({'sku':[1,2,3],'maxrating':[5,3,1]})
SQL
drop table if exists product;
create table product(sku integer primary key, maxrating int);
insert into product(maxrating) values(null),(null),(1);
drop table if exists rating; create table rating(sku int, rating int, source text);
insert into rating values(1,2,'retailer'),(1,5,'customer'),(2,3,'customer'),(2,5,'retailer'),(3,3,'retailer'),(3,4,'customer');
update product
set maxrating = (select max(rating)
from rating
where source = 'customer'
and product.sku = rating.sku
group by sku)
where maxrating is null;
select *
from product;
怎么做?
答案 0 :(得分:4)
试试这个:
use std::cell::{Ref, RefCell};
struct Retriever {
//Own the data. But I want it to be dropped as soon as the references to it go out of scope.
data: RefCell<Vec<usize>>
}
impl Retriever{
fn retrieve<'a>(&'a self, id: usize) -> Ref<'a, Vec<usize>> {
//Create Data out of the blue (or disk, or memory, or network. I dont care)
//Now data can be stored internally and a referece to it can be supplied.
let mut data = self.data.borrow_mut();
*data = vec![0, 1, 2, 3];
self.data.borrow()
}
}
fn consume_iterator<'a, TIterator: Iterator<Item=&'a usize>>(iterator: TIterator) {
for i in iterator {
println!("{}", i);
}
}
fn handler<'a>(ret: &'a Retriever, id: usize) -> IterWrapper<'a> {
//andle_request now has a reference to the collection
//So just call iter()? Nope. Lifetime issues.
ret.retrieve(id).iter()
}
fn main() {
let retriever = Retriever{data: RefCell::new(Vec::new())};
consume_iterator(handler(&retriever, 0))
}
或使用普通面具:
In [220]: product.ix[product.maxrating == 0, 'maxrating'] = product.sku.map(rating.groupby('sku')['rating'].max())
In [221]: product
Out[221]:
maxrating sku
0 5 1
1 3 2
2 1 3
答案 1 :(得分:2)
product.maxrating = product.maxrating.replace(0, np.nan)
missing = product.loc[product.maxrating.isnull(), 'sku']
missingmax = rating.groupby(missing, as_index=False).rating.agg({'maxrating': 'max'})
product.update(missingmax)
首先,让我们从空值而不是零开始
product.maxrating = product.maxrating.replace(0, np.nan)
product
然后找出丢失的'sku'
并在groupby
中使用它们来计算missingmax
missing = product.loc[product.maxrating.isnull(), 'sku']
missingmax = rating.groupby(missing, as_index=False).rating.agg({'maxrating': 'max'})
missingmax
使用update
product.update(missingmax)
product
答案 2 :(得分:1)
您可以执行以下操作:
In [127]: df = pd.merge(rating, product, on='sku')
In [128]: df1 = df[df['maxrating'] == 0].groupby('sku').agg({'rating': np.max}).reset_index().rename(columns={'rating': 'maxrating'})
In [129]: df2 = df[df['maxrating'] != 0][['sku', 'maxrating']].drop_duplicates(keep='first')
In [131]: pd.concat([df1, df2])
Out[131]:
sku maxrating
0 1 5
1 2 3
3 3 1
In [132]: expected_result
Out[132]:
sku maxrating
0 1 5
1 2 3
2 3 1
基本上,我合并两个数据帧,然后提取我需要处理的行(那些没有最大化的行),并找到它们的实际最大等级。
一旦完成,我将结果与我排除的行(具有最大值的行)连接起来,并以预期结果结束。