Pandas SQL相当于更新所在的组

时间:2016-07-26 08:28:21

标签: python sql pandas dataframe

尽管如此,我找不到正确的方法来获得在pandas中运行的等效查询。

update product
  set maxrating = (select max(rating)
                   from rating
                   where source = 'customer'
                     and product.sku = rating.sku
                   group by sku)
  where maxrating is null;

熊猫

product = pd.DataFrame({'sku':[1,2,3],'maxrating':[0,0,1]})
rating = pd.DataFrame({'sku':[1,1,2,3,3],'rating':[2,5,3,5,4],'source':['retailer','customer','customer','retailer','customer']})
expected_result = pd.DataFrame({'sku':[1,2,3],'maxrating':[5,3,1]})

SQL

drop table if exists product;
create table product(sku integer primary key, maxrating int);
insert into product(maxrating) values(null),(null),(1);
drop table if exists rating; create table rating(sku int, rating int, source text);
insert into rating values(1,2,'retailer'),(1,5,'customer'),(2,3,'customer'),(2,5,'retailer'),(3,3,'retailer'),(3,4,'customer');
update product
  set maxrating = (select max(rating)
                   from rating
                   where source = 'customer'
                     and product.sku = rating.sku
                   group by sku)
  where maxrating is null;
select *
from product;

怎么做?

3 个答案:

答案 0 :(得分:4)

试试这个:

use std::cell::{Ref, RefCell};

struct Retriever {
    //Own the data. But I want it to be dropped as soon as the references to it go out of scope.
    data: RefCell<Vec<usize>>
}

impl Retriever{

    fn retrieve<'a>(&'a self, id: usize) -> Ref<'a, Vec<usize>> {
        //Create Data out of the blue (or disk, or memory, or network. I dont care)
        //Now data can be stored internally and a referece to it can be supplied.
        let mut data = self.data.borrow_mut();
        *data = vec![0, 1, 2, 3];
        self.data.borrow()
    }

}

fn consume_iterator<'a, TIterator: Iterator<Item=&'a usize>>(iterator: TIterator) {
    for i in iterator {
        println!("{}", i);
    }
}


fn handler<'a>(ret: &'a Retriever, id: usize) -> IterWrapper<'a> {
    //andle_request now has a reference to the collection
    //So just call iter()? Nope. Lifetime issues.
    ret.retrieve(id).iter()        
}

fn main() {
    let retriever = Retriever{data: RefCell::new(Vec::new())};
    consume_iterator(handler(&retriever, 0))
}

或使用普通面具:

In [220]: product.ix[product.maxrating == 0, 'maxrating'] = product.sku.map(rating.groupby('sku')['rating'].max())

In [221]: product
Out[221]:
   maxrating  sku
0          5    1
1          3    2
2          1    3

答案 1 :(得分:2)

一起

product.maxrating = product.maxrating.replace(0, np.nan)
missing = product.loc[product.maxrating.isnull(), 'sku']
missingmax = rating.groupby(missing, as_index=False).rating.agg({'maxrating': 'max'})
product.update(missingmax)

首先,让我们从空值而不是零开始

product.maxrating = product.maxrating.replace(0, np.nan)
product

enter image description here

然后找出丢失的'sku'并在groupby中使用它们来计算missingmax

missing = product.loc[product.maxrating.isnull(), 'sku']
missingmax = rating.groupby(missing, as_index=False).rating.agg({'maxrating': 'max'})

missingmax

enter image description here

使用update

product.update(missingmax)
product

enter image description here

答案 2 :(得分:1)

您可以执行以下操作:

In [127]: df = pd.merge(rating, product, on='sku')

In [128]: df1 = df[df['maxrating'] == 0].groupby('sku').agg({'rating': np.max}).reset_index().rename(columns={'rating': 'maxrating'})

In [129]: df2 = df[df['maxrating'] != 0][['sku', 'maxrating']].drop_duplicates(keep='first')

In [131]: pd.concat([df1, df2])
Out[131]: 
   sku  maxrating
0    1          5
1    2          3
3    3          1

In [132]: expected_result
Out[132]: 
   sku  maxrating
0    1          5
1    2          3
2    3          1

基本上,我合并两个数据帧,然后提取我需要处理的行(那些没有最大化的行),并找到它们的实际最大等级。

一旦完成,我将结果与我排除的行(具有最大值的行)连接起来,并以预期结果结束。