我正在尝试使用pandas处理数据科学问题。我的数据集包含以下列:“country”,“conversion”,“test”,“userid”等。在country列中,大约有10个国家/地区。 “test”列的值为0和1表示两种类型的测试:控制为0,实验为1.“转换”也有值0和1,表示此人是否已转换。
我想分组国家和计算p值和测试的平均值== 0和测试每个组的== 1。我试图使用以下函数,但它会抛出一个错误,“TypeError:无法连接'str'和'float'对象”。有人可以说清楚这个吗?
from jpype import *
startJVM(getDefaultJVMPath(), "-ea", '-Xms1024m', '-Xmx4096m', '-Djava.class.path=./jars/bdms-chunkjava-lib-1.0.9-SNAPSHOT.jar:./jars/bdms-ldfjava-lib-1.0.9-SNAPSHOT.jar')
LDF1File = JClass('aero.blue.bdms.ldf.stream.LDF1File')
shutdownJVM()
完整的错误消息:
def f(x):
control = x.loc[(x.test==0)]
test = x.loc[(x.test==1)]
p_value = stats.ttest_ind(control,test)[0]
control_mean = control['conversion'].mean()
test_mean = test['conversion'].mean()
return pd.Series({'p_value': p_value, 'conversion_test': test_mean, 'conversion_control': control_mean})
bycountry = data1.groupby('country').apply(f)
bycountry = bycountry.reset_index(level='None')
bycountry
输出df.dtypes:
TypeError Traceback (most recent call last)
<ipython-input-495-bd6227878520> in <module>()
7 return pd.Series({'p_value': p_value, 'conversion_test': test_mean, 'conversion_control': control_mean})
8
----> 9 bycountry = data1.groupby("country").apply(f)
10 bycountry = bycountry.reset_index(level='None')
11 bycountry
C:\Users\SnehaPriya\Anaconda2\lib\site-packages\pandas\core\groupby.pyc in apply(self, func, *args, **kwargs)
649 # ignore SettingWithCopy here in case the user mutates
650 with option_context('mode.chained_assignment', None):
--> 651 return self._python_apply_general(f)
652
653 def _python_apply_general(self, f):
C:\Users\SnehaPriya\Anaconda2\lib\site-packages\pandas\core\groupby.pyc in _python_apply_general(self, f)
653 def _python_apply_general(self, f):
654 keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 655 self.axis)
656
657 return self._wrap_applied_output(
C:\Users\SnehaPriya\Anaconda2\lib\site-packages\pandas\core\groupby.pyc in apply(self, f, data, axis)
1525 # group might be modified
1526 group_axes = _get_axes(group)
-> 1527 res = f(group)
1528 if not _is_indexed_like(res, group_axes):
1529 mutated = True
C:\Users\SnehaPriya\Anaconda2\lib\site-packages\pandas\core\groupby.pyc in f(g)
645 @wraps(func)
646 def f(g):
--> 647 return func(g, *args, **kwargs)
648
649 # ignore SettingWithCopy here in case the user mutates
<ipython-input-495-bd6227878520> in f(x)
2 control = x.loc[(x.test==0)]
3 test = x.loc[(x.test==1)]
----> 4 p_value = stats.ttest_ind(control,test)[0]
5 control_mean = control['conversion'].mean()
6 test_mean = test['conversion'].mean()
C:\Users\SnehaPriya\Anaconda2\lib\site-packages\scipy\stats\stats.pyc in ttest_ind(a, b, axis, equal_var, nan_policy)
3865 return Ttest_indResult(np.nan, np.nan)
3866
-> 3867 v1 = np.var(a, axis, ddof=1)
3868 v2 = np.var(b, axis, ddof=1)
3869 n1 = a.shape[axis]
C:\Users\SnehaPriya\Anaconda2\lib\site-packages\numpy\core\fromnumeric.pyc in var(a, axis, dtype, out, ddof, keepdims)
3098
3099 return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
-> 3100 keepdims=keepdims)
C:\Users\SnehaPriya\Anaconda2\lib\site-packages\numpy\core\_methods.pyc in _var(a, axis, dtype, out, ddof, keepdims)
89 # Note that if dtype is not of inexact type then arraymean will
90 # not be either.
---> 91 arrmean = umr_sum(arr, axis, dtype, keepdims=True)
92 if isinstance(arrmean, mu.ndarray):
93 arrmean = um.true_divide(
TypeError: cannot concatenate 'str' and 'float' objects
答案 0 :(得分:0)
def f(x):
control = x.loc[(x.test==0)]
control = control['conversion']
test = x.loc[(x.test==1)]
test = test['conversion']
p_value = stats.ttest_ind(control,test)[0]
control_mean = control.mean()
test_mean = test.mean()
return pd.Series({'p_value': p_value, 'conversion_test': test_mean, 'conversion_control': control_mean})
这样做了!再次感谢@ juanpa.arrivillaga !!