我正在编写一个代码来查找6个向量的均值和标准差,每个向量包含8000个元素。我想知道我是否可以使用CUDA来加速操作。我可以想到如何使用CUDA找到平均值,但我无法理解如何使用CUDA计算标准偏差。有人可以帮我吗?
答案 0 :(得分:6)
Here is a Thrust example一次性计算一些汇总统计信息,包括mean和std。偏差。
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/transform_reduce.h>
#include <thrust/functional.h>
#include <thrust/extrema.h>
#include <cmath>
#include <limits>
// This example computes several statistical properties of a data
// series in a single reduction. The algorithm is described in detail here:
// http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
//
// Thanks to Joseph Rhoads for contributing this example
// structure used to accumulate the moments and other
// statistical properties encountered so far.
template <typename T>
struct summary_stats_data
{
T n;
T min;
T max;
T mean;
T M2;
T M3;
T M4;
// initialize to the identity element
void initialize()
{
n = mean = M2 = M3 = M4 = 0;
min = std::numeric_limits<T>::max();
max = std::numeric_limits<T>::min();
}
T variance() { return M2 / (n - 1); }
T variance_n() { return M2 / n; }
T skewness() { return std::sqrt(n) * M3 / std::pow(M2, (T) 1.5); }
T kurtosis() { return n * M4 / (M2 * M2); }
};
// stats_unary_op is a functor that takes in a value x and
// returns a variace_data whose mean value is initialized to x.
template <typename T>
struct summary_stats_unary_op
{
__host__ __device__
summary_stats_data<T> operator()(const T& x) const
{
summary_stats_data<T> result;
result.n = 1;
result.min = x;
result.max = x;
result.mean = x;
result.M2 = 0;
result.M3 = 0;
result.M4 = 0;
return result;
}
};
// summary_stats_binary_op is a functor that accepts two summary_stats_data
// structs and returns a new summary_stats_data which are an
// approximation to the summary_stats for
// all values that have been agregated so far
template <typename T>
struct summary_stats_binary_op
: public thrust::binary_function<const summary_stats_data<T>&,
const summary_stats_data<T>&,
summary_stats_data<T> >
{
__host__ __device__
summary_stats_data<T> operator()(const summary_stats_data<T>& x, const summary_stats_data <T>& y) const
{
summary_stats_data<T> result;
// precompute some common subexpressions
T n = x.n + y.n;
T n2 = n * n;
T n3 = n2 * n;
T delta = y.mean - x.mean;
T delta2 = delta * delta;
T delta3 = delta2 * delta;
T delta4 = delta3 * delta;
//Basic number of samples (n), min, and max
result.n = n;
result.min = thrust::min(x.min, y.min);
result.max = thrust::max(x.max, y.max);
result.mean = x.mean + delta * y.n / n;
result.M2 = x.M2 + y.M2;
result.M2 += delta2 * x.n * y.n / n;
result.M3 = x.M3 + y.M3;
result.M3 += delta3 * x.n * y.n * (x.n - y.n) / n2;
result.M3 += (T) 3.0 * delta * (x.n * y.M2 - y.n * x.M2) / n;
result.M4 = x.M4 + y.M4;
result.M4 += delta4 * x.n * y.n * (x.n * x.n - x.n * y.n + y.n * y.n) / n3;
result.M4 += (T) 6.0 * delta2 * (x.n * x.n * y.M2 + y.n * y.n * x.M2) / n2;
result.M4 += (T) 4.0 * delta * (x.n * y.M3 - y.n * x.M3) / n;
return result;
}
};
template <typename Iterator>
void print_range(const std::string& name, Iterator first, Iterator last)
{
typedef typename std::iterator_traits<Iterator>::value_type T;
std::cout << name << ": ";
thrust::copy(first, last, std::ostream_iterator<T>(std::cout, " "));
std::cout << "\n";
}
int main(void)
{
typedef float T;
// initialize host array
T h_x[] = {4, 7, 13, 16};
// transfer to device
thrust::device_vector<T> d_x(h_x, h_x + sizeof(h_x) / sizeof(T));
// setup arguments
summary_stats_unary_op<T> unary_op;
summary_stats_binary_op<T> binary_op;
summary_stats_data<T> init;
init.initialize();
// compute summary statistics
summary_stats_data<T> result = thrust::transform_reduce(d_x.begin(), d_x.end(), unary_op, init, binary_op);
std::cout <<"******Summary Statistics Example*****"<<std::endl;
print_range("The data", d_x.begin(), d_x.end());
std::cout <<"Count : "<< result.n << std::endl;
std::cout <<"Minimum : "<< result.min <<std::endl;
std::cout <<"Maximum : "<< result.max <<std::endl;
std::cout <<"Mean : "<< result.mean << std::endl;
std::cout <<"Variance : "<< result.variance() << std::endl;
std::cout <<"Standard Deviation : "<< std::sqrt(result.variance_n()) << std::endl;
std::cout <<"Skewness : "<< result.skewness() << std::endl;
std::cout <<"Kurtosis : "<< result.kurtosis() << std::endl;
return 0;
}
答案 1 :(得分:2)
这超出了我的专业领域,但是存在用于计算标准偏差的单程迭代算法,其可以转换为减少。特别是,我正在考虑Welford的算法,如Knuth,TAOCP,vol。 2.一个缺点是它需要在每一步都进行划分,但这很可能与必要的存储器访问很好地平衡。该算法的可用在线参考似乎是:
答案 2 :(得分:0)
我已经在CUDA中解决了这个用于数据挖掘的问题。我没有使用任何库。但是,它给了我很好的结果。问题是找到128 * 100万个样本的标准偏差和平均值。这就是我所做的。
我的设备有16KB的共享内存。而且,我正在使用花车。因此,共享内存可容纳4,000个元素。我的设备的每个块的最大线程数是512.所以,我可以有8个块。如果我将16KB分成8个块,我将获得2,000KB(即1个线程的1个浮点数)。一般来说,这不会匹配。如果你有更好的设备,你需要再次进行数学运算。
要查找标准差,每个块有512个元素。你可以使用一个线程找到square(element-mean)。
接下来的挑战是添加这个并找到这些元素的总和。尝试使用相同的方法找到平均值。适用于512个元素。将结果复制到全局内存。
迭代。找到结果的平方根。
PS:相应地进行规划,以便全局内存调用最小化。均值和标准差经常从内存中调用数据。
答案 3 :(得分:0)
答案很晚,但我在代码中使用thrust::transform_reduce
解决了这个问题(在GTX 1070上使用100k浮点数进行测试):
#include <thrust/transform_reduce.h>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <functional>
#include <cmath>
/*
* @struct varianceshifteop
* @brief a unary function that shifts input data
* by their mean and computes the squares of them
*/
struct varianceshifteop
: std::unary_function<float, float>
{
varianceshifteop(float m)
: mean(m)
{ /* no-op */ }
const float mean;
__device__ float operator()(float data) const
{
return ::pow(data - mean, 2.0f);
}
};
int main(int argc, char** argv)
{
thrust::device_vector<float> data{ ... };
// sum elements and divide by the number of elements
float mean = thrust::reduce(
data.cbegin(),
data.cend(),
0.0f,
thrust::plus<float>()) / data.size();
// shift elements by mean, square, and add them
float variance = thrust::transform_reduce(
data.cbegin(),
data.cend(),
varianceshifteop(mean),
0.0f,
thrust::plus<float>()) / (data.size() - 1);
// standard dev is just a sqrt away
float stdv = std::sqrtf(variance);
return 0;
}