使用Altivec的SIMD:为什么比添加两个向量更快地将两个向量相乘?

时间:2017-05-30 23:37:18

标签: c performance simd altivec

我一直在使用altivec实现基本的数学运算,作为学习即将到来的项目的simd的一种方式。另外,作为一种查看它的性能优势的方法,我追踪执行操作所需的时间,但我遇到了一些奇怪的事情。

我做的第一件事是将两个向量相加并减去两个向量。这很好用。接下来我做的是将两个向量相乘。但是,乘法比加法更快,即使使用较少的时钟周期来添加经文,也可以根据我的特定CPU数据表所说的使用指令进行乘法。

我有两个数组,每个数组大10MB,并通过这两个例程运行它们:

import 'package:flutter/material.dart';


void main() {
  runApp(new MaterialApp(
    home: new MyApp(),
  ));
}



class MyApp extends StatelessWidget {
  @override
  Widget build(BuildContext context) {
    return new DefaultTabController(
      length: 2,
      child: new Scaffold(
        appBar: new AppBar(
          title: new Text('Tabs Demo'),
          bottom: new PreferredSize(
            preferredSize: new Size(200.0, 200.0),
            child: new Container(
              width: 200.0,
              child: new TabBar(
                tabs: [
                  new Container(
                    height: 200.0,
                    child: new Tab(text: 'hello'),
                  ),
                  new Container(
                    height: 200.0,
                    child: new Tab(text: 'world'),
                  ),
                ],
              ),
            ),
          ),
        ),
        // body: ...
      ),
    );
  }

}

在我的特定平台上,av_AddValues需要81毫秒才能处理,av_MultiplyValues需要48毫秒才能处理。 (使用std :: chrono :: high_resolution_clock记录的时间)

为什么乘法比加法需要更少的时间?

考虑到__vector类型始终处理16个字节的数据,我认为添加32位值并不会乘以16位值会产生差异。

我的第一个想法是,由于将数字相加是一项非常简单的任务,因此CPU完成操作的速度要快于从内存中获取数据的速度。然而,随着乘法的增加,这种提取延迟被CPU忙于工作而不必等待很长时间所隐藏。

这是一个正确的假设吗?

完整代码:

void av_AddValues(int32_t* intArrayA, int32_t* intArrayB, int32_t* outputBuffer, int size)
{
  int iterations = size / (sizeof(__vector int32_t) / sizeof(int32_t));

  __vector int32_t* tempA = (__vector int32_t *) intArrayA;
  __vector int32_t* tempB = (__vector int32_t *) intArrayB;
  __vector int32_t* tempOut = (__vector int32_t *) outputBuffer;
  for(int i = 0; i < iterations; i++)
  {
    __vector int32_t sum = vec_add(*tempA, *tempB);
    vec_st(sum, 0, tempOut);

    tempA++;
    tempB++;
    tempOut++;
  }
}

  void av_MultiplyValues(int16_t* intArrayA, int16_t* intArrayB, int32_t* outputBuffer, int size)
  {
    int iterations = size / (sizeof(__vector int16_t) / sizeof(int16_t));
    __vector int16_t* tempA = (__vector int16_t *) intArrayA;
    __vector int16_t* tempB = (__vector int16_t *) intArrayB;
    __vector int32_t* tempOut = (__vector int32_t *) outputBuffer;


    for(int i = 0; i < iterations; i++)
    {
      __vector int32_t productEven = vec_mule(*tempA, *tempB);
      __vector int32_t productOdd = vec_mulo(*tempA, *tempB);

      __vector int32_t mergedProductHigh = vec_mergeh(productEven, productOdd);
      __vector int32_t mergedProductLow = vec_mergel(productEven, productOdd);

      vec_st(mergedProductHigh, 0, tempOut);
      tempOut++;
      vec_st(mergedProductLow, 0, tempOut);

      tempA++;
      tempB++;
      tempOut++;
    }
  }

perf stat和perf记录的输出:

#include <chrono>
#include <random>
#include <limits>

#include <iostream>
#include <cassert>
#include <cstring>
#include <cstdint>
#include <malloc.h>

#include <altivec.h>
#undef vector

void GenerateRandom16bitValues(int16_t* inputABuffer, int16_t* inputBBuffer, int32_t* outputBuffer, int size);
void GenerateRandom32bitValues(int32_t* inputABuffer, int32_t* inputBBuffer, int32_t* outputBuffer, int size);
void TestAdd();
void TestMultiply();
void av_AddValues(int32_t* intArrayA, int32_t* intArrayB, int32_t* outputBuffer, int size);
void av_MultiplyValues(int16_t* intArrayA, int16_t* intArrayB, int32_t* outputBuffer, int size);

int main()
{
  TestAdd();
  TestMultiply();
}

void GenerateRandom16bitValues(int16_t* inputABuffer, int16_t* inputBBuffer, int32_t* outputBuffer, int size)
{
  std::random_device rd;
  std::mt19937 gen(rd());
  std::uniform_int_distribution<> dis(std::numeric_limits<int16_t>::min(), std::numeric_limits<int16_t>::max());

  for(int i = 0; i < size; i++)
  {
    inputABuffer[i] = dis(gen);
    inputBBuffer[i] = dis(gen);
    outputBuffer[i] = 0;
  }
}

void GenerateRandom32bitValues(int32_t* inputABuffer, int32_t* inputBBuffer, int32_t* outputBuffer, int size)
{
  std::random_device rd;
  std::mt19937 gen(rd());
  std::uniform_int_distribution<> dis(std::numeric_limits<int32_t>::min(), std::numeric_limits<int32_t>::max());

  for(int i = 0; i < size; i++)
  {
    inputABuffer[i] = dis(gen);
    inputBBuffer[i] = dis(gen);
    outputBuffer[i] = 0;
  }
}

void TestAdd()
{
    int size = 10'485'760;
    int bytes = size * sizeof(int32_t);

    int32_t* inputABuffer = (int32_t*) memalign(64, bytes);
    int32_t* inputBBuffer = (int32_t*) memalign(64, bytes);
    int32_t* outputBuffer = (int32_t*) memalign(64, bytes);
    assert(inputABuffer != nullptr);
    assert(inputBBuffer != nullptr);
    assert(outputBuffer != nullptr);

    GenerateRandom32bitValues(inputABuffer, inputBBuffer, outputBuffer, size);

    for(int i = 0; i < 20; i++)
    {
      auto start = std::chrono::high_resolution_clock::now();
      av_AddValues(inputABuffer, inputBBuffer, outputBuffer, size);
      auto end = std::chrono::high_resolution_clock::now();
      auto diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);

      for(int k = 0; k < size; k++)
      {
        assert(outputBuffer[k] == (inputABuffer[k] + inputBBuffer[k]));
      }

      std::cout << "Vector Sum - " << diff.count() << "ms\n";
      memset(outputBuffer, 0, size);
    }
}

void TestMultiply()
{
    int size = 10'485'760;
    int16_t* inputABuffer = (int16_t*) memalign(64, size * sizeof(int16_t));
    int16_t* inputBBuffer = (int16_t*) memalign(64, size * sizeof(int16_t));
    int32_t* outputBuffer = (int32_t*) memalign(64, size * sizeof(int32_t));
    assert(inputABuffer != nullptr);
    assert(inputBBuffer != nullptr);
    assert(outputBuffer != nullptr);

    GenerateRandom16bitValues(inputABuffer, inputBBuffer, outputBuffer, size);

    for(int i = 0; i < 20; i++)
    {
      auto start = std::chrono::high_resolution_clock::now();
      av_MultiplyValues(inputABuffer, inputBBuffer, outputBuffer, size);
      auto end = std::chrono::high_resolution_clock::now();
      auto diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);

      for(int k = 0; k < size; k++)
      {
        assert(outputBuffer[k] == (inputABuffer[k] * inputBBuffer[k]));
      }

      std::cout << "Vector product - " << diff.count() << "ms\n";
      memset(outputBuffer, 0, size);
    }
}

void av_AddValues(int32_t* intArrayA, int32_t* intArrayB, int32_t* outputBuffer, int size)
{
  int iterations = size / (sizeof(__vector int32_t) / sizeof(int32_t));

  __vector int32_t* tempA = (__vector int32_t *) intArrayA;
  __vector int32_t* tempB = (__vector int32_t *) intArrayB;
  __vector int32_t* tempOut = (__vector int32_t *) outputBuffer;

  for(int i = 0; i < iterations; i++)
  {
    __vector int32_t sum = vec_add(*tempA, *tempB);
    vec_st(sum, 0, tempOut);

    tempA++;
    tempB++;
    tempOut++;
  }
}

void av_MultiplyValues(int16_t* intArrayA, int16_t* intArrayB, int32_t* outputBuffer, int size)
{
  int iterations = size / (sizeof(__vector int16_t) / sizeof(int16_t));
  __vector int16_t* tempA = (__vector int16_t *) intArrayA;
  __vector int16_t* tempB = (__vector int16_t *) intArrayB;
  __vector int32_t* tempOut = (__vector int32_t *) outputBuffer;
  for(int i = 0; i < iterations; i++)
  {
    __vector int32_t productEven = vec_mule(*tempA, *tempB);
    __vector int32_t productOdd = vec_mulo(*tempA, *tempB);

    __vector int32_t mergedProductHigh = vec_mergeh(productEven, productOdd);
    __vector int32_t mergedProductLow = vec_mergel(productEven, productOdd);

    vec_st(mergedProductHigh, 0, tempOut);
    tempOut++;
    vec_st(mergedProductLow, 0, tempOut);

    tempA++;
    tempB++;
    tempOut++;
  }
}

1 个答案:

答案 0 :(得分:1)

它与输入缓冲区的大小有关。

在一个案例中(TestAdd):

int size = 10'485'760;
int bytes = size * sizeof(int32_t);

int32_t* inputABuffer = (int32_t*) memalign(64, bytes);
int32_t* inputBBuffer = (int32_t*) memalign(64, bytes);
int32_t* outputBuffer = (int32_t*) memalign(64, bytes);

你分配3 * size * 4字节(sizeof(int32_t)= 4)

在另一个(test_mul)中:

int size = 10'485'760;
int16_t* inputABuffer = (int16_t*) memalign(64, size * sizeof(int16_t));
int16_t* inputBBuffer = (int16_t*) memalign(64, size * sizeof(int16_t));
int32_t* outputBuffer = (int32_t*) memalign(64, size * sizeof(int32_t));

你分配大小* 4 + 2 *大小* 2(sizeof(int16_t)= 2)

由于此代码完全受内存限制,因此第二个代码为(3 * 4)/(4 + 2 * 2)= 1.5倍

这与您的测量结果一致,因为2.15 / 1.5 = 1.43,接近1.58。