Question

（TL; DR）在NVME SSD（Intel p3600以及Avant）上，如果我在磁盘的一小部分而不是整个磁盘上发出随机读取，我会看到IOPS降低。

在反复读取相同的偏移量时，对于4k块大小，IOPS大约为36-40K。随着我增加发布随机读取的区域，IOPS逐渐增加。该程序（如下所示）在Linux上使用异步IO来提交读取请求。

Disk Range(in 4k blocks), IOPS 
0, 38833 
1, 68596 
10, 76100 
30, 80381 
40, 113647 
50, 148205 
100, 170374 
200, 239798 
400, 270197 
800, 334767

OS：Linux 4.2.0-35-generic

SSD：Intel P3600 NVME Flash

可能导致此问题的原因是什么？

程序可以按如下方式运行

$ for i in 0 1 10 30 40 50 100 200 400 800
do 
<program_name> /dev/nvme0n1 10 $i 
done

并验证您是否也看到上面提到的IOPS增长模式

   /**
 * $ g++ <progname.cpp> -o progname -std=c++11 -lpthread -laio -O3
 * $ progname /dev/nvme0n1 10 100
 */
#include <random>
#include <libaio.h>
#include <stdlib.h>//malloc, exit
#include <future> //async
#include <unistd.h> //usleep
#include <iostream>
#include <sys/time.h> // gettimeofday
#include <vector>
#include <fcntl.h> // open
#include <errno.h>
#include <sys/types.h> // open
#include <sys/stat.h> // open
#include <cassert>
#include <semaphore.h>

io_context_t ioctx;
std::vector<char*> buffers;
int fd = -1;
sem_t sem;

constexpr int numPerRound = 20;
constexpr int numRounds  = 100000;

constexpr int MAXEVENT = 10;
constexpr size_t BLKSIZE = 4096;
constexpr int QDEPTH = 200;

off_t startBlock = 0;
off_t numBlocks = 100;

const int numSubmitted = numRounds * numPerRound;

void DoGet()
{
  io_event eventsArray[MAXEVENT];
  int numCompleted = 0;
  while (numCompleted != numSubmitted)
  {
    bzero(eventsArray, MAXEVENT * sizeof(io_event));
    int numEvents;
    do {
      numEvents = io_getevents(ioctx, 1, MAXEVENT, eventsArray, nullptr);
    } while (numEvents == -EINTR);

    for (int i = 0; i < numEvents; i++)
    {
      io_event* ev = &eventsArray[i];
      iocb* cb = (iocb*)(ev->data);
      assert(ev->res2 == 0);
      assert(ev->res == BLKSIZE);
      sem_post(&sem); // free ioctx
    }
    numCompleted += numEvents;
  }
  std::cout << "completed=" << numCompleted << std::endl;
}


int main(int argc, char* argv[])
{
  if (argc == 1) {
    std::cout << "usage <nvme_device_name> <start_4k_block> <num_4k_blocks>" << std::endl;
    exit(1);
  }

  char* deviceName = argv[1];
  startBlock = atoll(argv[2]);
  numBlocks = atoll(argv[3]);

  int ret = 0;
  ret = io_queue_init(QDEPTH, &ioctx);
  assert(ret == 0);
  ret = sem_init(&sem, 0, QDEPTH);
 assert(ret == 0);

  auto DoGetFut = std::async(std::launch::async, DoGet);

  // preallocate buffers
  for (int i = 0; i < QDEPTH; i++)
  {
    char* buf ;
    ret = posix_memalign((void**)&buf, 4096, BLKSIZE);
    assert(ret == 0);
    buffers.push_back(buf);
  }

  fd = open("/dev/nvme0n1", O_DIRECT | O_RDONLY);
  assert(fd >= 0);

  off_t offset = 0;

  struct timeval start;
  gettimeofday(&start, 0);

  std::mt19937 generator (getpid());
  // generate random offsets within [startBlock, startBlock + numBlocks]
  std::uniform_int_distribution<off_t> offsetgen(startBlock, startBlock + numBlocks);

  for (int j = 0; j < numRounds; j++)
  {
    iocb mycb[numPerRound];
    iocb* posted[numPerRound];

    bzero(mycb, sizeof(iocb) * numPerRound);

    for (int i = 0; i < numPerRound; i++)
    {
      // same buffer may get used in 2 different async read
      // thats ok - not validating content in this program
      char* iobuf = buffers[i];
      iocb* cb = &mycb[i];

       offset = offsetgen(generator) * BLKSIZE;

      io_prep_pread(cb, fd, iobuf, BLKSIZE, offset);
      cb->data = iobuf;
      posted[i] = cb;
      sem_wait(&sem); // wait for ioctx to be free
    }

    int ret = 0;
    do {
      ret = io_submit(ioctx, numPerRound, posted);
    } while (ret == -EINTR);

    assert(ret == numPerRound);
  }

  DoGetFut.wait();

  struct timeval end;
  gettimeofday(&end, 0);

  uint64_t diff = ((end.tv_sec - start.tv_sec) * 1000000) + (end.tv_usec - start.tv_usec);

  io_queue_release(ioctx);

  std::cout
    << "ops=" << numRounds * numPerRound
    << " iops=" << (numRounds * numPerRound *(uint64_t)1000000)/diff
    << " region-size=" << (numBlocks * BLKSIZE)
    << std::endl;
}

Answer 1

当然这与记忆的结构有关。在内部，该驱动器由许多存储器芯片构成，并且可以在内部具有多个存储器总线。如果您在小范围内发出请求，则所有请求将解析为单个或几个芯片，并且必须排队。如果您访问整个设备，则多个请求跨越许多内部芯片和总线，并且可以异步运行，从而提供更高的吞吐量。

如果在小区域发出请求，则NVME SSD上的随机读取IOP减少

1 个答案: