这是Why is TCP write latency worse when work is interleaved?
的后续活动在该问题中,我们发现在对TCP套接字的write
调用之间插入CPU密集型工作时,write
延迟增加了5倍以上。这是因为在没有CPU密集型工作的情况下,传出字节在作为TCP数据包提交给设备之前先进行批处理。 CPU密集型工作允许刷新发送缓冲区,因此每个新的write
都会触发包含开销的完整数据包构造。 (作为附带的问题,此数据包的构造究竟需要什么?TCP标头为<20字节,因此我不确定大多数开销实际上来自何处。)
鉴于此,我正在寻找一种“准备”下一个数据包的方法。当您知道将来某个时候需要发送数据包时,这在对延迟敏感的环境中很有用,因此您希望尽早完成数据包的构建开销。
我的第一个想法是将低水位标记SO_SNDLOWAT
设置为2,然后准备数据包而不用仅发送一个字节的write
来发送。从理论上讲,SO_SNDLOWAT
应该防止此数据包实际撞击设备,因此,当我测量后续write
携带实际数据的延迟时,它应该很快。但这根本不会减少延迟(我有些怀疑SO_SNDLOWAT
是否正在按我的期望进行操作。)
这是我的服务器代码:
// Server side C/C++ program to demonstrate Socket programming
// #include <iostream>
#include <boost/timer.hpp>
#include <ctime>
#include <sched.h>
#include <unistd.h>
#include <stdio.h>
#include <sys/socket.h>
#include <stdlib.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <string.h>
#include <unistd.h>
#include <time.h>
// Function to count clock cycles
__inline__ uint64_t rdtsc(void)
{
uint32_t lo, hi;
__asm__ __volatile__ (
"xorl %%eax,%%eax \n cpuid"
::: "%rax", "%rbx", "%rcx", "%rdx");
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
return (uint64_t)hi << 32 | lo;
}
// Set up some blocking work.
bool isPrime(int n) {
if (n < 2) {
return false;
}
for (int i = 2; i < n; i++) {
if (n % i == 0) {
return false;
}
}
return true;
}
// Compute the nth largest prime. Takes ~1 sec for n = 10,000
int getPrime(int n) {
int numPrimes = 0;
int i = 0;
while (true) {
if (isPrime(i)) {
numPrimes++;
if (numPrimes >= n) {
return i;
}
}
i++;
}
}
int main(int argc, char const *argv[])
{
int server_fd, new_socket, valread;
struct sockaddr_in address;
int opt = 1;
// Low water mark for socket
int lowat = 2;
int lowat2;
socklen_t optlen;
int addrlen = sizeof(address);
int result;
// Create socket for TCP server
server_fd = socket(AF_INET, SOCK_STREAM, 0);
setsockopt(server_fd, SOL_SOCKET, SO_SNDLOWAT, &lowat, sizeof(lowat));
address.sin_family = AF_INET;
address.sin_addr.s_addr = INADDR_ANY;
address.sin_port = htons(8080);
bind(server_fd, (struct sockaddr *)&address, sizeof(address));
listen(server_fd, 3);
// Accept one client connection
new_socket = accept(server_fd, (struct sockaddr *)&address, (socklen_t*)&addrlen);
setsockopt(new_socket, SOL_SOCKET, SO_SNDLOWAT, &lowat, sizeof(lowat));
// Check that SO_SNDLOWAT was updated
getsockopt(new_socket, SOL_SOCKET, SO_SNDLOWAT, &lowat2, &optlen);
printf("New lowat value: %d\n", lowat2);
char sendBuffer[1] = {0};
int primes[20] = {0};
int N = 10;
for (int i = 0; i < N; i++) {
sendBuffer[0] = 97 + i;
boost::timer t;
auto start = rdtsc();
write(new_socket, sendBuffer, 1);
auto end = rdtsc();
printf("%d mics (%llu cycles) to write\n", int(1e6 * t.elapsed()), end-start);
// Inserting blocking work here slows down the `write` calls by a
// factor of 5.
primes[i] = getPrime(10000 + i);
// Attempt to prep the next packet without sending it, by writing 'X'.
sendBuffer[0] = 88;
write(new_socket, sendBuffer, 1);
primes[i] = getPrime(1000 + i);
}
// Prevent the compiler from optimizing away the prime computation.
printf("prime: %d\n", primes[8]);
}
和客户代码:
// Server side C/C++ program to demonstrate Socket programming
// #include <iostream>
#include <unistd.h>
#include <stdio.h>
#include <sys/socket.h>
#include <stdlib.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <string.h>
#include <unistd.h>
int main(int argc, char const *argv[])
{
int sock, valread;
struct sockaddr_in address;
int opt = 1;
int addrlen = sizeof(address);
// We'll be passing uint32's back and forth
unsigned char recv_buffer[1024] = {0};
// Create socket for TCP server
sock = socket(AF_INET, SOCK_STREAM, 0);
// Set TCP_NODELAY so that writes won't be batched
setsockopt(sock, SOL_SOCKET, TCP_NODELAY, &opt, sizeof(opt));
address.sin_family = AF_INET;
address.sin_addr.s_addr = INADDR_ANY;
address.sin_port = htons(8080);
// Accept one client connection
if (connect(sock, (struct sockaddr *)&address, (socklen_t)addrlen) != 0) {
throw("connect failed");
}
int N = 10;
int loc[N+1];
int nloc, curloc;
for (nloc = curloc = 0; curloc < N; nloc++) {
int n = read(sock, recv_buffer + curloc, sizeof recv_buffer-curloc);
if (n <= 0) {
break;
}
curloc += n;
loc[nloc] = curloc;
// usleep(100000);
}
int last = 0;
for (int i = 0; i < nloc; i++) {
printf("%*.*s ", loc[i] - last, loc[i] - last, recv_buffer + last);
last = loc[i];
}
printf("\n");
}
输出:
New lowat value: 2
14 mics (31252 cycles) to write
25 mics (49088 cycles) to write
26 mics (55558 cycles) to write
26 mics (53618 cycles) to write
26 mics (54468 cycles) to write
28 mics (58382 cycles) to write
完全省去主要运算,将write
延迟减少到约5,000个周期(快10倍左右)。
我想知道我的SO_SNDLOWAT
实现是否有问题,或者是否有一种更干净的方式来准备数据包。
客户端的输出(空格表示单独的read
调用)表明SO_SNDLOWAT
失败:a X b X c X d X e X
。
更新:根据Gil的建议,我在发送MSG_MORE
数据包时尝试使用X
标志作为阻止实际设备写入的信号。这似乎可行(使第二次阻塞工作花费的时间少于200ms),因为客户端的输出变为a Xb Xc Xd Xe Xf
。但是与直觉相反,有效负载write
实际上变得更慢(100,000个周期与50,000个周期(无MSG_MORE
)与5,000个周期(不阻塞工作))。 MSG_MORE
代码:
// Attempt to prep the next packet without sending it, by writing 'X'.
sendBuffer[0] = 88;
send(new_socket, sendBuffer, 1, MSG_MORE);
primes[i] = getPrime(1000 + i + 1);