尝试使用mmap_packet创建基于套接字的原始程序以快速发送数据包。
此gist的示例中采用了以下代码。它确实发送数据包,但它不会快速发送。在我的1Gbps nic(r8169驱动程序)上,它只在我的corei7处理器(3.1GHz)上以大约95,000个数据包/秒的速率发送。我相信它可能会以更高的速度发送。
不确定瓶颈是什么。有任何想法吗?谢谢!
以下是代码段:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <unistd.h>
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <poll.h>
#include <arpa/inet.h>
#include <netinet/if_ether.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <linux/if.h>
#include <linux/if_packet.h>
#include <sys/time.h>
#define PACKET_QDISC_BYPASS 20
/// The number of frames in the ring
// This number is not set in stone. Nor are block_size, block_nr or frame_size
#define CONF_RING_FRAMES 1024
#define CONF_DEVICE "eth0"
/// Offset of data from start of frame
#define PKT_OFFSET (TPACKET_ALIGN(sizeof(struct tpacket_hdr)) + \
TPACKET_ALIGN(sizeof(struct sockaddr_ll)))
/// (unimportant) macro for loud failure
#define RETURN_ERROR(lvl, msg) \
do { \
fprintf(stderr, msg); \
return lvl; \
} while(0);
static struct sockaddr_ll txring_daddr;
double getTS() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec + tv.tv_usec/1000000.0;
}
/// create a linklayer destination address
// @param ringdev is a link layer device name, such as "eth0"
static int
init_ring_daddr(int fd, const char *ringdev)
{
struct ifreq ifreq;
// get device index
strcpy(ifreq.ifr_name, ringdev);
if (ioctl(fd, SIOCGIFINDEX, &ifreq)) {
perror("ioctl");
return -1;
}
txring_daddr.sll_family = AF_PACKET;
txring_daddr.sll_protocol = htons(ETH_P_IP);
txring_daddr.sll_ifindex = ifreq.ifr_ifindex;
// set the linklayer destination address
// NOTE: this should be a real address, not ff.ff....
txring_daddr.sll_halen = ETH_ALEN;
memset(&txring_daddr.sll_addr, 0xff, ETH_ALEN);
return 0;
}
/// Initialize a packet socket ring buffer
// @param ringtype is one of PACKET_RX_RING or PACKET_TX_RING
static char *
init_packetsock_ring(int fd, int ringtype)
{
struct tpacket_req tp;
char *ring;
// tell kernel to export data through mmap()ped ring
tp.tp_block_size = CONF_RING_FRAMES * getpagesize();
tp.tp_block_nr = 1;
tp.tp_frame_size = getpagesize();
tp.tp_frame_nr = CONF_RING_FRAMES;
if (setsockopt(fd, SOL_PACKET, ringtype, (void*) &tp, sizeof(tp))) {
perror("setting up ring");
RETURN_ERROR(NULL, "setsockopt() ring\n");
}
#ifdef TPACKET_V2
printf("it's TPACKET_V2\n");
val = TPACKET_V1;
setsockopt(fd, SOL_PACKET, PACKET_HDRLEN, &val, sizeof(val));
#endif
// open ring
ring = mmap(0, tp.tp_block_size * tp.tp_block_nr,
PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (!ring)
RETURN_ERROR(NULL, "mmap()\n");
if (init_ring_daddr(fd, CONF_DEVICE))
return NULL;
return ring;
}
/// Create a packet socket. If param ring is not NULL, the buffer is mapped
// @param ring will, if set, point to the mapped ring on return
// @return the socket fd
static int
init_packetsock(char **ring, int ringtype)
{
int fd;
// open packet socket
//fd = socket(PF_PACKET, SOCK_DGRAM, htons(ETH_P_IP));
//fd = socket(AF_INET,SOCK_RAW,htons(ETH_P_ALL)); //ETH_P_ALL = 3
fd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
if (fd < 0) {
perror("open socket");
RETURN_ERROR(-1, "Root priliveges are required\nsocket() rx. \n");
}
if (ring) {
*ring = init_packetsock_ring(fd, ringtype);
if (!*ring) {
close(fd);
return -1;
}
}
return fd;
}
static int
exit_packetsock(int fd, char *ring)
{
if (munmap(ring, CONF_RING_FRAMES * getpagesize())) {
perror("munmap");
return 1;
}
if (close(fd)) {
perror("close");
return 1;
}
return 0;
}
/// transmit a packet using packet ring
// NOTE: for high rate processing try to batch system calls,
// by writing multiple packets to the ring before calling send()
//
// @param pkt is a packet from the network layer up (e.g., IP)
// @return 0 on success, -1 on failure
static int process_tx(int fd, char *ring, const char *pkt, int pktlen)
{
static int ring_offset = 0;
struct tpacket_hdr *header;
struct pollfd pollset;
char *off;
int ret;
// fetch a frame
// like in the PACKET_RX_RING case, we define frames to be a page long,
// including their header. This explains the use of getpagesize().
header = (void *) ring + (ring_offset * getpagesize());
assert((((unsigned long) header) & (getpagesize() - 1)) == 0);
while (header->tp_status != TP_STATUS_AVAILABLE) {
// if none available: wait on more data
pollset.fd = fd;
pollset.events = POLLOUT;
pollset.revents = 0;
ret = poll(&pollset, 1, 1000 /* don't hang */);
if (ret < 0) {
if (errno != EINTR) {
perror("poll");
return -1;
}
//return 0;
}
}
// fill data
off = ((void *) header) + (TPACKET_HDRLEN - sizeof(struct sockaddr_ll));
memcpy(off, pkt, pktlen);
// fill header
header->tp_len = pktlen;
header->tp_status = TP_STATUS_SEND_REQUEST;
// increase consumer ring pointer
ring_offset = (ring_offset + 1) & (CONF_RING_FRAMES - 1);
// notify kernel
return 0;
}
/// Example application that opens a packet socket with rx_ring
int main(int argc, char **argv)
{
char *ring;
char pkt[125] = {0x00,0x0c,0x29,0xa4,0xff,0xbc,0x40,0x25,0xc2,0xd9,0xfb,0x8c,0x08,0x00,0x45,0x00,0x00,0x6f,0x24,0x1b,0x40,0x00,0x40,0x06,0x02,0x4b,0x0a,0x00,0x00,0x07,0x0a,0x00,0x00,0x1d,0xb8,0x64,0x01,0xbb,0x80,0x9e,0xaa,0x77,0x17,0x6d,0xa2,0x04,0x80,0x18,0x00,0x73,0x03,0xa0,0x00,0x00,0x01,0x01,0x08,0x0a,0x01,0x27,0x8e,0xaf,0x00,0x01,0xe8,0x71,0x16,0x03,0x01,0x00,0x36,0x01,0x00,0x00,0x32,0x03,0x02,0x55,0xf5,0x01,0xa9,0xc0,0xca,0xae,0xd6,0xd2,0x9b,0x6a,0x79,0x6d,0x9a,0xe8,0x9d,0x78,0xe2,0x64,0x98,0xf0,0xac,0xcb,0x2c,0x0d,0x51,0xa5,0xf8,0xc4,0x0f,0x93,0x87,0x00,0x00,0x04,0x00,0x35,0x00,0xff,0x01,0x00,0x00,0x05,0x00,0x0f,0x00,0x01,0x01};
int fd;
printf("page size %x\n", getpagesize());
fd = init_packetsock(&ring, PACKET_TX_RING);
if (fd < 0)
return 1;
// TODO: make correct IP packet out of pkt
int i;
double startTs = getTS();
double currentTs;
int pktCnt = 0;
int sendCnt = 0;
while (1) {
for (i=0; i<1000; i++) {
pkt[1] ++; pktCnt++;
process_tx(fd, ring, pkt, 125);
}
if (sendto(fd, NULL, 0, 0, (void *) &txring_daddr, sizeof(txring_daddr)) < 0) {
perror("sendto");
return -1;
}
sendCnt++;
usleep(300);
currentTs = getTS();
if ((currentTs - startTs) >= 1.0) {
startTs += 1.0;
printf("%7d %6d\n", pktCnt, sendCnt);
pktCnt = 0; sendCnt = 0;
}
}
if (exit_packetsock(fd, ring))
return 1;
printf("OK\n");
return 0;
}
UPDATE1
当前的NIC是RealTek RTL8111 / 8168/8411 NIC。将驱动程序升级到8.044之后的版本后,速率上升到135K /秒。
在英特尔82577LM千兆网卡上运行相同程序,速率约为430K /秒。