我正在尝试实施网络预取系统。像这样的系统的目的是“预测”未来的请求并预取它们。
系统根据Web导航日志(Squid access.log文件)构建预测模型。该模型是一个依赖图,其中表示URL A的节点对表示URL B的节点有一个弧,如果在URL A之后立即请求了URL B.
构建模型后,系统会收到用户请求的URL查询,并根据图形进行“预测”。预测是将来很可能要求的资源(URL)。因此,根据预测,系统会预取这些资源,以便在用户请求之前将它们存储在缓存中。
我正在使用以下测试方案:
stdbuf -o0 tshark -i eth1 tcp port 3128 and "tcp[((tcp[12:1] & 0xf0) >> 2):4] = 0x47455420" -T fields -e http.request.full_uri >> url_fifo
情况如下:有时从命名管道读取会冻结。即使tshark继续捕获URL并将其重定向到命名管道,也不会从管道中读取任何URL。一小时(或几个小时)后,所有“缓冲”的URL都会在不到10分钟的时间内被读取。在那之后,从管道读取再次保持正常。这种情况并不总是发生(50%的时候冻结,50%没有)。
似乎存在缓冲问题,因为tshark持续捕获URL并且所有请求都正确记录在Squid的access.log中。
一开始,我使用-l
选项运行tshark,以便其输出变为行缓冲。然后我开始使用stdbuf -o0
(没有缓冲)。无论如何,情况仍然发生。
在系统代码中,我还尝试打开并读取命名管道作为流(FILE *
),并将流设置为no buffered或line buffered(使用setvbuf()
函数)。情况仍然发生。
在某些周期中,请求比其他周期更快。无论如何,它似乎不是一个快速的生产者缓慢的消费者问题,因为在许多重复测试中,所有URL都被正确读取和处理而没有任何冻结。
我是否遗漏了与命名管道和缓冲相关的内容?我真的很感激一些指导。
假设网络(接口,路由,iptables,squid)没问题。我没有任何与之相关的问题。
代码(假设包含必要的头文件):
functions.c
#define BUFLEN 512
#define QUEUE_LEN 64
#define THREADS_LEN 2
pthread_mutex_t model_lock;
pthread_cond_t model_cond, listen_cond;
pthread_t queries_thread, listen_thread;
short int model_is_updating, model_can_update, program_shutdown;
/* Program execution statistics */
Status * program_status;
/* Thread pool */
threadpool_t *pool;
/* program execution */
int
run(void)
{
Graph_Adj_List * gr = NULL; /* Graph as an adjacency list */
char ** reports = NULL;
unsigned report_counter = 0;
/* Init program status */
program_status = status_init();
/* Load list of custom web navigation reports to be used to build the initial
* version of the predictive model */
reports = file_load_reports(program_config.reports_file);
if (!reports)
return 0;
/* Init mutex and cond */
pthread_mutex_init(&model_lock, NULL);
pthread_cond_init(&model_cond, NULL);
pthread_cond_init(&listen_cond, NULL);
/* Lock */
pthread_mutex_lock (&model_lock);
/* Start first cycle */
status_start_cycle(program_status);
/* Create initial version of the predictive model */
gr = create_model_from_files(reports, &report_counter, program_config.reports_limit);
if (!gr)
{
/* Unlock */
pthread_mutex_unlock (&model_lock);
return 0;
}
/* Unlock */
pthread_mutex_unlock (&model_lock);
/* Start threads */
if (pthread_create(&queries_thread, NULL, fifo_predictions_threaded, (void *)gr) ||
pthread_create(&listen_thread, NULL, listen_end_of_cycle, NULL))
program_shutdown = 1;
/* main loop */
while(!program_shutdown)
{
/* lock */
pthread_mutex_lock (&model_lock);
/* wait for clients' announcement of the end of requests from current cycle */
while (!model_can_update)
pthread_cond_wait(&model_cond, &model_lock);
/* set updating flag */
model_is_updating = 1;
/* Update predictive model, based on Squid's access.log from (about to finish)
* current cycle */
adj_list_update_access(gr, program_config.access_file);
/* Save statistics related to the current cycle and finish it */
status_finish_cycle(program_status);
/* Check if last custom report has been read */
if (!reports[report_counter])
{
program_shutdown = 1;
pthread_mutex_unlock (&model_lock);
break;
}
/* Start a new cycle */
status_start_cycle(program_status);
/* Read a new custom report and update the predictive model */
update_model(gr, reports[report_counter]);
report_counter++;
/* Updating is done */
model_is_updating = 0;
/* Model can't be updated until client announces the end of the cycle
* that has just started */
model_can_update = 0;
/* Tell client to start sending requests from the new cycle */
if (!signal_start_cycle())
{
program_shutdown = 1;
pthread_mutex_unlock (&model_lock);
break;
}
/* Signal listener thread that a new cycle has begin */
pthread_cond_signal(&listen_cond);
/* Unlock */
pthread_mutex_unlock (&model_lock);
}
/* Finish threads */
pthread_cancel(listen_thread);
pthread_cancel(queries_thread);
pthread_join(listen_thread, NULL);
pthread_join(queries_thread, NULL);
/* Free memory */
adj_list_free_all2(&gr);
file_reports_free_all(&reports);
pthread_cond_destroy(&model_cond);
pthread_cond_destroy(&listen_cond);
pthread_mutex_destroy(&model_lock);
status_free(&program_status);
return 1;
}
void *
fifo_predictions_threaded(void * data)
{
Graph_Adj_List * gr = (Graph_Adj_List *) data;
/* Set thread cancel type */
pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
while (!program_shutdown)
{
pthread_mutex_lock(&model_lock);
/* Pause reading from named pipe while the model is being updated */
while(model_is_updating)
pthread_cond_wait(&listen_cond, &model_lock);
pthread_mutex_unlock(&model_lock);
/* Read URLs from named pipe */
fifo_predictions(gr, program_config.fifo);
}
pthread_exit(NULL);
return NULL;
}
int
fifo_predictions(Graph_Adj_List * gr, const u8 * fifo)
{
u8 cad[BUFLEN] = { '\0' };
u8 * ini = NULL, * fin = NULL, * fullurl = NULL;
int i, fifo_descriptor, read_urls = 0, fullurl_len = 0, incomplete_url = 1;
FILE * fifo_file = NULL;
/* Open fifo in blocking mode */
fifo_descriptor = open(CHAR_CAST fifo, O_RDONLY);
/* Open fifo as a stream */
// fifo_file = fopen(fifo, "r");
// if (!fifo_file)
if (fifo_descriptor == -1)
return 0;
/* If fifo is opened as a stream, set it line buffered */
// setlinebuf(fifo_file);
do
{
if ((i = read(fifo_descriptor, cad, BUFLEN - 1)) == -1)
// if ( fgets(cad, BUFLEN-1, fifo_file) == NULL)
ERROR(__FILE__, __FUNCTION__, __LINE__, "Fifo read error");
else
{
// i = strlen(cad);
cad[i] = '\0';
read_urls = 0;
if (i > 0)
{
int j = 0;
for (j = 0, ini = cad, fin = NULL ; cad[j] != '\0'; j++)
{
if (cad[j] == '\n')
{
/* Save URL */
fin = &cad[j];
ini = (*ini == '\n' ? ini + 1 : ini);
/* Check if string is a continuation of the previously read URL */
read_urls = fin - ini;
read_urls = read_urls >= 0 ? read_urls : 0;
/* Save URL in fullurl string */
fullurl = realloc(fullurl, fullurl_len + read_urls + 1);
memcpy(&fullurl[fullurl_len], ini, read_urls);
fullurl[fullurl_len + read_urls] = '\0';
ini = fin;
incomplete_url = fullurl_len = 0;
/* Ask the model for predictions and fetch them */
fetch_url_predictions2(gr, fullurl);
u8_free(&fullurl);
} else
incomplete_url = 1;
}
if (incomplete_url)
{
ini = (*ini == '\n' ? ini + 1 : ini);
read_urls = &cad[j] - ini;
read_urls = read_urls >= 0 ? read_urls : 0;
fullurl = realloc(fullurl, fullurl_len + read_urls);
memcpy(&fullurl[fullurl_len], ini, read_urls);
fullurl_len += read_urls;
}
}
}
} while (i > 0);
close(fifo_descriptor);
// fclose (fifo_file);
return 1;
}
int
fetch_url_predictions2(Graph_Adj_List * gr, u8 * in_url)
{
String * string_url = NULL;
Headnode * head = NULL;
LinkedList * list = NULL;
LinkedListElem * elem = NULL;
/* Use custom string type */
string_url = string_create_no_len(in_url);
if (!string_url)
return 0;
pthread_mutex_lock(&model_lock);
/* Get URL node */
head = adj_list_get_node(gr, string_url);
if (head)
{
/* Get predictions (URLs) as a linked list */
list = adj_list_predictions_to_list(head);
if (!list)
{
string_free_all(&string_url);
return 0;
}
pthread_mutex_unlock(&model_lock);
/* Callback fetches URLs */
list->callback = &curl_callback_void;
if (!pool)
pool = threadpool_create(THREADS_LEN, QUEUE_LEN, 0);
/* Load URLs to be fetched to threadpool's task queue */
for (elem = list->first; elem; elem = elem->next)
{
CallbackArg arg;
arg.data = arg.copy(elem->data);
threadpool_add_copy_arg(pool, list->callback, &arg, 1, sizeof(arg), 0);
}
linked_list_free_all(&list);
}
pthread_mutex_unlock(&model_lock);
string_free_all(&string_url);
return 1;
}
fetch.c
void
curl_callback_void(void * data)
{
CallbackArg * arg = (CallbackArg *) data;
char * url = (char *) arg->data;
fetch_url(url);
}
static size_t
write_data(void *buffer, size_t size, size_t nmemb, void *userp)
{
return size * nmemb;
}
int
fetch_url(char * url)
{
CURL *curl;
CURLcode res;
struct timeval time;
char * time_string = NULL;
curl = curl_easy_init();
if (curl)
{
curl_easy_setopt(curl, CURLOPT_URL, url);
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, &write_data);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, NULL);
curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1);
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1);
curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 15);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 10);
/* Perform the request, res will get the return code */
res = curl_easy_perform(curl);
gettimeofday(&time, NULL);
time_string = timeval_to_str(&time);
/* Check for errors */
if (res != CURLE_OK)
{
fprintf(stderr, "\ntime %s curl_easy_perform() (url %s) failed: %s\n",
time_string, url, curl_easy_strerror(res));
}
else
{
fprintf(stderr, "\ntime %s curl_easy_perform() (url %s) fetched ok\n",
time_string, url);
}
fflush(stderr);
free (time_string);
curl_easy_cleanup(curl);
}
return 0;
}
network.c
/*
* Code based on Beej's Networking Guide
*/
#define MSG_LEN 5
#define QUEUE_SIZE 5
extern pthread_mutex_t model_lock;
extern pthread_cond_t model_cond;
extern short int model_can_update, program_shutdown;
extern Config program_config;
// get sockaddr, IPv4 or IPv6:
static void *
get_in_addr(struct sockaddr *sa) {
if (sa->sa_family == AF_INET) {
return &(((struct sockaddr_in*) sa)->sin_addr);
}
return &(((struct sockaddr_in6*) sa)->sin6_addr);
}
void *
listen_end_of_cycle(void * data)
{
int sockfd, new_fd; // listen on sock_fd, new connection on new_fd
struct addrinfo hints, *servinfo, *p;
struct sockaddr_storage their_addr; // connector's address information
socklen_t sin_size;
int yes = 1;
char s[INET_ADDRSTRLEN], msg[MSG_LEN], *str = NULL;
int rv;
int read_bytes;
struct timeval actual_time;
/* Set thread cancel type */
pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL );
memset(&hints, 0, sizeof hints);
hints.ai_family = AF_INET;
hints.ai_socktype = SOCK_STREAM;
hints.ai_flags = AI_PASSIVE; // use my IP
if ((rv = getaddrinfo(NULL, program_config.listen_port, &hints, &servinfo))
!= 0) {
fprintf(stderr, "getaddrinfo: %s\n", gai_strerror(rv));
return "error";
}
// loop through all the results and bind to the first we can
for (p = servinfo; p != NULL ; p = p->ai_next) {
if ((sockfd = socket(p->ai_family, p->ai_socktype, p->ai_protocol))
== -1) {
perror("server: socket");
continue;
}
if (setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(int))
== -1) {
perror("setsockopt");
return "error";
}
if (bind(sockfd, p->ai_addr, p->ai_addrlen) == -1) {
close(sockfd);
perror("server: bind");
continue;
}
break;
}
if (p == NULL ) {
fprintf(stderr, "server: failed to bind\n");
return "error";
}
freeaddrinfo(servinfo); // all done with this structure
if (listen(sockfd, QUEUE_SIZE) == -1) {
perror("listen");
return "error";
}
while (!program_shutdown)
{
sin_size = sizeof their_addr;
new_fd = accept(sockfd, (struct sockaddr *) &their_addr, &sin_size);
if (new_fd == -1) {
perror("accept");
continue;
}
inet_ntop(their_addr.ss_family,
get_in_addr((struct sockaddr *) &their_addr), s, sizeof s);
if ((read_bytes = recv(new_fd, msg, MSG_LEN - 1, 0)) == -1) {
perror("recv");
continue;
}
close(new_fd);
msg[read_bytes] = '\0';
/* Check received message */
if (strcmp(msg, "DONE")) {
perror("Not valid message");
continue;
}
printf("\ngot \"DONE\" from %s\n", s);
fflush(stdout);
/* Lock */
pthread_mutex_lock(&model_lock);
/* Flag used by main thread to allow model update */
model_can_update = 1;
/* Signal model can be updated */
pthread_cond_signal(&model_cond);
/* Unlock */
pthread_mutex_unlock(&model_lock);
}
close(sockfd);
pthread_exit(NULL);
return "ok";
}
int signal_start_cycle(void) {
int sockfd;
struct addrinfo hints, *servinfo, *p;
int rv;
char s[INET6_ADDRSTRLEN], *str = NULL;
struct timeval actual_time, aux_time;
struct timeval connect_timeout = { 15, 0 }, max_connect_time = { 0, 0 };
short int connected = 0;
memset(&hints, 0, sizeof hints);
hints.ai_family = AF_INET;
hints.ai_socktype = SOCK_STREAM;
if ((rv = getaddrinfo(program_config.client_ip, program_config.client_port,
&hints, &servinfo)) != 0) {
fprintf(stderr, "getaddrinfo: %s\n", gai_strerror(rv));
return 0;
}
gettimeofday(&aux_time, NULL);
timeval_add(aux_time, connect_timeout, &max_connect_time);
/* Try several times to connect to the remote side */
do {
// loop through all the results and connect to the first we can
for (p = servinfo; p != NULL ; p = p->ai_next) {
if ((sockfd = socket(p->ai_family, p->ai_socktype, p->ai_protocol))
== -1) {
perror("client: socket");
continue;
}
gettimeofday(&actual_time, NULL )
printf("\ntrying to connect %s\n", program_config.client_ip);
fflush(stdout);
if (connect(sockfd, p->ai_addr, p->ai_addrlen) == -1) {
close(sockfd);
perror("client: connect");
continue;
}
connected = 1;
break;
}
} while (!connected && !timeval_greater_than(actual_time, max_connect_time));
if (p == NULL ) {
fprintf(stderr, "client: failed to connect\n");
return 0;
}
inet_ntop(p->ai_family, get_in_addr((struct sockaddr *) p->ai_addr), s,
sizeof s);
printf("\nMAIN THREAD: connecting to %s\n", s);
fflush(stdout);
freeaddrinfo(servinfo); // all done with this structure
if (send(sockfd, "DONE", 4, 0) == -1)
{
perror("send");
return 0;
}
printf("\nsent \"DONE\" to %s\n", s);
fflush(stdout);
close(sockfd);
return 1;
}