我正在尝试使用此代码将文件行添加到哈希表中。目前它读取格式的文件。
289016164,279211721,462102225
341714666,132189021,299107290
362328497,466836829,47952622
这是三个逗号分隔的行。我希望它能够读取格式
的行289016164,279211721,462102225, some random text
341714666,132189021,299107290, some more random text
362328497,466836829,47952622, even more random text
应该包含每一行的结构应该是
typedef struct Row {
uint32_t a;
uint32_t b;
uint32_t t;
char text[40];
} Row;
读入文件的例程称为readAll(见下文),我在修改它时遇到问题。
如何更改
readAll
以便能够应对这种新格式?
我已经包含了大多数使用readAll的代码来提供一些上下文。
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
// Should be 37% occupied with 50m entries
#define TABLE_SIZE 0x8000000
#define MASK (TABLE_SIZE - 1)
#define BUFFER_SIZE 16384
#define END_OF_FILE (-1)
#define DEFAULT_VALUE (-1)
typedef struct Row {
uint32_t a;
uint32_t b;
uint32_t t;
} Row;
int32_t hash(int32_t a) {
return a * 428916315;
}
void insert(Row * table, Row row) {
long loc = hash(row.a) & MASK; // Entries are hashed on a
long inc = 0;
while (inc <= TABLE_SIZE) {
loc = (loc + inc) & MASK;
inc++;
if (table[loc].a == DEFAULT_VALUE) {
table[loc] = row;
break;
}
}
}
int readChar(FILE * input, char * buffer, int * pos, int * limit) {
if (*limit < *pos) {
return buffer[(*limit)++];
} else {
*limit = 0;
*pos = fread(buffer, sizeof(char), BUFFER_SIZE, input);
if (*limit < *pos) {
return buffer[(*limit)++];
} else
return END_OF_FILE;
}
}
void readAll(char * fileName, Row * table) {
char* buffer = (char*) malloc(sizeof(char) * BUFFER_SIZE);
int limit = 0;
int pos = 0;
FILE * input = fopen(fileName, "rb");
int lastRead;
Row currentRow;
uint32_t * currentElement = &(currentRow.a);
// We read rows with an FSM. We can
// roll up some of the code using the `currentElement` pointer
while (1) {
switch(lastRead = readChar(input, buffer, &pos, &limit)) {
case END_OF_FILE:
fclose(input);
return;
case ',':
if (currentElement == &(currentRow.a))
currentElement = &(currentRow.b);
else
currentElement = &(currentRow.t);
break;
case '\n':
insert(table, currentRow);
currentRow.a = 0;
currentRow.b = 0;
currentRow.t = 0;
currentElement = &(currentRow.a);
break;
default:
*currentElement = *currentElement * 10 + (lastRead - '0');
break;
}
} //printf("Read %d", lastRead);
}
int main(int argc, char** argv) {
Row* table = (Row*) malloc(sizeof(Row) * TABLE_SIZE);
memset(table, 255, sizeof(Row) * TABLE_SIZE);
readAll(argv[1], table);
//[...]
}
答案 0 :(得分:2)
你需要识别第三个逗号,并在找到它时填写.text,这些内容如下:
case ',':
if (currentElement == &(currentRow.a)) {
currentElement = &(currentRow.b);
break;
}
if (currentElement == &(currentRow.b)) {
currentElement = &(currentRow.t);
break;
}
{ int i = 0;
int maxchars = sizeof(currentRow->text) - 1;
while ((lastRead = readChar(input, buffer, &pos, &limit)) != '\n') {
if (i < maxchars) currentRow->text[i++] = lastRead;
}
currentRow->text[i] = '\0';
}
/* fallthrough*/
答案 1 :(得分:1)
这可能会做你想要的,这与我完成任何的方式有很大的不同,但它是你的代码,我正在寻找一个最小的解决方案。< / p>
首先,为文本字段长度定义一个宏
#define TEXT_LEN 40
并在你的结构中使用它:
typedef struct Row {
uint32_t a;
uint32_t b;
uint32_t t;
char text[TEXT_LEN];
} Row;
接下来,修改您的readAll
以执行此操作:
void readAll(char * fileName, Row * table)
{
char* buffer = (char*) malloc(sizeof(char) * BUFFER_SIZE);
int limit = 0;
int pos = 0;
FILE * input = fopen(fileName, "rb");
int lastRead;
Row currentRow;
uint32_t * currentElement = &(currentRow.a);
size_t txt_len = 0;
while (1)
{
switch(lastRead = readChar(input, buffer, &pos, &limit))
{
case END_OF_FILE:
fclose(input);
return;
case ',':
// move from a to b
if (currentElement == &(currentRow.a))
currentElement = &(currentRow.b);
// move from b to t
else if (currentElement == &(currentRow.b))
currentElement = &(currentRow.t);
// move from t to NULL, begin trailing text
else
currentElement = NULL;
break;
case '\n':
// terminate text string
currentRow.text[txt_len] = 0;
insert(table, currentRow);
currentRow.a = 0;
currentRow.b = 0;
currentRow.t = 0;
txt_len = 0;
currentElement = &(currentRow.a);
break;
default:
// only if there is a current element to parse as uint32_t
if (currentElement)
*currentElement = *currentElement * 10 + (lastRead - '0');
// else we're parsing trailing text
else if (txt_len < (TEXT_LEN-1))
currentRow.text[txt_len++] = lastRead;
// else we consume the char. as we have no space for it anyway
break;
}
}
}
备注:强>
值得一提的是,如果换行未终止,您的代码将跳过文件中的最后一个条目。解决这个问题并非完全无足轻重,特别是因为双缓冲正在完成。双缓冲是一种浪费,并且经常这样做是为了避免fgetc()
和getc()
的强制锁定特性的开销。如果只有一个线程正在读取该文件,您可以通过执行以下操作来避免这种情况并显着提高性能:
flockfile(input)
来锁定线程的文件。getc_unlocked(input)
EOF
后,调用funlockfile(input)
,然后fclose(input);
执行上述操作将完全消除对readChar
的需求,并显着减少您的代码库。
祝你好运。