Question

我想用C语言计算文件中所有bigrams（字符对）的出现次数。 Bigram计数计算字符对的频率。实际上这是关于Letter Frequencies。我实施了会标，但我没有找到与bigrams相关的解决方案。

这是我的代码，这是会标计数，我怎么能改变这个if语句？这段代码算一个字。例如，我想计算＆＃34; ac，dh，tx ..等等。＆＃34;字符。我准备了一个2字节的txt文件作为列表并从该文件中读取。（ar，la，an ......等）通过这种方式：FILE *plain = fopen("bigram.txt", "r");但是如何设计计算文件中多个字符的代码？非常感谢你。

int main(){

    setlocale(LC_ALL, "en_US.UTF-8");
    char string[9000];
    int c = 0, count[30] = {0};
    int bahar = 0;    

    ...
        if ( string[c] >= 'a' && string[c] <= 'z' ){
            count[string[c]-'a']++;
            bahar++;

}

Answer 1

您可以使用单个循环轻松解决此问题：

使用已初始化为count[26][26]的二维数组0。
一次从文件中读取字节
如果字节是小写字母且最后一个字节也是，则递增相应的计数器。
循环直到文件结束。
打印统计数据。

以下是代码：

#include <stdio.h>

int main(void) {
    int count['z' - 'a' + 1]['z' - 'a' + 1] = {{ 0 }};
    int c0 = EOF, c1;
    FILE *plain = fopen("bigram.txt", "r");

    if (plain != NULL) {
        while ((c1 = getc(plain)) != EOF) {
            if (c1 >= 'a' && c1 <= 'z' && c0 >= 'a' && c0 <= 'z') {
                count[c0 - 'a'][c1 - 'a']++;
            }
            c0 = c1;
        }
        fclose(plain);
        for (c0 = 'a'; c0 <= 'z'; c0++) {
            for (c1 = 'a'; c1 <= 'z'; c1++) {
                int n = count[c0 - 'a'][c1 - 'a'];
                if (n) {
                    printf("%c%c: %d\n", c0, c1, n);
                }
            }
        }
    }
    return 0;
}

这是一个更通用的版本，可以处理任何8位字符对：

#include <stdio.h>
#include <string.h>

int main(void) {
    /* character set: must use single byte encoding */
    /* the last 5 bytes correspond to ışçöü in ISO-8859-9 */
    const char set[] = "abcdefghijklmnopqrstuvwxyz\xFD\xFxE7\xF6\xFC";
    const int setlen = (sizeof(set) - 1);
    int count[setlen][setlen];
    char *p0 = NULL;
    int c1;
    FILE *plain = fopen("bigram.txt", "r");

    memset(count, 0, sizeof(count));

    if (plain != NULL) {
        while ((c1 = getc(plain)) != EOF) {
            char *p1 = memchr(set, c1, setlen);
            if (p1 != NULL && p0 != NULL) {
                count[p0 - set][p1 - set]++;
            }
            p0 = p1;
        }
        fclose(plain);
        for (size_t i = 0; i < setlen; i++) {
            for (size_t j = 0; j < setlen; j++) {
                int n = count[i][j];
                if (n > 0) {
                    printf("%c%c: %d\n", set[i], set[j], n);
                }
            }
        }
    }
    return 0;
}

在文件中计算bigrams（一对两个字符）

1 个答案: