如何比较两个文本文件与未排序和略有不同的行

时间:2017-06-16 10:27:41

标签: linux windows diff

我有两个文件如下:

File1中

the quick brown fox jumps
jumps over the very lazy dog
brown fox jumps over the
lorem ipsum dolor

文件2

jumps over the very lazy *chicken*
brown fox jumps over the
the quick brown fox *swims*
an apple a day keeps the doctor away

我需要DIFF两个文件并从中提取两个文件中的任何一个中的唯一行。

但问题是:

  1. 两个文件中的所有行都是未排序的
  2. 行可能(或可能不)相同
  3. 在比较线条时,前四个字很重要。第五个字就是"不关心"。在上面的示例中,在File1中使用 chicken 游泳的行被视为" PRESENT"在File2。
  4. 因此,根据上述条件,预期输出为:

    File1中

    lorem ipsum dolor
    

    文件2

    an apple a day keeps the doctor away
    

    任何人都能快速有效地了解DIFF的方法吗? (最短的解决方案,易于阅读的输出)我尝试过的是使用excel在视觉上比较两个文件。但我要对 LOT 的一对日志文件执行此操作。它将永远地完成它们。

    任何更好的建议都表示赞赏。

    谢谢和最诚挚的问候。

2 个答案:

答案 0 :(得分:1)

为什么不为该作业编写一个小程序,所以它适用于两个平台?它可以在一些独立于平台的C代码中轻松完成:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

typedef struct Line
{
    char *line;
    char *tokens;
    size_t nwords;
    const char **words;
} Line;

char *copyString(const char *s)
{
    char *r = malloc(strlen(s) + 1);
    if (!r) exit(EXIT_FAILURE);
    strcpy(r, s);
    return r;
}

int compareLines(const void *a, const void *b)
{
    const Line *line1 = a;
    const Line *line2 = b;

    size_t mw = line1->nwords;
    if (line2->nwords < mw) mw = line2->nwords;
    for (size_t i = 0; i < mw; ++i)
    {
        int r = strcmp(line1->words[i], line2->words[i]);
        if (r) return r;
    }
    if (line1->nwords > mw) return 1;
    if (line2->nwords > mw) return -1;
    return 0;
}

size_t readFile(Line **linesptr, FILE *f, size_t wordCount)
{
    size_t cap = 256;
    size_t n = 0;
    char buf[1024];

    Line *lines = malloc(cap * sizeof(Line));
    if (!lines) exit(EXIT_FAILURE);

    while (fgets(buf, 1024, f))
    {
        if (n == cap)
        {
            cap *= 2;
            lines = realloc(lines, cap * sizeof(Line));
            if (!lines) exit(EXIT_FAILURE);
        }
        lines[n].line = copyString(buf);
        lines[n].tokens = copyString(buf);
        lines[n].words = malloc(wordCount * sizeof(const char *));
        if (!lines[n].words) exit(EXIT_FAILURE);
        size_t c = 0;
        char *word = strtok(lines[n].tokens, " \t");
        while (word && c < wordCount)
        {
            lines[n].words[c++] = word;
            if (c == wordCount) break;
            word = strtok(0, " \t");
        }
        lines[n].nwords = c;
        lines[n].words = realloc(lines[n].words, c * sizeof(const char *));
        if (!lines[n].words) exit(EXIT_FAILURE);
        ++n;
    }
    lines = realloc(lines, n * sizeof(Line));
    if (!lines) exit(EXIT_FAILURE);
    qsort(lines, n, sizeof(Line), compareLines);
    *linesptr = lines;
    return n;
}

void freeLines(Line *lines, size_t n)
{
    for (size_t i = 0; i < n; ++i)
    {
        free(lines[i].words);
        free(lines[i].tokens);
        free(lines[i].line);
    }
    free(lines);
}

int main(int argc, char **argv)
{
    if (argc != 4)
    {
        fprintf(stderr, "Usage: %s [n] [file1] [file2]\n", argv[0]);
        return EXIT_FAILURE;
    }

    int nwords = atoi(argv[1]);
    if (!nwords) return EXIT_FAILURE;
    FILE *f1 = fopen(argv[2], "r");
    if (!f1) return EXIT_FAILURE;
    FILE *f2 = fopen(argv[3], "r");
    if (!f2) return EXIT_FAILURE;

    Line *f1lines = 0;
    size_t nf1lines = readFile(&f1lines, f1, nwords);
    if (!f1lines) return EXIT_FAILURE;

    Line *f2lines = 0;
    size_t nf2lines = readFile(&f2lines, f2, nwords);
    if (!f2lines) return EXIT_FAILURE;

    fclose(f1);
    fclose(f2);

    size_t f1pos = 0;
    size_t f2pos = 0;

    while (f1pos < nf1lines && f2pos < nf2lines)
    {
        int cmp = compareLines(f1lines + f1pos, f2lines + f2pos);
        if (cmp)
        {
            if (cmp < 0)
            {
                printf("%s: %s", argv[2], f1lines[f1pos++].line);
            }
            else
            {
                printf("%s: %s", argv[3], f2lines[f2pos++].line);
            }
        }
        else
        {
            ++f1pos;
            ++f2pos;
        }
    }

    while (f1pos < nf1lines)
    {
        printf("%s: %s", argv[2], f1lines[f1pos++].line);
    }

    while (f2pos < nf2lines)
    {
        printf("%s: %s", argv[3], f2lines[f2pos++].line);
    }

    freeLines(f1lines, nf1lines);
    freeLines(f2lines, nf2lines);

    return EXIT_SUCCESS;
}

如果您使用gcc,请使用

编译
gcc -s -g0 -O2 -std=c11 -Wall -Wextra -pedantic -ofinduniq finduniq.c

演示:

$ ./finduniq 4 test1.txt test2.txt
test2.txt: an apple a day keeps the doctor away
test1.txt: lorem ipsum dolor

$ ./finduniq 6 test1.txt test2.txt
test2.txt: an apple a day keeps the doctor away
test2.txt: jumps over the very lazy *chicken*
test1.txt: jumps over the very lazy dog
test1.txt: lorem ipsum dolor
test2.txt: the quick brown fox *swims*
test1.txt: the quick brown fox jumps

答案 1 :(得分:0)

 $ diff file1 file2 | grep "<\|>" | sed -E 's/^(<|>) //g' | sort | uniq -w5 -u

diff - 比较文件file1file2

grepsed删除多余的行和符号,然后删除sort个字符串,

uniq输出唯一字符串(-w5比较行中的前5个字符,尝试解决问题列表中的问题#3)