C ++中的LZ78编码问题

时间:2016-12-12 01:21:50

标签: c++ algorithm io compression

我正在尝试用C ++实现LZ78压缩算法,我希望我的程序能够像这样工作:

  1. 打开文件并将内容读入字符串
  2. 压缩字符串,输出一个字符串,其中包含字典中存在的每个(key,char)的二进制表示。

  3. 将字符串的每个8个字符的块转换为其二进制数

  4. 将这些数字写入文件。
  5. 举个例子:

    1. string =“Kakashi。”
    2. 运行算法会产生 (0,K) (0,a) (0,k) (2,s) (0,h) (0,i) (0 ,.) (0,)

      所有对的二进制表示: 00000000010010110000000001100001000000000110101100000010011100110000000001101000000000000110100100000000001011100000000000001010

    3. 在我的代码中,我一次读取8个字符以上的字符串,将其转换为数字,以二进制模式将其写入文件。

    4. 但是,如果我要压缩的字符串有点长,当我解压缩它时,并非每个单词都是正确的,有些字母丢失而其他字母则无序。当我将压缩文本写入文件时,我相信我做错了。

      这是我的完整代码:

      #include <bits/stdc++.h>
      #include <list>
      #include <bitset>
      #include <iostream>
      #include <fstream>
      #include <string>
      #include <algorithm>
      #include <queue>
      
      using namespace std;
      
      string encode_int(int in)
      {
        return bitset<8>(in).to_string();
      }
      
      int decode_int(char out)
      {
        return bitset<8>(out).to_ulong();
      }
      
      string encode_char(string in)
      {
        return bitset<8>(in[0]).to_string();
      }
      
      char decode_char(string out)
      {
        return (char)bitset<8>(out).to_ulong();
      }
      
      struct Dict
      {
        string label; // dictionary entry string
        char output;  // first non-matching symbol
        int entry;    // longest matching dictionary entry
      
        Dict(string label, int entry, char output) // constructor
        {
          this->label = label;
          this->entry = entry;
          this->output = output;
        }
      };
      
      int find(string l, list<Dict> enc_list)
      { // change list to map
      
        list<Dict> temp = enc_list;
        int i = 1;
      
        while(!temp.empty())
        {
            if(!(l.compare(temp.front().label)))
            {
                return i;
            }
            temp.pop_front();
            i++;
        }
        return -1;
      }
      
      void write_file(string input, string output_filename)
      {
        string one_byte;
        unsigned long bin_number;
        unsigned char chr;
        int i, len = input.length();
      
        FILE *fp;
        fp = fopen(output_filename.c_str(), "wb");
      
        if(fp == NULL)
        {
          printf("Unable to open output file!\n");
          return;
        }
      
        for (i=0; i<len; i+= 8)
        {
          one_byte = input.substr(i, 8);
          bin_number = strtol(one_byte.c_str(), NULL, 2);
      
          chr = bin_number;
          fprintf(fp, "%c", bin_number);
        }
      
        fclose(fp);
      }
      
      void LZ78_Compress(string txt, string output_filename)
      {
        list <Dict> Dictionary;
        string Preffix = "", Char, compressed;
      
        int CodeWord, IndexForPreffix = 1, len, i;
      
        len = txt.length();
      
      
        for(i=0; i<len; i++){
      
           Char = string(1, txt[i]);
      
           IndexForPreffix = find((Preffix + Char), Dictionary);  // if it equals to -1, it means (Preffix + Char) is not in the dictionary
           if(IndexForPreffix != -1)
           {    
               Preffix = Preffix + Char; // if  Preffix + Char already exists, append Char
           }
      
           else
           {
              if(Preffix.empty())
              {
                CodeWord = 0;           // if Preffix is empty, a new letter was processed
                compressed += "00000000";
              }
              else
              {
                CodeWord = find(Preffix, Dictionary);     // search Preffix index
                compressed += encode_int(CodeWord);       // encode index
              }
      
              compressed += encode_char(Char);                                // encode char
              Dictionary.push_back(Dict((Preffix + Char), CodeWord, txt[i])); // add new entry to the dictionary
              Preffix.clear();      
           }
        } 
      
        write_file(compressed, output_filename);
      }
      
      void LZ78_Decompress(string input_filename, string output_filename)
      {
        // Decompression Variables
        string dict = "";
        string decompressed_text;      // the the decomressed string
        string compressed_text;        // the compressed input
        string character;              // the character immediately after the current codeword
        string temp;                   
      
        unsigned char ch;
        unsigned int codeword, l = 0, i, len;           // the current dictionary entry being processed
      
        FILE *fp;
        fp = fopen(input_filename.c_str(), "rb");
      
        if(fp == NULL)
        {
          printf("Unable to open compressed file!\n");
          return;
        }
      
      
        while(fscanf(fp, "%c", &ch) == 1)
        {
          compressed_text += ch;
        }
        len = compressed_text.length();
      
        fclose(fp);
      
        ofstream outfile(output_filename.c_str(), ios::binary);
      
        int *idx = new int[len]; // used for storing the index of the i-th dictionary entry
      
        for (i=0;i<len;i+=2)
        {
          codeword = compressed_text[i];                      // longest matching dictionary entry
          character = compressed_text.substr(i + 1, 1);       // first non-matching symbol
          dict += character;                           
          idx[l] = codeword;
          l++; // idx size
      
          // let's say l = 0
          // then (idx[0], dict[0]) represents the first dictionary entry
      
          if(codeword == 0)
          {
              decompressed_text += character; // new letter, just append
          }
      
          else
          {      
             while(codeword > 0)  // go back in the dictionary string, adding each letter until you get one with codeword = 0
             {
              temp += dict[codeword-1];
              codeword = idx[codeword-1];
             }
             reverse(temp.begin(), temp.end()); // restore correct order
             decompressed_text += temp;         // append string and char
             decompressed_text += character;
             temp.clear();
          }
        }
        outfile << decompressed_text;
        outfile.close();
      }
      
      
      void Compress(string input_filename, string output_filename)
      {
        ifstream in(input_filename.c_str());
        string line, txt;
      
        while(getline(in, line))
        {
          txt += line;
          txt += "\n";
        }
        in.close();
      
        LZ78_Compress(txt, output_filename);
      }
      
      int main()
      {
        Compress("FullText.txt", "Compressed.out");
        LZ78_Decompress("Compressed.out", "Decompressed.out");
      }
      

      如果我以“Kakashi”作为输入运行它,它可以正常工作。但如果我用

      运行

      “Lorem ipsum dolor sit amet,consectetuer adipiscing elit.Aenean commodo ligula eget dolor.Aenean massa.Cum sociis natoque penatibus et magnis dis parturient montes,nascetur ridiculus mus.Donec quam felis,ultricies nec,pellentesque eu,pretium quis, sem.Nulla consequat massa quis enim.Donec pede justo,fringilla vel,aliquet nec, vulputate eget,arcu。

      程序几乎输出整个字符串,而不是输出

      [vulputate eget,arcu。]

      它显示了这个[vulputatricéeget,arcu。]

0 个答案:

没有答案