我正在尝试用C ++实现LZ78压缩算法,我希望我的程序能够像这样工作:
压缩字符串,输出一个字符串,其中包含字典中存在的每个(key,char)的二进制表示。
将字符串的每个8个字符的块转换为其二进制数
举个例子:
运行算法会产生 (0,K) (0,a) (0,k) (2,s) (0,h) (0,i) (0 ,.) (0,)
所有对的二进制表示: 00000000010010110000000001100001000000000110101100000010011100110000000001101000000000000110100100000000001011100000000000001010
在我的代码中,我一次读取8个字符以上的字符串,将其转换为数字,以二进制模式将其写入文件。
但是,如果我要压缩的字符串有点长,当我解压缩它时,并非每个单词都是正确的,有些字母丢失而其他字母则无序。当我将压缩文本写入文件时,我相信我做错了。
这是我的完整代码:
#include <bits/stdc++.h>
#include <list>
#include <bitset>
#include <iostream>
#include <fstream>
#include <string>
#include <algorithm>
#include <queue>
using namespace std;
string encode_int(int in)
{
return bitset<8>(in).to_string();
}
int decode_int(char out)
{
return bitset<8>(out).to_ulong();
}
string encode_char(string in)
{
return bitset<8>(in[0]).to_string();
}
char decode_char(string out)
{
return (char)bitset<8>(out).to_ulong();
}
struct Dict
{
string label; // dictionary entry string
char output; // first non-matching symbol
int entry; // longest matching dictionary entry
Dict(string label, int entry, char output) // constructor
{
this->label = label;
this->entry = entry;
this->output = output;
}
};
int find(string l, list<Dict> enc_list)
{ // change list to map
list<Dict> temp = enc_list;
int i = 1;
while(!temp.empty())
{
if(!(l.compare(temp.front().label)))
{
return i;
}
temp.pop_front();
i++;
}
return -1;
}
void write_file(string input, string output_filename)
{
string one_byte;
unsigned long bin_number;
unsigned char chr;
int i, len = input.length();
FILE *fp;
fp = fopen(output_filename.c_str(), "wb");
if(fp == NULL)
{
printf("Unable to open output file!\n");
return;
}
for (i=0; i<len; i+= 8)
{
one_byte = input.substr(i, 8);
bin_number = strtol(one_byte.c_str(), NULL, 2);
chr = bin_number;
fprintf(fp, "%c", bin_number);
}
fclose(fp);
}
void LZ78_Compress(string txt, string output_filename)
{
list <Dict> Dictionary;
string Preffix = "", Char, compressed;
int CodeWord, IndexForPreffix = 1, len, i;
len = txt.length();
for(i=0; i<len; i++){
Char = string(1, txt[i]);
IndexForPreffix = find((Preffix + Char), Dictionary); // if it equals to -1, it means (Preffix + Char) is not in the dictionary
if(IndexForPreffix != -1)
{
Preffix = Preffix + Char; // if Preffix + Char already exists, append Char
}
else
{
if(Preffix.empty())
{
CodeWord = 0; // if Preffix is empty, a new letter was processed
compressed += "00000000";
}
else
{
CodeWord = find(Preffix, Dictionary); // search Preffix index
compressed += encode_int(CodeWord); // encode index
}
compressed += encode_char(Char); // encode char
Dictionary.push_back(Dict((Preffix + Char), CodeWord, txt[i])); // add new entry to the dictionary
Preffix.clear();
}
}
write_file(compressed, output_filename);
}
void LZ78_Decompress(string input_filename, string output_filename)
{
// Decompression Variables
string dict = "";
string decompressed_text; // the the decomressed string
string compressed_text; // the compressed input
string character; // the character immediately after the current codeword
string temp;
unsigned char ch;
unsigned int codeword, l = 0, i, len; // the current dictionary entry being processed
FILE *fp;
fp = fopen(input_filename.c_str(), "rb");
if(fp == NULL)
{
printf("Unable to open compressed file!\n");
return;
}
while(fscanf(fp, "%c", &ch) == 1)
{
compressed_text += ch;
}
len = compressed_text.length();
fclose(fp);
ofstream outfile(output_filename.c_str(), ios::binary);
int *idx = new int[len]; // used for storing the index of the i-th dictionary entry
for (i=0;i<len;i+=2)
{
codeword = compressed_text[i]; // longest matching dictionary entry
character = compressed_text.substr(i + 1, 1); // first non-matching symbol
dict += character;
idx[l] = codeword;
l++; // idx size
// let's say l = 0
// then (idx[0], dict[0]) represents the first dictionary entry
if(codeword == 0)
{
decompressed_text += character; // new letter, just append
}
else
{
while(codeword > 0) // go back in the dictionary string, adding each letter until you get one with codeword = 0
{
temp += dict[codeword-1];
codeword = idx[codeword-1];
}
reverse(temp.begin(), temp.end()); // restore correct order
decompressed_text += temp; // append string and char
decompressed_text += character;
temp.clear();
}
}
outfile << decompressed_text;
outfile.close();
}
void Compress(string input_filename, string output_filename)
{
ifstream in(input_filename.c_str());
string line, txt;
while(getline(in, line))
{
txt += line;
txt += "\n";
}
in.close();
LZ78_Compress(txt, output_filename);
}
int main()
{
Compress("FullText.txt", "Compressed.out");
LZ78_Decompress("Compressed.out", "Decompressed.out");
}
如果我以“Kakashi”作为输入运行它,它可以正常工作。但如果我用
运行“Lorem ipsum dolor sit amet,consectetuer adipiscing elit.Aenean commodo ligula eget dolor.Aenean massa.Cum sociis natoque penatibus et magnis dis parturient montes,nascetur ridiculus mus.Donec quam felis,ultricies nec,pellentesque eu,pretium quis, sem.Nulla consequat massa quis enim.Donec pede justo,fringilla vel,aliquet nec, vulputate eget,arcu。“
程序几乎输出整个字符串,而不是输出
[vulputate eget,arcu。]
它显示了这个[vulputatricéeget,arcu。]