霍夫曼后缀代码

时间:2017-02-07 09:35:01

标签: compression huffman-code

我试图有效地为一组给定字符构造一个二进制后缀代码及其概率(即一组单词,其中没有一个是任何其他单词的后缀)。

我的基本想法是使用霍夫曼算法的实现来构造前缀代码。通过反转代码字,我得到一个无后缀的代码。虽然这个解决方案正在工作,但它似乎不是最佳的,因为我必须反转可变长度的代码字(因此我需要一个结合了位移的查找表)。

有没有办法修改Huffman算法以便更有效地创建后缀代码?

2 个答案:

答案 0 :(得分:1)

我会将HuffmanNode实现为

class HuffmanNode implements Comparable<HuffmanNode>
{
    // data
    private String text;
    private double frequency;

    // linkage
    private HuffmanNode left;
    private HuffmanNode right;
    private HuffmanNode parent;

    public HuffmanNode(String text, double frequency)
    {
        this.text = text;
        this.frequency = frequency;
    }
    public HuffmanNode(HuffmanNode n0, HuffmanNode n1)
    {
        if(n0.frequency < n1.frequency)
        {
            left = n0;
            right = n1;
        }else if(n0.frequency > n1.frequency)
        {
            left = n1;
            right = n0;
        }else
        {
            if(n0.text.compareTo(n1.text) < 0)
            {
                left = n0;
               right = n1;
            }else
            {
                left = n1;
                right = n0;
            }
        }
        left.parent = this;
        right.parent = this;
        text = left.text + right.text;
        frequency = left.frequency + right.frequency;
    }

    public HuffmanNode getParent() {
        return parent;
    }

    public HuffmanNode getLeft() {
       return left;
    }

    public HuffmanNode getRight() {
        return right;
    }

    public String getText()
    {
        return text;
    }

    @Override
    public int compareTo(HuffmanNode o) {
        if(frequency < o.frequency)
            return -1;
        else if(frequency > o.frequency)
            return 1;
        else
            return text.compareTo(o.text);
    }

    public Collection<HuffmanNode> leaves()
    {
        if(left == null && right == null)
        {
            Set<HuffmanNode> retval = new HashSet<>();
            retval.add(this);
            return retval;
        }
        else if(left == null || right == null)
        {
            Set<HuffmanNode> retval = new HashSet<>();
            if(left != null)
                retval.addAll(left.leaves());
            if(right != null)
                retval.addAll(right.leaves());
            retval.add(this);
            return retval;
        }
        else
        {
            Set<HuffmanNode> retval = new HashSet<>();
            retval.addAll(left.leaves());
            retval.addAll(right.leaves());
            return retval;
        }
    }

    public String toString()
    {
         return "{" + text + " -> " + frequency + "}";
    }
}

此类表示霍夫曼树中的单个节点 它有方便的方法从(子)树获取所有叶子。

然后,您可以轻松构建树:

private Map<String,String> buildTree(String text)
{
    List<HuffmanNode> nodes = new ArrayList<>();
    for(Map.Entry<String,Double> en : frequency(text).entrySet())
    {
        nodes.add(new HuffmanNode(en.getKey(), en.getValue()));
    }
    java.util.Collections.sort(nodes);
    while(nodes.size() != 1)
    {
        HuffmanNode n0 = nodes.get(0);
        HuffmanNode n1 = nodes.get(1);

        // build merged node
        HuffmanNode newNode = new HuffmanNode(nodes.get(0), nodes.get(1));
        nodes.remove(n0);
        nodes.remove(n1);

        // calculate insertion point
        int insertionPoint = - java.util.Collections.binarySearch(nodes, newNode) - 1;

        // insert
        nodes.add(insertionPoint, newNode);
    }

    // build lookup table
    Map<String, String> lookupTable = new HashMap<>();
    for(HuffmanNode leaf : nodes.iterator().next().leaves())
    {
        String code = "";
        HuffmanNode tmp = leaf;
        while(tmp.getParent() != null)
        {
            if(tmp.getParent().getLeft() == tmp)
                code = "0" + code;
            else
                code = "1" + code;
            tmp = tmp.getParent();
        }
        lookupTable.put(leaf.getText(), code);
    }
    return lookupTable;
}

通过更改构建代码的方法(例如预先挂起下一个数字而不是附加数字),您可以更改正在生成的代码。

答案 1 :(得分:0)

我使霍夫曼编码树部署了如下所示的C ++:

enter image description here

为此,我创建了三个类-HuffmanTree,BinTree和BinNode。

您可以在我的GitHub上查看更多详细信息:https://github.com/MouChiaHung/DataStructures

检查以下三个文件: bin_node.h,bin_tree.h和huffman_tree.h 。他们以霍夫曼方式读取源文件“ source”,编码以将文件“ encode”,然后解码文件“ encode”并将结果存储到输出文件“ decode ”。此外,霍夫曼表记录在文件“ ”中。

核心功能之一是 HuffmanTree :: encode(),可从源文件读取字符。

template<typename T> void amo::HuffmanTree<T>::grow(std::list<Model*>& list) { //ascendantly sorted list
Model* l;
Model* r;
Model* m;
BinNode<T>* lchild;
BinNode<T>* rchild;
BinNode<T>* vertex;
std::list<Model*>::iterator it = list.begin();
std::vector<BinNode<T>*> subs; //roots of sub-trees
typename std::vector<BinNode<T>*>::iterator it_subs = subs.begin();
int i = 0;
while (it!=list.end()) {
    lchild = NULL;
    rchild = NULL;
    vertex = NULL;
    cout << YELLOW << "while-loop:" << ++i << WHITE << endl;
    if (std::next(it,1) == list.end()) { //met the last and single leaf or sub-tree 
        if (subs.size() > 1) {
            cout << RED << "size of sub-tree is more than 1:" << subs.size() << WHITE << endl;
            this->_root = subs.back();
            subs.pop_back();
            break;
        }
        else if (subs.size() == 1){ 
            if (**it == subs.back()->data) { //met the last sub-tree 
                cout << GREEN << "going to attach the last sub-tree" << WHITE << endl;
                vertex = subs.back();
                subs.pop_back();
            } 
            else { //met the last leaf 
                cout << GREEN << "going to attach the last leaf" << WHITE << endl;
                r = *it;
                lchild = subs.back();
                subs.pop_back();
                cout << CYAN << "lchild points to the root of the last sub-tree:" << *lchild;
                rchild = new BinNode<T>(*r);
                cout << CYAN << "rchild points to a new node:" << *rchild;
                m = new Model(CHAR_VERTEX, (lchild->data.prob)+(r->prob));
                vertex = new BinNode<T>(*m);
                lchild->parent = vertex;
                rchild->parent = vertex;
                vertex->lchild = lchild;
                vertex->rchild = rchild;
            }   
            this->_root = vertex;
            cout << CYAN << "root:" << *this->_root <<  WHITE << endl;
            break;
        }
        else {
            cout << RED << "size of sub-tree is less than 1:" << subs.size() << WHITE << endl;
            this->_root = subs.back();
            subs.pop_back();
            break;
        }
    }
    else {
        l = *it;
        it++;
        r = *it;
        m = new Model(CHAR_VERTEX, l->prob+r->prob);        

        for (it_subs=subs.begin(); it_subs!=subs.end(); it_subs++) { //set lchild if any sub-tree corresponds with this l model iterated currently 
            if (*l == (*it_subs)->data) {
                cout << CYAN << "lchild points to the root of sub-tree:" << **it_subs;
                lchild = *it_subs;
                --(it_subs = subs.erase(it_subs));
            }
            if (lchild != NULL) break; //tricky but important
        }
        for (it_subs=subs.begin(); it_subs!=subs.end(); it_subs++) { //set rchild if any sub-tree corresponds with this r model iterated currently 
            if (*r == (*it_subs)->data) {
                cout << CYAN << "rchild points to the root of sub-tree:" << **it_subs;
                rchild = *it_subs;
                --(it_subs = subs.erase(it_subs));
            }
            if (rchild != NULL) break; //tricky but important
        }
        if (lchild == NULL) { //set lchild with a new node if no any sub-tree corresponds with this l model iterated currently, which means meeting a row leaf 
            lchild = new BinNode<T>(*l);
            cout << CYAN << "lchild points to a new node:" << *lchild;
        }
        if (rchild == NULL) { //set rchild with a new node if no any sub-tree corresponds with this r model iterated currently, which means meeting a row leaf
            rchild = new BinNode<T>(*r);
            cout << CYAN << "rchild points to a new node:" << *rchild;
        }

        vertex = new BinNode<T>(*m);
        std::cout << GREEN << "growing..." << WHITE << endl;
        std::cout << CYAN << "lchild" << *lchild << WHITE;
        std::cout << CYAN << "rchild" << *rchild << WHITE;
        std::cout << CYAN << "vertex" << *vertex << WHITE;
        lchild->parent = vertex;
        rchild->parent = vertex;
        vertex->lchild = lchild;
        vertex->rchild = rchild;
        subs.push_back(vertex);
        for (std::list<Model*>::iterator itt=it;itt!=list.end();itt++) {
            if ((*m < **itt) || (*m == **itt)) {
                list.insert(itt, m);
                break;
            }
            else if (std::next(itt,1) == list.end()) {
                list.push_back(m);
                break;
            }
        }
        it++;
    }
}

this->updateHeightAll();
cout << GREEN << "-*-*-*-*-*-*-*-* Huffman tree top -*-*-*-*-*-*-*-*" << WHITE << endl;
this->traverseLevel();
cout << GREEN << "-*-*-*-*-*-*-*-* Huffman tree bottom -*-*-*-*-*-*-*-*" << WHITE << endl;

subs.clear();}

另一个核心功能是 Huffman :: grow(),它为PFC编码创建了一个二叉树。

template<typename T> void amo::HuffmanTree<T>::grow(std::list<Model*>& list) { //ascendantly sorted list
Model* l;
Model* r;
Model* m;
BinNode<T>* lchild;
BinNode<T>* rchild;
BinNode<T>* vertex;
std::list<Model*>::iterator it = list.begin();
std::vector<BinNode<T>*> subs; //roots of sub-trees
typename std::vector<BinNode<T>*>::iterator it_subs = subs.begin();
int i = 0;
while (it!=list.end()) {
    lchild = NULL;
    rchild = NULL;
    vertex = NULL;
    cout << YELLOW << "while-loop:" << ++i << WHITE << endl;
    if (std::next(it,1) == list.end()) { //met the last and single leaf or sub-tree 
        if (subs.size() > 1) {
            cout << RED << "size of sub-tree is more than 1:" << subs.size() << WHITE << endl;
            this->_root = subs.back();
            subs.pop_back();
            break;
        }
        else if (subs.size() == 1){ 
            if (**it == subs.back()->data) { //met the last sub-tree 
                cout << GREEN << "going to attach the last sub-tree" << WHITE << endl;
                vertex = subs.back();
                subs.pop_back();
            } 
            else { //met the last leaf 
                cout << GREEN << "going to attach the last leaf" << WHITE << endl;
                r = *it;
                lchild = subs.back();
                subs.pop_back();
                cout << CYAN << "lchild points to the root of the last sub-tree:" << *lchild;
                rchild = new BinNode<T>(*r);
                cout << CYAN << "rchild points to a new node:" << *rchild;
                m = new Model(CHAR_VERTEX, (lchild->data.prob)+(r->prob));
                vertex = new BinNode<T>(*m);
                lchild->parent = vertex;
                rchild->parent = vertex;
                vertex->lchild = lchild;
                vertex->rchild = rchild;
            }   
            this->_root = vertex;
            cout << CYAN << "root:" << *this->_root <<  WHITE << endl;
            break;
        }
        else {
            cout << RED << "size of sub-tree is less than 1:" << subs.size() << WHITE << endl;
            this->_root = subs.back();
            subs.pop_back();
            break;
        }
    }
    else {
        l = *it;
        it++;
        r = *it;
        m = new Model(CHAR_VERTEX, l->prob+r->prob);        

        for (it_subs=subs.begin(); it_subs!=subs.end(); it_subs++) { //set lchild if any sub-tree corresponds with this l model iterated currently 
            if (*l == (*it_subs)->data) {
                cout << CYAN << "lchild points to the root of sub-tree:" << **it_subs;
                lchild = *it_subs;
                --(it_subs = subs.erase(it_subs));
            }
            if (lchild != NULL) break; //tricky but important
        }
        for (it_subs=subs.begin(); it_subs!=subs.end(); it_subs++) { //set rchild if any sub-tree corresponds with this r model iterated currently 
            if (*r == (*it_subs)->data) {
                cout << CYAN << "rchild points to the root of sub-tree:" << **it_subs;
                rchild = *it_subs;
                --(it_subs = subs.erase(it_subs));
            }
            if (rchild != NULL) break; //tricky but important
        }
        if (lchild == NULL) { //set lchild with a new node if no any sub-tree corresponds with this l model iterated currently, which means meeting a row leaf 
            lchild = new BinNode<T>(*l);
            cout << CYAN << "lchild points to a new node:" << *lchild;
        }
        if (rchild == NULL) { //set rchild with a new node if no any sub-tree corresponds with this r model iterated currently, which means meeting a row leaf
            rchild = new BinNode<T>(*r);
            cout << CYAN << "rchild points to a new node:" << *rchild;
        }

        vertex = new BinNode<T>(*m);
        std::cout << GREEN << "growing..." << WHITE << endl;
        std::cout << CYAN << "lchild" << *lchild << WHITE;
        std::cout << CYAN << "rchild" << *rchild << WHITE;
        std::cout << CYAN << "vertex" << *vertex << WHITE;
        lchild->parent = vertex;
        rchild->parent = vertex;
        vertex->lchild = lchild;
        vertex->rchild = rchild;
        subs.push_back(vertex);
        for (std::list<Model*>::iterator itt=it;itt!=list.end();itt++) {
            if ((*m < **itt) || (*m == **itt)) {
                list.insert(itt, m);
                break;
            }
            else if (std::next(itt,1) == list.end()) {
                list.push_back(m);
                break;
            }
        }
        it++;
    }
}

this->updateHeightAll();
cout << GREEN << "-*-*-*-*-*-*-*-* Huffman tree top -*-*-*-*-*-*-*-*" << WHITE << endl;
this->traverseLevel();
cout << GREEN << "-*-*-*-*-*-*-*-* Huffman tree bottom -*-*-*-*-*-*-*-*" << WHITE << endl;

subs.clear();}

然后 Huffman :: generate()创建用于编码内容的表。

template<typename T> void amo::HuffmanTree<T>::generate() {
std::string code = "";
std::queue<BinNode<T>*> queue;
BinNode<T>* node = this->_root;
BinNode<T>* tmp;
queue.push(node);
int i = 0;
while (true) {
    if (queue.empty()) break;
    node = queue.front();
    queue.pop();
    cout << YELLOW << "while-loop:" << ++i << ", node:" << *node << WHITE << endl;

    if (node->data.c == CHAR_VERTEX) {
        //do nothing
    } 
    else {
        if (node->isLeaf()) code = "";
        tmp = node;
        while (tmp!=NULL) {
            if (tmp->isLeftChild()) code.insert(0, "0");
            else if (tmp->isRightChild()) code.insert(0, "1");
            tmp = tmp->parent;
        }
        if (node->data.c != CHAR_VERTEX) codes[node->data.c] = code;
    }

    if (node->hasLeftChild()) queue.push(node->lchild);
    if (node->hasRightChild()) queue.push(node->rchild);
}

for (std::map<char,string>::iterator it=codes.begin();it!=codes.end();it++) {
    cout << YELLOW << "codes[" << distance(codes.begin(),it) << "]:" << " key:" << it->first << " => value:" << it->second << WHITE << endl; 
}}

谢谢,欢迎提出任何建议。