香农虽然提出了香农码,但是香农码很多情况下离最优码还差得不少。比如:K=2,$p(a_1) = 0.9999,p(a_2) = 0.0001$,这种偏差非常大的情况下,香农码给$a_1,a_2$的编码长度分别为:1,14.而实际上两个值,我们可以仅用一个bit就能区分开来。在这里介绍一个大名鼎鼎的最优前缀编码:Huffman编码。
霍夫曼编码就不介绍了。因为这个东西我也实现过,知道它是怎么做的。不过如何提出来这个编码是非常有意思的。
霍夫曼码提出之前,人们一直在追求最优前缀编码。霍夫曼在MIT读博士的时候,老师给了一个作业题目:找到最优的二进制编码。霍夫曼想,想要证明已有编码是否是最优的太难了,所以他就想着自己找个编码方式。然后,诞生了霍夫曼编码。
唉,这就是大佬啊。
需要注意的是k叉霍夫曼树每个结点要么有k个孩子,要么没有孩子。确定了第一个,每次编码需要减少(k-1)个叶子结点。这意味着,信源的种类个数必须满足:1+n(k-1)。因此有时候需要填充0概率的字符来保证编码过程顺利。
最优性如何证明?
留个坑吧。看书上这个证明也挺长的。有时间了看完了再来写(也可能一直没有写)。
最后发一下多年前实现的huffman二进制编码:
Node.java:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
| package hfm_compress; import edu.princeton.cs.algs4.*; import java.util.ArrayList; import java.util.PriorityQueue; import java.util.Iterator;
class Code { boolean used; short code; int size; } public class Node implements Comparable<Node> { private Node left; private Node right; private int frep; private char symbol; public Node left() { return left; } public Node right() { return right; } public Node(Node l,Node r,int f,char s) { left = l; right = r; frep = f; symbol = s; } public int compareTo(Node n) { return this.frep - n.frep; } public int frep() { return frep; } public char symbol() { return symbol; } public boolean hasLeft() { return !(left == null); } public boolean hasRight() { return !(right == null); } public static void main(String []args) { System.out.println("wocao"); } }
|
HuffmanCompress.java
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
| package hfm_compress; import java.io.*; import edu.princeton.cs.algs4.*; import java.util.ArrayList; import java.util.PriorityQueue; import java.util.Iterator;
public class HuffmanCompress { public static Node BuildTree(int []frep) { Node n = null; Node left = null,right = null; PriorityQueue<Node> pq = new PriorityQueue<Node> (); for(int i = 0;i!=256;++i) { if(frep[i]!=0) { n = new Node(null,null,frep[i],(char)i); pq.add(n); } } while(pq.size()>1) { left = pq.poll(); right = pq.poll(); n = new Node(left,right,left.frep()+right.frep(),(char)0); pq.add(n); } n = pq.poll(); return n; } public static Code [] BuildTable(Node tree) { Code []table = new Code[256]; for(int i = 0;i!=256;++i) table[i] = new Code(); BuildTable(tree,table,(short)0,0); return table; } private static void BuildTable(Node tree,Code []table,short code,int size) { if(tree.hasLeft()) BuildTable(tree.left(),table,(short)(code<<1),size+1); if(tree.hasRight()) BuildTable(tree.right(),table,(short)((code<<1)|1),size+1); if(!tree.hasRight()&&!tree.hasLeft()) { table[tree.symbol()].size = size; table[tree.symbol()].code = code; table[tree.symbol()].used = true; } } public static void compress(BinaryIn in,BinaryOut out) { int frep[] = new int [256]; for(int i = 0;i!=256;++i) { frep[i] = 0; } int max = 255; ArrayList<Character> data = new ArrayList<Character>(); while(!in.isEmpty()) { char sym = in.readChar(); ++frep[sym]; if(frep[sym]>max) max = frep[sym]; data.add(sym); } for(int i = 0;i!=256;++i) { int a = frep[i]/(max/255); if(a == 0&&frep[i]>0) frep[i] = 1; else frep[i] = a; } Node tree = BuildTree(frep); Code []table = BuildTable(tree); Iterator<Character> ii = data.iterator(); out.write(data.size()); for(int i = 0;i!=256;++i) out.write(frep[i],8); while(ii.hasNext()) { char b = ii.next(); if(table[b].used) out.write(table[b].code,table[b].size); } out.close(); } public static void expand(BinaryIn in,BinaryOut out) { int size = in.readInt(); int []frep = new int[256]; for(int i = 0;i!=256;++i) frep[i] =(int)in.readChar(); Node tree = BuildTree(frep); Node node = tree; int ipos = 0; while(size>0) { boolean t = in.readBoolean(); if(t) node = node.right(); else node = node.left(); if(node == null) return; if(!node.hasLeft()&&!node.hasRight()) { --size; out.write(node.symbol()); node = tree; } } out.close();
} public static void main(String []args) throws IOException { BufferedReader reader = new BufferedReader(new InputStreamReader(System.in)); String s1 = reader.readLine(); String s2 = reader.readLine(); String s3 = reader.readLine(); if(s1.equals("compress")) { compress(new BinaryIn(s2),new BinaryOut(s3)); System.out.println("the file has been compressed ! "); File old = new File(s2); File now = new File(s3); System.out.println("Old : "+old.length()+" new : "+now.length()+"\nthe compression ratio is "+(double)now.length()/old.length()); } else { expand(new BinaryIn(s2),new BinaryOut(s3)); System.out.println("the file has been expanded ! "); File old = new File(s2); File now = new File(s3); System.out.println("Old : "+old.length()+" new : "+now.length()); } } }
|
这两个文件放在一个名为hfm_compress的package里。