代码改变世界

PriorityQueue和Huffman编码

2012-12-14 15:47  康杜  阅读(1159)  评论(0编辑  收藏  举报

前言

本文不描述Huffman编码的算法和理论知识。Huffman编码算法的描述请看酷壳上的文章《Huffman压缩算法》。本文只考虑Huffman编码的Java实现,代码中有些地方没有做性能优化。

PriorityQueue

PriorityQueue是一个排序的队列,是构建Huffman树的基础。

package art.programming.huffman;

//Frequency用来抽象每个字符出现频率
public class Frequency implements Comparable<Frequency>{
	private int frequency;
	private Character value;
	private TreeNode treeNodeRef;
	
	public Frequency(Character value, int frequency){
		this.value = value;
		this.frequency = frequency;
	}
	
	public int getFrequency() {
		return frequency;
	}

	public Character getValue() {
		return value;
	}
	
	public void increase(){
		++frequency;
	}

	@Override
	public int compareTo(Frequency another) {
		if (frequency != another.getFrequency()){
			return frequency - another.getFrequency();
		}
		if (value == null && another.getValue()!=null){
			return -1;
		}
		
		if (value != null && another.getValue()==null){
			return 1;
		}
		
		if (value == null & another.getValue() ==null){
			int i=0,j=0;
			TreeNode treeNode = treeNodeRef;
			TreeNode anotherTreeNode = another.getTreeNodeRef();
			while(treeNode.getLeft()!=null){
				i ++;
				treeNode = treeNode.getLeft();
			}
			while(anotherTreeNode.getLeft()!=null){
				j ++;
				anotherTreeNode = anotherTreeNode.getLeft();
			}
			return i - j;
		}
		return frequency - another.getFrequency();
	}

	public TreeNode getTreeNodeRef() {
		return treeNodeRef;
	}

	public void setTreeNodeRef(TreeNode treeNodeRef) {
		this.treeNodeRef = treeNodeRef;
	}
	
	
	public String toString(){
		return value + " : " + frequency;
	}
	
}

计算字符出现的频率放到PriorityQueue中

package art.programming.huffman;

import java.util.HashMap;
import java.util.Map;
import java.util.PriorityQueue;

public class FrequencyCalculator {
	private Map<Character, Frequency> frequencyMap;
	
	public FrequencyCalculator(){
		frequencyMap = new HashMap<Character, Frequency>();
	}
	
	public PriorityQueue<Frequency> calculate(String c){
		int len = c.length();
		for (int i=0; i<len; i++){
			Character character = c.charAt(i);
			Frequency frequency = frequencyMap.get(character);
			if (frequency == null){
				frequency = new Frequency(character, 1);
				frequencyMap.put(character, frequency);
			}else{
				frequency.increase();
			}
		}
		PriorityQueue<Frequency> priorityQueue = new PriorityQueue<Frequency>();
		for (Frequency frequency : frequencyMap.values()){
			priorityQueue.add(frequency);
		}
		return priorityQueue;
	}
}

根据PriorityQueue构建Huffman树

package art.programming.huffman;

import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.PriorityQueue;

public class HuffmanTree {

	private TreeNode root;
	
	private Map<Character, Encoding> encodings = new HashMap<Character, Encoding>();
	

	public void build(PriorityQueue<Frequency> prioQueue) {
		while (prioQueue.size() >= 2) {
			//Poll the first element, then build the left node
			Frequency first = prioQueue.poll();
			TreeNode leftNode = buildNode(first);
			
			//Poll the second element, then build the right node
			Frequency second = prioQueue.poll();
			TreeNode rightNode = buildNode(second);
			
			//Build the sub-tree
			TreeNode node = new TreeNode();
			node.setLeft(leftNode);
			node.setRight(rightNode);
			node.setValue(null);
			
			//Put the calculated frequency back to the priority queue
			int freq = first.getFrequency() + second.getFrequency();
			Frequency frequency = new Frequency(null, freq);
			frequency.setTreeNodeRef(node);
			prioQueue.add(frequency);
			
			if (prioQueue.size()==1){
				//byte rootscore = 0xf;
				//rootscore = (byte) (rootscore << 1);
				
				root = node;
				//root.setScore(rootscore);
			}
		}
	}
	
	public Map<Character, Encoding> getEncodingTable(){
		traverse();
		Collection<Encoding> coll = encodings.values();
		for (Encoding encoding : coll){
			System.out.println(encoding.getBits() + " " + encoding.getValue());
		}
		return encodings;
	}
	
	private void traverse(TreeNode node){
		if (node != null){
				if (node.getLeft()!=null){
					byte i = node.getScore();
					i = (byte) (i << 1);
					node.getLeft().setScore(i);
				}
				if (node.getRight()!=null){
					byte i = node.getScore();
					i = (byte) ((i << 1) + 1);
					node.getRight().setScore(i);
				}
				
				traverse(node.getLeft());
				
				if (node.getValue() != null){
					System.out.println(node.getValue());
					System.out.println(node.getScore());
					Encoding encoding = new Encoding();
					encoding.setBits(node.getScore());
					encoding.setValue(node.getValue());
					encodings.put(node.getValue(), encoding);
					
				}
				traverse(node.getRight());
		}
	}
	
	
	private void traverse(){
		traverse(this.root);
	}
	
	
	
	private TreeNode buildNode(Frequency freq){
		if (freq.getTreeNodeRef() != null){
			return freq.getTreeNodeRef();
		}
		TreeNode node = new TreeNode();
		node.setValue(freq.getValue());
		return node;
	}
	
	public TreeNode getRoot(){
		return root;
	}
}

Huffman树节点结构

package art.programming.huffman;

public class TreeNode {
		
	private TreeNode left;
	
	private TreeNode right;
	
	private Character value;
	
	private byte score = 0xf;
	
	public void setValue(Character value){
		this.value = value;
	}
	
	public TreeNode getLeft() {
		return left;
	}

	public void setLeft(TreeNode left) {
		this.left = left;
	}

	public TreeNode getRight() {
		return right;
	}

	public void setRight(TreeNode right) {
		this.right = right;
	}

	public Character getValue() {
		return value;
	}

	public byte getScore() {
		return score;
	}

	public void setScore(byte score) {
		this.score = score;
	}
	
	public String toString(){
		return value + " : " + score;
	}
}

Huffman编码字段结构

package art.programming.huffman;

public class Encoding {
	private Character value;
	private byte bits;
	
	public Character getValue() {
		return value;
	}
	
	public void setValue(Character value) {
		this.value = value;
	}
	
	public byte getBits() {
		return bits;
	}
	
	public void setBits(byte bits) {
		this.bits = bits;
	}	
}

  

Huffman编码

package art.programming.huffman;

import java.util.Collection;
import java.util.PriorityQueue;

public class HuffmanEncoding {
	
	HuffmanTree huffmanTree = new HuffmanTree();
	
	public byte[] encode(String toEncode){
		//Calculate the frequency
		PriorityQueue<Frequency> frequencyPrioQueue 
			= new FrequencyCalculator().calculate(toEncode);
		huffmanTree.build(frequencyPrioQueue);
		byte[] bytes = new byte[toEncode.length()];
		
		for (int i=0; i<toEncode.length(); i++){
			Encoding encoding = huffmanTree.getEncodingTable().get(toEncode.charAt(i));
			bytes[i] = encoding.getBits();
		}
		return bytes;
	}
	
	public String decode(byte[] bytes){
		Collection<Encoding> encodings = huffmanTree.getEncodingTable().values();
		char[] chars = new char[bytes.length];
		for (int i = 0; i<bytes.length;i ++){
			byte b = bytes[i];
			for (Encoding encoding : encodings){
				if (encoding.getBits() == b){
					chars[i] = encoding.getValue().charValue(); 
				}
			}
		}
		return new String(chars);
	}
}

测试及验证

package art.programming.huffman;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;

import org.junit.Test;

public class HuffmanEncodingTest {
	
	@Test
	public void encodeAndDecode() throws IOException{
		HuffmanEncoding encoding = new HuffmanEncoding();
		//byte[] bytes = encoding.encode("beep boop beer!");
		String toEncode = "歌舞青春 舞蹈 舞出我人生 好电影啊, 哈哈,我爱看电影";
		byte[] bytes = encoding.encode(toEncode);
		FileOutputStream fileOutputStream = new FileOutputStream(new File("/home/alex/odbytes"));
		FileOutputStream fileOutputStreamOrigin = new FileOutputStream(new File("/home/alex/original"));
		try {
			fileOutputStream.write(bytes);
			fileOutputStreamOrigin.write(toEncode.getBytes());
		} catch (IOException e) {
			e.printStackTrace();
		}finally{
			fileOutputStream.close();
			fileOutputStreamOrigin.close();
		}
		
		String original = encoding.decode(bytes);
		System.out.println(original);
	}
	
}

压缩前

alex@ubuntu:~$ od original 
0000000 126746 164214 117210 116751 163222 122630 164040 117210
0000020 134750 020210 104350 162636 135207 104346 162221 135272
0000040 112347 020237 122745 163675 132624 136745 162661 105225
0000060 136357 020214 111745 162610 104223 136357 163214 110610
0000100 104347 163661 105634 112347 162665 130675
0000114

压缩后

alex@ubuntu:~$ od odbytes 
0000000 075373 173346 075177 077764 175172 171366 077747 173765
0000020 174170 077770 174771 173370 174767 074367
0000034