一、赫夫曼树
一种树形结构,只不过内部存有一个weight权重,从根节点出发所能到达的所有叶子节点的路径长度*叶子节点权重之和=wpl最小的树,例如下面这个例子:
则右边这个树为赫夫曼树
二、通过赫夫曼树生成赫夫曼编码表
以左路径标识为0,右路径标识为1,直到叶子节点记为一个编码,权重为byte数组的词频,例如有一个字符串为abbcccdddd,则赫夫曼树为:
对应的编码是:a:110 b:111 c:10 d:0
三、通过赫夫曼编码表构建赫夫曼编码
还是以abbcccdddd为例,此时如果不使用赫夫曼编码,转换为字节数组为:[97, 98, 98, 99, 99, 99, 100, 100, 100, 100],长度位10,那么此时根据赫夫曼编码表构建新的二进制串可以得到:1101111111010100000,8位一分割,可以得到一个新的字节数组[-33, -44, 0, 3],可以看到整整压缩了7个空间,最后一位3是用来记录最后一段的二进制长度是多少
四、解码赫夫曼字节数组
-33 => 11111111111111111111111111011111 => 截取后8位 => 11011111
-44 => 11111111111111111111111111010100 => 截取后8位 => 11010100
0 => 00000000
由于前面记录了最后一段长度是3,那么这里要截取最后3个字符即000,
那么此时拼接就可以得到1101111111010100000,最后再通过之前的赫夫曼编码表反向查询出对应的字符,因为前缀都是唯一的,最后可以分割为 110 111 111 10 10 10 0 0 0 0则刚好对应的就是[97, 98, 98, 99, 99, 99, 100, 100, 100, 100],即abbcccdddd
五、具体实现
package com.gs.tree;
import org.junit.Before;
import org.junit.Test;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;
/**
* 赫夫曼编码压缩
*
* @author Pursuer
* @version 1.0
* @date 2023/11/2
*/
public class HuffmanCompressorTest {
private HuffmanCompressor compressor;
@Before
public void before() {
compressor = new HuffmanCompressor();
}
@Test
public void test01() {
//定义要压缩的字符串
String str = "The harder, The luckier!";
System.out.println("编码:");
byte[] bytes = compressor.zipContent(str);
System.out.println(Arrays.toString(bytes));
System.out.println("解码:");
System.out.println(compressor.unzipContent(bytes, compressor.huffmanCodeMap));
}
@Test
public void test02() {
String src = "C:\\Users\\guosh\\Desktop\\a.doc";
String target = "C:\\Users\\guosh\\Desktop\\a.zip";
compressor.zipFile(src, target);
}
@Test
public void test03() {
String src = "C:\\Users\\guosh\\Desktop\\a.zip";
String target = "C:\\Users\\guosh\\Desktop\\b.doc";
compressor.unZipFile(src, target);
}
}
class HuffmanCompressor {
/**
* 赫夫曼树
*/
HuffmanNode root;
/**
* 节点数量
*/
int nodeCount;
/**
* 赫夫曼编码映射表
*/
Map<Byte, String> huffmanCodeMap;
/**
* 赫夫曼树左右路径对应的编码
*/
private static final String LEFT_PATH_CODE = "0";
private static final String RIGHT_PATH_CODE = "1";
public byte[] zipContent(String content) {
return zip(content.getBytes());
}
public String unzipContent(byte[] bytes, Map<Byte, String> huffmanCodeMap) {
return new String(this.unZip(bytes, huffmanCodeMap));
}
public void zipFile(String src, String target) {
try (InputStream is = Files.newInputStream(Paths.get(src)); ObjectOutputStream os = new ObjectOutputStream(Files.newOutputStream(Paths.get(target)))) {
//读取文件
byte[] bytes = new byte[is.available()];
is.read(bytes);
//压缩
byte[] zipBytes = zip(bytes);
//写出
os.writeObject(zipBytes);
os.writeObject(huffmanCodeMap);
} catch (Exception e) {
e.printStackTrace();
}
}
public void unZipFile(String src, String target) {
try (ObjectInputStream is = new ObjectInputStream(Files.newInputStream(Paths.get(src))); OutputStream os = Files.newOutputStream(Paths.get(target))) {
//读取文件内容
byte[] content = (byte[]) is.readObject();
//读取赫夫曼编码
Map<Byte, String> huffmanCodeMap = (Map<Byte, String>) is.readObject();
//解码
byte[] bytes = unZip(content, huffmanCodeMap);
//写出
os.write(bytes);
} catch (Exception e) {
e.printStackTrace();
}
}
private byte[] zip(byte[] bytes) {
//初始化赫夫曼树
initHuffmanTree(bytes);
//根据赫夫曼树生成赫夫曼编码映射表
createHuffmanCodeMap();
//生成赫夫曼编码串
return createHuffmanCode(bytes);
}
public byte[] unZip(byte[] bytes, Map<Byte, String> huffmanCodeMap) {
//定义返回值
StringBuilder sb = new StringBuilder();
//赫夫曼编码翻转
Map<String, Byte> huffmanCodeReverseMap = new HashMap<>(huffmanCodeMap.size());
for (Map.Entry<Byte, String> entry : huffmanCodeMap.entrySet()) {
huffmanCodeReverseMap.put(entry.getValue(), entry.getKey());
}
//遍历赫夫曼字节数组转换数据,只遍历到真正的数据行
int endLength = bytes[bytes.length - 1];
for (int i = 0; i < bytes.length - 1; i++) {
int b = bytes[i];
//这一步是为了补齐正数为8位
b |= 256;
//获取二进制编码
String code = Integer.toBinaryString(b);
/**
* 截取:
* 1.如果不是最后一位则一定有至少有8位,若最后一位长度为8则也可以直接截取8位,不会越界
* 2.如果最后一位长度不是8位,则要减去最后一位的长度,例如:
* 最后一位为010,对应的十进制是3,则二进制长度为3
* 此时 b | 256的结果为100000010,但是最后一位长度只有3
* 那此时code.substring(code.length() - 8)得到的就是00000010 != 010
* 和结果是不相符的,那么此时就应该用code.substring(code.length() - 3)
* 得到的就是010,这才是我们需要的正确的最后一位
* 如果最后一位是负数,例如-1,则endLength一定是8位
* 因为byte的最大正数值是127,二进制为01111111,前面的0符号位被省略了就是1111111
* 也就是说,如果出现负数,那么肯定有8位,因为第一位为符号位1表示负数,是不会被省略的那么就是8位二进制了
*/
int len = i != bytes.length - 2 || endLength == 8 ? 8 : endLength;
//拼接
sb.append(code.substring(code.length() - len));
}
//恢复成原字节数组
List<Byte> decodeList = new ArrayList<>();
int count = 1, pos = 0;
while (pos < sb.length()) {
Byte b = null;
while (b == null) {
b = huffmanCodeReverseMap.get(sb.substring(pos, pos + count++));
}
decodeList.add(b);
pos += count - 1;
count = 1;
}
//转换为字节数据
byte[] decodeBytes = new byte[decodeList.size()];
for (int i = 0; i < decodeList.size(); i++) {
decodeBytes[i] = decodeList.get(i);
}
return decodeBytes;
}
private byte[] createHuffmanCode(byte[] bytes) {
//将原始数据转换为赫夫曼编码字符串
StringBuilder sb = new StringBuilder();
for (byte b : bytes) {
sb.append(this.huffmanCodeMap.get(b));
}
//计算赫夫曼编码字节数据大小,8位一存储,这里+7是因为可能长度不是8的倍数,则最后还要多一个空间
int size = (sb.length() + 7) / 8;
//定义返返回值,这里还要+1是因为规定最后一个空间保存最后一位的长度
byte[] huffmanBytes = new byte[size + 1];
int index = 0;
//填充赫夫曼数组
for (int i = 0; i < sb.length(); i += 8) {
//定义字符串
StringBuilder builder;
//判断是否是最后一位,最后一位可能不满8位
if (i + 8 > sb.length()) {
builder = new StringBuilder(sb.substring(i));
huffmanBytes[huffmanBytes.length - 1] = (byte) builder.length();
} else {
builder = new StringBuilder(sb.substring(i, i + 8));
}
//填充数据
huffmanBytes[index++] = (byte) Integer.parseInt(builder.toString(), 2);
}
if (huffmanBytes[huffmanBytes.length - 1] == 0) {
huffmanBytes[huffmanBytes.length - 1] = (byte) 8;
}
//返回
return huffmanBytes;
}
private void createHuffmanCodeMap() {
//初始化赫夫曼编码表
this.huffmanCodeMap = new HashMap<>(nodeCount);
//生成路径映射
if (this.root != null) {
//向左生成
this.createLeafNodePath(root.getLeft(), LEFT_PATH_CODE, new StringBuilder());
//向右生成
this.createLeafNodePath(root.getRight(), RIGHT_PATH_CODE, new StringBuilder());
}
}
private void createLeafNodePath(HuffmanNode node, String code, StringBuilder sb) {
if (node == null) {
return;
}
StringBuilder builder = new StringBuilder(sb);
builder.append(code);
//判断是否是叶子节点
if (node.getData() != null) {
//加入赫夫曼编码表
huffmanCodeMap.put(node.getData(), builder.toString());
} else {
//向左递归
createLeafNodePath(node.getLeft(), LEFT_PATH_CODE, builder);
//向右递归
createLeafNodePath(node.getRight(), RIGHT_PATH_CODE, builder);
}
}
public void initHuffmanTree(byte[] bytes) {
//统计次数
Map<Byte, Integer> byteCountMap = new HashMap<>(bytes.length);
for (byte b : bytes) {
byteCountMap.put(b, byteCountMap.getOrDefault(b, 0) + 1);
}
//根据统计生成赫夫曼树节点
List<HuffmanNode> nodes = new ArrayList<>(byteCountMap.size());
for (Map.Entry<Byte, Integer> entry : byteCountMap.entrySet()) {
nodes.add(new HuffmanNode(entry.getKey(), entry.getValue()));
}
//设置叶子节点数量
this.nodeCount = nodes.size();
//根据节点生成赫夫曼树
while (nodes.size() > 1) {
//先把节点排序
Collections.sort(nodes);
//获取前两个节点
HuffmanNode huffmanLeftNode = nodes.get(0);
HuffmanNode huffmanRightNode = nodes.get(1);
//生成父节点
HuffmanNode parent = new HuffmanNode(huffmanLeftNode.getWeight() + huffmanRightNode.getWeight());
//设置叶子
parent.setLeft(huffmanLeftNode);
parent.setRight(huffmanRightNode);
//列表删除左右节点
nodes.remove(huffmanLeftNode);
nodes.remove(huffmanRightNode);
//加入父节点
nodes.add(parent);
}
//设置赫夫曼树
this.root = nodes.get(0);
}
}
class HuffmanNode implements Comparable<HuffmanNode> {
Byte data;
int weight;
HuffmanNode left;
HuffmanNode right;
public HuffmanNode(int weight) {
this.weight = weight;
}
public HuffmanNode(byte data, int weight) {
this.data = data;
this.weight = weight;
}
public Byte getData() {
return data;
}
public void setData(Byte data) {
this.data = data;
}
public int getWeight() {
return weight;
}
public void setWeight(int weight) {
this.weight = weight;
}
public HuffmanNode getLeft() {
return left;
}
public void setLeft(HuffmanNode left) {
this.left = left;
}
public HuffmanNode getRight() {
return right;
}
public void setRight(HuffmanNode right) {
this.right = right;
}
@Override
public String toString() {
return "HuffmanNode{" +
"data=" + data +
", weight=" + weight +
'}';
}
@Override
public int compareTo(HuffmanNode o) {
return this.weight = o.getWeight();
}
}
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。