头图

一、赫夫曼树

一种树形结构,只不过内部存有一个weight权重,从根节点出发所能到达的所有叶子节点的路径长度*叶子节点权重之和=wpl最小的树,例如下面这个例子:
image.png
则右边这个树为赫夫曼树

二、通过赫夫曼树生成赫夫曼编码表

以左路径标识为0,右路径标识为1,直到叶子节点记为一个编码,权重为byte数组的词频,例如有一个字符串为abbcccdddd,则赫夫曼树为:
image.png
对应的编码是:a:110 b:111 c:10 d:0

三、通过赫夫曼编码表构建赫夫曼编码

还是以abbcccdddd为例,此时如果不使用赫夫曼编码,转换为字节数组为:[97, 98, 98, 99, 99, 99, 100, 100, 100, 100],长度位10,那么此时根据赫夫曼编码表构建新的二进制串可以得到:1101111111010100000,8位一分割,可以得到一个新的字节数组[-33, -44, 0, 3],可以看到整整压缩了7个空间,最后一位3是用来记录最后一段的二进制长度是多少

四、解码赫夫曼字节数组

-33 => 11111111111111111111111111011111 => 截取后8位 => 11011111
-44 => 11111111111111111111111111010100 => 截取后8位 => 11010100
0 => 00000000
由于前面记录了最后一段长度是3,那么这里要截取最后3个字符即000,
那么此时拼接就可以得到1101111111010100000,最后再通过之前的赫夫曼编码表反向查询出对应的字符,因为前缀都是唯一的,最后可以分割为 110 111 111 10 10 10 0 0 0 0则刚好对应的就是[97, 98, 98, 99, 99, 99, 100, 100, 100, 100],即abbcccdddd

五、具体实现

package com.gs.tree;

import org.junit.Before;
import org.junit.Test;

import java.io.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;

/**
 * 赫夫曼编码压缩
 *
 * @author Pursuer
 * @version 1.0
 * @date 2023/11/2
 */
public class HuffmanCompressorTest {

    private HuffmanCompressor compressor;

    @Before
    public void before() {
        compressor = new HuffmanCompressor();
    }

    @Test
    public void test01() {
        //定义要压缩的字符串
        String str = "The harder, The luckier!";
        System.out.println("编码:");
        byte[] bytes = compressor.zipContent(str);
        System.out.println(Arrays.toString(bytes));
        System.out.println("解码:");
        System.out.println(compressor.unzipContent(bytes, compressor.huffmanCodeMap));
    }

    @Test
    public void test02() {
        String src = "C:\\Users\\guosh\\Desktop\\a.doc";
        String target = "C:\\Users\\guosh\\Desktop\\a.zip";
        compressor.zipFile(src, target);
    }

    @Test
    public void test03() {
        String src = "C:\\Users\\guosh\\Desktop\\a.zip";
        String target = "C:\\Users\\guosh\\Desktop\\b.doc";
        compressor.unZipFile(src, target);
    }
}

class HuffmanCompressor {
    /**
     * 赫夫曼树
     */
    HuffmanNode root;
    /**
     * 节点数量
     */
    int nodeCount;
    /**
     * 赫夫曼编码映射表
     */
    Map<Byte, String> huffmanCodeMap;
    /**
     * 赫夫曼树左右路径对应的编码
     */
    private static final String LEFT_PATH_CODE = "0";
    private static final String RIGHT_PATH_CODE = "1";

    public byte[] zipContent(String content) {
        return zip(content.getBytes());
    }

    public String unzipContent(byte[] bytes, Map<Byte, String> huffmanCodeMap) {
        return new String(this.unZip(bytes, huffmanCodeMap));
    }

    public void zipFile(String src, String target) {
        try (InputStream is = Files.newInputStream(Paths.get(src)); ObjectOutputStream os = new ObjectOutputStream(Files.newOutputStream(Paths.get(target)))) {
            //读取文件
            byte[] bytes = new byte[is.available()];
            is.read(bytes);
            //压缩
            byte[] zipBytes = zip(bytes);
            //写出
            os.writeObject(zipBytes);
            os.writeObject(huffmanCodeMap);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void unZipFile(String src, String target) {
        try (ObjectInputStream is = new ObjectInputStream(Files.newInputStream(Paths.get(src))); OutputStream os = Files.newOutputStream(Paths.get(target))) {
            //读取文件内容
            byte[] content = (byte[]) is.readObject();
            //读取赫夫曼编码
            Map<Byte, String> huffmanCodeMap = (Map<Byte, String>) is.readObject();
            //解码
            byte[] bytes = unZip(content, huffmanCodeMap);
            //写出
            os.write(bytes);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    private byte[] zip(byte[] bytes) {
        //初始化赫夫曼树
        initHuffmanTree(bytes);
        //根据赫夫曼树生成赫夫曼编码映射表
        createHuffmanCodeMap();
        //生成赫夫曼编码串
        return createHuffmanCode(bytes);
    }

    public byte[] unZip(byte[] bytes, Map<Byte, String> huffmanCodeMap) {
        //定义返回值
        StringBuilder sb = new StringBuilder();
        //赫夫曼编码翻转
        Map<String, Byte> huffmanCodeReverseMap = new HashMap<>(huffmanCodeMap.size());
        for (Map.Entry<Byte, String> entry : huffmanCodeMap.entrySet()) {
            huffmanCodeReverseMap.put(entry.getValue(), entry.getKey());
        }
        //遍历赫夫曼字节数组转换数据,只遍历到真正的数据行
        int endLength = bytes[bytes.length - 1];
        for (int i = 0; i < bytes.length - 1; i++) {
            int b = bytes[i];
            //这一步是为了补齐正数为8位
            b |= 256;
            //获取二进制编码
            String code = Integer.toBinaryString(b);
            /**
             * 截取:
             * 1.如果不是最后一位则一定有至少有8位,若最后一位长度为8则也可以直接截取8位,不会越界
             * 2.如果最后一位长度不是8位,则要减去最后一位的长度,例如:
             *   最后一位为010,对应的十进制是3,则二进制长度为3
             *   此时 b | 256的结果为100000010,但是最后一位长度只有3
             *   那此时code.substring(code.length() - 8)得到的就是00000010 != 010
             *   和结果是不相符的,那么此时就应该用code.substring(code.length() - 3)
             *   得到的就是010,这才是我们需要的正确的最后一位
             *   如果最后一位是负数,例如-1,则endLength一定是8位
             *   因为byte的最大正数值是127,二进制为01111111,前面的0符号位被省略了就是1111111
             *   也就是说,如果出现负数,那么肯定有8位,因为第一位为符号位1表示负数,是不会被省略的那么就是8位二进制了
             */
            int len = i != bytes.length - 2 || endLength == 8 ? 8 : endLength;
            //拼接
            sb.append(code.substring(code.length() - len));
        }
        //恢复成原字节数组
        List<Byte> decodeList = new ArrayList<>();
        int count = 1, pos = 0;
        while (pos < sb.length()) {
            Byte b = null;
            while (b == null) {
                b = huffmanCodeReverseMap.get(sb.substring(pos, pos + count++));
            }
            decodeList.add(b);
            pos += count - 1;
            count = 1;
        }
        //转换为字节数据
        byte[] decodeBytes = new byte[decodeList.size()];
        for (int i = 0; i < decodeList.size(); i++) {
            decodeBytes[i] = decodeList.get(i);
        }
        return decodeBytes;
    }

    private byte[] createHuffmanCode(byte[] bytes) {
        //将原始数据转换为赫夫曼编码字符串
        StringBuilder sb = new StringBuilder();
        for (byte b : bytes) {
            sb.append(this.huffmanCodeMap.get(b));
        }
        //计算赫夫曼编码字节数据大小,8位一存储,这里+7是因为可能长度不是8的倍数,则最后还要多一个空间
        int size = (sb.length() + 7) / 8;
        //定义返返回值,这里还要+1是因为规定最后一个空间保存最后一位的长度
        byte[] huffmanBytes = new byte[size + 1];
        int index = 0;
        //填充赫夫曼数组
        for (int i = 0; i < sb.length(); i += 8) {
            //定义字符串
            StringBuilder builder;
            //判断是否是最后一位,最后一位可能不满8位
            if (i + 8 > sb.length()) {
                builder = new StringBuilder(sb.substring(i));
                huffmanBytes[huffmanBytes.length - 1] = (byte) builder.length();
            } else {
                builder = new StringBuilder(sb.substring(i, i + 8));
            }
            //填充数据
            huffmanBytes[index++] = (byte) Integer.parseInt(builder.toString(), 2);
        }
        if (huffmanBytes[huffmanBytes.length - 1] == 0) {
            huffmanBytes[huffmanBytes.length - 1] = (byte) 8;
        }
        //返回
        return huffmanBytes;
    }

    private void createHuffmanCodeMap() {
        //初始化赫夫曼编码表
        this.huffmanCodeMap = new HashMap<>(nodeCount);
        //生成路径映射
        if (this.root != null) {
            //向左生成
            this.createLeafNodePath(root.getLeft(), LEFT_PATH_CODE, new StringBuilder());
            //向右生成
            this.createLeafNodePath(root.getRight(), RIGHT_PATH_CODE, new StringBuilder());
        }
    }

    private void createLeafNodePath(HuffmanNode node, String code, StringBuilder sb) {
        if (node == null) {
            return;
        }
        StringBuilder builder = new StringBuilder(sb);
        builder.append(code);
        //判断是否是叶子节点
        if (node.getData() != null) {
            //加入赫夫曼编码表
            huffmanCodeMap.put(node.getData(), builder.toString());
        } else {
            //向左递归
            createLeafNodePath(node.getLeft(), LEFT_PATH_CODE, builder);
            //向右递归
            createLeafNodePath(node.getRight(), RIGHT_PATH_CODE, builder);
        }
    }

    public void initHuffmanTree(byte[] bytes) {
        //统计次数
        Map<Byte, Integer> byteCountMap = new HashMap<>(bytes.length);
        for (byte b : bytes) {
            byteCountMap.put(b, byteCountMap.getOrDefault(b, 0) + 1);
        }
        //根据统计生成赫夫曼树节点
        List<HuffmanNode> nodes = new ArrayList<>(byteCountMap.size());
        for (Map.Entry<Byte, Integer> entry : byteCountMap.entrySet()) {
            nodes.add(new HuffmanNode(entry.getKey(), entry.getValue()));
        }
        //设置叶子节点数量
        this.nodeCount = nodes.size();
        //根据节点生成赫夫曼树
        while (nodes.size() > 1) {
            //先把节点排序
            Collections.sort(nodes);
            //获取前两个节点
            HuffmanNode huffmanLeftNode = nodes.get(0);
            HuffmanNode huffmanRightNode = nodes.get(1);
            //生成父节点
            HuffmanNode parent = new HuffmanNode(huffmanLeftNode.getWeight() + huffmanRightNode.getWeight());
            //设置叶子
            parent.setLeft(huffmanLeftNode);
            parent.setRight(huffmanRightNode);
            //列表删除左右节点
            nodes.remove(huffmanLeftNode);
            nodes.remove(huffmanRightNode);
            //加入父节点
            nodes.add(parent);
        }
        //设置赫夫曼树
        this.root = nodes.get(0);
    }
}

class HuffmanNode implements Comparable<HuffmanNode> {
    Byte data;
    int weight;
    HuffmanNode left;
    HuffmanNode right;

    public HuffmanNode(int weight) {
        this.weight = weight;
    }

    public HuffmanNode(byte data, int weight) {
        this.data = data;
        this.weight = weight;
    }

    public Byte getData() {
        return data;
    }

    public void setData(Byte data) {
        this.data = data;
    }

    public int getWeight() {
        return weight;
    }

    public void setWeight(int weight) {
        this.weight = weight;
    }

    public HuffmanNode getLeft() {
        return left;
    }

    public void setLeft(HuffmanNode left) {
        this.left = left;
    }

    public HuffmanNode getRight() {
        return right;
    }

    public void setRight(HuffmanNode right) {
        this.right = right;
    }

    @Override
    public String toString() {
        return "HuffmanNode{" +
                "data=" + data +
                ", weight=" + weight +
                '}';
    }

    @Override
    public int compareTo(HuffmanNode o) {
        return this.weight = o.getWeight();
    }
}

Pursuer丶
7 声望4 粉丝

对技术抱有热情,对工作保持严谨!