java敏感词过滤

java敏感词过滤

敏感词:“美元”,“中国”,“北京大学”,“北大”,“南京大学”

DFAUtils

`import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;

public class DFAUtils {
    /**
     * 添加敏感词到算法树
     */
    public static void addSensitiveWord(String sensitiveWord) {
        if (null == sensitiveWord || sensitiveWord.length() == 0) {
            return;
        }
        char[] chars = sensitiveWord.toCharArray();
        Map<Character, Map> parentMap = sensitiveWordsMap;
        Map<Character, Map> current = null;
        synchronized (lock) {
            for (int i = 0; i < chars.length; i++) {
                if (i == 0) {
                    if (sensitiveWordsMap.size() == 0) {
                        /* 添加第一个敏感词的第一个字符执行此code */
                        if (chars.length == 1) {
                            Map<Character, Map> endMap = new HashMap<>(1);
                            endMap.put(null, null);
                            sensitiveWordsMap.put(chars[0], endMap);
                        } else {
                            sensitiveWordsMap.put(chars[0], null);
                        }
                    } else {
                        current = parentMap.get(chars[0]);
                        if (null == current) {
                            if (chars.length == 1) {
                                Map<Character, Map> endMap = new HashMap<>(1);
                                endMap.put(null, null);
                                sensitiveWordsMap.put(chars[0], endMap);
                                break;
                            } else {
                                sensitiveWordsMap.put(chars[0], null);
                            }
                        } else {
                            if (chars.length == 1) {
                                current.put(null, null);
                                break;
                            }
                        }
                    }
                } else {
                    if (null == current) {
                        Map<Character, Map> childMap = new HashMap<Character, Map>();
                        if (i == chars.length - 1) {
                            Map<Character, Map> endMap = new HashMap<>(1);
                            endMap.put(null, null);
                            childMap.put(chars[i], endMap);
                            parentMap.put(chars[i - 1], childMap);
                            break;
                        } else {
                            childMap.put(chars[i], null);
                            parentMap.put(chars[i - 1], childMap);
                            parentMap = childMap;
                            current = null;
                        }
                    } else {
                        Map<Character, Map> childMap = current.get(chars[i]);
                        if (null == childMap) {
                            if (i == chars.length - 1) {
                                Map<Character, Map> endMap = new HashMap<>(1);
                                endMap.put(null, null);
                                current.put(chars[i], endMap);
                            } else {
                                current.put(chars[i], null);
                                parentMap = current;
                                current = null;
                            }
                        } else {
                            if (i == chars.length - 1) {
                                childMap.put(null, null);
                            } else {
                                parentMap = current;
                                current = childMap;
                            }
                        }
                    }
                }
            }
        }
    }

    /**
     * 检查敏感词(找到符合敏感词则返回--单个字符敏感词前后不是中文字符才算敏感词)
     */
    public static String checkSensitiveWord(String content) {
        if (null == content || content.length() == 0 || sensitiveWordsMap.size() == 0) {
            return null;
        }
        char[] chars = content.toCharArray();
        boolean isContain = Boolean.FALSE;
        StringBuilder sbResult = new StringBuilder();
        for (int i = 0; i < chars.length; i++) {
            if (sensitiveWordsMap.containsKey(chars[i])) {
                Map<Character, Map> currentMap = sensitiveWordsMap.get(chars[i]);
                sbResult.append(chars[i]);
                if (null == currentMap) {
                    break;
                } else {
                    if (currentMap.containsKey(null)) {
                        if (sbResult.length() == 1) {
                            /* 前一个字符或后一个字符是否是中文字符 */
                            boolean before = Boolean.FALSE;
                            if (i - 1 < 0) {
                                before = Boolean.TRUE;
                            } else {
                                if (chars[i - 1] < 13312 || chars[i - 1] > 40895) {
                                    before = Boolean.TRUE;
                                }
                            }
                            boolean after = Boolean.FALSE;
                            if (i + 1 >= chars.length) {
                                after = Boolean.TRUE;
                            } else {
                                if (chars[i + 1] < 13312 || chars[i + 1] > 40895) {
                                    after = Boolean.TRUE;
                                }
                            }
                            if (before && after) {
                                isContain = Boolean.TRUE;
                                break;
                            }
                            /* From当前index开始匹配是否存在敏感词 */
                            int j = i + 1;
                            for (; j < chars.length; j++) {
                                if (currentMap.containsKey(chars[j])) {
                                    sbResult.append(chars[j]);
                                    currentMap = currentMap.get(chars[j]);
                                    if (currentMap.containsKey(null)) {
                                        isContain = Boolean.TRUE;
                                        break;
                                    } else {
                                        continue;
                                    }
                                } else {
                                    break;
                                }
                            }
                        } else {
                            isContain = Boolean.TRUE;
                            break;
                        }
                    } else {
                        /* From当前index开始匹配是否存在敏感词 */
                        int j = i + 1;
                        for (; j < chars.length; j++) {
                            if (currentMap.containsKey(chars[j])) {
                                sbResult.append(chars[j]);
                                currentMap = currentMap.get(chars[j]);
                                if (currentMap.containsKey(null)) {
                                    isContain = Boolean.TRUE;
                                    break;
                                } else {
                                    continue;
                                }
                            } else {
                                break;
                            }
                        }
                    }
                    if (isContain) {
                        break;
                    } else {
                        sbResult.setLength(0);
                    }
                }
            }
        }

        if (isContain) {
            return sbResult.toString();
        } else {
            return null;
        }
    }

    /**
     * 删除算法树的敏感词
     */
    public static void delSensitiveWord(String sensitiveWord) {
        if (null == sensitiveWord || sensitiveWord.length() == 0 || sensitiveWordsMap.size() == 0) {
            return;
        }
        int delIndex = 0;
        char[] chars = sensitiveWord.toCharArray();
        Map<Character, Map> current = sensitiveWordsMap;
        synchronized (lock) {
            int i = 0;
            for (; i < chars.length; i++) {
                if (current.containsKey(chars[i])) {
                    if (current.get(chars[i]).size() > 1) {
                        delIndex = i;
                    }

                } else {
                    break;
                }
                current = current.get(chars[i]);
            }
            if (!current.containsKey(null)) {
                return;
            }
            current = sensitiveWordsMap;
            if (i == chars.length) {
                for (i = 0; i < delIndex; i++) {
                    current = current.get(chars[i]);
                }
                if (i == chars.length) {
                    current.remove(chars[i]);
                } else {
                    if (i == 0 && chars.length == 1) {
                        if (current.get(chars[i]).size() == 1) {
                            current.remove(chars[i]);
                        } else {
                            current.get(chars[i]).remove(null);
                        }
                    } else {
                        if (i + 1 == chars.length) {
                            current.get(chars[i]).remove(null);
                        } else {
                            current.get(chars[i]).remove(chars[i + 1]);
                        }
                    }
                }
            }
        }
    }

    /**
     * 获取算法树的敏感词
     */
    public static LinkedList<String> getSevsitiveWords() {
        LinkedList<String> listWords = new LinkedList<String>();
        if (sensitiveWordsMap.size() == 0) {
            return listWords;
        }
        StringBuilder sbWord = new StringBuilder();
        getSevsitiveWords(sensitiveWordsMap, listWords, sbWord);
        return listWords;
    }

    /**
     * 算法树是否包含对应的敏感词
     */
    public static boolean containSensitiveWord(String sensitiveWord) {
        if (null == sensitiveWord || sensitiveWord.length() == 0 || sensitiveWordsMap.size() == 0) {
            return false;
        }
        return sensitiveWord.equals(checkSensitiveWord(sensitiveWord));
    }

    /**
     * 清空算法树
     */
    public static void clearSensitiveWord() {
        synchronized (lock) {
            sensitiveWordsMap = new HashMap<Character, Map>();
        }
    }

    /**
     * 递归获取算法树的敏感词
     */
    private static void getSevsitiveWords(Map<Character, Map> childMap, LinkedList<String> listWords,
                                          StringBuilder sbWord) {
        if (childMap.size() == 1 && childMap.containsKey(null)) {
            listWords.add(sbWord.toString());
            sbWord.setLength(sbWord.length() - 1);
            return;
        }
        for (Map.Entry<Character, Map> entry : childMap.entrySet()) {
            Character keyChar = entry.getKey();
            Map<Character, Map> valueMap = entry.getValue();
            if (null == keyChar) {
                continue;
            }
            sbWord.append(keyChar);
            if (valueMap.containsKey(null)) {
                listWords.add(sbWord.toString());
                if (valueMap.size() == 1) {
                    sbWord.setLength(sbWord.length() - 1);
                } else {
                    getSevsitiveWords(valueMap, listWords, sbWord);
                    sbWord.setLength(sbWord.length() - 1);
                }
            } else {
                getSevsitiveWords(valueMap, listWords, sbWord);
                sbWord.setLength(sbWord.length() - 1);
            }
        }
    }

    private final static Object lock = new Object();
    private static Map<Character, Map> sensitiveWordsMap = new HashMap<Character, Map>();
}` 


DFAUtilsTest

外汇名词解释https://www.fx61.com/definitions

import org.junit.Assert;
import org.junit.Test;

import java.util.LinkedList;

public class DFAUtilsTest {
    /*==========================AddSensitiveWord-start==========================*/
    @Test
    public void testAddSensitiveWord01() {
        DFAUtils.clearSensitiveWord();
        LinkedList<String> listWords = null;
        DFAUtils.addSensitiveWord("中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(1, listWords.size());
        DFAUtils.addSensitiveWord("中哈");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(2, listWords.size());
        DFAUtils.addSensitiveWord("中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(2, listWords.size());
        DFAUtils.addSensitiveWord("中哈");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(2, listWords.size());
        DFAUtils.delSensitiveWord("中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(1, listWords.size());
        DFAUtils.addSensitiveWord("中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(2, listWords.size());
        DFAUtils.addSensitiveWord("中中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(3, listWords.size());
        DFAUtils.addSensitiveWord("人");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(4, listWords.size());
        DFAUtils.addSensitiveWord("中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(5, listWords.size());
    }

    /*==========================AddSensitiveWord-end============================*/

    /*==========================CheckSensitiveWord-start==========================*/
    @Test
    public void testCheckSensitiveWord01() {
        DFAUtils.clearSensitiveWord();
        String sencitivaWord = null;
        LinkedList<String> listWords = null;
        DFAUtils.addSensitiveWord("大");
        DFAUtils.addSensitiveWord("大学");
        DFAUtils.addSensitiveWord("中中中国中中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(8, listWords.size());
        sencitivaWord = DFAUtils.checkSensitiveWord("滚");
        Assert.assertEquals("滚", sencitivaWord);

        sencitivaWord = DFAUtils.checkSensitiveWord("翻滚");
        Assert.assertEquals(null, sencitivaWord);

        sencitivaWord = DFAUtils.checkSensitiveWord("滚 ");
        Assert.assertEquals("滚", sencitivaWord);

        sencitivaWord = DFAUtils.checkSensitiveWord(" 滚");
        Assert.assertEquals("滚", sencitivaWord);

        sencitivaWord = DFAUtils.checkSensitiveWord("体操");
        Assert.assertEquals(null, sencitivaWord);

        sencitivaWord = DFAUtils.checkSensitiveWord("你好滚滚");
        Assert.assertEquals("滚滚", sencitivaWord);

        sencitivaWord = DFAUtils.checkSensitiveWord("滚你好滚");
        Assert.assertEquals(null, sencitivaWord);

        sencitivaWord = DFAUtils.checkSensitiveWord("滚轮胎");
        Assert.assertEquals(null, sencitivaWord);

        sencitivaWord = DFAUtils.checkSensitiveWord("你你国国");
        Assert.assertEquals(null, sencitivaWord);

        sencitivaWord = DFAUtils.checkSensitiveWord("中中国中中 中中中中国中中中");
        Assert.assertEquals("中中中国中中中", sencitivaWord);
    }

    /*==========================CheckSensitiveWord-start==========================*/

    /*==========================DelSensitiveWor-start==========================*/
    @Test
    public void testDelSensitiveWord01() {
        DFAUtils.clearSensitiveWord();
        LinkedList<String> listWords = null;
        DFAUtils.addSensitiveWord("中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(1, listWords.size());
        DFAUtils.delSensitiveWord("");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(1, listWords.size());
        DFAUtils.delSensitiveWord("国");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(1, listWords.size());
        DFAUtils.delSensitiveWord("中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(0, listWords.size());
    }

    @Test
    public void testDelSensitiveWord02() {
        DFAUtils.clearSensitiveWord();
        LinkedList<String> listWords = null;
        DFAUtils.addSensitiveWord("中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(1, listWords.size());
        DFAUtils.delSensitiveWord("");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(1, listWords.size());
        DFAUtils.delSensitiveWord("中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(1, listWords.size());
        DFAUtils.delSensitiveWord("中中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(1, listWords.size());
        DFAUtils.delSensitiveWord("中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(0, listWords.size());
    }

    @Test
    public void testDelSensitiveWord03() {
        DFAUtils.clearSensitiveWord();
        LinkedList<String> listWords = null;
        DFAUtils.addSensitiveWord("中中");
        DFAUtils.addSensitiveWord("中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(2, listWords.size());
        DFAUtils.delSensitiveWord("");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(2, listWords.size());
        DFAUtils.delSensitiveWord("中中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(2, listWords.size());
        DFAUtils.delSensitiveWord(" 中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(2, listWords.size());
        DFAUtils.delSensitiveWord("中中 ");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(2, listWords.size());
        DFAUtils.delSensitiveWord("中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(1, listWords.size());
        DFAUtils.delSensitiveWord("中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(0, listWords.size());
        DFAUtils.addSensitiveWord("中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(1, listWords.size());
        DFAUtils.delSensitiveWord("中中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(1, listWords.size());
        DFAUtils.delSensitiveWord("中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(1, listWords.size());
        DFAUtils.delSensitiveWord("中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(0, listWords.size());
    }

    @Test
    public void testDelSensitiveWord04() {
        DFAUtils.clearSensitiveWord();
        LinkedList<String> listWords = null;
        DFAUtils.addSensitiveWord("中中中111");
        DFAUtils.addSensitiveWord("中中");
        DFAUtils.addSensitiveWord("中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(3, listWords.size());
        DFAUtils.delSensitiveWord("");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(3, listWords.size());
        DFAUtils.delSensitiveWord("中中中111");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(2, listWords.size());
        DFAUtils.addSensitiveWord("中中中111");
        DFAUtils.delSensitiveWord("中中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(3, listWords.size());
        DFAUtils.delSensitiveWord("中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(2, listWords.size());
        DFAUtils.addSensitiveWord("中中 ");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(3, listWords.size());
        DFAUtils.delSensitiveWord("中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(2, listWords.size());
        DFAUtils.delSensitiveWord("中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(2, listWords.size());
    }
    /*==========================DelSensitiveWor-end============================*/

    /*==========================ContainSensitiveWord-start==========================*/
    @Test
    public void testContainSensitiveWord01() {
        DFAUtils.clearSensitiveWord();
        LinkedList<String> listWords = null;
        DFAUtils.addSensitiveWord("滚");
        DFAUtils.addSensitiveWord("中中中国中中中");
        listWords = DFAUtils.getSevsitiveWords();
        Assert.assertEquals(7, listWords.size());
        Assert.assertEquals(false, DFAUtils.containSensitiveWord(" "));
        Assert.assertEquals(true, DFAUtils.containSensitiveWord("操"));
    }

    /*==========================ContainSensitiveWord-end============================*/
}
阅读 47

推荐阅读