前缀树

前缀树,用来快速匹配一段文本中是否包含某些指定的文本数据。

算法类 Trie

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
/**
 * 前缀树
 */
public class Trie {
    private final TrieNode root;

    public Trie() {
        root = new TrieNode();
    }

    public void insert(String keyWord) {
        TrieNode node = root;
        for (int i = 0; i < keyWord.length(); i++) {
            char c = keyWord.charAt(i);
            if (!node.containsKey(c)) {
                node.put(c, new TrieNode());
            }
            node = node.get(c);
        }
        node.setEnd();
    }

    public boolean search(String text) {
        return search(text, 0);
    }

    public boolean search(String text, int startIndex) {
        if (startIndex < 0) {
            startIndex = 0;
        } else if (startIndex >= text.length()) {
            return false;
        }
        TrieNode node = root;
        for (int i = startIndex; i < text.length(); i++) {
            char c = text.charAt(i);
            TrieNode childNode = node.get(c);
            if (childNode == null) {
                node = root;
            } else if (childNode.isEnd()) {
                return true;
            } else {
                node = childNode;
            }
        }
        return node.isEnd();
    }

    static class TrieNode {
        private final Map<Character, TrieNode> children = new HashMap<>();
        private boolean isEndOfWord;

        public TrieNode() {
            isEndOfWord = false;
        }

        public boolean containsKey(char ch) {
            return children.containsKey(ch);
        }

        public TrieNode get(char ch) {
            return children.get(ch);
        }

        public void put(char ch, TrieNode node) {
            children.put(ch, node);
        }

        public boolean isEnd() {
            return isEndOfWord;
        }

        public void setEnd() {
            isEndOfWord = true;
        }
    }
}

使用场景案例

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
/**
 * 敏感词前缀树(字典树)
 */
@Slf4j
@Component
public class ChatSensitiveTrie {
    /**
     * 敏感词前缀树。
     * 注意:敏感词更新频率很低,且对实时性没有要求。不要加任何同步机制。连volatile都不要加。
     */
    private Trie prefixTrie = new Trie();


    /**
     * 初始化
     *
     * @param sensitiveWords 敏感词列表
     */
    public synchronized void init(List<String> sensitiveWords) {
        Trie trie = new Trie();
        for (String sensitiveWord : sensitiveWords) {
            // 忽略大小写
            String lowerWord = sensitiveWord.toLowerCase();
            trie.insert(lowerWord);
        }
        prefixTrie = trie;
        log.info("敏感词初始化成功,共初始化敏感词{}个", sensitiveWords.size());
        log.debug("敏感词内容:{}", sensitiveWords);
    }

    /**
     * 是否涉及敏感词
     *
     * @param chatMessage 聊天信息
     * @return 是否涉及敏感词 true/false
     */
    public boolean isSensitive(String chatMessage) {
        String lowerCaseMsg = chatMessage.toLowerCase();
        return prefixTrie.search(lowerCaseMsg);
    }
}