我也是一个懂状态机的男人了

DFA 算法简介

DFA算法全称是Deterministic finite automaton,翻译为中文是”确定有限状态机“。
此算法可以用于敏感字符的模式匹配,敏感词集合可以确定有限个状态,每个敏感词中的一个字符代表状态机中的一个状态。敏感词中的相邻字符代表相邻状态,前一个字符对应的状态可以转移到后一个字符对应的状态。除此之外,敏感词的最后一个字符对应的状态还需一个结束标识。
在进行匹配时顺序遍历文本,在状态机中查找相应状态,遇到结束标识时表示匹配到一个敏感词,记录匹配的长度,文本偏移相应的长度继续查找直到遍历到文本结尾。

C++ 代码实现

words_filter.hpp

/*************************************************************************
    > File Name: words_filter.hpp
    > Author: ce39906
    > Mail: ce39906@163.com
    > Created Time: 2019-02-25 15:05:00
 ************************************************************************/
#ifndef WORDS_FILTER_HPP
#define WORDS_FILTER_HPP
#include <map>
#include <vector>
#include <set>
#include <string>

struct TreeNode
{
    using TreeMap = std::map<unsigned char, TreeNode*>;

    TreeNode()
    {
        c_ = '0';
        is_end_ = false;
    }

    TreeNode(unsigned char c, bool is_end) :
        c_(c), is_end_(is_end)
    {
    }

    TreeNode* findChild(const unsigned char next_char) const
    {
        if (subtree_map_.count(next_char))
        {
            return subtree_map_.at(next_char);
        }

        return nullptr;
    }
    // insert and return child node
    TreeNode* insertChild(const unsigned char next_char)
    {
        // already have the child
        if (findChild(next_char))
        {
            return nullptr;
        }

        TreeNode* child = new TreeNode(next_char, false);
        if (child == nullptr)
        {
            return nullptr;
        }

        subtree_map_.insert(std::make_pair(next_char, child));
        return child;
    }
    // keyword
    unsigned char c_;
    // end flag
    bool is_end_;
    // subtree
    TreeMap subtree_map_;
};

class WordsFilterTree
{
  public:
    WordsFilterTree(const std::vector<std::string>& sensitive_words);

    bool addSensitiveWord(const std::string& sensitive_word);

    std::set<std::string> findAllSensitiveWords(const std::string& text,
                                                const int match_type = 2) const;

    std::string replaceAllSensitiveWords(const std::string& text,
                                         const bool unix_shell_colored = true,
                                         const int match_type = 2,
                                         const unsigned char replaced_char = '*') const;

  private:
    TreeNode root_;

    bool insert(TreeNode* parent, const std::string& sensitive_word);

    int checkSensitiveWord(const TreeNode* node,
                           const std::string& text,
                           const int begin_index,
                           const int match_type) const;
};

static const int kMinMatch = 1;
static const int kMaxMatch = 2;
static const int kBoldRedANSIColorCodeLen = 11;
static const std::string kBoldRedANSIColorCodePrefix = "\033[1;31m";
static const std::string kBoldRedANSIColorCodeSuffix = "\033[0m";

static int utf8StringLen(const std::string& word)
{
    const char* s = word.c_str();
    int len = 0;
    while (*s) len += (*s++ & 0xc0) != 0x80;
    return len;
}

#endif

words_filter.cpp

/*************************************************************************
    > File Name: words_filter.cpp
    > Author: ce39906
    > Mail: ce39906@163.com
    > Created Time: 2019-02-26 14:17:18
 ************************************************************************/
#include "words_filter.hpp"

WordsFilterTree::WordsFilterTree(const std::vector<std::string>& sensitive_words)
{
    for (const std::string& sensitive_word : sensitive_words)
    {
        insert(&root_, sensitive_word);
    }
}

bool WordsFilterTree::addSensitiveWord(const std::string& sensitive_word)
{
    return insert(&root_, sensitive_word);
}

std::set<std::string>
WordsFilterTree::findAllSensitiveWords(const std::string& text,
                                       const int match_type) const
{
    std::set<std::string> matched_words;
    int begin_index = 0;
    const int n = text.size();
    while (begin_index < n)
    {
        int match_len = checkSensitiveWord(&root_, text, begin_index, match_type);
        if (match_len == 0)
        {
            begin_index++;
        }
        else
        {
            matched_words.insert(text.substr(begin_index, match_len));
            begin_index += match_len;
        }
    }

    return matched_words;
}

std::string
WordsFilterTree::replaceAllSensitiveWords(const std::string& text,
                                          const bool unix_shell_colored,
                                          const int match_type,
                                          const unsigned char replaced_char) const
{
    //get one copy
    std::string replaced_text = text;
    int begin_index = 0;
    int shift_len = 0;
    const int n = text.size();
    while (begin_index < n)
    {
        int match_len = checkSensitiveWord(&root_, text, begin_index, match_type);
        if (match_len == 0)
        {
            begin_index++;
        }
        else
        {
            const std::string matched_word = text.substr(begin_index, match_len);
            int utf8Len = utf8StringLen(matched_word);
            std::string replaced_str = std::string(utf8Len, replaced_char);
            if (unix_shell_colored)
            {
                replaced_str =   kBoldRedANSIColorCodePrefix
                               + replaced_str
                               + kBoldRedANSIColorCodeSuffix;
            }

            replaced_text =   replaced_text.substr(0, begin_index - shift_len)
                            + replaced_str
                            + replaced_text.substr(begin_index + match_len - shift_len);

            begin_index += match_len;
            shift_len += match_len - utf8Len;
            shift_len -= unix_shell_colored ? kBoldRedANSIColorCodeLen : 0;
        }
    }

    return replaced_text;
}

bool WordsFilterTree::insert(TreeNode* parent, const std::string& sensitive_word)
{

    const int n = sensitive_word.size();
    for (int i = 0; i < n; i++)
    {
        if (!parent)
        {
            return false;
        }

        const unsigned char c = sensitive_word[i];
        TreeNode* child = parent->findChild(c);
        if (!child)
        {
            parent = parent->insertChild(c);
        }
        else
        {
            parent = child;
        }

        if (i == n - 1)
        {
            parent->is_end_ = true;
        }
    }

    return true;
}

int WordsFilterTree::checkSensitiveWord(const TreeNode* node,
                                        const std::string& text,
                                        const int begin_index,
                                        const int match_type) const
{
    bool flag = false;
    int match_len = 0;
    const int n = text.size();
    for (int i = begin_index; i < n; i++)
    {
        const unsigned char c = text[i];
        node = node->findChild(c);
        if (!node)
        {
            break;
        }
        else
        {
            match_len++;
            if (node->is_end_)
            {
                flag = true;
                if (match_type == kMinMatch)
                {
                    break;
                }
            }
        }
    }

    if (match_len < 2 || !flag)
    {
        match_len = 0;
    }

    return match_len;
}

测试

测试代码如下

/*************************************************************************
    > File Name: test_words_filter.cpp
    > Author: ce39906
    > Mail: ce39906@163.com
    > Created Time: 2019-02-25 16:36:46
 ************************************************************************/
#include <iostream>
#include "words_filter.hpp"
using namespace std;

int main()
{
    vector<string> sensitive_words = {
        "全部",
        "手机",
        "追逐",
        "haha",
        "Indian",
        "Force"
    };

    string text = "如果流浪是你的天赋,那么你一定是我最美的追逐,"
                  "如果爱情是你的游牧拥有过是不是该满足,谁带我踏"
                  "上孤独的丝路,追逐你的脚步,谁带我离开孤独的丝"
                  "路感受你的温度,我将眼泪流成天山上的湖,羌笛声"
                  "胡旋舞为你笑为你哭,爱上你的全部放弃我的全部,"
                  "爱上了你之后我开始领悟,陪你走了一段最唯美的国"
                  "度.Pakistan Armed Forces spokesperson, Major G"
                  "eneral Asif Ghafoor, on Twitter claimed Indian"
                  " Air Force violated the Line of Control. Pakis"
                  "tan Air Force immediately scrambled. Indian aircraf";

    WordsFilterTree words_filter(sensitive_words);
    cout << "Init sensitive words are :" << endl;
    for (const string& sensitive_word : sensitive_words)
    {
        cout << kBoldRedANSIColorCodePrefix << sensitive_word
             << kBoldRedANSIColorCodeSuffix << " ";
    }
    cout << endl;

    string word = "温度";
    cout << "Add new sensitive word :" << endl;
    cout << kBoldRedANSIColorCodePrefix << word << kBoldRedANSIColorCodeSuffix;
    words_filter.addSensitiveWord(word);
    cout << endl;

    set<string> matched_words = words_filter.findAllSensitiveWords(text, 1);
    cout << "Matched sensitvie words are :" << endl;
    for (const string& matched_word : matched_words)
    {
        cout << kBoldRedANSIColorCodePrefix << matched_word
             << kBoldRedANSIColorCodeSuffix << " ";
    }
    cout << endl;

    cout << "Origin text is :" << endl;
    cout << text << endl;
    string replaced_text = words_filter.replaceAllSensitiveWords(text);
    cout << "Replaced text is :" << endl;
    cout << replaced_text << endl;

    return 0;
}

代码地址

https://github.com/ce39906/self-practices/tree/master/cppcode/words_filter

编译

g++ -std=c++11 test_words_filter.cpp words_filter.cpp -o words_filter

结果

./words_filter

pic