C++ DFA实现敏感词过滤
我也是一个懂状态机的男人了
DFA 算法简介
DFA算法全称是Deterministic finite automaton,翻译为中文是”确定有限状态机“。
此算法可以用于敏感字符的模式匹配,敏感词集合可以确定有限个状态,每个敏感词中的一个字符代表状态机中的一个状态。敏感词中的相邻字符代表相邻状态,前一个字符对应的状态可以转移到后一个字符对应的状态。除此之外,敏感词的最后一个字符对应的状态还需一个结束标识。
在进行匹配时顺序遍历文本,在状态机中查找相应状态,遇到结束标识时表示匹配到一个敏感词,记录匹配的长度,文本偏移相应的长度继续查找直到遍历到文本结尾。
C++ 代码实现
words_filter.hpp
/*************************************************************************
> File Name: words_filter.hpp
> Author: ce39906
> Mail: ce39906@163.com
> Created Time: 2019-02-25 15:05:00
************************************************************************/
#ifndef WORDS_FILTER_HPP
#define WORDS_FILTER_HPP
#include <map>
#include <vector>
#include <set>
#include <string>
struct TreeNode
{
using TreeMap = std::map<unsigned char, TreeNode*>;
TreeNode()
{
c_ = '0';
is_end_ = false;
}
TreeNode(unsigned char c, bool is_end) :
c_(c), is_end_(is_end)
{
}
TreeNode* findChild(const unsigned char next_char) const
{
if (subtree_map_.count(next_char))
{
return subtree_map_.at(next_char);
}
return nullptr;
}
// insert and return child node
TreeNode* insertChild(const unsigned char next_char)
{
// already have the child
if (findChild(next_char))
{
return nullptr;
}
TreeNode* child = new TreeNode(next_char, false);
if (child == nullptr)
{
return nullptr;
}
subtree_map_.insert(std::make_pair(next_char, child));
return child;
}
// keyword
unsigned char c_;
// end flag
bool is_end_;
// subtree
TreeMap subtree_map_;
};
class WordsFilterTree
{
public:
WordsFilterTree(const std::vector<std::string>& sensitive_words);
bool addSensitiveWord(const std::string& sensitive_word);
std::set<std::string> findAllSensitiveWords(const std::string& text,
const int match_type = 2) const;
std::string replaceAllSensitiveWords(const std::string& text,
const bool unix_shell_colored = true,
const int match_type = 2,
const unsigned char replaced_char = '*') const;
private:
TreeNode root_;
bool insert(TreeNode* parent, const std::string& sensitive_word);
int checkSensitiveWord(const TreeNode* node,
const std::string& text,
const int begin_index,
const int match_type) const;
};
static const int kMinMatch = 1;
static const int kMaxMatch = 2;
static const int kBoldRedANSIColorCodeLen = 11;
static const std::string kBoldRedANSIColorCodePrefix = "\033[1;31m";
static const std::string kBoldRedANSIColorCodeSuffix = "\033[0m";
static int utf8StringLen(const std::string& word)
{
const char* s = word.c_str();
int len = 0;
while (*s) len += (*s++ & 0xc0) != 0x80;
return len;
}
#endif
words_filter.cpp
/*************************************************************************
> File Name: words_filter.cpp
> Author: ce39906
> Mail: ce39906@163.com
> Created Time: 2019-02-26 14:17:18
************************************************************************/
#include "words_filter.hpp"
WordsFilterTree::WordsFilterTree(const std::vector<std::string>& sensitive_words)
{
for (const std::string& sensitive_word : sensitive_words)
{
insert(&root_, sensitive_word);
}
}
bool WordsFilterTree::addSensitiveWord(const std::string& sensitive_word)
{
return insert(&root_, sensitive_word);
}
std::set<std::string>
WordsFilterTree::findAllSensitiveWords(const std::string& text,
const int match_type) const
{
std::set<std::string> matched_words;
int begin_index = 0;
const int n = text.size();
while (begin_index < n)
{
int match_len = checkSensitiveWord(&root_, text, begin_index, match_type);
if (match_len == 0)
{
begin_index++;
}
else
{
matched_words.insert(text.substr(begin_index, match_len));
begin_index += match_len;
}
}
return matched_words;
}
std::string
WordsFilterTree::replaceAllSensitiveWords(const std::string& text,
const bool unix_shell_colored,
const int match_type,
const unsigned char replaced_char) const
{
//get one copy
std::string replaced_text = text;
int begin_index = 0;
int shift_len = 0;
const int n = text.size();
while (begin_index < n)
{
int match_len = checkSensitiveWord(&root_, text, begin_index, match_type);
if (match_len == 0)
{
begin_index++;
}
else
{
const std::string matched_word = text.substr(begin_index, match_len);
int utf8Len = utf8StringLen(matched_word);
std::string replaced_str = std::string(utf8Len, replaced_char);
if (unix_shell_colored)
{
replaced_str = kBoldRedANSIColorCodePrefix
+ replaced_str
+ kBoldRedANSIColorCodeSuffix;
}
replaced_text = replaced_text.substr(0, begin_index - shift_len)
+ replaced_str
+ replaced_text.substr(begin_index + match_len - shift_len);
begin_index += match_len;
shift_len += match_len - utf8Len;
shift_len -= unix_shell_colored ? kBoldRedANSIColorCodeLen : 0;
}
}
return replaced_text;
}
bool WordsFilterTree::insert(TreeNode* parent, const std::string& sensitive_word)
{
const int n = sensitive_word.size();
for (int i = 0; i < n; i++)
{
if (!parent)
{
return false;
}
const unsigned char c = sensitive_word[i];
TreeNode* child = parent->findChild(c);
if (!child)
{
parent = parent->insertChild(c);
}
else
{
parent = child;
}
if (i == n - 1)
{
parent->is_end_ = true;
}
}
return true;
}
int WordsFilterTree::checkSensitiveWord(const TreeNode* node,
const std::string& text,
const int begin_index,
const int match_type) const
{
bool flag = false;
int match_len = 0;
const int n = text.size();
for (int i = begin_index; i < n; i++)
{
const unsigned char c = text[i];
node = node->findChild(c);
if (!node)
{
break;
}
else
{
match_len++;
if (node->is_end_)
{
flag = true;
if (match_type == kMinMatch)
{
break;
}
}
}
}
if (match_len < 2 || !flag)
{
match_len = 0;
}
return match_len;
}
测试
测试代码如下
/*************************************************************************
> File Name: test_words_filter.cpp
> Author: ce39906
> Mail: ce39906@163.com
> Created Time: 2019-02-25 16:36:46
************************************************************************/
#include <iostream>
#include "words_filter.hpp"
using namespace std;
int main()
{
vector<string> sensitive_words = {
"全部",
"手机",
"追逐",
"haha",
"Indian",
"Force"
};
string text = "如果流浪是你的天赋,那么你一定是我最美的追逐,"
"如果爱情是你的游牧拥有过是不是该满足,谁带我踏"
"上孤独的丝路,追逐你的脚步,谁带我离开孤独的丝"
"路感受你的温度,我将眼泪流成天山上的湖,羌笛声"
"胡旋舞为你笑为你哭,爱上你的全部放弃我的全部,"
"爱上了你之后我开始领悟,陪你走了一段最唯美的国"
"度.Pakistan Armed Forces spokesperson, Major G"
"eneral Asif Ghafoor, on Twitter claimed Indian"
" Air Force violated the Line of Control. Pakis"
"tan Air Force immediately scrambled. Indian aircraf";
WordsFilterTree words_filter(sensitive_words);
cout << "Init sensitive words are :" << endl;
for (const string& sensitive_word : sensitive_words)
{
cout << kBoldRedANSIColorCodePrefix << sensitive_word
<< kBoldRedANSIColorCodeSuffix << " ";
}
cout << endl;
string word = "温度";
cout << "Add new sensitive word :" << endl;
cout << kBoldRedANSIColorCodePrefix << word << kBoldRedANSIColorCodeSuffix;
words_filter.addSensitiveWord(word);
cout << endl;
set<string> matched_words = words_filter.findAllSensitiveWords(text, 1);
cout << "Matched sensitvie words are :" << endl;
for (const string& matched_word : matched_words)
{
cout << kBoldRedANSIColorCodePrefix << matched_word
<< kBoldRedANSIColorCodeSuffix << " ";
}
cout << endl;
cout << "Origin text is :" << endl;
cout << text << endl;
string replaced_text = words_filter.replaceAllSensitiveWords(text);
cout << "Replaced text is :" << endl;
cout << replaced_text << endl;
return 0;
}
代码地址
https://github.com/ce39906/self-practices/tree/master/cppcode/words_filter
编译
g++ -std=c++11 test_words_filter.cpp words_filter.cpp -o words_filter
结果
./words_filter