C# 敏感词过滤算法实现

系统运维2025-11-05 04:55:408122

 

本文转载自微信公众号「UP技术控」,感词过滤作者conan 。算法实现转载本文请联系UP技术控公众号。感词过滤

敏感词、算法实现文字过滤是感词过滤一个网站必不可少的功能,如何设计一个好的算法实现、高效的感词过滤过滤算法是非常有必要的。

在实现文字过滤的算法实现算法中,香港云服务器DFA是感词过滤唯一比较好的实现算法。DFA即Deterministic Finite Automaton,算法实现也就是感词过滤确定有穷自动机,它是算法实现是通过event和当前的state得到下一个state,即event+state=nextstate。感词过滤在实现敏感词过滤的算法实现算法中,我们必须要减少运算,b2b供应网感词过滤而DFA在DFA算法中几乎没有什么计算,有的只是状态的转换。

下面看下在c#方法下实现方式

1、构建敏感词库类

private bool LoadDictionary()        {            var wordList = new List<string>();            if (_memoryLexicon == null)            {                _memoryLexicon = new WordGroup[char.MaxValue];                var words = new SensitiveWordBll().GetAllWords();                if (words == null)                    return false;                foreach (string word in words)                {                    wordList.Add(word);                    var chineseWord = Microsoft.VisualBasic.Strings.StrConv(word,                        Microsoft.VisualBasic.VbStrConv.TraditionalChinese, 0);                    if (word != chineseWord)                        wordList.Add(chineseWord);                }                foreach (var word in wordList)                {                    if (word.Length > 0)                    {                        var group = _memoryLexicon[word[0]];                        if (group == null)                        {                            group = new WordGroup();                            _memoryLexicon[word[0]] = group;                        }                        group.Add(word.Substring(1));                    }                }            }            return true;        } 

2、构建敏感词检测类

private bool Check(string blackWord)      {          _wordlenght = 0;          //检测源下一位游标          _nextCursor = _cursor + 1;          var found = false;          var continueCheck = 0;          //遍历词的每一位做匹配          for (var i = 0; i < blackWord.Length; i++)          {              //特殊字符偏移游标              var offset = 0;              if (_nextCursor >= _sourceText.Length)              {                  if (i - 1 < blackWord.Length - 1)                      found = false;                  break;              }              else              {                  //检测下位字符如果不是汉字 数字 字符 偏移量加1                  for (var y = _nextCursor; y < _sourceText.Length; y++)                  {                      if (!IsChs(_sourceText[y]) && !IsNum(_sourceText[y]) && !IsAlphabet(_sourceText[y]))                      {                          offset++;                          //避让特殊字符,下位游标如果>=字符串长度 跳出                          if (_nextCursor + offset >= _sourceText.Length)                              break;                          _wordlenght++;                      }                      else break;                  }                  if (_nextCursor + offset >= _sourceText.Length)                  {                      found = false;                      break;                  }                  if (blackWord[i] == _sourceText[_nextCursor + offset])                  {                      found = true;                      continueCheck = 0;                  }                  else                  {                      // 匹配不到时尝试继续匹配4个字符                      if (continueCheck < 4 && _nextCursor < _sourceText.Length - 1)                      {                          continueCheck++;                          i--;                      }                      else                      {                          found = false;                          break;                      }                  }              }              _nextCursor = _nextCursor + 1 + offset;              _wordlenght++;          }          return found;      }  } 

3、测试与使用方法

_illegalWords = new List<string>();           if (string.IsNullOrEmpty(sourceText) && string.IsNullOrEmpty(_sourceText))           {               return sourceText;           }           if (!string.IsNullOrEmpty(sourceText))               _sourceText = sourceText;           _cursor = 0;           if (!LoadDictionary())           {               return _sourceText;           }           var tempString = _sourceText.ToCharArray();           var sourceTextDbc = ToDBC(SourceText);           for (var i = 0; i < SourceText.Length; i++)           {               //查询以该字为首字符的词组               var group = _memoryLexicon[sourceTextDbc[i]];               if (group != null)               {                   for (var z = 0; z < group.Count(); z++)                   {                       string word = group.GetWord(z);                       if (word.Length == 0 || Check(word))                       {                           if (isFirstCheckedReturn)                           {                               return null;                           }                           var blackword = string.Empty;                           for (var pos = 0; pos < _wordlenght + 1; pos++)                           {                               blackword += tempString[pos + _cursor].ToString();                               tempString[pos + _cursor] = ReplaceChar;                           }                           _illegalWords.Add(blackword);                           _cursor = _cursor + _wordlenght;                           i = i + _wordlenght;                           break;                       }                   }               }               _cursor++;           }           return new string(tempString);  var filter = new SensitiveWordFilter();            filter.SourceText = "dddddd";            var sourctText = filter.SourceText;            filter.ResetMemoryLexicon();            var datetime = DateTime.Now;            var ss = filter.Filter();            var datetime2 = DateTime.Now;            var millisecond = (datetime2 - datetime).TotalMilliseconds;            Console.WriteLine(millisecond);            Console.WriteLine(ss);            var words = System.IO.File.ReadAllLines(@"D:\Recv\敏感词库大全.txt", System.Text.Encoding.UTF8);            var ssx = sourctText;            var datetimex = DateTime.Now;            foreach (var word in words)            {                if (word.Length > 0)                    ssx = ssx.Replace(word, "*".PadLeft(word.Length, *));            }            var datetime2x = DateTime.Now;            var millisecondx = (datetime2x - datetimex).TotalMilliseconds;            Console.WriteLine(millisecondx);            Console.WriteLine(ssx); 
本文地址:http://www.bzve.cn/news/307a63199061.html
版权声明

本文仅代表作者观点,不代表本站立场。
本文系作者授权发表,未经许可,不得转载。

全站热门

电脑剪映短剧教程(以电脑剪映为背景,教你制作出精彩的短剧作品)

探索fotodiox转接环的功能与应用(发现fotodiox转接环的适配能力和创意摄影用途)

华硕主板BIOS设置U盘启动教程(轻松学会设置华硕主板BIOS实现U盘启动)

SonyXZPremium屏幕细腻逼真(4K超高清分辨率带来视觉盛宴)

苹果6s手机质量评测(用心设计,品质保证,苹果6s手机的优势与不足)

使用Ghost11.5.1创建美味的大白菜菜谱(轻松学会使用Ghost11.5.1软件制作大白菜菜谱)

Windows重装系统教程(一步一步教你轻松搞定,让你的电脑焕然一新)

Vivoxmax(探索创新科技,畅享高品质音频体验)

友情链接

滇ICP备2023006006号-39