脏字处理类,很快

mac2022-06-30  74

引用地址:http://www.cnblogs.com/xingd/archive/2008/02/01/1061800.html

主要是参考上面的代码改的,bool的那个方法基本上没有改,string的那个方法是我参照bool的方法改的.

原楼主只写了一些片段(核心代码),楼主主要是实现了检测是不是含有脏字,并没有替换,相信大家都会改,但即使简单,也是需要人做的,再说了,这么简单的事情,大鸟,大侠们当然不屑了,我是菜鸟我不怕,copy后修改是我自身的属性,OK,你们不要笑我...

我试了多次,很爽,速度很快....有什么不对的请大伙指教,还有 while (index < text.Length - 1 && (fastCheck[text[++index]] & 1) == 0) ; 这个不知道是么意思..

using System; using System.Collections; using System.Collections.Generic; using System.Linq; using System.Text; using System.IO;

namespace CommonUnit {     public class BadWordParse     {

        private HashSet<string> hash = new HashSet<string>();         private byte[] fastCheck = new byte[char.MaxValue];         private BitArray charCheck = new BitArray(char.MaxValue);         private int maxWordLength = 0;         private int minWordLength = int.MaxValue;         private bool _isHave = false;         private string _replaceString = "*";         private char _splitString = '|';         private string _newWord;         private string _badWordFilePath;

        /// <summary>         /// 是否含有脏字         /// </summary>         public bool IsHave         {             get { return _isHave; }         }

        /// <summary>         /// 替换后字符串         /// </summary>         public string ReplaceString         {             set { _replaceString = value; }         }         /// <summary>         /// 脏字字典切割符         /// </summary>         public char SplitString         {             set { _splitString = value; }         }

        /// <summary>         /// 更新后的字符串         /// </summary>         public string NewWord         {             get { return _newWord; }         }

        /// <summary>         /// 脏字字典文档路径         /// </summary>         public string BadWordFilePath         {             get { return _badWordFilePath; }             set { _badWordFilePath = value; }         }

        public BadWordParse(string filePath)         {             _badWordFilePath = filePath;             string srList = string.Empty;             if (File.Exists(_badWordFilePath))             {                 StreamReader sr = new StreamReader(_badWordFilePath, Encoding.GetEncoding("gb2312"));                 srList = sr.ReadToEnd();                 sr.Close();                 sr.Dispose();             }             string[] badwords = srList.Split('|');             foreach (string word in badwords)             {                 maxWordLength = Math.Max(maxWordLength, word.Length);                 minWordLength = Math.Min(minWordLength, word.Length);                 for (int i = 0; i < 7 && i < word.Length; i++)                 {                     fastCheck[word[i]] |= (byte)(1 << i);                 }

                for (int i = 7; i < word.Length; i++)                 {                     fastCheck[word[i]] |= 0x80;                 }

                if (word.Length == 1)                 {                     charCheck[word[0]] = true;                 }                 else                 {                     hash.Add(word);                 }             }         }         public bool HasBadWord(string text)         {             int index = 0;

            while (index < text.Length)             {

                if ((fastCheck[text[index]] & 1) == 0)                 {                     while (index < text.Length - 1 && (fastCheck[text[++index]] & 1) == 0) ;                 }

                //单字节检测                 if (minWordLength == 1 && charCheck[text[index]])                 {                     return true;                 }

                //多字节检测                 for (int j = 1; j <= Math.Min(maxWordLength, text.Length - index - 1); j++)                 {                     //快速排除                     if ((fastCheck[text[index + j]] & (1 << Math.Min(j, 7))) == 0)                     {                         break;                     }

                    if (j + 1 >= minWordLength)                     {                         string sub = text.Substring(index, j + 1);

                        if (hash.Contains(sub))                         {                             return true;                         }                     }                 }                 index++;             }             return false;         }

        public string ReplaceBadWord(string text)         {             int index = 0;

            for (index = 0; index < text.Length; index++)             {                 if ((fastCheck[text[index]] & 1) == 0)                 {                     while (index < text.Length - 1 && (fastCheck[text[++index]] & 1) == 0) ;                 }

                //单字节检测                 if (minWordLength == 1 && charCheck[text[index]])                 {                     //return true;                     _isHave = true;                     text = text.Replace(text[index], _replaceString[0]);                     continue;                 }                 //多字节检测                 for (int j = 1; j <= Math.Min(maxWordLength, text.Length - index - 1); j++)                 {

                    //快速排除                     if ((fastCheck[text[index + j]] & (1 << Math.Min(j, 7))) == 0)                     {                         break;                     }

                    if (j + 1 >= minWordLength)                     {                         string sub = text.Substring(index, j + 1);

                        if (hash.Contains(sub))                         {

                            //替换字符操作                             _isHave = true;                             char cc = _replaceString[0];                             string rp = _replaceString.PadRight((j + 1), cc);                             text = text.Replace(sub, rp);                             //记录新位置                             index += j;                             break;                         }                     }                 }             }             _newWord = text;             return text;         }     } }

脏字典E://Text/badword.txt:引用地址:http://www.cnblogs.com/goody9807/archive/2006/09/12/502094.html

以下是测试代码:

using System; using System.Collections.Generic; using System.Linq; using System.Text; using CommonLibrary; using NUnit.Framework;

namespace MyWebTest.CommonLibraryTest {

    [TestFixture]     public class BadWordParseTest     {        [System.Runtime.InteropServices.DllImport("kernel32.dll")]        public static extern uint GetTickCount();        [Test]        public void Test() {

           string filePath = "E://Text/badword.txt";            string testString = string.Empty;            System.IO.StreamReader sr = new System.IO.StreamReader(filePath,System.Text.Encoding.GetEncoding("gb2312"));            testString = sr.ReadToEnd();            sr.Close();            sr.Dispose();            uint t = GetTickCount();            BadWordParse bwp = new BadWordParse(filePath);            string parsedString = bwp.ReplaceBadWord(testString);            uint time = GetTickCount() - t;            Console.Write("使用时间:"+time.ToString());            Console.Write("\r\n");            Console.Write("原始字符串" + parsedString);            Console.Write("\r\n");            Console.Write("替换后字符串"+parsedString);        }     } }

测试结果图片:

转载于:https://www.cnblogs.com/bbqqqbq/archive/2008/12/10/1352142.html

最新回复(0)