引用地址:http://www.cnblogs.com/xingd/archive/2008/02/01/1061800.html
主要是参考上面的代码改的,bool的那个方法基本上没有改,string的那个方法是我参照bool的方法改的.
原楼主只写了一些片段(核心代码),楼主主要是实现了检测是不是含有脏字,并没有替换,相信大家都会改,但即使简单,也是需要人做的,再说了,这么简单的事情,大鸟,大侠们当然不屑了,我是菜鸟我不怕,copy后修改是我自身的属性,OK,你们不要笑我...
我试了多次,很爽,速度很快....有什么不对的请大伙指教,还有 while (index < text.Length - 1 && (fastCheck[text[++index]] & 1) == 0) ; 这个不知道是么意思..
using System; using System.Collections; using System.Collections.Generic; using System.Linq; using System.Text; using System.IO;
namespace CommonUnit { public class BadWordParse {
private HashSet<string> hash = new HashSet<string>(); private byte[] fastCheck = new byte[char.MaxValue]; private BitArray charCheck = new BitArray(char.MaxValue); private int maxWordLength = 0; private int minWordLength = int.MaxValue; private bool _isHave = false; private string _replaceString = "*"; private char _splitString = '|'; private string _newWord; private string _badWordFilePath;
/// <summary> /// 是否含有脏字 /// </summary> public bool IsHave { get { return _isHave; } }
/// <summary> /// 替换后字符串 /// </summary> public string ReplaceString { set { _replaceString = value; } } /// <summary> /// 脏字字典切割符 /// </summary> public char SplitString { set { _splitString = value; } }
/// <summary> /// 更新后的字符串 /// </summary> public string NewWord { get { return _newWord; } }
/// <summary> /// 脏字字典文档路径 /// </summary> public string BadWordFilePath { get { return _badWordFilePath; } set { _badWordFilePath = value; } }
public BadWordParse(string filePath) { _badWordFilePath = filePath; string srList = string.Empty; if (File.Exists(_badWordFilePath)) { StreamReader sr = new StreamReader(_badWordFilePath, Encoding.GetEncoding("gb2312")); srList = sr.ReadToEnd(); sr.Close(); sr.Dispose(); } string[] badwords = srList.Split('|'); foreach (string word in badwords) { maxWordLength = Math.Max(maxWordLength, word.Length); minWordLength = Math.Min(minWordLength, word.Length); for (int i = 0; i < 7 && i < word.Length; i++) { fastCheck[word[i]] |= (byte)(1 << i); }
for (int i = 7; i < word.Length; i++) { fastCheck[word[i]] |= 0x80; }
if (word.Length == 1) { charCheck[word[0]] = true; } else { hash.Add(word); } } } public bool HasBadWord(string text) { int index = 0;
while (index < text.Length) {
if ((fastCheck[text[index]] & 1) == 0) { while (index < text.Length - 1 && (fastCheck[text[++index]] & 1) == 0) ; }
//单字节检测 if (minWordLength == 1 && charCheck[text[index]]) { return true; }
//多字节检测 for (int j = 1; j <= Math.Min(maxWordLength, text.Length - index - 1); j++) { //快速排除 if ((fastCheck[text[index + j]] & (1 << Math.Min(j, 7))) == 0) { break; }
if (j + 1 >= minWordLength) { string sub = text.Substring(index, j + 1);
if (hash.Contains(sub)) { return true; } } } index++; } return false; }
public string ReplaceBadWord(string text) { int index = 0;
for (index = 0; index < text.Length; index++) { if ((fastCheck[text[index]] & 1) == 0) { while (index < text.Length - 1 && (fastCheck[text[++index]] & 1) == 0) ; }
//单字节检测 if (minWordLength == 1 && charCheck[text[index]]) { //return true; _isHave = true; text = text.Replace(text[index], _replaceString[0]); continue; } //多字节检测 for (int j = 1; j <= Math.Min(maxWordLength, text.Length - index - 1); j++) {
//快速排除 if ((fastCheck[text[index + j]] & (1 << Math.Min(j, 7))) == 0) { break; }
if (j + 1 >= minWordLength) { string sub = text.Substring(index, j + 1);
if (hash.Contains(sub)) {
//替换字符操作 _isHave = true; char cc = _replaceString[0]; string rp = _replaceString.PadRight((j + 1), cc); text = text.Replace(sub, rp); //记录新位置 index += j; break; } } } } _newWord = text; return text; } } }
脏字典E://Text/badword.txt:引用地址:http://www.cnblogs.com/goody9807/archive/2006/09/12/502094.html
以下是测试代码:
using System; using System.Collections.Generic; using System.Linq; using System.Text; using CommonLibrary; using NUnit.Framework;
namespace MyWebTest.CommonLibraryTest {
[TestFixture] public class BadWordParseTest { [System.Runtime.InteropServices.DllImport("kernel32.dll")] public static extern uint GetTickCount(); [Test] public void Test() {
string filePath = "E://Text/badword.txt"; string testString = string.Empty; System.IO.StreamReader sr = new System.IO.StreamReader(filePath,System.Text.Encoding.GetEncoding("gb2312")); testString = sr.ReadToEnd(); sr.Close(); sr.Dispose(); uint t = GetTickCount(); BadWordParse bwp = new BadWordParse(filePath); string parsedString = bwp.ReplaceBadWord(testString); uint time = GetTickCount() - t; Console.Write("使用时间:"+time.ToString()); Console.Write("\r\n"); Console.Write("原始字符串" + parsedString); Console.Write("\r\n"); Console.Write("替换后字符串"+parsedString); } } }
测试结果图片:
转载于:https://www.cnblogs.com/bbqqqbq/archive/2008/12/10/1352142.html