脏字处理类，很快

mac2022-06-30 85

引用地址：http://www.cnblogs.com/xingd/archive/2008/02/01/1061800.html

主要是参考上面的代码改的，bool的那个方法基本上没有改，string的那个方法是我参照bool的方法改的．

原楼主只写了一些片段（核心代码），楼主主要是实现了检测是不是含有脏字，并没有替换，相信大家都会改，但即使简单，也是需要人做的，再说了，这么简单的事情，大鸟，大侠们当然不屑了，我是菜鸟我不怕，copy后修改是我自身的属性，ＯＫ，你们不要笑我．．．

我试了多次，很爽，速度很快．．．．有什么不对的请大伙指教，还有 while (index < text.Length - 1 && (fastCheck[text[++index]] & 1) == 0) ; 这个不知道是么意思．．

using System; using System.Collections; using System.Collections.Generic; using System.Linq; using System.Text; using System.IO;

namespace CommonUnit { public class BadWordParse {

private HashSet<string> hash = new HashSet<string>(); private byte[] fastCheck = new byte[char.MaxValue]; private BitArray charCheck = new BitArray(char.MaxValue); private int maxWordLength = 0; private int minWordLength = int.MaxValue; private bool _isHave = false; private string _replaceString = "*"; private char _splitString = '|'; private string _newWord; private string _badWordFilePath;

/// <summary> /// 是否含有脏字 /// </summary> public bool IsHave { get { return _isHave; } }

/// <summary> /// 替换后字符串 /// </summary> public string ReplaceString { set { _replaceString = value; } } /// <summary> /// 脏字字典切割符 /// </summary> public char SplitString { set { _splitString = value; } }

/// <summary> /// 更新后的字符串 /// </summary> public string NewWord { get { return _newWord; } }

/// <summary> /// 脏字字典文档路径 /// </summary> public string BadWordFilePath { get { return _badWordFilePath; } set { _badWordFilePath = value; } }

public BadWordParse(string filePath) { _badWordFilePath = filePath; string srList = string.Empty; if (File.Exists(_badWordFilePath)) { StreamReader sr = new StreamReader(_badWordFilePath, Encoding.GetEncoding("gb2312")); srList = sr.ReadToEnd(); sr.Close(); sr.Dispose(); } string[] badwords = srList.Split('|'); foreach (string word in badwords) { maxWordLength = Math.Max(maxWordLength, word.Length); minWordLength = Math.Min(minWordLength, word.Length); for (int i = 0; i < 7 && i < word.Length; i++) { fastCheck[word[i]] |= (byte)(1 << i); }

for (int i = 7; i < word.Length; i++) { fastCheck[word[i]] |= 0x80; }

if (word.Length == 1) { charCheck[word[0]] = true; } else { hash.Add(word); } } } public bool HasBadWord(string text) { int index = 0;

while (index < text.Length) {

if ((fastCheck[text[index]] & 1) == 0) { while (index < text.Length - 1 && (fastCheck[text[++index]] & 1) == 0) ; }

//单字节检测 if (minWordLength == 1 && charCheck[text[index]]) { return true; }

//多字节检测 for (int j = 1; j <= Math.Min(maxWordLength, text.Length - index - 1); j++) { //快速排除 if ((fastCheck[text[index + j]] & (1 << Math.Min(j, 7))) == 0) { break; }

if (j + 1 >= minWordLength) { string sub = text.Substring(index, j + 1);

if (hash.Contains(sub)) { return true; } } } index++; } return false; }

public string ReplaceBadWord(string text) { int index = 0;

for (index = 0; index < text.Length; index++) { if ((fastCheck[text[index]] & 1) == 0) { while (index < text.Length - 1 && (fastCheck[text[++index]] & 1) == 0) ; }

//单字节检测 if (minWordLength == 1 && charCheck[text[index]]) { //return true; _isHave = true; text = text.Replace(text[index], _replaceString[0]); continue; } //多字节检测 for (int j = 1; j <= Math.Min(maxWordLength, text.Length - index - 1); j++) {

//快速排除 if ((fastCheck[text[index + j]] & (1 << Math.Min(j, 7))) == 0) { break; }

if (j + 1 >= minWordLength) { string sub = text.Substring(index, j + 1);

if (hash.Contains(sub)) {

//替换字符操作 _isHave = true; char cc = _replaceString[0]; string rp = _replaceString.PadRight((j + 1), cc); text = text.Replace(sub, rp); //记录新位置 index += j; break; } } } } _newWord = text; return text; } } }

脏字典E://Text/badword.txt：引用地址：http://www.cnblogs.com/goody9807/archive/2006/09/12/502094.html

以下是测试代码：

using System; using System.Collections.Generic; using System.Linq; using System.Text; using CommonLibrary; using NUnit.Framework;

namespace MyWebTest.CommonLibraryTest {

[TestFixture] public class BadWordParseTest { [System.Runtime.InteropServices.DllImport("kernel32.dll")] public static extern uint GetTickCount(); [Test] public void Test() {

string filePath = "E://Text/badword.txt"; string testString = string.Empty; System.IO.StreamReader sr = new System.IO.StreamReader(filePath,System.Text.Encoding.GetEncoding("gb2312")); testString = sr.ReadToEnd(); sr.Close(); sr.Dispose(); uint t = GetTickCount(); BadWordParse bwp = new BadWordParse(filePath); string parsedString = bwp.ReplaceBadWord(testString); uint time = GetTickCount() - t; Console.Write("使用时间："+time.ToString()); Console.Write("\r\n"); Console.Write("原始字符串" + parsedString); Console.Write("\r\n"); Console.Write("替换后字符串"+parsedString); } } }

测试结果图片：

转载于:https://www.cnblogs.com/bbqqqbq/archive/2008/12/10/1352142.html

最新回复(0)