最近自己写了下文章分析程序,用的是c# 2.0,数据库是mysql 5.0,自己弄两个mysql的Helper类,其中有涉及到线程和委托的东西,我找高手指点了我一下,结果,弄出来了,分析1万篇文章,大概1个小时左右,想要这个程序或者想交流的可以联系我.
帖一下自己的代码:
代码 using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.Text; using System.Windows.Forms; using System.IO; using System.Data.OleDb; using WoWExpress.Core; using MySql.Data.MySqlClient; using System.Text.RegularExpressions; using System.Threading; using Rainsoft.WordSeg; namespace CSVProject{ public partial class Form1 : Form { public Form1() { InitializeComponent(); } public DataSet GetStopwords() { string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString( " localhost " , " hwyd " , " root " , " 8152 " ); string mysqlStr = " select * from stopwords " ; DataSet stopwordsDataSet = WoWExpress.Core.MySqlHelper.ExecuteDataset(myConnectionStr, CommandType.Text, mysqlStr); return stopwordsDataSet; } public DataSet GetArticles() { string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString( " localhost " , " hwyd " , " root " , " 8152 " ); string mysqlStr = " select * from ccl_addonarticle " ; DataSet stopwordsDataSet = WoWExpress.Core.MySqlHelper.ExecuteDataset(myConnectionStr, CommandType.Text, mysqlStr); return stopwordsDataSet; } /* 一篇文章一个对象,对象包括文章主题,文章id等,现在就只要两个参数 文章对象放入ArrayList,这样可以循环操作文章 1.对文章使用停用词表,把文章隔开,如何隔开?利用停用词表集合循环的把文章中的停用词给用标识替换(如[%stopword%]) 2.直接使用split(artirleBody,[%stopword%])来分隔文章,留下的词就全部分入数组,数组循环判断,从第一个开始,相同就数量加1 * 插入新的对象关键词对象,对象包括关键词id,关键词,关键词在本篇文章数量,关键词在本篇文章的百分比(这个需要在本篇文章循环 * 完才可以计算的出),关键词在本数据库中的数量,关键词在本数据库中的百分比(这个需要在所有文章循环 * 完才可以计算的出) * 全部循环完之后,需要的数据就是关键词对象,这个也相应的显示出来,并且存到数据库静态化,但是当数据不断增加的时候,每次 * 就需要重新计算一次,得出当前最真实的结果,这样也会导致速度越来越慢,不过这是将来需要处理的。 */ /* 获得文章 */ public List < ArticleInfo > GetMyArticles( string pageLength) { List < ArticleInfo > articlesInfo = new List < ArticleInfo > (); string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString( " localhost " , " hwyd " , " root " , " 8152 " ); string mysqlStr = " select aid,body,isDo from ccl_addonarticle where isDo = 0 limit @pageLength " ; mysqlStr = mysqlStr.Replace( " @pageLength " , pageLength); // Execute the query against the database using (MySqlDataReader rdr = WoWExpress.Core.MySqlHelper.ExecuteReader(myConnectionStr, CommandType.Text, mysqlStr)) { // Scroll through the results while (rdr.Read()) { // 预先分词 this .segment(rdr.GetString( 1 )); ArticleInfo articleInfo = new ArticleInfo(Convert.ToInt32(rdr.GetString( 0 )), rdr.GetString( 1 )); // Add each item to the arraylist articlesInfo.Add(articleInfo); } } return articlesInfo; } /* 获得文章总数 */ public int GetArticlesCount() { List < ArticleInfo > articlesInfo = new List < ArticleInfo > (); string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString( " localhost " , " hwyd " , " root " , " 8152 " ); string mysqlStr = " select count(*) from ccl_addonarticle where isDo = 0 " ; int result = 0 ; // Execute the query against the database using (MySqlDataReader rdr = WoWExpress.Core.MySqlHelper.ExecuteReader(myConnectionStr, CommandType.Text, mysqlStr)) { // Scroll through the results if (rdr.Read()) { result = rdr.GetInt32( 0 ); } } return result; } /* 获得停用词表 */ public List < StopwordsInfo > GetMyStopwords() { List < StopwordsInfo > stopwords = new List < StopwordsInfo > (); string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString( " localhost " , " hwyd " , " root " , " 8152 " ); string mysqlStr = " select * from stopwords " ; // Execute the query against the database using (MySqlDataReader rdr = WoWExpress.Core.MySqlHelper.ExecuteReader(myConnectionStr, CommandType.Text, mysqlStr)) { // Scroll through the results while (rdr.Read()) { StopwordsInfo stopwordsInfo = new StopwordsInfo(Convert.ToInt32(rdr.GetString( 0 )), rdr.GetString( 1 ).Trim()); // Add each item to the arraylist stopwords.Add(stopwordsInfo); } } return stopwords; } /* 使用停用词表 */ public List < ArticleInfo > UseStopwords(List < ArticleInfo > articlesInfo, List < StopwordsInfo > stopwords) { /* 处理过后的文章 */ List < ArticleInfo > targetArticles = new List < ArticleInfo > (); /* 循环文章 */ foreach (ArticleInfo articleInfo in articlesInfo) { /* 每篇文章循环使用停用词表里面的各个词 */ string curArticleBody = articleInfo.ArticleBody.ToString(); /* 去除所有html代码 */ curArticleBody = this .stripHtml(curArticleBody); curArticleBody = this .StripHTML3(curArticleBody); foreach (StopwordsInfo stopwordsInfo in stopwords) { string curStopwords = stopwordsInfo.Stopwords.ToString(); curArticleBody = curArticleBody.Replace(curStopwords, " " ); } /* 处理每篇文章后,在把每篇文章放入新的列表里面等待使用 */ /* 去除所有html代码-在处理一次 */ curArticleBody = this .stripHtml(curArticleBody); articleInfo.ArticleBody = curArticleBody; targetArticles.Add(articleInfo); } return targetArticles; } /* 分隔文章到单词--这里已经得到了单篇文章的关键词统计 */ public List < SingleKeywords > SplitArticle(List < ArticleInfo > articlesInfo) { /* 处理过后得到的关键词列表 */ List < SingleKeywords > singleKeywordsArray = new List < SingleKeywords > (); /* 循环文章 */ /* 这里可以显示分进度 */ // 这里已经是处在了线程里面的话,就需要代理了,这里的设置也就需要代理了 // progressBar2.Maximum = articlesInfo.Count; // 设置最大长度值- // progressBar2.Value = 0; // 设置当前值 // progressBar2.Step = 1; // 设置没次增长多少 OnRrogressBar2Set(articlesInfo.Count); foreach (ArticleInfo articleInfo in articlesInfo) { /* 每篇文章循环使用切割 */ string curArticleBody = articleInfo.ArticleBody.ToString().Trim(); int curArticleId = articleInfo.ArticleId; // 切割后得到关键词列表 string [] keywordsArray = curArticleBody.Split( ' ' ); int keywordsArrayLength = keywordsArray.Length; /* 首先初始入库一个关键词,每篇第一个关键词肯定是要入库的 */ SingleKeywords curKeywords = new SingleKeywords(curArticleId, keywordsArray[ 0 ], 1 , 0 ); singleKeywordsArray.Add(curKeywords); for ( int i = 1 ; i < keywordsArrayLength - 1 ; i ++ ) { int singlekeywordsLength = singleKeywordsArray.Count; bool flag = true ; for ( int j = 0 ; j < singlekeywordsLength; j ++ ) { string tempSingleKeywords = keywordsArray[i].Trim(); int tempArticleId = curArticleId; string temp2SingleKeywords = singleKeywordsArray[j].KeywordsStr.Trim(); int temp2ArticleId = singleKeywordsArray[j].ArticleId; if (tempSingleKeywords.Equals(temp2SingleKeywords) && tempArticleId == temp2ArticleId) { singleKeywordsArray[j].SingleCount += 1 ; flag = false ; break ; } } // true代表没有一个是相同的,allKeywordsArray要加关键词 if (flag) { SingleKeywords addSingleKeywords = new SingleKeywords(curArticleId, keywordsArray[i].Trim(), 1 , 0 ); singleKeywordsArray.Add(addSingleKeywords); } } /* 这里进行百分比的计算 */ // todo /* 分进度 */ OnRrogressBarAdd2(progressBar2.Step); } /* 嵌套到分词的时候,就顺便操作数据库了 */ this .UpdateArticleAndInsertKeywords(singleKeywordsArray); // string startId = articlesInfo[0].ArticleId.ToString(); // string endId = articlesInfo[articlesInfo.Count - 1].ArticleId.ToString(); // label1.Text = "文章范围:" + startId + "-" + endId + "已经被更新完成!"; return singleKeywordsArray; } /* 接着数据库里面的操作 */ public string UpdateArticleAndInsertKeywords(List < SingleKeywords > singleKeywordsArray) { // 数据库连接加了字符集后,问题解决,插入正常 string strSetCharset = " utf8 " ; // System.Text.Encoding.UTF8.HeaderName; // System.Text.Encoding.Default.HeaderName; // string strSetCharset = "UTF8"; string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString( " localhost " , " hwyd " , " root " , " 8152 " , strSetCharset); // string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost", "hwyd", "root", "8152"); // 这样就已经循环更新了,本地使用的程序,不担心安全,快速开发出来-这里需要用事务,待处理 foreach (SingleKeywords singleKeywords in singleKeywordsArray) { string mysqlStr = " Update ccl_addonarticle set isDo =1 where aid = @aid " ; int articleId = singleKeywords.ArticleId; mysqlStr = mysqlStr.Replace( " @aid " , articleId.ToString()); WoWExpress.Core.MySqlHelper.ExecuteNonQuery(myConnectionStr, CommandType.Text, mysqlStr); string mysqlStr2 = " Insert articlekeywords(articleId,keywords,singleCount,singlePercent) values(?articleId,?KeywordsStr,?singleCount,?singlePercent) " ; int articleId2 = singleKeywords.ArticleId; string keywordsStr = singleKeywords.KeywordsStr; int singleCount = singleKeywords.SingleCount; double singlePercent = singleKeywords.SinglePercent; MySqlParameter[] keywordsParms = new MySqlParameter[] { new MySqlParameter( " ?articleId " , MySqlDbType.Int32, 4 ), new MySqlParameter( " ?KeywordsStr " , MySqlDbType.VarChar), new MySqlParameter( " ?singleCount " , MySqlDbType.Int32, 4 ), new MySqlParameter( " ?singlePercent " , MySqlDbType.Double, 4 )}; keywordsStr = Traditional2Simplified(keywordsStr); keywordsParms[ 0 ].Value = articleId2; keywordsParms[ 1 ].Value = keywordsStr; keywordsParms[ 2 ].Value = singleCount; keywordsParms[ 3 ].Value = singlePercent; WoWExpress.Core.MySqlHelper.ExecuteNonQuery(myConnectionStr, CommandType.Text, mysqlStr2, keywordsParms); } return " ok " ; } /* 辅助程序,改变编码 */ private string DBStringToNormal( string dbStr) { byte [] str = new byte [dbStr.Length]; for ( int i = 0 ; i < dbStr.Length; ++ i) str[i] = ( byte )(dbStr[i]); return System.Text.Encoding.Default.GetString(str, 0 , dbStr.Length); } public string Traditional2Simplified( string str) { // 繁体转简体 return (Microsoft.VisualBasic.Strings.StrConv(str, Microsoft.VisualBasic.VbStrConv.SimplifiedChinese, 0 )); } /// <summary> /// 提取HTML代码中文字的C#函数 /// </summary> public string StripHTML2( string strHtml) { string [] aryReg = { @" <script[^>]*?>.*?</script> " , @" <(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(http://www.cnblogs.com/oxite/admin/file://[""'tbnr]|[^/7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?> " , @" ([\r\n])[\s]+ " , @" &(quot|#34); " , @" &(amp|#38); " , @" &(lt|#60); " , @" &(gt|#62); " , @" &(nbsp|#160); " , @" &(iexcl|#161); " , @" &(cent|#162); " , @" &(pound|#163); " , @" &(copy|#169); " , @" (\d+); " , @" --> " , @" <!--.*\n " }; string [] aryRep = { "" , "" , "" , " \ "" , " & " , " < " , " > " , " " , " \xa1 " , // chr(161), " \xa2 " , // chr(162), " \xa3 " , // chr(163), " \xa9 " , // chr(169), "" , " \r\n " , "" }; string newReg = aryReg[ 0 ]; string strOutput = strHtml; for ( int i = 0 ; i < aryReg.Length; i ++ ) { Regex regex = new Regex(aryReg[i], RegexOptions.IgnoreCase); strOutput = regex.Replace(strOutput, aryRep[i]); } strOutput.Replace( " < " , "" ); strOutput.Replace( " > " , "" ); strOutput.Replace( " \r\n " , "" ); return strOutput; } /// <summary> /// 提取HTML代码中文字的C#函数 /// </summary> public string StripHTML3( string strHtml) { return strHtml.Replace( @" [^A-Za-z0-9\u4E00-\u9FBB] " , "" ); } /* 利用得到的最原始的关键词列表进行全局计算 */ public List < AllKeywords > ComputeKeywords(List < SingleKeywords > singleKeywords) { /* 处理过后得到的关键词列表 */ List < AllKeywords > allKeywordsArray = new List < AllKeywords > (); /* 把关键词相同的全部加起来,统计全局关键词 */ // 初始化全局统计列表 AllKeywords allKeywords = new AllKeywords(singleKeywords[ 0 ].KeywordsStr, singleKeywords[ 0 ].SingleCount, 0 ); allKeywordsArray.Add(allKeywords); int singleKeywordsCount = singleKeywords.Count; for ( int i = 1 ; i < singleKeywordsCount - 1 ; i ++ ) { int allkeywordsLength = allKeywordsArray.Count; bool flag = true ; for ( int j = 0 ; j < allkeywordsLength; j ++ ) { string tempSingleKeywords = singleKeywords[i].KeywordsStr.Trim(); string tempAllKeywords = allKeywordsArray[j].KeywordsStr.Trim(); if (tempSingleKeywords.Equals(tempAllKeywords)) { allKeywordsArray[j].AllCount += 1 ; flag = false ; break ; } } // true代表没有一个是相同的,allKeywordsArray要加关键词 if (flag) { AllKeywords addAllKeywords = new AllKeywords(singleKeywords[i].KeywordsStr, singleKeywords[i].SingleCount, 0 ); allKeywordsArray.Add(addAllKeywords); } } /* 这里进行百分比的计算 */ // todo return allKeywordsArray; } /**/ /// <summary> /// 将Html标签转化为空格 /// </summary> /// <param name="strHtml"> 待转化的字符串 </param> /// <returns> 经过转化的字符串 </returns> private string stripHtml( string strHtml) { Regex objRegExp = new Regex( " <(.|\n)+?> " ); string strOutput = objRegExp.Replace(strHtml, "" ); strOutput = strOutput.Replace( " < " , " < " ); strOutput = strOutput.Replace( " > " , " > " ); // 把所有空格变为一个空格 Regex r = new Regex( @" \s+ " ); strOutput = r.Replace(strOutput, " " ); strOutput.Trim(); return strOutput; } private void btnUseStopword_Click( object sender, EventArgs e) { /* 这里分开写,好校验,现在是为了方便 */ List < ArticleInfo > targetArticles = this .UseStopwords( this .GetMyArticles( " 10 " ), this .GetMyStopwords()); dataGridView3.DataSource = targetArticles; } private void btnGetArticle_Click( object sender, EventArgs e) { DataSet articleDS = this .GetArticles(); dataGridView2.DataSource = articleDS.Tables[ 0 ]; } private void btnStopwords_Click( object sender, EventArgs e) { // string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost", "hwyd", "root", "8152"); // string mysqlStr = "select * from stopwords where stopwordsId = @stopwordsId "; // MySqlParameter myParameter = new MySqlParameter("@stopwordsId", MySqlDbType.Int32, 4); // myParameter.Value = 26; DataSet stopwordsDataSet = this .GetStopwords(); dataGridView1.DataSource = stopwordsDataSet.Tables[ 0 ]; } /* 这里循环点击,或者程序自动点击也行 */ private void btnGetKeywords_Click( object sender, EventArgs e) { /* 这里分开写,好校验,现在是为了方便---分批处理,并且能够自动,判断,如果返回了值,就可以继续循环 */ /* 按分页的方法,先统计出一共多少篇文章,规定每次执行的篇数,计算出需要执行的次数,利用返回结果来判断是否当前 处理是否已经完成,完成者继续执行,否者报出错原因 */ int articlesCount = this .GetArticlesCount(); int pageLength = 10 ; int doCount = articlesCount / pageLength; int lastLength = articlesCount % pageLength; progressBar1.Maximum = doCount; // 设置最大长度值 progressBar1.Value = 0 ; // 设置当前值 progressBar1.Step = 1 ; // 设置没次增长多少 System.Threading.Thread thread = new System.Threading.Thread( delegate ( object arg) { // 如果总数小于单批长度,直接一次处理,长度为余数 if (articlesCount < pageLength) { List < SingleKeywords > singleKeywordsArray = this .SplitArticle( this .UseStopwords( this .GetMyArticles(lastLength.ToString()), this .GetMyStopwords())); OnGridViewDataBind(singleKeywordsArray); } else { for ( int i = 0 ; i < doCount; i ++ ) { // 这里循环操作 List < SingleKeywords > singleKeywordsArray = this .SplitArticle( this .UseStopwords( this .GetMyArticles(pageLength.ToString()), this .GetMyStopwords())); OnGridViewDataBind(singleKeywordsArray); OnRrogressBarAdd(progressBar1.Step); } // 如果有余数,单独处理最后一次 if (lastLength != 0 ) { List < SingleKeywords > singleKeywordsArray = this .SplitArticle( this .UseStopwords( this .GetMyArticles(lastLength.ToString()), this .GetMyStopwords())); OnGridViewDataBind(singleKeywordsArray); } } }); thread.Start(); } private void btnAllCompute_Click( object sender, EventArgs e) { List < AllKeywords > allKeywordsArray = this .ComputeKeywords( this .SplitArticle( this .UseStopwords( this .GetMyArticles( " 10 " ), this .GetMyStopwords()))); dataGridView5.DataSource = allKeywordsArray; } /* 跨线程的操作 */ // 绑定datagridview protected delegate void GridViewDataBind( object source); protected void OnGridViewDataBind( object source) { if (dataGridView4 == null ) return ; if (dataGridView4.InvokeRequired) dataGridView4.Invoke( new GridViewDataBind( delegate ( object dataSource) { dataGridView4.DataSource = dataSource; } ), source); else dataGridView4.DataSource = source; } // 设置整体进度条 protected delegate void RrogressBarAdd( int step); protected void OnRrogressBarAdd( int step) { if (progressBar1 == null ) return ; if (progressBar1.InvokeRequired) progressBar1.Invoke( new RrogressBarAdd( delegate ( int mystep) { progressBar1.Value += mystep; // 让进度条增加一次 } ), step); else progressBar1.Value += step; // 让进度条增加一次 } // 设置分进度条 protected delegate void RrogressBarAdd2( int step); protected void OnRrogressBarAdd2( int step) { if (progressBar2 == null ) return ; if (progressBar2.InvokeRequired) progressBar2.Invoke( new RrogressBarAdd2( delegate ( int mystep) { progressBar2.Value += mystep; // 让进度条增加一次 } ), step); else progressBar2.Value += step; // 让进度条增加一次 } protected delegate void RrogressBar2Set( int maximum); protected void OnRrogressBar2Set( int maximum) { if (progressBar2 == null ) return ; if (progressBar2.InvokeRequired) progressBar2.Invoke( new RrogressBar2Set( delegate ( int myMaximum) { progressBar2.Maximum = myMaximum; // 设置最大长度值- progressBar2.Value = 0 ; // 设置当前值 progressBar2.Step = 1 ; // 设置没次增长多少 } ), maximum); else progressBar2.Value += maximum; // 让进度条增加一次 } public string segment( string articleStr) { WordSegV1 seg = new WordSegV1(); string s = seg.Segment(articleStr, ' ' ); return s; } }}
程序开发完毕后,我突然发现分词不是那么容易的,找了下,又发现好东西了,c#版本开源的中文分词-ictclas,和一个简单的c#版本的分词组件。中文分词组件 好慢,等申请首页发布我在给出另外下载的代码吧,呵呵,看博客园园长的了。
转载于:https://www.cnblogs.com/oxite/archive/2010/03/22/1691480.html