中文分词-简单文章分析玩出来的

mac2022-06-30  29

最近自己写了下文章分析程序,用的是c# 2.0,数据库是mysql 5.0,自己弄两个mysql的Helper类,其中有涉及到线程和委托的东西,我找高手指点了我一下,结果,弄出来了,分析1万篇文章,大概1个小时左右,想要这个程序或者想交流的可以联系我.

帖一下自己的代码:

 

代码 using  System; using  System.Collections.Generic; using  System.ComponentModel; using  System.Data; using  System.Drawing; using  System.Text; using  System.Windows.Forms; using  System.IO; using  System.Data.OleDb; using  WoWExpress.Core; using  MySql.Data.MySqlClient; using  System.Text.RegularExpressions; using  System.Threading; using  Rainsoft.WordSeg; namespace  CSVProject{     public   partial   class  Form1 : Form    {         public  Form1()        {            InitializeComponent();        }          public  DataSet GetStopwords()        {             string  myConnectionStr  =  WoWExpress.Core.MySqlHelper.GetConnectionString( " localhost " " hwyd " " root " " 8152 " );             string  mysqlStr  =   " select * from stopwords " ;            DataSet stopwordsDataSet  =  WoWExpress.Core.MySqlHelper.ExecuteDataset(myConnectionStr, CommandType.Text, mysqlStr);             return  stopwordsDataSet;        }         public  DataSet GetArticles()        {             string  myConnectionStr  =  WoWExpress.Core.MySqlHelper.GetConnectionString( " localhost " " hwyd " " root " " 8152 " );             string  mysqlStr  =   " select * from ccl_addonarticle " ;            DataSet stopwordsDataSet  =  WoWExpress.Core.MySqlHelper.ExecuteDataset(myConnectionStr, CommandType.Text, mysqlStr);             return  stopwordsDataSet;        }         /* 一篇文章一个对象,对象包括文章主题,文章id等,现在就只要两个参数         文章对象放入ArrayList,这样可以循环操作文章         1.对文章使用停用词表,把文章隔开,如何隔开?利用停用词表集合循环的把文章中的停用词给用标识替换(如[%stopword%])         2.直接使用split(artirleBody,[%stopword%])来分隔文章,留下的词就全部分入数组,数组循环判断,从第一个开始,相同就数量加1         * 插入新的对象关键词对象,对象包括关键词id,关键词,关键词在本篇文章数量,关键词在本篇文章的百分比(这个需要在本篇文章循环         * 完才可以计算的出),关键词在本数据库中的数量,关键词在本数据库中的百分比(这个需要在所有文章循环         * 完才可以计算的出)         * 全部循环完之后,需要的数据就是关键词对象,这个也相应的显示出来,并且存到数据库静态化,但是当数据不断增加的时候,每次         * 就需要重新计算一次,得出当前最真实的结果,这样也会导致速度越来越慢,不过这是将来需要处理的。          */          /* 获得文章 */          public  List < ArticleInfo >  GetMyArticles( string  pageLength)        {            List < ArticleInfo >  articlesInfo  =   new  List < ArticleInfo > ();             string  myConnectionStr  =  WoWExpress.Core.MySqlHelper.GetConnectionString( " localhost " " hwyd " " root " " 8152 " );             string  mysqlStr  =   "  select aid,body,isDo from ccl_addonarticle where isDo = 0 limit @pageLength " ;            mysqlStr  =  mysqlStr.Replace( " @pageLength " , pageLength);             // Execute the query against the database              using  (MySqlDataReader rdr  =  WoWExpress.Core.MySqlHelper.ExecuteReader(myConnectionStr, CommandType.Text, mysqlStr))            {                 //  Scroll through the results                  while  (rdr.Read())                {                     // 预先分词                      this .segment(rdr.GetString( 1 ));                    ArticleInfo articleInfo  =   new  ArticleInfo(Convert.ToInt32(rdr.GetString( 0 )), rdr.GetString( 1 ));                     // Add each item to the arraylist                     articlesInfo.Add(articleInfo);                }            }             return  articlesInfo;        }         /* 获得文章总数 */          public   int  GetArticlesCount()        {            List < ArticleInfo >  articlesInfo  =   new  List < ArticleInfo > ();             string  myConnectionStr  =  WoWExpress.Core.MySqlHelper.GetConnectionString( " localhost " " hwyd " " root " " 8152 " );             string  mysqlStr  =   "  select count(*) from ccl_addonarticle where isDo = 0 " ;             int  result  =   0 ;             // Execute the query against the database              using  (MySqlDataReader rdr  =  WoWExpress.Core.MySqlHelper.ExecuteReader(myConnectionStr, CommandType.Text, mysqlStr))            {                 //  Scroll through the results                  if  (rdr.Read())                {                    result  =  rdr.GetInt32( 0 );                }            }             return  result;        }          /* 获得停用词表 */          public  List < StopwordsInfo >  GetMyStopwords()        {            List < StopwordsInfo >  stopwords  =   new  List < StopwordsInfo > ();             string  myConnectionStr  =  WoWExpress.Core.MySqlHelper.GetConnectionString( " localhost " " hwyd " " root " " 8152 " );             string  mysqlStr  =   " select * from stopwords " ;             // Execute the query against the database              using  (MySqlDataReader rdr  =  WoWExpress.Core.MySqlHelper.ExecuteReader(myConnectionStr, CommandType.Text, mysqlStr))            {                 //  Scroll through the results                  while  (rdr.Read())                {                    StopwordsInfo stopwordsInfo  =   new  StopwordsInfo(Convert.ToInt32(rdr.GetString( 0 )), rdr.GetString( 1 ).Trim());                     // Add each item to the arraylist                     stopwords.Add(stopwordsInfo);                }            }             return  stopwords;        }         /* 使用停用词表 */          public  List < ArticleInfo >  UseStopwords(List < ArticleInfo >  articlesInfo, List < StopwordsInfo >  stopwords)        {             /* 处理过后的文章 */             List < ArticleInfo >  targetArticles  =   new  List < ArticleInfo > ();             /* 循环文章 */              foreach  (ArticleInfo articleInfo  in  articlesInfo)            {                 /* 每篇文章循环使用停用词表里面的各个词 */                  string  curArticleBody  =  articleInfo.ArticleBody.ToString();                 /* 去除所有html代码 */                 curArticleBody  =   this .stripHtml(curArticleBody);                curArticleBody  =   this .StripHTML3(curArticleBody);                 foreach  (StopwordsInfo stopwordsInfo  in  stopwords)                {                     string  curStopwords  =  stopwordsInfo.Stopwords.ToString();                    curArticleBody  =  curArticleBody.Replace(curStopwords,  "   " );                }                 /* 处理每篇文章后,在把每篇文章放入新的列表里面等待使用 */                  /* 去除所有html代码-在处理一次 */                 curArticleBody  =   this .stripHtml(curArticleBody);                articleInfo.ArticleBody  =  curArticleBody;                targetArticles.Add(articleInfo);            }             return  targetArticles;        }         /* 分隔文章到单词--这里已经得到了单篇文章的关键词统计 */          public  List < SingleKeywords >  SplitArticle(List < ArticleInfo >  articlesInfo)        {             /* 处理过后得到的关键词列表 */             List < SingleKeywords >  singleKeywordsArray  =   new  List < SingleKeywords > ();             /* 循环文章 */              /* 这里可以显示分进度 */              // 这里已经是处在了线程里面的话,就需要代理了,这里的设置也就需要代理了             // progressBar2.Maximum = articlesInfo.Count; // 设置最大长度值-             // progressBar2.Value = 0; // 设置当前值             // progressBar2.Step = 1; // 设置没次增长多少             OnRrogressBar2Set(articlesInfo.Count);             foreach  (ArticleInfo articleInfo  in  articlesInfo)            {                 /* 每篇文章循环使用切割 */                  string  curArticleBody  =  articleInfo.ArticleBody.ToString().Trim();                 int  curArticleId  =  articleInfo.ArticleId;                 // 切割后得到关键词列表                  string [] keywordsArray  =  curArticleBody.Split( '   ' );                 int  keywordsArrayLength  =  keywordsArray.Length;                 /* 首先初始入库一个关键词,每篇第一个关键词肯定是要入库的 */                 SingleKeywords curKeywords  =   new  SingleKeywords(curArticleId, keywordsArray[ 0 ],  1 0 );                singleKeywordsArray.Add(curKeywords);                 for  ( int  i  =   1 ; i  <  keywordsArrayLength  -   1 ; i ++ )                {                     int  singlekeywordsLength  =  singleKeywordsArray.Count;                     bool  flag  =   true ;                     for  ( int  j  =   0 ; j  <  singlekeywordsLength; j ++ )                    {                         string  tempSingleKeywords  =  keywordsArray[i].Trim();                         int  tempArticleId  =  curArticleId;                         string  temp2SingleKeywords  =  singleKeywordsArray[j].KeywordsStr.Trim();                         int  temp2ArticleId  =  singleKeywordsArray[j].ArticleId;                         if  (tempSingleKeywords.Equals(temp2SingleKeywords)  &&  tempArticleId  ==  temp2ArticleId)                        {                            singleKeywordsArray[j].SingleCount  +=   1 ;                            flag  =   false ;                             break ;                        }                    }                     // true代表没有一个是相同的,allKeywordsArray要加关键词                      if  (flag)                    {                        SingleKeywords addSingleKeywords  =   new  SingleKeywords(curArticleId, keywordsArray[i].Trim(),  1 0 );                        singleKeywordsArray.Add(addSingleKeywords);                    }                }                 /* 这里进行百分比的计算 */                  // todo                  /* 分进度 */                 OnRrogressBarAdd2(progressBar2.Step);            }             /* 嵌套到分词的时候,就顺便操作数据库了 */              this .UpdateArticleAndInsertKeywords(singleKeywordsArray);             // string startId = articlesInfo[0].ArticleId.ToString();             // string endId = articlesInfo[articlesInfo.Count - 1].ArticleId.ToString();             // label1.Text = "文章范围:" + startId + "-" + endId + "已经被更新完成!";              return  singleKeywordsArray;        }         /* 接着数据库里面的操作 */          public   string  UpdateArticleAndInsertKeywords(List < SingleKeywords >  singleKeywordsArray)        {             // 数据库连接加了字符集后,问题解决,插入正常              string  strSetCharset  =   " utf8 " ; // System.Text.Encoding.UTF8.HeaderName; // System.Text.Encoding.Default.HeaderName;             // string strSetCharset = "UTF8";              string  myConnectionStr  =  WoWExpress.Core.MySqlHelper.GetConnectionString( " localhost " " hwyd " " root " " 8152 " , strSetCharset);             // string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost", "hwyd", "root", "8152");             // 这样就已经循环更新了,本地使用的程序,不担心安全,快速开发出来-这里需要用事务,待处理              foreach  (SingleKeywords singleKeywords  in  singleKeywordsArray)            {                 string  mysqlStr  =   "  Update ccl_addonarticle set isDo =1 where aid = @aid " ;                 int  articleId  =  singleKeywords.ArticleId;                mysqlStr  =  mysqlStr.Replace( " @aid " , articleId.ToString());                WoWExpress.Core.MySqlHelper.ExecuteNonQuery(myConnectionStr, CommandType.Text, mysqlStr);                 string  mysqlStr2  =   " Insert articlekeywords(articleId,keywords,singleCount,singlePercent) values(?articleId,?KeywordsStr,?singleCount,?singlePercent) " ;                 int  articleId2  =  singleKeywords.ArticleId;                 string  keywordsStr  =  singleKeywords.KeywordsStr;                 int  singleCount  =  singleKeywords.SingleCount;                 double  singlePercent  =  singleKeywords.SinglePercent;                 MySqlParameter[] keywordsParms  =   new  MySqlParameter[] {      new  MySqlParameter( " ?articleId " , MySqlDbType.Int32,  4 ),      new  MySqlParameter( " ?KeywordsStr " , MySqlDbType.VarChar),      new  MySqlParameter( " ?singleCount " , MySqlDbType.Int32,  4 ),      new  MySqlParameter( " ?singlePercent " , MySqlDbType.Double, 4 )};                keywordsStr  =  Traditional2Simplified(keywordsStr);                keywordsParms[ 0 ].Value  =  articleId2;                keywordsParms[ 1 ].Value  =  keywordsStr;                keywordsParms[ 2 ].Value  =  singleCount;                keywordsParms[ 3 ].Value  =  singlePercent;                WoWExpress.Core.MySqlHelper.ExecuteNonQuery(myConnectionStr, CommandType.Text, mysqlStr2, keywordsParms);            }             return   " ok " ;        }         /* 辅助程序,改变编码 */          private   string  DBStringToNormal( string  dbStr)        {             byte [] str  =   new   byte [dbStr.Length];             for  ( int  i  =   0 ; i  <  dbStr.Length;  ++ i)                str[i]  =  ( byte )(dbStr[i]);             return  System.Text.Encoding.Default.GetString(str,  0 , dbStr.Length);        }         public   string  Traditional2Simplified( string  str)        {  // 繁体转简体                 return  (Microsoft.VisualBasic.Strings.StrConv(str, Microsoft.VisualBasic.VbStrConv.SimplifiedChinese,  0 ));        }         ///   <summary>          ///  提取HTML代码中文字的C#函数         ///   </summary>          public   string  StripHTML2( string  strHtml)        {             string [] aryReg  = {            @" <script[^>]*?>.*?</script> " ,            @" <(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(http://www.cnblogs.com/oxite/admin/file://[""'tbnr]|[^/7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?> " ,            @" ([\r\n])[\s]+ " ,            @" &(quot|#34); " ,            @" &(amp|#38); " ,            @" &(lt|#60); " ,            @" &(gt|#62); " ,            @" &(nbsp|#160); " ,            @" &(iexcl|#161); " ,            @" &(cent|#162); " ,            @" &(pound|#163); " ,            @" &(copy|#169); " ,            @" &#(\d+); " ,            @" --> " ,            @" <!--.*\n "           };             string [] aryRep  =    {              "" ,              "" ,              "" ,              " \ "" ,               " & " ,              " < " ,              " > " ,              "     " ,              " \xa1 " , // chr(161),                " \xa2 " , // chr(162),                " \xa3 " , // chr(163),                " \xa9 " , // chr(169),                "" ,              " \r\n " ,              ""             };             string  newReg  =  aryReg[ 0 ];             string  strOutput  =  strHtml;             for  ( int  i  =   0 ; i  <  aryReg.Length; i ++ )            {                Regex regex  =   new  Regex(aryReg[i], RegexOptions.IgnoreCase);                strOutput  =  regex.Replace(strOutput, aryRep[i]);            }            strOutput.Replace( " < " "" );            strOutput.Replace( " > " "" );            strOutput.Replace( " \r\n " "" );             return  strOutput;        }         ///   <summary>          ///  提取HTML代码中文字的C#函数         ///   </summary>          public   string  StripHTML3( string  strHtml)        {             return  strHtml.Replace( @" [^A-Za-z0-9\u4E00-\u9FBB] " "" );        }         /* 利用得到的最原始的关键词列表进行全局计算 */          public  List < AllKeywords >  ComputeKeywords(List < SingleKeywords >  singleKeywords)        {             /* 处理过后得到的关键词列表 */             List < AllKeywords >  allKeywordsArray  =   new  List < AllKeywords > ();             /* 把关键词相同的全部加起来,统计全局关键词 */              // 初始化全局统计列表             AllKeywords allKeywords  =   new  AllKeywords(singleKeywords[ 0 ].KeywordsStr, singleKeywords[ 0 ].SingleCount,  0 );            allKeywordsArray.Add(allKeywords);             int  singleKeywordsCount  =  singleKeywords.Count;             for  ( int  i  =   1 ; i  <  singleKeywordsCount  -   1 ; i ++ )            {                 int  allkeywordsLength  =  allKeywordsArray.Count;                 bool  flag  =   true ;                 for  ( int  j  =   0 ; j  <  allkeywordsLength; j ++ )                {                     string  tempSingleKeywords  =  singleKeywords[i].KeywordsStr.Trim();                     string  tempAllKeywords  =  allKeywordsArray[j].KeywordsStr.Trim();                     if  (tempSingleKeywords.Equals(tempAllKeywords))                    {                        allKeywordsArray[j].AllCount  +=   1 ;                        flag  =   false ;                         break ;                    }                }                 // true代表没有一个是相同的,allKeywordsArray要加关键词                  if  (flag)                {                    AllKeywords addAllKeywords  =   new  AllKeywords(singleKeywords[i].KeywordsStr, singleKeywords[i].SingleCount,  0 );                    allKeywordsArray.Add(addAllKeywords);                }            }             /* 这里进行百分比的计算 */              // todo              return  allKeywordsArray;        }          /**/          ///   <summary>          ///  将Html标签转化为空格         ///   </summary>          ///   <param name="strHtml"> 待转化的字符串 </param>          ///   <returns> 经过转化的字符串 </returns>          private   string  stripHtml( string  strHtml)        {            Regex objRegExp  =   new  Regex( " <(.|\n)+?> " );             string  strOutput  =  objRegExp.Replace(strHtml,  "" );            strOutput  =  strOutput.Replace( " < " " < " );            strOutput  =  strOutput.Replace( " > " " > " );             // 把所有空格变为一个空格             Regex r  =   new  Regex( @" \s+ " );            strOutput  =  r.Replace(strOutput,  "   " );            strOutput.Trim();             return  strOutput;        }         private   void  btnUseStopword_Click( object  sender, EventArgs e)        {             /* 这里分开写,好校验,现在是为了方便 */             List < ArticleInfo >  targetArticles  =   this .UseStopwords( this .GetMyArticles( " 10 " ),  this .GetMyStopwords());            dataGridView3.DataSource  =  targetArticles;        }         private   void  btnGetArticle_Click( object  sender, EventArgs e)        {            DataSet articleDS  =   this .GetArticles();            dataGridView2.DataSource  =  articleDS.Tables[ 0 ];        }         private   void  btnStopwords_Click( object  sender, EventArgs e)        {             // string myConnectionStr = WoWExpress.Core.MySqlHelper.GetConnectionString("localhost", "hwyd", "root", "8152");             // string mysqlStr = "select * from stopwords  where stopwordsId = @stopwordsId ";             // MySqlParameter myParameter = new MySqlParameter("@stopwordsId", MySqlDbType.Int32, 4);             // myParameter.Value = 26;             DataSet stopwordsDataSet  =   this .GetStopwords();            dataGridView1.DataSource  =  stopwordsDataSet.Tables[ 0 ];        }         /* 这里循环点击,或者程序自动点击也行 */          private   void  btnGetKeywords_Click( object  sender, EventArgs e)        {             /* 这里分开写,好校验,现在是为了方便---分批处理,并且能够自动,判断,如果返回了值,就可以继续循环 */              /* 按分页的方法,先统计出一共多少篇文章,规定每次执行的篇数,计算出需要执行的次数,利用返回结果来判断是否当前             处理是否已经完成,完成者继续执行,否者报出错原因 */              int  articlesCount  =   this .GetArticlesCount();             int  pageLength  =   10 ;             int  doCount  =  articlesCount / pageLength;             int  lastLength  =  articlesCount  %  pageLength;            progressBar1.Maximum  =  doCount; // 设置最大长度值             progressBar1.Value  =   0 ; // 设置当前值             progressBar1.Step  =   1 ; // 设置没次增长多少             System.Threading.Thread thread  =   new  System.Threading.Thread( delegate ( object  arg) {                 // 如果总数小于单批长度,直接一次处理,长度为余数                  if  (articlesCount  <  pageLength)                {                    List < SingleKeywords >  singleKeywordsArray  =   this .SplitArticle( this .UseStopwords( this .GetMyArticles(lastLength.ToString()),  this .GetMyStopwords()));                    OnGridViewDataBind(singleKeywordsArray);                }                 else                 {                     for  ( int  i  =   0 ; i  <  doCount; i ++ )                    {                         // 这里循环操作                         List < SingleKeywords >  singleKeywordsArray  =   this .SplitArticle( this .UseStopwords( this .GetMyArticles(pageLength.ToString()),  this .GetMyStopwords()));                        OnGridViewDataBind(singleKeywordsArray);                        OnRrogressBarAdd(progressBar1.Step);                    }                     // 如果有余数,单独处理最后一次                      if  (lastLength  !=   0 )                    {                        List < SingleKeywords >  singleKeywordsArray  =   this .SplitArticle( this .UseStopwords( this .GetMyArticles(lastLength.ToString()),  this .GetMyStopwords()));                        OnGridViewDataBind(singleKeywordsArray);                    }                }            });            thread.Start();        }         private   void  btnAllCompute_Click( object  sender, EventArgs e)        {            List < AllKeywords >  allKeywordsArray  =   this .ComputeKeywords( this .SplitArticle( this .UseStopwords( this .GetMyArticles( " 10 " ),  this .GetMyStopwords())));            dataGridView5.DataSource  =  allKeywordsArray;        }         /* 跨线程的操作 */          // 绑定datagridview          protected   delegate   void  GridViewDataBind( object  source);         protected   void  OnGridViewDataBind( object  source)        {             if  (dataGridView4  ==   null )                 return ;             if  (dataGridView4.InvokeRequired)                dataGridView4.Invoke( new  GridViewDataBind(                     delegate ( object  dataSource)                    {                        dataGridView4.DataSource  =  dataSource;                    }                    ), source);             else                 dataGridView4.DataSource  =  source;        }                         // 设置整体进度条          protected   delegate   void  RrogressBarAdd( int  step);         protected   void  OnRrogressBarAdd( int  step)        {             if  (progressBar1  ==   null )                 return ;             if  (progressBar1.InvokeRequired)                progressBar1.Invoke( new  RrogressBarAdd(                     delegate ( int  mystep)                    {                        progressBar1.Value  +=  mystep; // 让进度条增加一次                     }                    ), step);             else                 progressBar1.Value  +=  step; // 让进度条增加一次         }         // 设置分进度条          protected   delegate   void  RrogressBarAdd2( int  step);         protected   void  OnRrogressBarAdd2( int  step)        {             if  (progressBar2  ==   null )                 return ;             if  (progressBar2.InvokeRequired)                progressBar2.Invoke( new  RrogressBarAdd2(                     delegate ( int  mystep)                    {                        progressBar2.Value  +=  mystep; // 让进度条增加一次                     }                    ), step);             else                 progressBar2.Value  +=  step; // 让进度条增加一次         }         protected   delegate   void  RrogressBar2Set( int  maximum);         protected   void  OnRrogressBar2Set( int  maximum)        {             if  (progressBar2  ==   null )                 return ;             if  (progressBar2.InvokeRequired)                progressBar2.Invoke( new  RrogressBar2Set(                     delegate ( int  myMaximum)                    {                        progressBar2.Maximum  =  myMaximum; // 设置最大长度值-                         progressBar2.Value  =   0 ; // 设置当前值                         progressBar2.Step  =   1 ; // 设置没次增长多少                     }                    ), maximum);             else                 progressBar2.Value  +=  maximum; // 让进度条增加一次         }          public   string  segment( string  articleStr)        {            WordSegV1 seg  =   new  WordSegV1();             string  s  =  seg.Segment(articleStr, '   ' );             return  s;        }     }}

 

 

 

程序开发完毕后,我突然发现分词不是那么容易的,找了下,又发现好东西了,c#版本开源的中文分词-ictclas,和一个简单的c#版本的分词组件。中文分词组件   好慢,等申请首页发布我在给出另外下载的代码吧,呵呵,看博客园园长的了。

转载于:https://www.cnblogs.com/oxite/archive/2010/03/22/1691480.html

最新回复(0)