提取HTML代码中文字的C#函数

mac2022-06-30  24

/// <summary>   /// 去除HTML标记   /// </summary>   /// <param name="strHtml">包括HTML的源码 </param>   /// <returns>已经去除后的文字</returns>   public static string StripHTML(string strHtml)   {    string [] aryReg ={           @"<script[^>]*?>.*?</script>",

          @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",          @"([\r\n])[\s]+",          @"&(quot|#34);",          @"&(amp|#38);",          @"&(lt|#60);",          @"&(gt|#62);",           @"&(nbsp|#160);",           @"&(iexcl|#161);",          @"&(cent|#162);",          @"&(pound|#163);",          @"&(copy|#169);",          @"&#(\d+);",          @"-->",          @"<!--.*\n"                  };

   string [] aryRep = {           "",           "",           "",           "\"",           "&",           "<",           ">",           " ",           "\xa1",//chr(161),           "\xa2",//chr(162),           "\xa3",//chr(163),           "\xa9",//chr(169),           "",           "\r\n",           ""          };

   string newReg =aryReg[0];   string strOutput=strHtml;   for(int i = 0;i<aryReg.Length;i++)   {    Regex regex = new Regex(aryReg[i],RegexOptions.IgnoreCase );    strOutput = regex.Replace(strOutput,aryRep[i]);   }

   strOutput.Replace("<","");   strOutput.Replace(">","");   strOutput.Replace("\r\n","");

   return strOutput;  }

转载于:https://www.cnblogs.com/webman/archive/2007/06/20/790603.html

相关资源:JAVA上百实例源码以及开源项目
最新回复(0)