C#实现抓取和分析网页类实例
本文实例讲述了C#实现抓取和分析网页类。分享给大家供大家参考。具体分析如下:
这里介绍了抓取和分析网页的类。
其主要功能有:
1、提取网页的纯文本,去所有html标签和javascript代码
2、提取网页的链接,包括href和frame及iframe
3、提取网页的title等(其它的标签可依此类推,正则是一样的)
4、可以实现简单的表单提交及cookie保存
/* *Author:SunjoyatCCNU *如果您改进了这个类请发一份代码给我(ccnusjy在gmail.com) */ usingSystem; usingSystem.Data; usingSystem.Configuration; usingSystem.Net; usingSystem.IO; usingSystem.Text; usingSystem.Collections.Generic; usingSystem.Text.RegularExpressions; usingSystem.Threading; usingSystem.Web; ///<summary> ///网页类 ///</summary> publicclassWebPage { #region私有成员 privateUrim_uri;//网址 privateList<Link>m_links;//此网页上的链接 privatestringm_title;//此网页的标题 privatestringm_html;//此网页的HTML代码 privatestringm_outstr;//此网页可输出的纯文本 privateboolm_good;//此网页是否可用 privateintm_pagesize;//此网页的大小 privatestaticDictionary<string,CookieContainer>webcookies=newDictionary<string,CookieContainer>();//存放所有网页的Cookie privatestringm_post;//此网页的登陆页需要的POST数据 privatestringm_loginurl;//此网页的登陆页 #endregion #region私有方法 ///<summary> ///这私有方法从网页的HTML代码中分析出链接信息 ///</summary> ///<returns>List<Link></returns> privateList<Link>getLinks() { if(m_links.Count==0) { Regex[]regex=newRegex[2]; regex[0]=newRegex("(?m)<a[^><]+href=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>(?<text>(\\w|\\W)*?)</",RegexOptions.Multiline|RegexOptions.IgnoreCase); regex[1]=newRegex("<[i]*frame[^><]+src=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>",RegexOptions.Multiline|RegexOptions.IgnoreCase); for(inti=0;i<2;i++) { Matchmatch=regex[i].Match(m_html); while(match.Success) { try { stringurl=newUri(m_uri,match.Groups["url"].Value).AbsoluteUri; stringtext=""; if(i==0)text=newRegex("(<[^>]+>)|(\\s)|( )|&|\"",RegexOptions.Multiline|RegexOptions.IgnoreCase).Replace(match.Groups["text"].Value,""); Linklink=newLink(url,text); m_links.Add(link); } catch(Exceptionex){Console.WriteLine(ex.Message);}; match=match.NextMatch(); } } } returnm_links; } ///<summary> ///此私有方法从一段HTML文本中提取出一定字数的纯文本 ///</summary> ///<paramname="instr">HTML代码</param> ///<paramname="firstN">提取从头数多少个字</param> ///<paramname="withLink">是否要链接里面的字</param> ///<returns>纯文本</returns> privatestringgetFirstNchar(stringinstr,intfirstN,boolwithLink) { if(m_outstr=="") { m_outstr=instr.Clone()asstring; m_outstr=newRegex(@"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>",RegexOptions.Multiline|RegexOptions.IgnoreCase).Replace(m_outstr,""); m_outstr=newRegex(@"(?m)<style[^>]*>(\w|\W)*?</style[^>]*>",RegexOptions.Multiline|RegexOptions.IgnoreCase).Replace(m_outstr,""); m_outstr=newRegex(@"(?m)<select[^>]*>(\w|\W)*?</select[^>]*>",RegexOptions.Multiline|RegexOptions.IgnoreCase).Replace(m_outstr,""); if(!withLink)m_outstr=newRegex(@"(?m)<a[^>]*>(\w|\W)*?</a[^>]*>",RegexOptions.Multiline|RegexOptions.IgnoreCase).Replace(m_outstr,""); RegexobjReg=newSystem.Text.RegularExpressions.Regex("(<[^>]+?>)| ",RegexOptions.Multiline|RegexOptions.IgnoreCase); m_outstr=objReg.Replace(m_outstr,""); RegexobjReg2=newSystem.Text.RegularExpressions.Regex("(\\s)+",RegexOptions.Multiline|RegexOptions.IgnoreCase); m_outstr=objReg2.Replace(m_outstr,""); } returnm_outstr.Length>firstN?m_outstr.Substring(0,firstN):m_outstr; } ///<summary> ///此私有方法返回一个IP地址对应的无符号整数 ///</summary> ///<paramname="x">IP地址</param> ///<returns></returns> privateuintgetuintFromIP(IPAddressx) { Byte[]bt=x.GetAddressBytes(); uinti=(uint)(bt[0]*256*256*256); i+=(uint)(bt[1]*256*256); i+=(uint)(bt[2]*256); i+=(uint)(bt[3]); returni; } #endregion #region公有文法 ///<summary> ///此公有方法提取网页中一定字数的纯文本,包括链接文字 ///</summary> ///<paramname="firstN">字数</param> ///<returns></returns> publicstringgetContext(intfirstN) { returngetFirstNchar(m_html,firstN,true); } ///<summary> ///此公有方法提取网页中一定字数的纯文本,不包括链接文字 ///</summary> ///<paramname="firstN"></param> ///<returns></returns> publicstringgetContextWithOutLink(intfirstN) { returngetFirstNchar(m_html,firstN,false); } ///<summary> ///此公有方法从本网页的链接中提取一定数量的链接,该链接的URL满足某正则式 ///</summary> ///<paramname="pattern">正则式</param> ///<paramname="count">返回的链接的个数</param> ///<returns>List<Link></returns> publicList<Link>getSpecialLinksByUrl(stringpattern,intcount) { if(m_links.Count==0)getLinks(); List<Link>SpecialLinks=newList<Link>(); List<Link>.Enumeratori; i=m_links.GetEnumerator(); intcnt=0; while(i.MoveNext()&&cnt<count) { if(newRegex(pattern,RegexOptions.Multiline|RegexOptions.IgnoreCase).Match(i.Current.url).Success) { SpecialLinks.Add(i.Current); cnt++; } } returnSpecialLinks; } ///<summary> ///此公有方法从本网页的链接中提取一定数量的链接,该链接的文字满足某正则式 ///</summary> ///<paramname="pattern">正则式</param> ///<paramname="count">返回的链接的个数</param> ///<returns>List<Link></returns> publicList<Link>getSpecialLinksByText(stringpattern,intcount) { if(m_links.Count==0)getLinks(); List<Link>SpecialLinks=newList<Link>(); List<Link>.Enumeratori; i=m_links.GetEnumerator(); intcnt=0; while(i.MoveNext()&&cnt<count) { if(newRegex(pattern,RegexOptions.Multiline|RegexOptions.IgnoreCase).Match(i.Current.text).Success) { SpecialLinks.Add(i.Current); cnt++; } } returnSpecialLinks; } ///<summary> ///此公有方法获得所有链接中在一定IP范围的链接 ///</summary> ///<paramname="_ip_start">起始IP</param> ///<paramname="_ip_end">终止IP</param> ///<returns></returns> publicList<Link>getSpecialLinksByIP(string_ip_start,string_ip_end) { IPAddressip_start=IPAddress.Parse(_ip_start); IPAddressip_end=IPAddress.Parse(_ip_end); if(m_links.Count==0)getLinks(); List<Link>SpecialLinks=newList<Link>(); List<Link>.Enumeratori; i=m_links.GetEnumerator(); while(i.MoveNext()) { IPAddressip; try { ip=Dns.GetHostEntry(newUri(i.Current.url).Host).AddressList[0]; } catch{continue;} if(getuintFromIP(ip)>=getuintFromIP(ip_start)&&getuintFromIP(ip)<=getuintFromIP(ip_end)) { SpecialLinks.Add(i.Current); } } returnSpecialLinks; } ///<summary> ///这公有方法提取本网页的纯文本中满足某正则式的文字 ///</summary> ///<paramname="pattern">正则式</param> ///<returns>返回文字</returns> publicstringgetSpecialWords(stringpattern) { if(m_outstr=="")getContext(Int16.MaxValue); Regexregex=newRegex(pattern,RegexOptions.Multiline|RegexOptions.IgnoreCase); Matchmc=regex.Match(m_outstr); if(mc.Success) returnmc.Groups[1].Value; returnstring.Empty; } #endregion #region构造函数 privatevoidInit(string_url) { try { m_uri=newUri(_url); m_links=newList<Link>(); m_html=""; m_outstr=""; m_title=""; m_good=true; if(_url.EndsWith(".rar")||_url.EndsWith(".dat")||_url.EndsWith(".msi")) { m_good=false; return; } HttpWebRequestrqst=(HttpWebRequest)WebRequest.Create(m_uri); rqst.AllowAutoRedirect=true; rqst.MaximumAutomaticRedirections=3; rqst.UserAgent="Mozilla/4.0(compatible;MSIE5.01;WindowsNT5.0)"; rqst.KeepAlive=true; rqst.Timeout=30000; lock(WebPage.webcookies) { if(WebPage.webcookies.ContainsKey(m_uri.Host)) rqst.CookieContainer=WebPage.webcookies[m_uri.Host]; else { CookieContainercc=newCookieContainer(); WebPage.webcookies[m_uri.Host]=cc; rqst.CookieContainer=cc; } } HttpWebResponsersps=(HttpWebResponse)rqst.GetResponse(); Streamsm=rsps.GetResponseStream(); if(!rsps.ContentType.ToLower().StartsWith("text/")||rsps.ContentLength>1<<22) { rsps.Close(); m_good=false; return; } Encodingcding=System.Text.Encoding.Default; stringcontenttype=rsps.ContentType.ToLower(); intix=contenttype.IndexOf("charset="); if(ix!=-1) { try { cding=System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix+"charset".Length+1)); } catch { cding=Encoding.Default; } m_html=newStreamReader(sm,cding).ReadToEnd(); } else { m_html=newStreamReader(sm,cding).ReadToEnd(); Regexregex=newRegex("charset=(?<cding>[^=]+)?\"",RegexOptions.IgnoreCase); stringstrcding=regex.Match(m_html).Groups["cding"].Value; try { cding=Encoding.GetEncoding(strcding); } catch{ cding=Encoding.Default; } byte[]bytes=Encoding.Default.GetBytes(m_html.ToCharArray()); m_html=cding.GetString(bytes); if(m_html.Split('?').Length>100) { m_html=Encoding.Default.GetString(bytes); } } m_pagesize=m_html.Length; m_uri=rsps.ResponseUri; rsps.Close(); } catch(Exceptionex) { Console.WriteLine(ex.Message+m_uri.ToString()); m_good=false; } } publicWebPage(string_url) { stringuurl=""; try { uurl=Uri.UnescapeDataString(_url); _url=uurl; } catch{}; Regexre=newRegex("(?<h>[^\x00-\xff]+)"); Matchmc=re.Match(_url); if(mc.Success) { stringhan=mc.Groups["h"].Value; _url=_url.Replace(han,System.Web.HttpUtility.UrlEncode(han,Encoding.GetEncoding("GB2312"))); } Init(_url); } publicWebPage(string_url,string_loginurl,string_post) { stringuurl=""; try { uurl=Uri.UnescapeDataString(_url); _url=uurl; } catch{}; Regexre=newRegex("(?<h>[^\x00-\xff]+)"); Matchmc=re.Match(_url); if(mc.Success) { stringhan=mc.Groups["h"].Value; _url=_url.Replace(han,System.Web.HttpUtility.UrlEncode(han,Encoding.GetEncoding("GB2312"))); } if(_loginurl.Trim()==""||_post.Trim()==""||WebPage.webcookies.ContainsKey(newUri(_url).Host)) { Init(_url); } else { #region登陆 stringindata=_post; m_post=_post; m_loginurl=_loginurl; byte[]bytes=Encoding.Default.GetBytes(_post); CookieContainermyCookieContainer=newCookieContainer(); try { //新建一个CookieContainer来存放Cookie集合 HttpWebRequestmyHttpWebRequest=(HttpWebRequest)WebRequest.Create(_loginurl); //新建一个HttpWebRequest myHttpWebRequest.ContentType="application/x-www-form-urlencoded"; myHttpWebRequest.AllowAutoRedirect=false; myHttpWebRequest.UserAgent="Mozilla/4.0(compatible;MSIE5.01;WindowsNT5.0)"; myHttpWebRequest.Timeout=60000; myHttpWebRequest.KeepAlive=true; myHttpWebRequest.ContentLength=bytes.Length; myHttpWebRequest.Method="POST"; myHttpWebRequest.CookieContainer=myCookieContainer; //设置HttpWebRequest的CookieContainer为刚才建立的那个myCookieContainer StreammyRequestStream=myHttpWebRequest.GetRequestStream(); myRequestStream.Write(bytes,0,bytes.Length); myRequestStream.Close(); HttpWebResponsemyHttpWebResponse=(HttpWebResponse)myHttpWebRequest.GetResponse(); foreach(CookieckinmyHttpWebResponse.Cookies) { myCookieContainer.Add(ck); } myHttpWebResponse.Close(); } catch { Init(_url); return; } #endregion #region登陆后再访问页面 try { m_uri=newUri(_url); m_links=newList<Link>(); m_html=""; m_outstr=""; m_title=""; m_good=true; if(_url.EndsWith(".rar")||_url.EndsWith(".dat")||_url.EndsWith(".msi")) { m_good=false; return; } HttpWebRequestrqst=(HttpWebRequest)WebRequest.Create(m_uri); rqst.AllowAutoRedirect=true; rqst.MaximumAutomaticRedirections=3; rqst.UserAgent="Mozilla/4.0(compatible;MSIE5.01;WindowsNT5.0)"; rqst.KeepAlive=true; rqst.Timeout=30000; rqst.CookieContainer=myCookieContainer; lock(WebPage.webcookies) { WebPage.webcookies[m_uri.Host]=myCookieContainer; } HttpWebResponsersps=(HttpWebResponse)rqst.GetResponse(); Streamsm=rsps.GetResponseStream(); if(!rsps.ContentType.ToLower().StartsWith("text/")||rsps.ContentLength>1<<22) { rsps.Close(); m_good=false; return; } Encodingcding=System.Text.Encoding.Default; intix=rsps.ContentType.ToLower().IndexOf("charset="); if(ix!=-1) { try { cding=System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix+"charset".Length+1)); } catch { cding=Encoding.Default; } } m_html=newStreamReader(sm,cding).ReadToEnd(); m_pagesize=m_html.Length; m_uri=rsps.ResponseUri; rsps.Close(); } catch(Exceptionex) { Console.WriteLine(ex.Message+m_uri.ToString()); m_good=false; } #endregion } } #endregion #region属性 ///<summary> ///通过此属性可获得本网页的网址,只读 ///</summary> publicstringURL { get { returnm_uri.AbsoluteUri; } } ///<summary> ///通过此属性可获得本网页的标题,只读 ///</summary> publicstringTitle { get { if(m_title=="") { Regexreg=newRegex(@"(?m)<title[^>]*>(?<title>(?:\w|\W)*?)</title[^>]*>",RegexOptions.Multiline|RegexOptions.IgnoreCase); Matchmc=reg.Match(m_html); if(mc.Success) m_title=mc.Groups["title"].Value.Trim(); } returnm_title; } } ///<summary> ///此属性获得本网页的所有链接信息,只读 ///</summary> publicList<Link>Links { get { if(m_links.Count==0)getLinks(); returnm_links; } } ///<summary> ///此属性返回本网页的全部纯文本信息,只读 ///</summary> publicstringContext { get { if(m_outstr=="")getContext(Int16.MaxValue); returnm_outstr; } } ///<summary> ///此属性获得本网页的大小 ///</summary> publicintPageSize { get { returnm_pagesize; } } ///<summary> ///此属性获得本网页的所有站内链接 ///</summary> publicList<Link>InsiteLinks { get { returngetSpecialLinksByUrl("^http://"+m_uri.Host,Int16.MaxValue); } } ///<summary> ///此属性表示本网页是否可用 ///</summary> publicboolIsGood { get { returnm_good; } } ///<summary> ///此属性表示网页的所在的网站 ///</summary> publicstringHost { get { returnm_uri.Host; } } ///<summary> ///此网页的登陆页所需的POST数据 ///</summary> publicstringPostStr { get { returnm_post; } } ///<summary> ///此网页的登陆页 ///</summary> publicstringLoginURL { get { returnm_loginurl; } } #endregion } ///<summary> ///链接类 ///</summary> publicclassLink { publicstringurl;//链接网址 publicstringtext;//链接文字 publicLink(string_url,string_text) { url=_url; text=_text; } }
希望本文所述对大家的C#程序设计有所帮助。