C#实现抓取和分析网页类实例
本文实例讲述了C#实现抓取和分析网页类。分享给大家供大家参考。具体分析如下:
这里介绍了抓取和分析网页的类。
其主要功能有:
1、提取网页的纯文本,去所有html标签和javascript代码
2、提取网页的链接,包括href和frame及iframe
3、提取网页的title等(其它的标签可依此类推,正则是一样的)
4、可以实现简单的表单提交及cookie保存
/*
*Author:SunjoyatCCNU
*如果您改进了这个类请发一份代码给我(ccnusjy在gmail.com)
*/
usingSystem;
usingSystem.Data;
usingSystem.Configuration;
usingSystem.Net;
usingSystem.IO;
usingSystem.Text;
usingSystem.Collections.Generic;
usingSystem.Text.RegularExpressions;
usingSystem.Threading;
usingSystem.Web;
///<summary>
///网页类
///</summary>
publicclassWebPage
{
#region私有成员
privateUrim_uri;//网址
privateList<Link>m_links;//此网页上的链接
privatestringm_title;//此网页的标题
privatestringm_html;//此网页的HTML代码
privatestringm_outstr;//此网页可输出的纯文本
privateboolm_good;//此网页是否可用
privateintm_pagesize;//此网页的大小
privatestaticDictionary<string,CookieContainer>webcookies=newDictionary<string,CookieContainer>();//存放所有网页的Cookie
privatestringm_post;//此网页的登陆页需要的POST数据
privatestringm_loginurl;//此网页的登陆页
#endregion
#region私有方法
///<summary>
///这私有方法从网页的HTML代码中分析出链接信息
///</summary>
///<returns>List<Link></returns>
privateList<Link>getLinks()
{
if(m_links.Count==0)
{
Regex[]regex=newRegex[2];
regex[0]=newRegex("(?m)<a[^><]+href=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>(?<text>(\\w|\\W)*?)</",RegexOptions.Multiline|RegexOptions.IgnoreCase);
regex[1]=newRegex("<[i]*frame[^><]+src=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>",RegexOptions.Multiline|RegexOptions.IgnoreCase);
for(inti=0;i<2;i++)
{
Matchmatch=regex[i].Match(m_html);
while(match.Success)
{
try
{
stringurl=newUri(m_uri,match.Groups["url"].Value).AbsoluteUri;
stringtext="";
if(i==0)text=newRegex("(<[^>]+>)|(\\s)|( )|&|\"",RegexOptions.Multiline|RegexOptions.IgnoreCase).Replace(match.Groups["text"].Value,"");
Linklink=newLink(url,text);
m_links.Add(link);
}
catch(Exceptionex){Console.WriteLine(ex.Message);};
match=match.NextMatch();
}
}
}
returnm_links;
}
///<summary>
///此私有方法从一段HTML文本中提取出一定字数的纯文本
///</summary>
///<paramname="instr">HTML代码</param>
///<paramname="firstN">提取从头数多少个字</param>
///<paramname="withLink">是否要链接里面的字</param>
///<returns>纯文本</returns>
privatestringgetFirstNchar(stringinstr,intfirstN,boolwithLink)
{
if(m_outstr=="")
{
m_outstr=instr.Clone()asstring;
m_outstr=newRegex(@"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>",RegexOptions.Multiline|RegexOptions.IgnoreCase).Replace(m_outstr,"");
m_outstr=newRegex(@"(?m)<style[^>]*>(\w|\W)*?</style[^>]*>",RegexOptions.Multiline|RegexOptions.IgnoreCase).Replace(m_outstr,"");
m_outstr=newRegex(@"(?m)<select[^>]*>(\w|\W)*?</select[^>]*>",RegexOptions.Multiline|RegexOptions.IgnoreCase).Replace(m_outstr,"");
if(!withLink)m_outstr=newRegex(@"(?m)<a[^>]*>(\w|\W)*?</a[^>]*>",RegexOptions.Multiline|RegexOptions.IgnoreCase).Replace(m_outstr,"");
RegexobjReg=newSystem.Text.RegularExpressions.Regex("(<[^>]+?>)| ",RegexOptions.Multiline|RegexOptions.IgnoreCase);
m_outstr=objReg.Replace(m_outstr,"");
RegexobjReg2=newSystem.Text.RegularExpressions.Regex("(\\s)+",RegexOptions.Multiline|RegexOptions.IgnoreCase);
m_outstr=objReg2.Replace(m_outstr,"");
}
returnm_outstr.Length>firstN?m_outstr.Substring(0,firstN):m_outstr;
}
///<summary>
///此私有方法返回一个IP地址对应的无符号整数
///</summary>
///<paramname="x">IP地址</param>
///<returns></returns>
privateuintgetuintFromIP(IPAddressx)
{
Byte[]bt=x.GetAddressBytes();
uinti=(uint)(bt[0]*256*256*256);
i+=(uint)(bt[1]*256*256);
i+=(uint)(bt[2]*256);
i+=(uint)(bt[3]);
returni;
}
#endregion
#region公有文法
///<summary>
///此公有方法提取网页中一定字数的纯文本,包括链接文字
///</summary>
///<paramname="firstN">字数</param>
///<returns></returns>
publicstringgetContext(intfirstN)
{
returngetFirstNchar(m_html,firstN,true);
}
///<summary>
///此公有方法提取网页中一定字数的纯文本,不包括链接文字
///</summary>
///<paramname="firstN"></param>
///<returns></returns>
publicstringgetContextWithOutLink(intfirstN)
{
returngetFirstNchar(m_html,firstN,false);
}
///<summary>
///此公有方法从本网页的链接中提取一定数量的链接,该链接的URL满足某正则式
///</summary>
///<paramname="pattern">正则式</param>
///<paramname="count">返回的链接的个数</param>
///<returns>List<Link></returns>
publicList<Link>getSpecialLinksByUrl(stringpattern,intcount)
{
if(m_links.Count==0)getLinks();
List<Link>SpecialLinks=newList<Link>();
List<Link>.Enumeratori;
i=m_links.GetEnumerator();
intcnt=0;
while(i.MoveNext()&&cnt<count)
{
if(newRegex(pattern,RegexOptions.Multiline|RegexOptions.IgnoreCase).Match(i.Current.url).Success)
{
SpecialLinks.Add(i.Current);
cnt++;
}
}
returnSpecialLinks;
}
///<summary>
///此公有方法从本网页的链接中提取一定数量的链接,该链接的文字满足某正则式
///</summary>
///<paramname="pattern">正则式</param>
///<paramname="count">返回的链接的个数</param>
///<returns>List<Link></returns>
publicList<Link>getSpecialLinksByText(stringpattern,intcount)
{
if(m_links.Count==0)getLinks();
List<Link>SpecialLinks=newList<Link>();
List<Link>.Enumeratori;
i=m_links.GetEnumerator();
intcnt=0;
while(i.MoveNext()&&cnt<count)
{
if(newRegex(pattern,RegexOptions.Multiline|RegexOptions.IgnoreCase).Match(i.Current.text).Success)
{
SpecialLinks.Add(i.Current);
cnt++;
}
}
returnSpecialLinks;
}
///<summary>
///此公有方法获得所有链接中在一定IP范围的链接
///</summary>
///<paramname="_ip_start">起始IP</param>
///<paramname="_ip_end">终止IP</param>
///<returns></returns>
publicList<Link>getSpecialLinksByIP(string_ip_start,string_ip_end)
{
IPAddressip_start=IPAddress.Parse(_ip_start);
IPAddressip_end=IPAddress.Parse(_ip_end);
if(m_links.Count==0)getLinks();
List<Link>SpecialLinks=newList<Link>();
List<Link>.Enumeratori;
i=m_links.GetEnumerator();
while(i.MoveNext())
{
IPAddressip;
try
{
ip=Dns.GetHostEntry(newUri(i.Current.url).Host).AddressList[0];
}
catch{continue;}
if(getuintFromIP(ip)>=getuintFromIP(ip_start)&&getuintFromIP(ip)<=getuintFromIP(ip_end))
{
SpecialLinks.Add(i.Current);
}
}
returnSpecialLinks;
}
///<summary>
///这公有方法提取本网页的纯文本中满足某正则式的文字
///</summary>
///<paramname="pattern">正则式</param>
///<returns>返回文字</returns>
publicstringgetSpecialWords(stringpattern)
{
if(m_outstr=="")getContext(Int16.MaxValue);
Regexregex=newRegex(pattern,RegexOptions.Multiline|RegexOptions.IgnoreCase);
Matchmc=regex.Match(m_outstr);
if(mc.Success)
returnmc.Groups[1].Value;
returnstring.Empty;
}
#endregion
#region构造函数
privatevoidInit(string_url)
{
try
{
m_uri=newUri(_url);
m_links=newList<Link>();
m_html="";
m_outstr="";
m_title="";
m_good=true;
if(_url.EndsWith(".rar")||_url.EndsWith(".dat")||_url.EndsWith(".msi"))
{
m_good=false;
return;
}
HttpWebRequestrqst=(HttpWebRequest)WebRequest.Create(m_uri);
rqst.AllowAutoRedirect=true;
rqst.MaximumAutomaticRedirections=3;
rqst.UserAgent="Mozilla/4.0(compatible;MSIE5.01;WindowsNT5.0)";
rqst.KeepAlive=true;
rqst.Timeout=30000;
lock(WebPage.webcookies)
{
if(WebPage.webcookies.ContainsKey(m_uri.Host))
rqst.CookieContainer=WebPage.webcookies[m_uri.Host];
else
{
CookieContainercc=newCookieContainer();
WebPage.webcookies[m_uri.Host]=cc;
rqst.CookieContainer=cc;
}
}
HttpWebResponsersps=(HttpWebResponse)rqst.GetResponse();
Streamsm=rsps.GetResponseStream();
if(!rsps.ContentType.ToLower().StartsWith("text/")||rsps.ContentLength>1<<22)
{
rsps.Close();
m_good=false;
return;
}
Encodingcding=System.Text.Encoding.Default;
stringcontenttype=rsps.ContentType.ToLower();
intix=contenttype.IndexOf("charset=");
if(ix!=-1)
{
try
{
cding=System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix+"charset".Length+1));
}
catch
{
cding=Encoding.Default;
}
m_html=newStreamReader(sm,cding).ReadToEnd();
}
else
{
m_html=newStreamReader(sm,cding).ReadToEnd();
Regexregex=newRegex("charset=(?<cding>[^=]+)?\"",RegexOptions.IgnoreCase);
stringstrcding=regex.Match(m_html).Groups["cding"].Value;
try
{
cding=Encoding.GetEncoding(strcding);
}
catch{
cding=Encoding.Default;
}
byte[]bytes=Encoding.Default.GetBytes(m_html.ToCharArray());
m_html=cding.GetString(bytes);
if(m_html.Split('?').Length>100)
{
m_html=Encoding.Default.GetString(bytes);
}
}
m_pagesize=m_html.Length;
m_uri=rsps.ResponseUri;
rsps.Close();
}
catch(Exceptionex)
{
Console.WriteLine(ex.Message+m_uri.ToString());
m_good=false;
}
}
publicWebPage(string_url)
{
stringuurl="";
try
{
uurl=Uri.UnescapeDataString(_url);
_url=uurl;
}
catch{};
Regexre=newRegex("(?<h>[^\x00-\xff]+)");
Matchmc=re.Match(_url);
if(mc.Success)
{
stringhan=mc.Groups["h"].Value;
_url=_url.Replace(han,System.Web.HttpUtility.UrlEncode(han,Encoding.GetEncoding("GB2312")));
}
Init(_url);
}
publicWebPage(string_url,string_loginurl,string_post)
{
stringuurl="";
try
{
uurl=Uri.UnescapeDataString(_url);
_url=uurl;
}
catch{};
Regexre=newRegex("(?<h>[^\x00-\xff]+)");
Matchmc=re.Match(_url);
if(mc.Success)
{
stringhan=mc.Groups["h"].Value;
_url=_url.Replace(han,System.Web.HttpUtility.UrlEncode(han,Encoding.GetEncoding("GB2312")));
}
if(_loginurl.Trim()==""||_post.Trim()==""||WebPage.webcookies.ContainsKey(newUri(_url).Host))
{
Init(_url);
}
else
{
#region登陆
stringindata=_post;
m_post=_post;
m_loginurl=_loginurl;
byte[]bytes=Encoding.Default.GetBytes(_post);
CookieContainermyCookieContainer=newCookieContainer();
try
{
//新建一个CookieContainer来存放Cookie集合
HttpWebRequestmyHttpWebRequest=(HttpWebRequest)WebRequest.Create(_loginurl);
//新建一个HttpWebRequest
myHttpWebRequest.ContentType="application/x-www-form-urlencoded";
myHttpWebRequest.AllowAutoRedirect=false;
myHttpWebRequest.UserAgent="Mozilla/4.0(compatible;MSIE5.01;WindowsNT5.0)";
myHttpWebRequest.Timeout=60000;
myHttpWebRequest.KeepAlive=true;
myHttpWebRequest.ContentLength=bytes.Length;
myHttpWebRequest.Method="POST";
myHttpWebRequest.CookieContainer=myCookieContainer;
//设置HttpWebRequest的CookieContainer为刚才建立的那个myCookieContainer
StreammyRequestStream=myHttpWebRequest.GetRequestStream();
myRequestStream.Write(bytes,0,bytes.Length);
myRequestStream.Close();
HttpWebResponsemyHttpWebResponse=(HttpWebResponse)myHttpWebRequest.GetResponse();
foreach(CookieckinmyHttpWebResponse.Cookies)
{
myCookieContainer.Add(ck);
}
myHttpWebResponse.Close();
}
catch
{
Init(_url);
return;
}
#endregion
#region登陆后再访问页面
try
{
m_uri=newUri(_url);
m_links=newList<Link>();
m_html="";
m_outstr="";
m_title="";
m_good=true;
if(_url.EndsWith(".rar")||_url.EndsWith(".dat")||_url.EndsWith(".msi"))
{
m_good=false;
return;
}
HttpWebRequestrqst=(HttpWebRequest)WebRequest.Create(m_uri);
rqst.AllowAutoRedirect=true;
rqst.MaximumAutomaticRedirections=3;
rqst.UserAgent="Mozilla/4.0(compatible;MSIE5.01;WindowsNT5.0)";
rqst.KeepAlive=true;
rqst.Timeout=30000;
rqst.CookieContainer=myCookieContainer;
lock(WebPage.webcookies)
{
WebPage.webcookies[m_uri.Host]=myCookieContainer;
}
HttpWebResponsersps=(HttpWebResponse)rqst.GetResponse();
Streamsm=rsps.GetResponseStream();
if(!rsps.ContentType.ToLower().StartsWith("text/")||rsps.ContentLength>1<<22)
{
rsps.Close();
m_good=false;
return;
}
Encodingcding=System.Text.Encoding.Default;
intix=rsps.ContentType.ToLower().IndexOf("charset=");
if(ix!=-1)
{
try
{
cding=System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix+"charset".Length+1));
}
catch
{
cding=Encoding.Default;
}
}
m_html=newStreamReader(sm,cding).ReadToEnd();
m_pagesize=m_html.Length;
m_uri=rsps.ResponseUri;
rsps.Close();
}
catch(Exceptionex)
{
Console.WriteLine(ex.Message+m_uri.ToString());
m_good=false;
}
#endregion
}
}
#endregion
#region属性
///<summary>
///通过此属性可获得本网页的网址,只读
///</summary>
publicstringURL
{
get
{
returnm_uri.AbsoluteUri;
}
}
///<summary>
///通过此属性可获得本网页的标题,只读
///</summary>
publicstringTitle
{
get
{
if(m_title=="")
{
Regexreg=newRegex(@"(?m)<title[^>]*>(?<title>(?:\w|\W)*?)</title[^>]*>",RegexOptions.Multiline|RegexOptions.IgnoreCase);
Matchmc=reg.Match(m_html);
if(mc.Success)
m_title=mc.Groups["title"].Value.Trim();
}
returnm_title;
}
}
///<summary>
///此属性获得本网页的所有链接信息,只读
///</summary>
publicList<Link>Links
{
get
{
if(m_links.Count==0)getLinks();
returnm_links;
}
}
///<summary>
///此属性返回本网页的全部纯文本信息,只读
///</summary>
publicstringContext
{
get
{
if(m_outstr=="")getContext(Int16.MaxValue);
returnm_outstr;
}
}
///<summary>
///此属性获得本网页的大小
///</summary>
publicintPageSize
{
get
{
returnm_pagesize;
}
}
///<summary>
///此属性获得本网页的所有站内链接
///</summary>
publicList<Link>InsiteLinks
{
get
{
returngetSpecialLinksByUrl("^http://"+m_uri.Host,Int16.MaxValue);
}
}
///<summary>
///此属性表示本网页是否可用
///</summary>
publicboolIsGood
{
get
{
returnm_good;
}
}
///<summary>
///此属性表示网页的所在的网站
///</summary>
publicstringHost
{
get
{
returnm_uri.Host;
}
}
///<summary>
///此网页的登陆页所需的POST数据
///</summary>
publicstringPostStr
{
get
{
returnm_post;
}
}
///<summary>
///此网页的登陆页
///</summary>
publicstringLoginURL
{
get
{
returnm_loginurl;
}
}
#endregion
}
///<summary>
///链接类
///</summary>
publicclassLink
{
publicstringurl;//链接网址
publicstringtext;//链接文字
publicLink(string_url,string_text)
{
url=_url;
text=_text;
}
}
希望本文所述对大家的C#程序设计有所帮助。