C#自写的一个HTML解析类(类似XElement语法)
功能:
1、轻松获取指元素HTML元素。
2、可以根据属性标签进行筛选
3、返回的都是Llist强类型无需转换
用过XElement的都知道用来解析XML非常的方便,但是对于HTML的格式多样化实在是没办法兼容。
所以我就写了这么一个类似XElement的XHTMLElement
用法:
stringfilePath=Server.MapPath("~/file/test.htm");
//获取HTML代码
stringmailBody=FileHelper.FileToString(filePath);
XHtmlElementxh=newXHtmlElement(mailBody);
//获取body的子集a标签并且class="icon"
varlink=xh.Descendants("body").ChildDescendants("a").Where(c=>c.Attributes.Any(a=>a.Key=="class"&&a.Value=="icon")).ToList();
//获取带href的a元素
varlinks=xh.Descendants("a").Where(c=>c.Attributes.Any(a=>a.Key=="href")).ToList();
foreach(varrinlinks)
{
Response.Write(r.Attributes.Single(c=>c.Key=="href").Value);//出输href
}
//获取第一个img
varimg=xh.Descendants("img");
//获取最近的第一个p元素以及与他同一级的其它p元素
varps=xh.Descendants("p");
代码:
usingSystem;
usingSystem.Collections.Generic;
usingSystem.Linq;
usingSystem.Web;
usingSystem.Text;
usingSystem.Text.RegularExpressions;
namespaceSyntacticSugar
{
///<summary>
///**描述:html解析类
///**创始时间:2015-4-23
///**修改时间:-
///**作者:sunkaixuan
///**qq:610262374欢迎交流,共同提高,命名语法等写的不好的地方欢迎大家的给出宝贵建议
///</summary>
publicclassXHtmlElement
{
privatestring_html;
publicXHtmlElement(stringhtml)
{
_html=html;
}
///<summary>
///获取最近的相同层级的HTML元素
///</summary>
///<paramname="elementName">等于null为所有元素</param>
///<returns></returns>
publicList<HtmlInfo>Descendants(stringelementName=null)
{
if(_html==null)
{
thrownewArgumentNullException("html不能这空!");
}
varallList=RootDescendants(_html);
varreval=allList.Where(c=>elementName==null||c.TagName.ToLower()==elementName.ToLower()).ToList();
if(reval==null||reval.Count==0)
{
reval=GetDescendantsSource(allList,elementName);
}
returnreval;
}
///<summary>
///获取第一级元素
///</summary>
///<paramname="elementName"></param>
///<returns></returns>
publicList<HtmlInfo>RootDescendants(stringhtml=null)
{
/*
*业务逻辑:
*1、获取第一个html标签一直找结尾标签,如果在这个过程中遇到相同的标签收尾标签就要加1
*2、第一个标签取到后继续第一步操作,找第2个元素。。第N个元素
*/
if(html==null)html=_html;
varfirstTag=Regex.Match(html,"<.+?>");
List<string>eleList=newList<string>();
List<HtmlInfo>reval=newList<HtmlInfo>();
GetElementsStringList(html,refeleList);
foreach(varrineleList)
{
HtmlInfodata=newHtmlInfo();
data.OldFullHtml=r;
data.SameLeveHtml=html;
data.TagName=Regex.Match(r,@"(?<=\s{1}|\<)[a-z,A-Z]+(?=\>|\s)",RegexOptions.IgnoreCase).Value;
data.InnerHtml=Regex.Match(r,@"(?<=\>).+(?=<)",RegexOptions.Singleline).Value;
vareleBegin=Regex.Match(r,"<.+?>").Value;
varattrList=Regex.Matches(eleBegin,@"[a-z,A-Z]+\="".+?""").Cast<Match>().Select(c=>new{key=c.Value.Split('=').First(),value=c.Value.Split('=').Last().TrimEnd('"').TrimStart('"')}).ToList();
data.Attributes=newDictionary<string,string>();
if(attrList!=null&&attrList.Count>0)
{
foreach(varainattrList)
{
data.Attributes.Add(a.key,a.value);
}
}
reval.Add(data);
}
returnreval;
}
#regionprivate
privateList<HtmlInfo>GetDescendantsSource(List<HtmlInfo>allList,stringelementName)
{
foreach(varrinallList)
{
if(r.InnerHtml==null||!r.InnerHtml.Contains("<"))continue;
varchildList=RootDescendants(r.InnerHtml).Where(c=>elementName==null||c.TagName.ToLower()==elementName.ToLower()).ToList();
if(childList==null||childList.Count==0)
{
childList=GetDescendantsSource(RootDescendants(r.InnerHtml),elementName);
if(childList!=null&&childList.Count>0)
returnchildList;
}
else
{
returnchildList;
}
}
returnnull;
}
privatevoidGetElementsStringList(stringhtml,refList<string>eleList)
{
HtmlInfoinfo=newHtmlInfo();
info.TagName=Regex.Match(html,@"(?<=\<\s{0,5}|\<)([a-z,A-Z]+|h\d{1})(?=\>|\s)",RegexOptions.IgnoreCase).Value;
stringcurrentTagBeginReg=@"<\s{0,10}"+info.TagName+@".*?>";//获取当前标签元素开始标签正则
stringcurrentTagEndReg=@"\<\/"+info.TagName+@"\>";//获取当前标签元素收尾标签正则
if(string.IsNullOrEmpty(info.TagName))return;
stringeleHtml="";
//情况1<a/>
//情况2<a></a>
//情况3<a>错误格式
//情况4endif
if(Regex.IsMatch(html,@"<\s{0,10}"+info.TagName+"[^<].*?/>"))//单标签
{
eleHtml=Regex.Match(html,@"<\s{0,10}"+info.TagName+"[^<].*?/>").Value;
}
elseif(!Regex.IsMatch(html,currentTagEndReg))//没有收尾
{
if(Regex.IsMatch(html,@"\s{0,10}\<\!\-\-\[if"))
{
eleHtml=GetElementString(html,@"\s{0,10}\<\!\-\-\[if",@"\[endif\]\-\-\>",1);
}
else
{
eleHtml=Regex.Match(html,currentTagBeginReg,RegexOptions.Singleline).Value;
}
}
else
{
eleHtml=GetElementString(html,currentTagBeginReg,currentTagEndReg,1);
}
try
{
eleList.Add(eleHtml);
html=html.Replace(eleHtml,"");
html=Regex.Replace(html,@"<\!DOCTYPE.*?>","");
if(!Regex.IsMatch(html,@"^\s*$"))
{
GetElementsStringList(html,refeleList);
}
}
catch(Exceptionex)
{
thrownewException("SORRY,您的HTML格式不能解析!!!");
}
}
privatestringGetElementString(stringhtml,stringcurrentTagBeginReg,stringcurrentTagEndReg,inti)
{
stringnewHtml=GetRegNextByNum(html,currentTagBeginReg,currentTagEndReg,i);
varcurrentTagBeginMatches=Regex.Matches(newHtml,currentTagBeginReg,RegexOptions.Singleline).Cast<Match>().Select(c=>c.Value).ToList();
varcurrentTagEndMatches=Regex.Matches(newHtml,currentTagEndReg).Cast<Match>().Select(c=>c.Value).ToList();
if(currentTagBeginMatches.Count==currentTagEndMatches.Count)
{//两个签标元素相等
returnnewHtml;
}
returnGetElementString(html,currentTagBeginReg,currentTagEndReg,++i);
}
privatestringGetRegNextByNum(stringval,stringcurrentTagBeginReg,stringcurrentTagEndReg,inti)
{
returnRegex.Match(val,currentTagBeginReg+@"((.*?)"+currentTagEndReg+"){"+i+"}?",RegexOptions.IgnoreCase|RegexOptions.Singleline).Value;
}
#endregion
}
publicstaticclassXHtmlElementExtendsion
{
///<summary>
///获取最近的相同层级的HTML元素
///</summary>
///<paramname="elementName">等于null为所有元素</param>
///<returns></returns>
publicstaticList<HtmlInfo>Descendants(thisIEnumerable<HtmlInfo>htmlInfoList,stringelementName=null)
{
varhtml=htmlInfoList.First().InnerHtml;
XHtmlElementxhe=newXHtmlElement(html);
returnxhe.Descendants(elementName);
}
///<summary>
///获取下级元素
///</summary>
///<paramname="elementName"></param>
///<returns></returns>
publicstaticList<HtmlInfo>ChildDescendants(thisIEnumerable<HtmlInfo>htmlInfoList,stringelementName=null)
{
varhtml=htmlInfoList.First().InnerHtml;
XHtmlElementxhe=newXHtmlElement(html);
returnxhe.RootDescendants(html).Where(c=>elementName==null||c.TagName==elementName).ToList();
}
///<summary>
///获取父级
///</summary>
///<paramname="htmlInfoList"></param>
///<returns></returns>
publicstaticList<HtmlInfo>ParentDescendant(thisIEnumerable<HtmlInfo>htmlInfoList,stringfullHtml)
{
varsaveLeveHtml=htmlInfoList.First().SameLeveHtml;
stringreplaceGuid=Guid.NewGuid().ToString();
fullHtml=fullHtml.Replace(saveLeveHtml,replaceGuid);
varparentHtml=Regex.Match(fullHtml,@"<[^<]+?>[^<]*?"+replaceGuid+@".*?<\/.+?>").Value;
parentHtml=parentHtml.Replace(replaceGuid,saveLeveHtml);
XHtmlElementxhe=newXHtmlElement(parentHtml);
returnxhe.RootDescendants();
}
}
///<summary>
///html信息类
///</summary>
publicclassHtmlInfo
{
///<summary>
///元素名
///</summary>
publicstringTagName{get;set;}
///<summary>
///元素属性
///</summary>
publicDictionary<string,string>Attributes{get;set;}
///<summary>
///元素内部html
///</summary>
publicstringInnerHtml{get;set;}
publicstringOldFullHtml{get;set;}
publicstringSameLeveHtml{get;set;}
///<summary>
///得到元素的html
///</summary>
///<returns></returns>
publicstringFullHtml
{
get
{
StringBuilderreval=newStringBuilder();
stringattributesString=string.Empty;
if(Attributes!=null&&Attributes.Count>0)
{
attributesString=string.Join("",Attributes.Select(c=>string.Format("{0}=\"{1}\"",c.Key,c.Value)));
}
reval.AppendFormat("<{0}{2}>{1}</{0}>",TagName,InnerHtml,attributesString);
returnreval.ToString();
}
}
}
}
前台HTML:
<!DOCTYPEhtmlPUBLIC"-//W3C//DTDXHTML1.0Transitional//EN""http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <htmlxmlns="http://www.w3.org/1999/xhtml"> <head> <title></title> </head> <body> <aid="1">我是1</a> <aid="2"class="icon">icon</a> <img/> </body> </html>