C#自写的一个HTML解析类(类似XElement语法)
功能:
1、轻松获取指元素HTML元素。
2、可以根据属性标签进行筛选
3、返回的都是Llist强类型无需转换
用过XElement的都知道用来解析XML非常的方便,但是对于HTML的格式多样化实在是没办法兼容。
所以我就写了这么一个类似XElement的XHTMLElement
用法:
stringfilePath=Server.MapPath("~/file/test.htm"); //获取HTML代码 stringmailBody=FileHelper.FileToString(filePath); XHtmlElementxh=newXHtmlElement(mailBody); //获取body的子集a标签并且class="icon" varlink=xh.Descendants("body").ChildDescendants("a").Where(c=>c.Attributes.Any(a=>a.Key=="class"&&a.Value=="icon")).ToList(); //获取带href的a元素 varlinks=xh.Descendants("a").Where(c=>c.Attributes.Any(a=>a.Key=="href")).ToList(); foreach(varrinlinks) { Response.Write(r.Attributes.Single(c=>c.Key=="href").Value);//出输href } //获取第一个img varimg=xh.Descendants("img"); //获取最近的第一个p元素以及与他同一级的其它p元素 varps=xh.Descendants("p");
代码:
usingSystem; usingSystem.Collections.Generic; usingSystem.Linq; usingSystem.Web; usingSystem.Text; usingSystem.Text.RegularExpressions; namespaceSyntacticSugar { ///<summary> ///**描述:html解析类 ///**创始时间:2015-4-23 ///**修改时间:- ///**作者:sunkaixuan ///**qq:610262374欢迎交流,共同提高,命名语法等写的不好的地方欢迎大家的给出宝贵建议 ///</summary> publicclassXHtmlElement { privatestring_html; publicXHtmlElement(stringhtml) { _html=html; } ///<summary> ///获取最近的相同层级的HTML元素 ///</summary> ///<paramname="elementName">等于null为所有元素</param> ///<returns></returns> publicList<HtmlInfo>Descendants(stringelementName=null) { if(_html==null) { thrownewArgumentNullException("html不能这空!"); } varallList=RootDescendants(_html); varreval=allList.Where(c=>elementName==null||c.TagName.ToLower()==elementName.ToLower()).ToList(); if(reval==null||reval.Count==0) { reval=GetDescendantsSource(allList,elementName); } returnreval; } ///<summary> ///获取第一级元素 ///</summary> ///<paramname="elementName"></param> ///<returns></returns> publicList<HtmlInfo>RootDescendants(stringhtml=null) { /* *业务逻辑: *1、获取第一个html标签一直找结尾标签,如果在这个过程中遇到相同的标签收尾标签就要加1 *2、第一个标签取到后继续第一步操作,找第2个元素。。第N个元素 */ if(html==null)html=_html; varfirstTag=Regex.Match(html,"<.+?>"); List<string>eleList=newList<string>(); List<HtmlInfo>reval=newList<HtmlInfo>(); GetElementsStringList(html,refeleList); foreach(varrineleList) { HtmlInfodata=newHtmlInfo(); data.OldFullHtml=r; data.SameLeveHtml=html; data.TagName=Regex.Match(r,@"(?<=\s{1}|\<)[a-z,A-Z]+(?=\>|\s)",RegexOptions.IgnoreCase).Value; data.InnerHtml=Regex.Match(r,@"(?<=\>).+(?=<)",RegexOptions.Singleline).Value; vareleBegin=Regex.Match(r,"<.+?>").Value; varattrList=Regex.Matches(eleBegin,@"[a-z,A-Z]+\="".+?""").Cast<Match>().Select(c=>new{key=c.Value.Split('=').First(),value=c.Value.Split('=').Last().TrimEnd('"').TrimStart('"')}).ToList(); data.Attributes=newDictionary<string,string>(); if(attrList!=null&&attrList.Count>0) { foreach(varainattrList) { data.Attributes.Add(a.key,a.value); } } reval.Add(data); } returnreval; } #regionprivate privateList<HtmlInfo>GetDescendantsSource(List<HtmlInfo>allList,stringelementName) { foreach(varrinallList) { if(r.InnerHtml==null||!r.InnerHtml.Contains("<"))continue; varchildList=RootDescendants(r.InnerHtml).Where(c=>elementName==null||c.TagName.ToLower()==elementName.ToLower()).ToList(); if(childList==null||childList.Count==0) { childList=GetDescendantsSource(RootDescendants(r.InnerHtml),elementName); if(childList!=null&&childList.Count>0) returnchildList; } else { returnchildList; } } returnnull; } privatevoidGetElementsStringList(stringhtml,refList<string>eleList) { HtmlInfoinfo=newHtmlInfo(); info.TagName=Regex.Match(html,@"(?<=\<\s{0,5}|\<)([a-z,A-Z]+|h\d{1})(?=\>|\s)",RegexOptions.IgnoreCase).Value; stringcurrentTagBeginReg=@"<\s{0,10}"+info.TagName+@".*?>";//获取当前标签元素开始标签正则 stringcurrentTagEndReg=@"\<\/"+info.TagName+@"\>";//获取当前标签元素收尾标签正则 if(string.IsNullOrEmpty(info.TagName))return; stringeleHtml=""; //情况1<a/> //情况2<a></a> //情况3<a>错误格式 //情况4endif if(Regex.IsMatch(html,@"<\s{0,10}"+info.TagName+"[^<].*?/>"))//单标签 { eleHtml=Regex.Match(html,@"<\s{0,10}"+info.TagName+"[^<].*?/>").Value; } elseif(!Regex.IsMatch(html,currentTagEndReg))//没有收尾 { if(Regex.IsMatch(html,@"\s{0,10}\<\!\-\-\[if")) { eleHtml=GetElementString(html,@"\s{0,10}\<\!\-\-\[if",@"\[endif\]\-\-\>",1); } else { eleHtml=Regex.Match(html,currentTagBeginReg,RegexOptions.Singleline).Value; } } else { eleHtml=GetElementString(html,currentTagBeginReg,currentTagEndReg,1); } try { eleList.Add(eleHtml); html=html.Replace(eleHtml,""); html=Regex.Replace(html,@"<\!DOCTYPE.*?>",""); if(!Regex.IsMatch(html,@"^\s*$")) { GetElementsStringList(html,refeleList); } } catch(Exceptionex) { thrownewException("SORRY,您的HTML格式不能解析!!!"); } } privatestringGetElementString(stringhtml,stringcurrentTagBeginReg,stringcurrentTagEndReg,inti) { stringnewHtml=GetRegNextByNum(html,currentTagBeginReg,currentTagEndReg,i); varcurrentTagBeginMatches=Regex.Matches(newHtml,currentTagBeginReg,RegexOptions.Singleline).Cast<Match>().Select(c=>c.Value).ToList(); varcurrentTagEndMatches=Regex.Matches(newHtml,currentTagEndReg).Cast<Match>().Select(c=>c.Value).ToList(); if(currentTagBeginMatches.Count==currentTagEndMatches.Count) {//两个签标元素相等 returnnewHtml; } returnGetElementString(html,currentTagBeginReg,currentTagEndReg,++i); } privatestringGetRegNextByNum(stringval,stringcurrentTagBeginReg,stringcurrentTagEndReg,inti) { returnRegex.Match(val,currentTagBeginReg+@"((.*?)"+currentTagEndReg+"){"+i+"}?",RegexOptions.IgnoreCase|RegexOptions.Singleline).Value; } #endregion } publicstaticclassXHtmlElementExtendsion { ///<summary> ///获取最近的相同层级的HTML元素 ///</summary> ///<paramname="elementName">等于null为所有元素</param> ///<returns></returns> publicstaticList<HtmlInfo>Descendants(thisIEnumerable<HtmlInfo>htmlInfoList,stringelementName=null) { varhtml=htmlInfoList.First().InnerHtml; XHtmlElementxhe=newXHtmlElement(html); returnxhe.Descendants(elementName); } ///<summary> ///获取下级元素 ///</summary> ///<paramname="elementName"></param> ///<returns></returns> publicstaticList<HtmlInfo>ChildDescendants(thisIEnumerable<HtmlInfo>htmlInfoList,stringelementName=null) { varhtml=htmlInfoList.First().InnerHtml; XHtmlElementxhe=newXHtmlElement(html); returnxhe.RootDescendants(html).Where(c=>elementName==null||c.TagName==elementName).ToList(); } ///<summary> ///获取父级 ///</summary> ///<paramname="htmlInfoList"></param> ///<returns></returns> publicstaticList<HtmlInfo>ParentDescendant(thisIEnumerable<HtmlInfo>htmlInfoList,stringfullHtml) { varsaveLeveHtml=htmlInfoList.First().SameLeveHtml; stringreplaceGuid=Guid.NewGuid().ToString(); fullHtml=fullHtml.Replace(saveLeveHtml,replaceGuid); varparentHtml=Regex.Match(fullHtml,@"<[^<]+?>[^<]*?"+replaceGuid+@".*?<\/.+?>").Value; parentHtml=parentHtml.Replace(replaceGuid,saveLeveHtml); XHtmlElementxhe=newXHtmlElement(parentHtml); returnxhe.RootDescendants(); } } ///<summary> ///html信息类 ///</summary> publicclassHtmlInfo { ///<summary> ///元素名 ///</summary> publicstringTagName{get;set;} ///<summary> ///元素属性 ///</summary> publicDictionary<string,string>Attributes{get;set;} ///<summary> ///元素内部html ///</summary> publicstringInnerHtml{get;set;} publicstringOldFullHtml{get;set;} publicstringSameLeveHtml{get;set;} ///<summary> ///得到元素的html ///</summary> ///<returns></returns> publicstringFullHtml { get { StringBuilderreval=newStringBuilder(); stringattributesString=string.Empty; if(Attributes!=null&&Attributes.Count>0) { attributesString=string.Join("",Attributes.Select(c=>string.Format("{0}=\"{1}\"",c.Key,c.Value))); } reval.AppendFormat("<{0}{2}>{1}</{0}>",TagName,InnerHtml,attributesString); returnreval.ToString(); } } } }
前台HTML:
<!DOCTYPEhtmlPUBLIC"-//W3C//DTDXHTML1.0Transitional//EN""http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <htmlxmlns="http://www.w3.org/1999/xhtml"> <head> <title></title> </head> <body> <aid="1">我是1</a> <aid="2"class="icon">icon</a> <img/> </body> </html>