Java用正则表达式如何读取网页内容
学习java的正则表达式,抓取网页并解析HTML部分内容
packagecom.xiaofeng.picup;
importjava.io.BufferedReader;
importjava.io.IOException;
importjava.io.InputStreamReader;
importjava.net.MalformedURLException;
importjava.net.URL;
importjava.util.ArrayList;
importjava.util.HashMap;
importjava.util.List;
importjava.util.regex.Matcher;
importjava.util.regex.Pattern;
/***//**
*
*@抓取页面文章标题及内容(测试)手动输入网址抓取,可进一步自动抓取整个页面的全部内容
*
*/
publicclassWebContent...{
/***//**
*读取一个网页全部内容
*/
publicStringgetOneHtml(Stringhtmlurl)throwsIOException...{
URLurl;
Stringtemp;
StringBuffersb=newStringBuffer();
try...{
url=newURL(htmlurl);
BufferedReaderin=newBufferedReader(newInputStreamReader(url
.openStream(),"utf-8"));//读取网页全部内容
while((temp=in.readLine())!=null)...{
sb.append(temp);
}
in.close();
}catch(MalformedURLExceptionme)...{
System.out.println("你输入的URL格式有问题!请仔细输入");
me.getMessage();
throwme;
}catch(IOExceptione)...{
e.printStackTrace();
throwe;
}
returnsb.toString();
}
/***//**
*
*@params
*@return获得网页标题
*/
publicStringgetTitle(Strings)...{
Stringregex;
Stringtitle="";
List<String>list=newArrayList<String>();
regex="<title>.*?</title>";
Patternpa=Pattern.compile(regex,Pattern.CANON_EQ);
Matcherma=pa.matcher(s);
while(ma.find())...{
list.add(ma.group());
}
for(inti=0;i<list.size();i++)...{
title=title+list.get(i);
}
returnoutTag(title);
}
/***//**
*
*@params
*@return获得链接
*/
publicList<String>getLink(Strings)...{
Stringregex;
List<String>list=newArrayList<String>();
regex="<a[^>]*href=("([^"]*)"|'([^']*)'|([^s>]*))[^>]*>(.*?)</a>";
Patternpa=Pattern.compile(regex,Pattern.DOTALL);
Matcherma=pa.matcher(s);
while(ma.find())...{
list.add(ma.group());
}
returnlist;
}
/***//**
*
*@params
*@return获得脚本代码
*/
publicList<String>getScript(Strings)...{
Stringregex;
List<String>list=newArrayList<String>();
regex="<script.*?</script>";
Patternpa=Pattern.compile(regex,Pattern.DOTALL);
Matcherma=pa.matcher(s);
while(ma.find())...{
list.add(ma.group());
}
returnlist;
}
/***//**
*
*@params
*@return获得CSS
*/
publicList<String>getCSS(Strings)...{
Stringregex;
List<String>list=newArrayList<String>();
regex="<style.*?</style>";
Patternpa=Pattern.compile(regex,Pattern.DOTALL);
Matcherma=pa.matcher(s);
while(ma.find())...{
list.add(ma.group());
}
returnlist;
}
/***//**
*
*@params
*@return去掉标记
*/
publicStringoutTag(Strings)...{
returns.replaceAll("<.*?>","");
}