Java精确抽取网页发布时间
对网页中各种不同格式的发布时间进行抽取,将发布时间以规整的“yyyy-MM-ddHH:mm:ss”格式表示出来,只能尽量追求精确,但是因为网络发布时间的格式十分灵活,所以做不到百分百地正确抽取
packagewhu.extract.pubtime.core;
importjava.util.ArrayList;
importjava.util.Calendar;
importjava.util.Collections;
importjava.util.List;
importjava.util.regex.Matcher;
importjava.util.regex.Pattern;
importwhu.utils.TimeUtil;
/**
*CreatedOn2014年3月13日下午2:49:05
*@description获取网页的发布时间
*/
publicclassFetchPubTime{
/**表示url中连续的8位日期,例如http://www.baidu.com/20140311/2356.html*/
privatestaticStringurl_reg_whole="([-|/|_]{1}20\\d{6})";
/**表示用-或者/隔开的日期,有年月日的,例如http://www.baidu.com/2014-3-11/2356.html*/
privatestaticStringurl_reg_sep_ymd="([-|/|_]{1}20\\d{2}[-|/|_]{1}\\d{1,2}[-|/|_]{1}\\d{1,2})";
/**表示用-或者/隔开的日期,只有年和月份的,例如http://www.baidu.com/2014-3/2356.html*/
privatestaticStringurl_reg_sep_ym="([-|/|_]{1}20\\d{2}[-|/|_]{1}\\d{1,2})";
privatestaticCalendarcurrent=Calendar.getInstance();
/**格式正确的时间正则表达式*/
privatestaticStringrightTimeReg="^((\\d{2}(([02468][048])|([13579][26]))[\\-\\/\\s]?((((0?[13578])|(1[02]))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(3[01])))|(((0?[469])|(11))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(30)))|(0?2[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])))))|(\\d{2}(([02468][1235679])|([13579][01345789]))[\\-\\/\\s]?((((0?[13578])|(1[02]))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(3[01])))|(((0?[469])|(11))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(30)))|(0?2[\\-\\/\\s]?((0?[1-9])|(1[0-9])|(2[0-8]))))))(\\s(((0?[0-9])|([1-2][0-3]))\\:([0-5]?[0-9])((\\s)|(\\:([0-5]?[0-9])))))?$";
/**
*@paramurl
*@paramurlContent
*@return
*/
publicstaticStringgetPubTimeVarious(Stringurl,StringurlContent){
StringpubTime=getPubTimeFromUrl(url);
//链接里面没有,匹配文本中的
if(pubTime==null)
{
if(urlContent!=null&&!urlContent.trim().equals(""))
returnextractPageDate(urlContent);
}
returnpubTime;
}
/**从url里面抽取出发布时间,返回YYYY-MM-DDHH:mm:ss格式的字符串
*@paramurl
*@return
*/
publicstaticStringgetPubTimeFromUrl(Stringurl)
{
Patternp_whole=Pattern.compile(url_reg_whole);
Matcherm_whole=p_whole.matcher(url);
if(m_whole.find(0)&&m_whole.groupCount()>0)
{
Stringtime=m_whole.group(0);
time=time.substring(1,time.length());
//每一步都不能够超出当前时间
if(current.compareTo(TimeUtil.strToCalendar(time,"yyyyMMdd"))>=0)
{
returntime.substring(0,4)+"-"+time.substring(4,6)+"-"+
time.substring(6,8)+""+"00:00:00";
}
}
p_whole=null;
m_whole=null;
Patternp_sep=Pattern.compile(url_reg_sep_ymd);
Matcherm_sep=p_sep.matcher(url);
if(m_sep.find(0)&&m_sep.groupCount()>0)
{
Stringtime=m_sep.group(0);
time=time.substring(1,time.length());
String[]seg=time.split("[-|/|_]{1}");
CalendartheTime=Calendar.getInstance();
theTime.set(Calendar.YEAR,Integer.parseInt(seg[0]));
theTime.set(Calendar.MONTH,Integer.parseInt(seg[1]));
theTime.set(Calendar.DAY_OF_MONTH,Integer.parseInt(seg[2]));
if(current.compareTo(theTime)>=0)
{
returnseg[0]+"-"+seg[1]+"-"+seg[2]+""+"00:00:00";
}
}
p_sep=null;
m_sep=null;
Patternp_sep_ym=Pattern.compile(url_reg_sep_ym);
Matcherm_sep_ym=p_sep_ym.matcher(url);
if(m_sep_ym.find(0)&&m_sep_ym.groupCount()>0)
{
Stringtime=m_sep_ym.group(0);
time=time.substring(1,time.length());
CalendartheTime=Calendar.getInstance();
String[]seg=time.split("[-|/|_]{1}");
theTime.set(Calendar.YEAR,Integer.parseInt(seg[0]));
theTime.set(Calendar.MONTH,Integer.parseInt(seg[1]));
theTime.set(Calendar.DAY_OF_MONTH,1);
if(current.compareTo(theTime)>=0)
{
returnseg[0]+"-"+seg[1]+"-"+"01"+""+"00:00:00";
}
}
returnnull;
}
/**从网页源码中取出发布时间
*java中正则表达式提取字符串中日期实现代码
*2013年12月19日15:58:42
*读取出2013-12-1915:48:33或者2013-12-19或者2012/3/05形式的时间
*@paramtext待提取的字符串
*@return返回日期
*@author:oschina
*@Createtime:Jan21,2013
*/
publicstaticStringextractPageDate(Stringtext){
booleancontainsHMS=false;
StringdateStr=text.replaceAll("r?n","");
try{
Listmatches=null;
Patternp_detail=Pattern.compile("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2}\\d{1,2}:\\d{1,2}:\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)",Pattern.CASE_INSENSITIVE|Pattern.MULTILINE);
//如果是仅仅抽取年月日,则按照上面的,如果是抽取年月日-时分秒,则按照下面的
Patternp=Pattern.compile("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)",Pattern.CASE_INSENSITIVE|Pattern.MULTILINE);
//Matchermatcher=p.matcher(dateStr);
Matchermatcher_detail=p_detail.matcher(dateStr);
if(!(matcher_detail.find(0)&&matcher_detail.groupCount()>=1))
{
matcher_detail=p.matcher(dateStr);
containsHMS=true;
}else
matcher_detail=p_detail.matcher(dateStr);
if(matcher_detail.find()&&matcher_detail.groupCount()>=1){
matches=newArrayList();
for(inti=1;i<=matcher_detail.groupCount();i++){
Stringtemp=matcher_detail.group(i);
matches.add(temp);
}
}else{
matches=Collections.EMPTY_LIST;
}
if(matches.size()>0){
for(inti=0;i<matches.size();i++)
{
StringpubTime=matches.get(i).toString().trim();
//取出第一个值
pubTime=pubTime.replace("/","-").replace("年","-").replace("月","-").replace("日","-");
if(current.compareTo(TimeUtil.strToCalendar(pubTime,"yyyy-MM-dd"))>=0)
{
if(containsHMS)
pubTime+=""+"00:00:00";
if(pubTime.matches(rightTimeReg))
{
returnpubTime;
}
}
}
}else{
returnnull;
}
}catch(Exceptione){
returnnull;
}
returnnull;
}
}
以上就是本文的全部内容,希望对大家学习java程序设计有所帮助。