Java精确抽取网页发布时间
对网页中各种不同格式的发布时间进行抽取,将发布时间以规整的“yyyy-MM-ddHH:mm:ss”格式表示出来,只能尽量追求精确,但是因为网络发布时间的格式十分灵活,所以做不到百分百地正确抽取
packagewhu.extract.pubtime.core; importjava.util.ArrayList; importjava.util.Calendar; importjava.util.Collections; importjava.util.List; importjava.util.regex.Matcher; importjava.util.regex.Pattern; importwhu.utils.TimeUtil; /** *CreatedOn2014年3月13日下午2:49:05 *@description获取网页的发布时间 */ publicclassFetchPubTime{ /**表示url中连续的8位日期,例如http://www.baidu.com/20140311/2356.html*/ privatestaticStringurl_reg_whole="([-|/|_]{1}20\\d{6})"; /**表示用-或者/隔开的日期,有年月日的,例如http://www.baidu.com/2014-3-11/2356.html*/ privatestaticStringurl_reg_sep_ymd="([-|/|_]{1}20\\d{2}[-|/|_]{1}\\d{1,2}[-|/|_]{1}\\d{1,2})"; /**表示用-或者/隔开的日期,只有年和月份的,例如http://www.baidu.com/2014-3/2356.html*/ privatestaticStringurl_reg_sep_ym="([-|/|_]{1}20\\d{2}[-|/|_]{1}\\d{1,2})"; privatestaticCalendarcurrent=Calendar.getInstance(); /**格式正确的时间正则表达式*/ privatestaticStringrightTimeReg="^((\\d{2}(([02468][048])|([13579][26]))[\\-\\/\\s]?((((0?[13578])|(1[02]))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(3[01])))|(((0?[469])|(11))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(30)))|(0?2[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])))))|(\\d{2}(([02468][1235679])|([13579][01345789]))[\\-\\/\\s]?((((0?[13578])|(1[02]))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(3[01])))|(((0?[469])|(11))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(30)))|(0?2[\\-\\/\\s]?((0?[1-9])|(1[0-9])|(2[0-8]))))))(\\s(((0?[0-9])|([1-2][0-3]))\\:([0-5]?[0-9])((\\s)|(\\:([0-5]?[0-9])))))?$"; /** *@paramurl *@paramurlContent *@return */ publicstaticStringgetPubTimeVarious(Stringurl,StringurlContent){ StringpubTime=getPubTimeFromUrl(url); //链接里面没有,匹配文本中的 if(pubTime==null) { if(urlContent!=null&&!urlContent.trim().equals("")) returnextractPageDate(urlContent); } returnpubTime; } /**从url里面抽取出发布时间,返回YYYY-MM-DDHH:mm:ss格式的字符串 *@paramurl *@return */ publicstaticStringgetPubTimeFromUrl(Stringurl) { Patternp_whole=Pattern.compile(url_reg_whole); Matcherm_whole=p_whole.matcher(url); if(m_whole.find(0)&&m_whole.groupCount()>0) { Stringtime=m_whole.group(0); time=time.substring(1,time.length()); //每一步都不能够超出当前时间 if(current.compareTo(TimeUtil.strToCalendar(time,"yyyyMMdd"))>=0) { returntime.substring(0,4)+"-"+time.substring(4,6)+"-"+ time.substring(6,8)+""+"00:00:00"; } } p_whole=null; m_whole=null; Patternp_sep=Pattern.compile(url_reg_sep_ymd); Matcherm_sep=p_sep.matcher(url); if(m_sep.find(0)&&m_sep.groupCount()>0) { Stringtime=m_sep.group(0); time=time.substring(1,time.length()); String[]seg=time.split("[-|/|_]{1}"); CalendartheTime=Calendar.getInstance(); theTime.set(Calendar.YEAR,Integer.parseInt(seg[0])); theTime.set(Calendar.MONTH,Integer.parseInt(seg[1])); theTime.set(Calendar.DAY_OF_MONTH,Integer.parseInt(seg[2])); if(current.compareTo(theTime)>=0) { returnseg[0]+"-"+seg[1]+"-"+seg[2]+""+"00:00:00"; } } p_sep=null; m_sep=null; Patternp_sep_ym=Pattern.compile(url_reg_sep_ym); Matcherm_sep_ym=p_sep_ym.matcher(url); if(m_sep_ym.find(0)&&m_sep_ym.groupCount()>0) { Stringtime=m_sep_ym.group(0); time=time.substring(1,time.length()); CalendartheTime=Calendar.getInstance(); String[]seg=time.split("[-|/|_]{1}"); theTime.set(Calendar.YEAR,Integer.parseInt(seg[0])); theTime.set(Calendar.MONTH,Integer.parseInt(seg[1])); theTime.set(Calendar.DAY_OF_MONTH,1); if(current.compareTo(theTime)>=0) { returnseg[0]+"-"+seg[1]+"-"+"01"+""+"00:00:00"; } } returnnull; } /**从网页源码中取出发布时间 *java中正则表达式提取字符串中日期实现代码 *2013年12月19日15:58:42 *读取出2013-12-1915:48:33或者2013-12-19或者2012/3/05形式的时间 *@paramtext待提取的字符串 *@return返回日期 *@author:oschina *@Createtime:Jan21,2013 */ publicstaticStringextractPageDate(Stringtext){ booleancontainsHMS=false; StringdateStr=text.replaceAll("r?n",""); try{ Listmatches=null; Patternp_detail=Pattern.compile("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2}\\d{1,2}:\\d{1,2}:\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)",Pattern.CASE_INSENSITIVE|Pattern.MULTILINE); //如果是仅仅抽取年月日,则按照上面的,如果是抽取年月日-时分秒,则按照下面的 Patternp=Pattern.compile("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)",Pattern.CASE_INSENSITIVE|Pattern.MULTILINE); //Matchermatcher=p.matcher(dateStr); Matchermatcher_detail=p_detail.matcher(dateStr); if(!(matcher_detail.find(0)&&matcher_detail.groupCount()>=1)) { matcher_detail=p.matcher(dateStr); containsHMS=true; }else matcher_detail=p_detail.matcher(dateStr); if(matcher_detail.find()&&matcher_detail.groupCount()>=1){ matches=newArrayList(); for(inti=1;i<=matcher_detail.groupCount();i++){ Stringtemp=matcher_detail.group(i); matches.add(temp); } }else{ matches=Collections.EMPTY_LIST; } if(matches.size()>0){ for(inti=0;i<matches.size();i++) { StringpubTime=matches.get(i).toString().trim(); //取出第一个值 pubTime=pubTime.replace("/","-").replace("年","-").replace("月","-").replace("日","-"); if(current.compareTo(TimeUtil.strToCalendar(pubTime,"yyyy-MM-dd"))>=0) { if(containsHMS) pubTime+=""+"00:00:00"; if(pubTime.matches(rightTimeReg)) { returnpubTime; } } } }else{ returnnull; } }catch(Exceptione){ returnnull; } returnnull; } }
以上就是本文的全部内容,希望对大家学习java程序设计有所帮助。