java代理实现爬取代理IP的示例
仅仅使用了一个java文件,运行main方法即可,需要依赖的jar包是com.alibaba.fastjson(版本1.2.28)和Jsoup(版本1.10.2)
如果用了pom,那么就是以下两个:
com.alibaba fastjson 1.2.28 org.jsoup jsoup 1.10.2
完整的代码如下:
packagecom.tuniu.fcm.facade.IPProxy; importcom.alibaba.fastjson.JSONObject; importorg.jsoup.Jsoup; importorg.jsoup.nodes.Document; importjava.util.ArrayList; importjava.util.HashMap; importjava.util.List; importjava.util.Map; importjava.util.regex.Matcher; importjava.util.regex.Pattern; /** *获取代理IP,需要 *com.alibaba.fastjson.JSONObject以及Jsoup */ publicclassProxyCralwerUnusedVPN{ ThreadLocallocalWantedNumber=newThreadLocal (); ThreadLocal >localProxyInfos=newThreadLocal
>(); publicstaticvoidmain(String[]args){ ProxyCralwerUnusedVPNproxyCrawler=newProxyCralwerUnusedVPN(); /** *想要获取的代理IP个数,由需求方自行指定。(如果个数太多,将导致返回变慢) */ proxyCrawler.startCrawler(1); } /** *暴露给外部模块调用的入口 *@paramwantedNumber调用方期望获取到的代理IP个数 */ publicStringstartCrawler(intwantedNumber){ localWantedNumber.set(wantedNumber); kuaidailiCom("http://www.xicidaili.com/nn/",15); kuaidailiCom("http://www.xicidaili.com/nt/",15); kuaidailiCom("http://www.xicidaili.com/wt/",15); kuaidailiCom("http://www.kuaidaili.com/free/inha/",15); kuaidailiCom("http://www.kuaidaili.com/free/intr/",15); kuaidailiCom("http://www.kuaidaili.com/free/outtr/",15); /** *构造返回数据 */ ProxyResponseresponse=newProxyResponse(); response.setSuccess("true"); Map
dataInfoMap=newHashMap (); dataInfoMap.put("numFound",localProxyInfos.get().size()); dataInfoMap.put("pageNum",1); dataInfoMap.put("proxy",localProxyInfos.get()); response.setData(dataInfoMap); StringresponseString=JSONObject.toJSON(response).toString(); System.out.println(responseString); returnresponseString; } privatevoidkuaidailiCom(StringbaseUrl,inttotalPage){ StringipReg="\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\d{1,6}"; PatternipPtn=Pattern.compile(ipReg); for(inti=1;i =localWantedNumber.get()){ return; } try{ Documentdoc=Jsoup.connect(baseUrl+i+"/") .header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") .header("Accept-Encoding","gzip,deflate,sdch") .header("Accept-Language","zh-CN,zh;q=0.8,en;q=0.6") .header("Cache-Control","max-age=0") .header("User-Agent","Mozilla/5.0(Macintosh;IntelMacOSX10_11_4)AppleWebKit/537.36(KHTML,likeGecko)Chrome/51.0.2704.103Safari/537.36") .header("Cookie","Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1462812244;_gat=1;_ga=GA1.2.1061361785.1462812244") .header("Host","www.kuaidaili.com") .header("Referer","http://www.kuaidaili.com/free/outha/") .timeout(30*1000) .get(); Matcherm=ipPtn.matcher(doc.text()); while(m.find()){ if(getCurrentProxyNumber()>=localWantedNumber.get()){ break; } String[]strs=m.group().split(""); if(checkProxy(strs[0],Integer.parseInt(strs[1]))){ System.out.println("获取到可用代理IP\t"+strs[0]+"\t"+strs[1]); addProxy(strs[0],strs[1],"http"); } } }catch(Exceptione){ e.printStackTrace(); } } } privatestaticbooleancheckProxy(Stringip,Integerport){ try{ //http://1212.ip138.com/ic.asp可以换成任何比较快的网页 Jsoup.connect("http://1212.ip138.com/ic.asp") .timeout(2*1000) .proxy(ip,port) .get(); returntrue; }catch(Exceptione){ returnfalse; } } privateintgetCurrentProxyNumber(){ List proxyInfos=localProxyInfos.get(); if(proxyInfos==null){ proxyInfos=newArrayList (); localProxyInfos.set(proxyInfos); return0; } else{ returnproxyInfos.size(); } } privatevoidaddProxy(Stringip,Stringport,Stringprotocol){ List proxyInfos=localProxyInfos.get(); if(proxyInfos==null){ proxyInfos=newArrayList (); proxyInfos.add(newProxyInfo(ip,port,protocol)); } else{ proxyInfos.add(newProxyInfo(ip,port,protocol)); } } } classProxyInfo{ privateStringuserName=""; privateStringip; privateStringpassword=""; privateStringtype; privateStringport; privateintis_internet=1; publicProxyInfo(Stringip,Stringport,Stringtype){ this.ip=ip; this.type=type; this.port=port; } publicStringgetUserName(){ returnuserName; } publicvoidsetUserName(StringuserName){ this.userName=userName; } publicStringgetIp(){ returnip; } publicvoidsetIp(Stringip){ this.ip=ip; } publicStringgetPassword(){ returnpassword; } publicvoidsetPassword(Stringpassword){ this.password=password; } publicStringgetType(){ returntype; } publicvoidsetType(Stringtype){ this.type=type; } publicStringgetPort(){ returnport; } publicvoidsetPort(Stringport){ this.port=port; } publicintgetIs_internet(){ returnis_internet; } publicvoidsetIs_internet(intis_internet){ this.is_internet=is_internet; } } classProxyResponse{ privateStringsuccess; privateMap data; publicStringgetSuccess(){ returnsuccess; } publicvoidsetSuccess(Stringsuccess){ this.success=success; } publicMap getData(){ returndata; } publicvoidsetData(Map data){ this.data=data; } }
以上这篇java代理实现爬取代理IP的示例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持毛票票。
声明:本文内容来源于网络,版权归原作者所有,内容由互联网用户自发贡献自行上传,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任。如果您发现有涉嫌版权的内容,欢迎发送邮件至:czq8825#qq.com(发邮件时,请将#更换为@)进行举报,并提供相关证据,一经查实,本站将立刻删除涉嫌侵权内容。