java代理实现爬取代理IP的示例
仅仅使用了一个java文件,运行main方法即可,需要依赖的jar包是com.alibaba.fastjson(版本1.2.28)和Jsoup(版本1.10.2)
如果用了pom,那么就是以下两个:
com.alibaba fastjson 1.2.28 org.jsoup jsoup 1.10.2
完整的代码如下:
packagecom.tuniu.fcm.facade.IPProxy;
importcom.alibaba.fastjson.JSONObject;
importorg.jsoup.Jsoup;
importorg.jsoup.nodes.Document;
importjava.util.ArrayList;
importjava.util.HashMap;
importjava.util.List;
importjava.util.Map;
importjava.util.regex.Matcher;
importjava.util.regex.Pattern;
/**
*获取代理IP,需要
*com.alibaba.fastjson.JSONObject以及Jsoup
*/
publicclassProxyCralwerUnusedVPN{
ThreadLocallocalWantedNumber=newThreadLocal();
ThreadLocal>localProxyInfos=newThreadLocal>();
publicstaticvoidmain(String[]args){
ProxyCralwerUnusedVPNproxyCrawler=newProxyCralwerUnusedVPN();
/**
*想要获取的代理IP个数,由需求方自行指定。(如果个数太多,将导致返回变慢)
*/
proxyCrawler.startCrawler(1);
}
/**
*暴露给外部模块调用的入口
*@paramwantedNumber调用方期望获取到的代理IP个数
*/
publicStringstartCrawler(intwantedNumber){
localWantedNumber.set(wantedNumber);
kuaidailiCom("http://www.xicidaili.com/nn/",15);
kuaidailiCom("http://www.xicidaili.com/nt/",15);
kuaidailiCom("http://www.xicidaili.com/wt/",15);
kuaidailiCom("http://www.kuaidaili.com/free/inha/",15);
kuaidailiCom("http://www.kuaidaili.com/free/intr/",15);
kuaidailiCom("http://www.kuaidaili.com/free/outtr/",15);
/**
*构造返回数据
*/
ProxyResponseresponse=newProxyResponse();
response.setSuccess("true");
MapdataInfoMap=newHashMap();
dataInfoMap.put("numFound",localProxyInfos.get().size());
dataInfoMap.put("pageNum",1);
dataInfoMap.put("proxy",localProxyInfos.get());
response.setData(dataInfoMap);
StringresponseString=JSONObject.toJSON(response).toString();
System.out.println(responseString);
returnresponseString;
}
privatevoidkuaidailiCom(StringbaseUrl,inttotalPage){
StringipReg="\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\d{1,6}";
PatternipPtn=Pattern.compile(ipReg);
for(inti=1;i=localWantedNumber.get()){
return;
}
try{
Documentdoc=Jsoup.connect(baseUrl+i+"/")
.header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
.header("Accept-Encoding","gzip,deflate,sdch")
.header("Accept-Language","zh-CN,zh;q=0.8,en;q=0.6")
.header("Cache-Control","max-age=0")
.header("User-Agent","Mozilla/5.0(Macintosh;IntelMacOSX10_11_4)AppleWebKit/537.36(KHTML,likeGecko)Chrome/51.0.2704.103Safari/537.36")
.header("Cookie","Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1462812244;_gat=1;_ga=GA1.2.1061361785.1462812244")
.header("Host","www.kuaidaili.com")
.header("Referer","http://www.kuaidaili.com/free/outha/")
.timeout(30*1000)
.get();
Matcherm=ipPtn.matcher(doc.text());
while(m.find()){
if(getCurrentProxyNumber()>=localWantedNumber.get()){
break;
}
String[]strs=m.group().split("");
if(checkProxy(strs[0],Integer.parseInt(strs[1]))){
System.out.println("获取到可用代理IP\t"+strs[0]+"\t"+strs[1]);
addProxy(strs[0],strs[1],"http");
}
}
}catch(Exceptione){
e.printStackTrace();
}
}
}
privatestaticbooleancheckProxy(Stringip,Integerport){
try{
//http://1212.ip138.com/ic.asp可以换成任何比较快的网页
Jsoup.connect("http://1212.ip138.com/ic.asp")
.timeout(2*1000)
.proxy(ip,port)
.get();
returntrue;
}catch(Exceptione){
returnfalse;
}
}
privateintgetCurrentProxyNumber(){
ListproxyInfos=localProxyInfos.get();
if(proxyInfos==null){
proxyInfos=newArrayList();
localProxyInfos.set(proxyInfos);
return0;
}
else{
returnproxyInfos.size();
}
}
privatevoidaddProxy(Stringip,Stringport,Stringprotocol){
ListproxyInfos=localProxyInfos.get();
if(proxyInfos==null){
proxyInfos=newArrayList();
proxyInfos.add(newProxyInfo(ip,port,protocol));
}
else{
proxyInfos.add(newProxyInfo(ip,port,protocol));
}
}
}
classProxyInfo{
privateStringuserName="";
privateStringip;
privateStringpassword="";
privateStringtype;
privateStringport;
privateintis_internet=1;
publicProxyInfo(Stringip,Stringport,Stringtype){
this.ip=ip;
this.type=type;
this.port=port;
}
publicStringgetUserName(){
returnuserName;
}
publicvoidsetUserName(StringuserName){
this.userName=userName;
}
publicStringgetIp(){
returnip;
}
publicvoidsetIp(Stringip){
this.ip=ip;
}
publicStringgetPassword(){
returnpassword;
}
publicvoidsetPassword(Stringpassword){
this.password=password;
}
publicStringgetType(){
returntype;
}
publicvoidsetType(Stringtype){
this.type=type;
}
publicStringgetPort(){
returnport;
}
publicvoidsetPort(Stringport){
this.port=port;
}
publicintgetIs_internet(){
returnis_internet;
}
publicvoidsetIs_internet(intis_internet){
this.is_internet=is_internet;
}
}
classProxyResponse{
privateStringsuccess;
privateMapdata;
publicStringgetSuccess(){
returnsuccess;
}
publicvoidsetSuccess(Stringsuccess){
this.success=success;
}
publicMapgetData(){
returndata;
}
publicvoidsetData(Mapdata){
this.data=data;
}
}
以上这篇java代理实现爬取代理IP的示例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持毛票票。
声明:本文内容来源于网络,版权归原作者所有,内容由互联网用户自发贡献自行上传,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任。如果您发现有涉嫌版权的内容,欢迎发送邮件至:czq8825#qq.com(发邮件时,请将#更换为@)进行举报,并提供相关证据,一经查实,本站将立刻删除涉嫌侵权内容。