python手机号前7位归属地爬虫代码实例
需求分析
项目上需要用到手机号前7位,判断号码是否合法,还有归属地查询。旧的数据是几年前了太久了,打算用python爬虫重新爬一份
单线程版本
#coding:utf-8 importrequests fromdatetimeimportdatetime classPhoneInfoSpider: def__init__(self,phoneSections): self.phoneSections=phoneSections defphoneInfoHandler(self,textData): text=textData.splitlines(True) #print("textlength:"+str(len(text))) iflen(text)>=9: number=text[1].split('\'')[1] province=text[2].split('\'')[1] mobile_area=text[3].split('\'')[1] postcode=text[5].split('\'')[1] line="number:"+number+",province:"+province+",mobile_area:"+mobile_area+",postcode:"+postcode line_text=number+","+province+","+mobile_area+","+postcode print(line_text) #print("province:"+province) try: f=open('./result.txt','a') f.write(str(line_text)+'\n') exceptExceptionase: print(Exception,":",e) defrequestPhoneInfo(self,phoneNum): try: url='https://tcc.taobao.com/cc/json/mobile_tel_segment.htm?tel='+phoneNum response=requests.get(url) self.phoneInfoHandler(response.text) exceptExceptionase: print(Exception,":",e) defrequestAllSections(self): #last用于接上次异常退出前的号码 last=0 #last=4 #自动生成手机号码,后四位补0 forheadinself.phoneSections: head_begin=datetime.now() print(head+"begintime:"+str(head_begin)) #foriinrange(last,10000): foriinrange(last,10): middle=str(i).zfill(4) phoneNum=head+middle+"0000" self.requestPhoneInfo(phoneNum) last=0 head_end=datetime.now() print(head+"endtime:"+str(head_end)) if__name__=='__main__': task_begin=datetime.now() print("phonecheckbegintime:"+str(task_begin)) #电信,联通,移动,虚拟运营商 dx=['133','149','153','173','177','180','181','189','199'] lt=['130','131','132','145','146','155','156','166','171','175','176','185','186','166'] yd=['134','135','136','137','138','139','147','148','150','151','152','157','158','159','172', '178','182','183','184','187','188','198'] add=['170'] all_num=dx+lt+yd+add #print(all_num) print(len(all_num)) #要爬的号码段 spider=PhoneInfoSpider(all_num) spider.requestAllSections() task_end=datetime.now() print("phonecheckendtime:"+str(task_end))
发现爬取一个号段,共10000次查询,单线程版大概要多1个半小时,太慢了。
多线程版本
#coding:utf-8 importrequests fromdatetimeimportdatetime importqueue importthreading threadNum=32 classMyThread(threading.Thread): def__init__(self,func): threading.Thread.__init__(self) self.func=func defrun(self): self.func() defrequestPhoneInfo(): globallock whileTrue: lock.acquire() ifq.qsize()!=0: print("queuesize:"+str(q.qsize())) p=q.get()#获得任务 lock.release() middle=str(9999-q.qsize()).zfill(4) phoneNum=phone_head+middle+"0000" print("phoneNum:"+phoneNum) try: url='https://tcc.taobao.com/cc/json/mobile_tel_segment.htm?tel='+phoneNum #print(url) response=requests.get(url) #print(response.text) phoneInfoHandler(response.text) exceptExceptionase: print(Exception,":",e) else: lock.release() break defphoneInfoHandler(textData): text=textData.splitlines(True) iflen(text)>=9: number=text[1].split('\'')[1] province=text[2].split('\'')[1] mobile_area=text[3].split('\'')[1] postcode=text[5].split('\'')[1] line="number:"+number+",province:"+province+",mobile_area:"+mobile_area+",postcode:"+postcode line_text=number+","+province+","+mobile_area+","+postcode print(line_text) #print("province:"+province) try: f=open('./result.txt','a') f.write(str(line_text)+'\n') exceptExceptionase: print(Exception,":",e) if__name__=='__main__': task_begin=datetime.now() print("phonecheckbegintime:"+str(task_begin)) dx=['133','149','153','173','177','180','181','189','199'] lt=['130','131','132','145','155','156','166','171','175','176','185','186','166'] yd=['134','135','136','137','138','139','147','150','151','152','157','158','159','172','178', '182','183','184','187','188','198'] all_num=dx+lt+yd print(len(all_num)) forheadinall_num: head_begin=datetime.now() print(head+"begintime:"+str(head_begin)) q=queue.Queue() threads=[] lock=threading.Lock() forpinrange(10000): q.put(p+1) print(q.qsize()) foriinrange(threadNum): middle=str(i).zfill(4) globalphone_head phone_head=head thread=MyThread(requestPhoneInfo) thread.start() threads.append(thread) forthreadinthreads: thread.join() head_end=datetime.now() print(head+"endtime:"+str(head_end)) task_end=datetime.now() print("phonecheckendtime:"+str(task_end))
多线程版的1个号码段1000条数据,大概2,3min就好,cpu使用飙升,大概维持在70%左右。
总共40多个号段,爬完大概1,2个小时,总数据41w左右
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。