python 写的一个爬虫程序源码
写爬虫是一项复杂、枯噪、反复的工作,考虑的问题包括采集效率、链路异常处理、数据质量(与站点编码规范关系很大)等。整理自己写一个爬虫程序,单台服务器可以启用1~8个实例同时采集,然后将数据入库。
#-*-coding:utf-8-*- #!/usr/local/bin/python importsys,time,os,string importmechanize importurlparse fromBeautifulSoupimportBeautifulSoup importre importMySQLdb importlogging importcgi fromoptparseimportOptionParser #----------------------------------------------------------------------------# #Name:TySpider.py# #Purpose:WebSiteSpiderModule# #Author:刘天斯# #Email:liutiansi@gamil.com# #Created:2010/02/16# #Copyright:(c)2010# #----------------------------------------------------------------------------# """ |-------------------------------------------------------------------------- |定义logingclass; |-------------------------------------------------------------------------- | |功能:记录系统相关日志信息。 | | """ classPubclilog(): def__init__(self): self.logfile='website_log.txt' definiLog(self): logger=logging.getLogger() filehandler=logging.FileHandler(self.logfile) streamhandler=logging.StreamHandler() fmt=logging.Formatter('%(asctime)s,%(funcName)s,%(message)s') logger.setLevel(logging.DEBUG) logger.addHandler(filehandler) logger.addHandler(streamhandler) return[logger,filehandler] """ |-------------------------------------------------------------------------- |定义tySpiderclass; |-------------------------------------------------------------------------- | |功能:抓取分类、标题等信息 | | """ classBaseTySpider: #初始化相关成员方法 def__init__(self,X,log_switch): #数据库连接 self.conn=MySQLdb.connect(db='dbname',host='192.168.0.10',user='dbuser',passwd='SDFlkj934y5jsdgfjh435',charset='utf8') #分类及标题页面Community self.CLASS_URL='http://test.abc.com/aa/CommTopicsPage?' #发表回复页 self.Content_URL='http://test.bac.com/aa/CommMsgsPage?' #开始comm值 self.X=X #当前commid取模,方面平均到表 self.mod=self.X%5 #Community文件下载页 self.body="" #self.bodySoup对象 self.soup=None #发表回复页下载内容变量 self.Contentbody="" #发表回复页内容self.ContentbodySoup对象 self.Contentsoup=None #日志开关 self.log_switch=log_switch #======================获取名称及分类方法========================== def_SpiderClass(self,nextpage=None): ifnextpage==None: FIXED_QUERY='cmm='+str(self.X) else: FIXED_QUERY=nextpage[1:] try: rd=mechanize.Browser() rd.addheaders=[("User-agent","Tianya/2010(compatible;MSIE6.0;WindowsNT5.1)")] rd.open(self.CLASS_URL+FIXED_QUERY) self.body=rd.response().read() #rd=mechanize.Request(self.CLASS_URL+FIXED_QUERY) #response=mechanize.urlopen(rd) #self.body=response.read() exceptException,e: ifself.log_switch=="on": logapp=Pubclilog() logger,hdlr=logapp.iniLog() logger.info(self.CLASS_URL+FIXED_QUERY+str(e)) hdlr.flush() logger.removeHandler(hdlr) return self.soup=BeautifulSoup(self.body) NextPageObj=self.soup("a",{'class':re.compile("fs-paging-itemfs-paging-next")}) self.cursor=self.conn.cursor() ifnextpage==None: try: Ttag=str(self.soup.table) #printTtag """ ------------------分析结构体----------------- <tablecellspacing="0"cellpadding="0"> <tr> <td> <h1title="Dunhill">Dunhill</h1> </td> <tdvalign="middle"> <divclass="fs-comm-cat"> <spanclass="fs-iconsfs-icon-cat"></span><ahref="TopByCategoryPage?cid=211&ref=commnav-cat">中国</a>»<ahref="TopByCategoryPage?cid=211&subcid=273&ref=commnav-cat">人民</a> </div> </td> </tr> </table> """ soupTable=BeautifulSoup(Ttag) #定位到第一个h1标签 tableh1=soupTable("h1") #printself.X #print"Name:"+tableh1[0].string.strip().encode('utf-8') #处理无类型的 try: #定位到表格中符合规则“^TopByCategory”A链接块,tablea[0]为第一个符合条件的连接文字,tablea[1]... tablea=soupTable("a",{'href':re.compile("^TopByCategory")}) iftablea[0].string.strip()=="": pass #print"BigCLass:"+tablea[0].string.strip().encode('utf-8') #print"SubClass:"+tablea[1].string.strip().encode('utf-8') exceptException,e: ifself.log_switch=="on": logapp=Pubclilog() logger,hdlr=logapp.iniLog() logger.info("[noClassInfo]"+str(self.X)+str(e)) hdlr.flush() logger.removeHandler(hdlr) self.cursor.execute("insertintobaname"+str(self.mod)+"values('%d','%d','%s')"%(self.X,-1,tableh1[0].string.strip().encode('utf-8'))) self.conn.commit() self._SpiderTitle() ifNextPageObj: NextPageURL=NextPageObj[0]['href'] self._SpiderClass(NextPageURL) return else: return #获取链接二对象的href值 classlink=tablea[1]['href'] par_dict=cgi.parse_qs(urlparse.urlparse(classlink).query) #print"CID:"+par_dict["cid"][0] #print"SubCID:"+par_dict["subcid"][0] #print"---------------------------------------" #插入数据库 self.cursor.execute("insertintoclassvalues('%d','%s')"%(int(par_dict["cid"][0]),tablea[0].string.strip().encode('utf-8'))) self.cursor.execute("insertintosubclassvalues('%d','%d','%s')"%(int(par_dict["subcid"][0]),int(par_dict["cid"][0]),tablea[1].string.strip().encode('utf-8'))) self.cursor.execute("insertintobaname"+str(self.mod)+"values('%d','%d','%s')"%(self.X,int(par_dict["subcid"][0]),tableh1[0].string.strip().encode('utf-8'))) self.conn.commit() self._SpiderTitle() ifNextPageObj: NextPageURL=NextPageObj[0]['href'] self._SpiderClass(NextPageURL) self.body=None self.soup=None Ttag=None soupTable=None table=None table1=None classlink=None par_dict=None exceptException,e: ifself.log_switch=="on": logapp=Pubclilog() logger,hdlr=logapp.iniLog() logger.info("[ClassInfo]"+str(self.X)+str(e)) hdlr.flush() logger.removeHandler(hdlr) else: self._SpiderTitle() ifNextPageObj: NextPageURL=NextPageObj[0]['href'] self._SpiderClass(NextPageURL) #====================获取标题方法========================= def_SpiderTitle(self): #查找标题表格对象(table) soupTitleTable=self.soup("table",{'class':"fs-topic-list"}) #查找标题行对象(tr) TitleTr=soupTitleTable[0]("tr",{'onmouseover':re.compile("^this\.className='fs-row-hover'")}) """ -----------分析结构体-------------- <trclass="fs-alt-row"onmouseover="this.className='fs-row-hover'"onmouseout="this.className='fs-alt-row'"> <tdvalign="middle"class="fs-hot-topic-dots-ctn"> <divclass="fs-hot-topic-dots"style="background-position:0-0px"title="点击量:12"></div> </td> <tdvalign="middle"class="fs-topic-name"> <ahref="CommMsgsPage?cmm=16081&tid=2718969307756232842&ref=regulartopics"id="a53"title="【新人报到】欢迎美国人民加入"target="_blank">【新人报到】欢迎美国人民加入</a> <spanclass="fs-meta"> <spanclass="fs-iconsfs-icon-mini-reply"></span>0 / <spanclass="fs-iconsfs-icon-pageview"></span>12</span> </td> <tdvalign="middle"> <aclass="fs-tiny-user-avatarumhook"href="ProfilePage?uid=8765915421039908242"title="中国人"><imgsrc="http://img1.sohu.com.cn/aa/images/138/0/P/1/s.jpg"/></a> </td> <tdvalign="middle"style="padding-left:4px"> <ahref="Profile?uid=8765915421039908242"id="b53"title="中国人"class="umhook">中国人</a> </td> <tdvalign="middle"class="fs-topic-last-mdfyfs-meta">2-14</td> </tr> """ forCurrTrinTitleTr: try: #初始化置顶及精华状态 Title_starred='N' Title_sticky='N' #获取当前记录的BeautifulSoup对象 soupCurrTr=BeautifulSoup(str(CurrTr)) #BeautifulSoup分析HTML有误,只能通过span的标志数来获取贴子状态,会存在一定误差 #如只有精华时也会当成置顶来处理。 TitleStatus=soupCurrTr("span",{'title':""}) TitlePhotoViewer=soupCurrTr("a",{'href':re.compile("^PhotoViewer")}) ifTitlePhotoViewer.__len__()==1: TitlePhotoViewerBool=0 else: TitlePhotoViewerBool=1 ifTitleStatus.__len__()==3-TitlePhotoViewerBool: Title_starred='Y' Title_sticky='Y' elifTitleStatus.__len__()==2-TitlePhotoViewerBool: Title_sticky='Y' #获取贴子标题 Title=soupCurrTr.a.next.strip() #获取贴子ID par_dict=cgi.parse_qs(urlparse.urlparse(soupCurrTr.a['href']).query) #获取回复数及浏览器 TitleNum=soupCurrTr("td",{'class':"fs-topic-name"}) TitleArray=string.split(str(TitleNum[0]),'\n') Title_ReplyNum=string.split(TitleArray[len(TitleArray)-4],'>')[2] Title_ViewNum=string.split(TitleArray[len(TitleArray)-2],'>')[2][:-6] #获取贴子作者 TitleAuthorObj=soupCurrTr("td",{'style':"padding-left:4px"}) Title_Author=TitleAuthorObj[0].next.next.next.string.strip().encode('utf-8') #获取回复时间 TitleTime=soupCurrTr("td",{'class':re.compile("^fs-topic-last-mdfyfs-meta")}) """ print"X:"+str(self.X) print"Title_starred:"+Title_starred print"Title_sticky:"+Title_sticky print"Title:"+Title #获取贴子内容连接URL print"Title_link:"+soupCurrTr.a['href'] print"CID:"+par_dict["tid"][0] print"Title_ReplyNum:"+Title_ReplyNum print"Title_ViewNum:"+Title_ViewNum print"Title_Author:"+Title_Author print"TitleTime:"+TitleTime[0].string.strip().encode('utf-8') """ #入库 self.cursor.execute("insertintoTitle"+str(self.mod)+"values('%s','%d','%s','%d','%d','%s','%s','%s','%s')"%(par_dict["tid"][0],\ self.X,Title,int(Title_ReplyNum),int(Title_ViewNum),Title_starred,Title_sticky,\ Title_Author.decode('utf-8'),TitleTime[0].string.strip().encode('utf-8'))) self.conn.commit() self._SpiderContent(par_dict["tid"][0]) exceptException,e: ifself.log_switch=="on": logapp=Pubclilog() logger,hdlr=logapp.iniLog() logger.info("[Title]"+str(self.X)+'-'+par_dict["tid"][0]+'-'+str(e)) hdlr.flush() logger.removeHandler(hdlr) #======================获取发表及回复方法======================= def_SpiderContent(self,ID,nextpage=None): ifnextpage==None: FIXED_QUERY='cmm='+str(self.X)+'&tid='+ID+'&ref=regulartopics' else: FIXED_QUERY=nextpage[9:] rd=mechanize.Browser() rd.addheaders=[("User-agent","Tianya/2010(compatible;MSIE6.0;WindowsNT5.1)")] rd.open(self.Content_URL+FIXED_QUERY) self.Contentbody=rd.response().read() #rd=mechanize.Request(self.Content_URL+FIXED_QUERY) #response=mechanize.urlopen(rd) #self.Contentbody=response.read() self.Contentsoup=BeautifulSoup(self.Contentbody) NextPageObj=self.Contentsoup("a",{'class':re.compile("fs-paging-itemfs-paging-next")}) try: Tdiv=self.Contentsoup("div",{'class':"fs-user-action"}) i=0 forCurrdivinTdiv: ifi==0: Ctype='Y' else: Ctype='N' #发表时间 soupCurrdiv=BeautifulSoup(str(Currdiv)) PosttimeObj=soupCurrdiv("span",{'class':"fs-meta"}) Posttime=PosttimeObj[0].next[1:] Posttime=Posttime[0:-3] #IP地址 IPObj=soupCurrdiv("a",{'href':re.compile("CommMsgAddress")}) ifIPObj: IP=IPObj[0].next.strip() else: IP='' #发表/回复内容 ContentObj=soupCurrdiv("div",{'class':"fs-user-action-body"}) Content=ContentObj[0].renderContents().strip() """ print"ID:"+str(self.X) print"ID:"+ID print"Ctype:"+Ctype print"POSTTIME:"+Posttime print"IP:"+IP print"Content:"+Content """ self.cursor.execute("insertintoContent"+str(self.mod)+"values('%s','%d','%s','%s','%s','%s')"%(ID,self.X,Ctype,Posttime,IP,Content.decode('utf-8'))) self.conn.commit() i+=1 exceptException,e: ifself.log_switch=="on": logapp=Pubclilog() logger,hdlr=logapp.iniLog() logger.info("[Content]"+str(self.X)+'-'+ID+'-'+str(e)) hdlr.flush() logger.removeHandler(hdlr) #如“下一页”有链接刚继续遍历 ifNextPageObj: NextPageURL=NextPageObj[0]['href'] self._SpiderContent(ID,NextPageURL) def__del__(self): try: self.cursor.close() self.conn.close() exceptException,e: pass #遍历comm范围 definitapp(StartValue,EndValue,log_switch): forxinrange(StartValue,EndValue): app=BaseTySpider(x,log_switch) app._SpiderClass() app=None if__name__=="__main__": #定义命令行参数 MSG_USAGE="TySpider.py[-sStartNumberEndNumber]-l[on|off][-v][-h]" parser=OptionParser(MSG_USAGE) parser.add_option("-s","--set",nargs=2,action="store", dest="comm_value", type="int", default=False, help="配置名称ID值范围。".decode('utf-8')) parser.add_option("-l","--log",action="store", dest="log_switch", type="string", default="on", help="错误日志开关".decode('utf-8')) parser.add_option("-v","--version",action="store_true",dest="verbose", help="显示版本信息".decode('utf-8')) opts,args=parser.parse_args() ifopts.comm_value: ifopts.comm_value[0]>opts.comm_value[1]: print"终止值比起始值还小?" exit(); ifopts.log_switch=="on": log_switch="on" else: log_switch="off" initapp(opts.comm_value[0],opts.comm_value[1],log_switch) exit(); ifopts.verbose: print"WebSiteSciderV1.0beta." exit;