python定向爬虫校园论坛帖子信息
引言
写这个小爬虫主要是为了爬校园论坛上的实习信息,主要采用了Requests库
源码
URLs.py
主要功能是根据一个初始url(包含page页面参数)来获得page页面从当前页面数到pageNum的url列表
importre defgetURLs(url,attr,pageNum=1): all_links=[] try: now_page_number=int(re.search(attr+'=(\d+)',url,re.S).group(1)) foriinrange(now_page_number,pageNum+1): new_url=re.sub(attr+'=\d+',attr+'=%s'%i,url,re.S) all_links.append(new_url) returnall_links exceptTypeError: print"argumentsTypeError:attrshouldbestring."
uni_2_native.py
由于论坛上爬取得到的网页上的中文都是unicode编码的形式,文本格式都为XXX;的形式,所以在爬得网站内容后还需要对其进行转换
importsys
importre
reload(sys)
sys.setdefaultencoding('utf-8')
defget_native(raw):
tostring=raw
whileTrue:
obj=re.search('(.*?);',tostring,flags=re.S)
ifobjisNone:
break
else:
raw,code=obj.group(0),obj.group(1)
tostring=re.sub(raw,unichr(int(code)),tostring)
returntostring
存入SQLite数据库:saveInfo.py
#-*-coding:utf-8-*-
importMySQLdb
classsaveSqlite():
def__init__(self):
self.infoList=[]
defsaveSingle(self,author=None,title=None,date=None,url=None,reply=0,view=0):
ifauthorisNoneortitleisNoneordateisNoneorurlisNone:
print"Noinfosaved!"
else:
singleDict={}
singleDict['author']=author
singleDict['title']=title
singleDict['date']=date
singleDict['url']=url
singleDict['reply']=reply
singleDict['view']=view
self.infoList.append(singleDict)
deftoMySQL(self):
conn=MySQLdb.connect(host='localhost',user='root',passwd='',port=3306,db='db_name',charset='utf8')
cursor=conn.cursor()
#sql="select*frominfo"
#n=cursor.execute(sql)
#forrowincursor.fetchall():
#forrinrow:
#printr
#print'\n'
sql="deletefrominfo"
cursor.execute(sql)
conn.commit()
sql="insertintoinfo(title,author,url,date,reply,view)values(%s,%s,%s,%s,%s,%s)"
params=[]
foreachinself.infoList:
params.append((each['title'],each['author'],each['url'],each['date'],each['reply'],each['view']))
cursor.executemany(sql,params)
conn.commit()
cursor.close()
conn.close()
defshow(self):
foreachinself.infoList:
print"author:"+each['author']
print"title:"+each['title']
print"date:"+each['date']
print"url:"+each['url']
print"reply:"+str(each['reply'])
print"view:"+str(each['view'])
print'\n'
if__name__=='__main__':
save=saveSqlite()
save.saveSingle('网','aaa','2008-10-1010:10:10','www.baidu.com',1,1)
#save.show()
save.toMySQL()
主要爬虫代码
importrequests
fromlxmlimportetree
fromcc98importuni_2_native,URLs,saveInfo
#根据自己所需要爬的网站,伪造一个header
headers={
'Accept':'',
'Accept-Encoding':'',
'Accept-Language':'',
'Connection':'',
'Cookie':'',
'Host':'',
'Referer':'',
'Upgrade-Insecure-Requests':'',
'User-Agent':''
}
url='http://www.cc98.org/list.asp?boardid=459&page=1&action='
cc98='http://www.cc98.org/'
print"getinfomationfromcc98..."
urls=URLs.getURLs(url,"page",50)
savetools=saveInfo.saveSqlite()
forurlinurls:
r=requests.get(url,headers=headers)
html=uni_2_native.get_native(r.text)
selector=etree.HTML(html)
content_tr_list=selector.xpath('//form/table[@class="tableborder1list-topic-table"]/tbody/tr')
foreachincontent_tr_list:
href=each.xpath('./td[2]/a/@href')
iflen(href)==0:
continue
else:
#printlen(href)
#notverywellusingfor,thoughjustoneelementinlist
#butIdon'tknowwhyIcannotgetthedatabyindex
foreach_hrefinhref:
link=cc98+each_href
title_author_time=each.xpath('./td[2]/a/@title')
#printlen(title_author_time)
forinfointitle_author_time:
info_split=info.split('\n')
title=info_split[0][1:len(info_split[0])-1]
author=info_split[1][3:]
date=info_split[2][3:]
hot=each.xpath('./td[4]/text()')
#printlen(hot)
forhot_numinhot:
reply_view=hot_num.strip().split('/')
reply,view=reply_view[0],reply_view[1]
savetools.saveSingle(author=author,title=title,date=date,url=link,reply=reply,view=view)
print"Allgot!NowsavingtoDatabase..."
#savetools.show()
savetools.toMySQL()
print"ALLCLEAR!HaveFun!"
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。