python定向爬虫校园论坛帖子信息

2023-09-09 23:40:06 358

引言

写这个小爬虫主要是为了爬校园论坛上的实习信息，主要采用了Requests库

源码

URLs.py

主要功能是根据一个初始url（包含page页面参数）来获得page页面从当前页面数到pageNum的url列表

importre

defgetURLs(url,attr,pageNum=1):
all_links=[]
try:
now_page_number=int(re.search(attr+'=(\d+)',url,re.S).group(1))
foriinrange(now_page_number,pageNum+1):
new_url=re.sub(attr+'=\d+',attr+'=%s'%i,url,re.S)
all_links.append(new_url)
returnall_links
exceptTypeError:
print"argumentsTypeError:attrshouldbestring."

uni_2_native.py

由于论坛上爬取得到的网页上的中文都是unicode编码的形式，文本格式都为&#XXXX;的形式，所以在爬得网站内容后还需要对其进行转换

importsys
importre
reload(sys)
sys.setdefaultencoding('utf-8')

defget_native(raw):
tostring=raw
whileTrue:
obj=re.search('&#(.*?);',tostring,flags=re.S)
ifobjisNone:
break
else:
raw,code=obj.group(0),obj.group(1)
tostring=re.sub(raw,unichr(int(code)),tostring)
returntostring

存入SQLite数据库：saveInfo.py

#-*-coding:utf-8-*-

importMySQLdb


classsaveSqlite():
def__init__(self):
self.infoList=[]

defsaveSingle(self,author=None,title=None,date=None,url=None,reply=0,view=0):
ifauthorisNoneortitleisNoneordateisNoneorurlisNone:
print"Noinfosaved!"
else:
singleDict={}
singleDict['author']=author
singleDict['title']=title
singleDict['date']=date
singleDict['url']=url
singleDict['reply']=reply
singleDict['view']=view
self.infoList.append(singleDict)

deftoMySQL(self):
conn=MySQLdb.connect(host='localhost',user='root',passwd='',port=3306,db='db_name',charset='utf8')
cursor=conn.cursor()
#sql="select*frominfo"
#n=cursor.execute(sql)
#forrowincursor.fetchall():
#forrinrow:
#printr
#print'\n'
sql="deletefrominfo"
cursor.execute(sql)
conn.commit()

sql="insertintoinfo(title,author,url,date,reply,view)values(%s,%s,%s,%s,%s,%s)"
params=[]
foreachinself.infoList:
params.append((each['title'],each['author'],each['url'],each['date'],each['reply'],each['view']))
cursor.executemany(sql,params)

conn.commit()
cursor.close()
conn.close()


defshow(self):
foreachinself.infoList:
print"author:"+each['author']
print"title:"+each['title']
print"date:"+each['date']
print"url:"+each['url']
print"reply:"+str(each['reply'])
print"view:"+str(each['view'])
print'\n'

if__name__=='__main__':
save=saveSqlite()
save.saveSingle('网','aaa','2008-10-1010:10:10','www.baidu.com',1,1)
#save.show()
save.toMySQL()

主要爬虫代码

importrequests
fromlxmlimportetree
fromcc98importuni_2_native,URLs,saveInfo

#根据自己所需要爬的网站，伪造一个header
headers={
'Accept':'',
'Accept-Encoding':'',
'Accept-Language':'',
'Connection':'',
'Cookie':'',
'Host':'',
'Referer':'',
'Upgrade-Insecure-Requests':'',
'User-Agent':''
}
url='http://www.cc98.org/list.asp?boardid=459&page=1&action='
cc98='http://www.cc98.org/'

print"getinfomationfromcc98..."

urls=URLs.getURLs(url,"page",50)
savetools=saveInfo.saveSqlite()

forurlinurls:
r=requests.get(url,headers=headers)
html=uni_2_native.get_native(r.text)

selector=etree.HTML(html)
content_tr_list=selector.xpath('//form/table[@class="tableborder1list-topic-table"]/tbody/tr')

foreachincontent_tr_list:
href=each.xpath('./td[2]/a/@href')
iflen(href)==0:
continue
else:
#printlen(href)
#notverywellusingfor,thoughjustoneelementinlist
#butIdon'tknowwhyIcannotgetthedatabyindex
foreach_hrefinhref:
link=cc98+each_href
title_author_time=each.xpath('./td[2]/a/@title')

#printlen(title_author_time)
forinfointitle_author_time:
info_split=info.split('\n')
title=info_split[0][1:len(info_split[0])-1]
author=info_split[1][3:]
date=info_split[2][3:]

hot=each.xpath('./td[4]/text()')
#printlen(hot)
forhot_numinhot:
reply_view=hot_num.strip().split('/')
reply,view=reply_view[0],reply_view[1]
savetools.saveSingle(author=author,title=title,date=date,url=link,reply=reply,view=view)

print"Allgot!NowsavingtoDatabase..."
#savetools.show()
savetools.toMySQL()
print"ALLCLEAR!HaveFun!"

以上就是本文的全部内容，希望对大家的学习有所帮助，也希望大家多多支持毛票票。

python定向爬虫校园论坛帖子信息

热门推荐

随机推荐