python爬取本站电子书信息并入库的实现代码
入门级爬虫:只抓取书籍名称,信息及下载地址并存储到数据库
数据库工具类:DBUtil.py
importpymysql classDBUtils(object): defconnDB(self):#连接数据库 conn=pymysql.connect(host='192.168.251.114',port=3306,user='root',passwd='b6f3g2',db='yangsj',charset='utf8'); cur=conn.cursor(); return(conn,cur); defexeUpdate(self,conn,cur,sql):#更新或插入操作 sta=cur.execute(sql); conn.commit(); return(sta); defexeDelete(self,conn,cur,IDs):#删除操作demo没用到 sta=0; foreachIDinIDs.split(''): sta+=cur.execute("deletefromstudentswhereId=%d"%(int(eachID))); conn.commit(); return(sta); defexeQuery(self,cur,sql):#查找操作 effect_row=cur.execute(sql); return(effect_row,cur); defconnClose(self,conn,cur):#关闭连接,释放资源 cur.close(); conn.close(); if__name__=='__main__': dbUtil=DBUtils(); conn,cur=dbUtil.connDB();
书籍操作文件bookOpe.py
fromDBUtilimportDBUtils frombookInfoimportBook frombookInfoimportDownLoadInfo importlogging logging.basicConfig( level=logging.INFO ) classBookOperator(object): def__addBook(self,book): logging.info("addbook:%s"%book.bookName); dbUtil=DBUtils(); conn,cur=dbUtil.connDB(); insertBookSql=("insertintobook(bookName,bookUrl,bookInfo)values('%s','%s','%s');"%(book.bookName,book.downLoadUrl,book.mainInfo)); dbUtil.exeUpdate(conn,cur,insertBookSql); dbUtil.connClose(conn,cur); def__selectLastBookId(self): logging.info("selectLastBookId"); dbUtil=DBUtils(); conn,cur=dbUtil.connDB(); selectLastBookSql="selectidfrombookorderbyiddesclimit1"; effect_row,cur=dbUtil.exeQuery(cur,selectLastBookSql); bookId=cur.fetchone()[0]; dbUtil.connClose(conn,cur); returnbookId; def__addBookDownLoadInfos(self,downLoadInfos,bookId): logging.info("addbookId:%s"%bookId); dbUtil=DBUtils(); conn,cur=dbUtil.connDB(); fordownLoadinfoindownLoadInfos: insertBookDownLoadInfo=("insertintobook_down_url(bookId,downName,downUrl)values('%s','%s','%s');"%(bookId,downLoadinfo.downName,downLoadinfo.downUrl)); dbUtil.exeUpdate(conn,cur,insertBookDownLoadInfo); dbUtil.connClose(conn,cur); defaddBookInfo(self,book): logging.info("addbookInfo:%s"%book.bookName); self.__addBook(book); bookId=self.__selectLastBookId(); self.__addBookDownLoadInfos(book.downLoadInfos,bookId); if__name__=='__main__': bookope=BookOperator(); book=Book("aaa","yang","cccc"); book.addDownLoadUrl(DownLoadInfo("aaa.html","书籍")); bookope.addBookInfo(book);
书籍信息文件bookInfo.py
importsys sys.encoding="utf8" classBook(object): #书籍信息# def__init__(self,mainInfo,downLoadUrl,bookName): self.mainInfo=mainInfo; self.downLoadUrl=downLoadUrl; self.bookName=bookName; self.downLoadInfos=[]; defaddDownLoadUrl(self,downloadInfo): self.downLoadInfos.append(downloadInfo); defprint_book_info(self): print("bookName:%s"%(self.bookName)); classDownLoadInfo(object): #下载信息# def__init__(self,downUrl,downName): self.downUrl=downUrl; self.downName=downName; defprint_down_info(self): print("downLoad%s-%s"%(self.downUrl,self.downName));
51job界面解析文件FiveOneJobFetch.py
importrequests frombs4importBeautifulSoup importsys frombookInfoimportBook frombookInfoimportDownLoadInfo importlogging sys.encoding="utf8" classPageFetch(object): host="//www.nhooo.com/";#域名+分类 category="books/";#具体请求页 def__init__(self,pageUrl): self.pageUrl=pageUrl;#完整URL self.url=PageFetch.host+PageFetch.category+pageUrl; def__getPageContent(self): req=requests.get(self.url); ifreq.status_code==200: req.encoding="gb2312"; strText=req.text; returnstrText; else: return""; defgetPageContent(url): req=requests.get(url); ifreq.status_code==200: req.encoding="gb2312"; strText=req.text; returnstrText; else: return""; def__getMaxPageNumAndUrl(self): fetchUrl=self.pageUrl; #获取分页地址分页url形如list45_2.html2为页号# maxPageNum=0; maxLink=""; whilemaxLink=="": url=PageFetch.host+PageFetch.category+fetchUrl; reqContent=PageFetch.getPageContent(url) soup=BeautifulSoup(reqContent,"html.parser"); forulinsoup.select(".plist"): print("数据"); print(ul); maxPageNum=ul.select("strong")[0].text; alink=ul.select("a"); ifalink[-1]['href']=="#": maxLink=alink[1]['href']; else: fetchUrl=alink[-1]['href']; returnmaxPageNum,maxLink; def__formatPage(self,pageNum): #格式化url形如list45_2.html# lineBeginSite=self.pageUrl.index("_")+1; docBeginSite=self.pageUrl.index("."); returnself.pageUrl[:lineBeginSite]+str(pageNum+1)+self.pageUrl[docBeginSite:]; defgetBookPageList(self): #获取书籍每页的URL# shortPageList=[]; maxPageNum,urlPattern=self.__getMaxPageNumAndUrl(); foriinrange(int(maxPageNum)): shortPageList.append(self.host+self.category+self.__formatPage(i)); returnshortPageList; defgetDownloadPage(url): downPage=[]; reqContent=PageFetch.getPageContent(url); soup=BeautifulSoup(reqContent,"html.parser"); forainsoup.select(".cur-cat-list.btn-dl"): downPage.append(PageFetch.host+a['href']); returndownPage; defgetBookInfo(url): logging.info("获取书籍信息url:%s"%url); reqContent=PageFetch.getPageContent(url); soup=BeautifulSoup(reqContent,"html.parser"); mainInfo=(soup.select("#soft-intro"))[0].text.replace("截图:","").replace("'",""); title=(soup.select("dldth1"))[0].text.replace("'",""); book=Book(mainInfo,url,title); forulinsoup.select(".ul_Address"): forliinul.select("li"): downLoadInfo=DownLoadInfo(li.select("a")[0]['href'],li.select("a")[0].text); book.addDownLoadUrl(downLoadInfo); returnbook; if__name__=='__main__': p=PageFetch("list152_1.html"); shortPageList=p.getBookPageList(); downPage=[]; forpageinshortPageList: downLoadPage=PageFetch.getDownloadPage(page); downPage=downPage+downLoadPage; print("================汇总如下==============================="); forbookDownLoadPageindownPage: book=PageFetch.getBookInfo(bookDownLoadPage); print(book.bookName+":%s"%book.downLoadUrl); fordinbook.downLoadInfos: print("%s-%s"%(d.downUrl,d.downName)); #p=PageFetch("list977_1.html"); #p=p.getMaxPageNumAndUrl(); #print(p);
执行文件,以上文件copy在相同的文件夹下执行此文件即可51Job.py
fromFiveOneJobFetchimportPageFetch frombookInfoimportBook frombookInfoimportDownLoadInfo frombookOpeimportBookOperator defmain(url): p=PageFetch(url); shortPageList=p.getBookPageList(); bookOperator=BookOperator(); downPage=[]; forpageinshortPageList: downLoadPage=PageFetch.getDownloadPage(page); downPage=downPage+downLoadPage; forbookDownLoadPageindownPage: book=PageFetch.getBookInfo(bookDownLoadPage); bookOperator.addBookInfo(book); print("数据抓取成功:"+url); if__name__=='__main__': urls=["list152_35.html","list300_2.html","list476_6.html","list977_2.html","list572_5.html","list509_2.html","list481_1.html","list576_1.html","list482_1.html","list483_1.html","list484_1.html"]; forurlinurls: main(url);
数据库表:书籍信息表和下载地址表
CREATETABLE`book`( `id`INT(11)NOTNULLAUTO_INCREMENT, `bookName`VARCHAR(200)NULLDEFAULTNULL, `bookUrl`VARCHAR(500)NULLDEFAULTNULL, `bookInfo`TEXTNULL, PRIMARYKEY(`id`) ) COLLATE='utf8mb4_general_ci' ENGINE=InnoDB AUTO_INCREMENT=2936;
CREATETABLE`book_down_url`( `id`INT(11)NOTNULLAUTO_INCREMENT, `bookId`INT(11)NOTNULLDEFAULT'0', `downName`VARCHAR(200)NOTNULLDEFAULT'0', `downUrl`VARCHAR(2000)NOTNULLDEFAULT'0', PRIMARYKEY(`id`) ) COLLATE='utf8mb4_general_ci' ENGINE=InnoDB AUTO_INCREMENT=44441;
git地址:https://git.oschina.net/yangsj/BookFetch/tree/master