python爬取本站电子书信息并入库的实现代码
入门级爬虫:只抓取书籍名称,信息及下载地址并存储到数据库
数据库工具类:DBUtil.py
importpymysql
classDBUtils(object):
defconnDB(self):#连接数据库
conn=pymysql.connect(host='192.168.251.114',port=3306,user='root',passwd='b6f3g2',db='yangsj',charset='utf8');
cur=conn.cursor();
return(conn,cur);
defexeUpdate(self,conn,cur,sql):#更新或插入操作
sta=cur.execute(sql);
conn.commit();
return(sta);
defexeDelete(self,conn,cur,IDs):#删除操作demo没用到
sta=0;
foreachIDinIDs.split(''):
sta+=cur.execute("deletefromstudentswhereId=%d"%(int(eachID)));
conn.commit();
return(sta);
defexeQuery(self,cur,sql):#查找操作
effect_row=cur.execute(sql);
return(effect_row,cur);
defconnClose(self,conn,cur):#关闭连接,释放资源
cur.close();
conn.close();
if__name__=='__main__':
dbUtil=DBUtils();
conn,cur=dbUtil.connDB();
书籍操作文件bookOpe.py
fromDBUtilimportDBUtils
frombookInfoimportBook
frombookInfoimportDownLoadInfo
importlogging
logging.basicConfig(
level=logging.INFO
)
classBookOperator(object):
def__addBook(self,book):
logging.info("addbook:%s"%book.bookName);
dbUtil=DBUtils();
conn,cur=dbUtil.connDB();
insertBookSql=("insertintobook(bookName,bookUrl,bookInfo)values('%s','%s','%s');"%(book.bookName,book.downLoadUrl,book.mainInfo));
dbUtil.exeUpdate(conn,cur,insertBookSql);
dbUtil.connClose(conn,cur);
def__selectLastBookId(self):
logging.info("selectLastBookId");
dbUtil=DBUtils();
conn,cur=dbUtil.connDB();
selectLastBookSql="selectidfrombookorderbyiddesclimit1";
effect_row,cur=dbUtil.exeQuery(cur,selectLastBookSql);
bookId=cur.fetchone()[0];
dbUtil.connClose(conn,cur);
returnbookId;
def__addBookDownLoadInfos(self,downLoadInfos,bookId):
logging.info("addbookId:%s"%bookId);
dbUtil=DBUtils();
conn,cur=dbUtil.connDB();
fordownLoadinfoindownLoadInfos:
insertBookDownLoadInfo=("insertintobook_down_url(bookId,downName,downUrl)values('%s','%s','%s');"%(bookId,downLoadinfo.downName,downLoadinfo.downUrl));
dbUtil.exeUpdate(conn,cur,insertBookDownLoadInfo);
dbUtil.connClose(conn,cur);
defaddBookInfo(self,book):
logging.info("addbookInfo:%s"%book.bookName);
self.__addBook(book);
bookId=self.__selectLastBookId();
self.__addBookDownLoadInfos(book.downLoadInfos,bookId);
if__name__=='__main__':
bookope=BookOperator();
book=Book("aaa","yang","cccc");
book.addDownLoadUrl(DownLoadInfo("aaa.html","书籍"));
bookope.addBookInfo(book);
书籍信息文件bookInfo.py
importsys
sys.encoding="utf8"
classBook(object):
#书籍信息#
def__init__(self,mainInfo,downLoadUrl,bookName):
self.mainInfo=mainInfo;
self.downLoadUrl=downLoadUrl;
self.bookName=bookName;
self.downLoadInfos=[];
defaddDownLoadUrl(self,downloadInfo):
self.downLoadInfos.append(downloadInfo);
defprint_book_info(self):
print("bookName:%s"%(self.bookName));
classDownLoadInfo(object):
#下载信息#
def__init__(self,downUrl,downName):
self.downUrl=downUrl;
self.downName=downName;
defprint_down_info(self):
print("downLoad%s-%s"%(self.downUrl,self.downName));
51job界面解析文件FiveOneJobFetch.py
importrequests
frombs4importBeautifulSoup
importsys
frombookInfoimportBook
frombookInfoimportDownLoadInfo
importlogging
sys.encoding="utf8"
classPageFetch(object):
host="//www.nhooo.com/";#域名+分类
category="books/";#具体请求页
def__init__(self,pageUrl):
self.pageUrl=pageUrl;#完整URL
self.url=PageFetch.host+PageFetch.category+pageUrl;
def__getPageContent(self):
req=requests.get(self.url);
ifreq.status_code==200:
req.encoding="gb2312";
strText=req.text;
returnstrText;
else:
return"";
defgetPageContent(url):
req=requests.get(url);
ifreq.status_code==200:
req.encoding="gb2312";
strText=req.text;
returnstrText;
else:
return"";
def__getMaxPageNumAndUrl(self):
fetchUrl=self.pageUrl;
#获取分页地址分页url形如list45_2.html2为页号#
maxPageNum=0;
maxLink="";
whilemaxLink=="":
url=PageFetch.host+PageFetch.category+fetchUrl;
reqContent=PageFetch.getPageContent(url)
soup=BeautifulSoup(reqContent,"html.parser");
forulinsoup.select(".plist"):
print("数据");
print(ul);
maxPageNum=ul.select("strong")[0].text;
alink=ul.select("a");
ifalink[-1]['href']=="#":
maxLink=alink[1]['href'];
else:
fetchUrl=alink[-1]['href'];
returnmaxPageNum,maxLink;
def__formatPage(self,pageNum):
#格式化url形如list45_2.html#
lineBeginSite=self.pageUrl.index("_")+1;
docBeginSite=self.pageUrl.index(".");
returnself.pageUrl[:lineBeginSite]+str(pageNum+1)+self.pageUrl[docBeginSite:];
defgetBookPageList(self):
#获取书籍每页的URL#
shortPageList=[];
maxPageNum,urlPattern=self.__getMaxPageNumAndUrl();
foriinrange(int(maxPageNum)):
shortPageList.append(self.host+self.category+self.__formatPage(i));
returnshortPageList;
defgetDownloadPage(url):
downPage=[];
reqContent=PageFetch.getPageContent(url);
soup=BeautifulSoup(reqContent,"html.parser");
forainsoup.select(".cur-cat-list.btn-dl"):
downPage.append(PageFetch.host+a['href']);
returndownPage;
defgetBookInfo(url):
logging.info("获取书籍信息url:%s"%url);
reqContent=PageFetch.getPageContent(url);
soup=BeautifulSoup(reqContent,"html.parser");
mainInfo=(soup.select("#soft-intro"))[0].text.replace("截图:","").replace("'","");
title=(soup.select("dldth1"))[0].text.replace("'","");
book=Book(mainInfo,url,title);
forulinsoup.select(".ul_Address"):
forliinul.select("li"):
downLoadInfo=DownLoadInfo(li.select("a")[0]['href'],li.select("a")[0].text);
book.addDownLoadUrl(downLoadInfo);
returnbook;
if__name__=='__main__':
p=PageFetch("list152_1.html");
shortPageList=p.getBookPageList();
downPage=[];
forpageinshortPageList:
downLoadPage=PageFetch.getDownloadPage(page);
downPage=downPage+downLoadPage;
print("================汇总如下===============================");
forbookDownLoadPageindownPage:
book=PageFetch.getBookInfo(bookDownLoadPage);
print(book.bookName+":%s"%book.downLoadUrl);
fordinbook.downLoadInfos:
print("%s-%s"%(d.downUrl,d.downName));
#p=PageFetch("list977_1.html");
#p=p.getMaxPageNumAndUrl();
#print(p);
执行文件,以上文件copy在相同的文件夹下执行此文件即可51Job.py
fromFiveOneJobFetchimportPageFetch
frombookInfoimportBook
frombookInfoimportDownLoadInfo
frombookOpeimportBookOperator
defmain(url):
p=PageFetch(url);
shortPageList=p.getBookPageList();
bookOperator=BookOperator();
downPage=[];
forpageinshortPageList:
downLoadPage=PageFetch.getDownloadPage(page);
downPage=downPage+downLoadPage;
forbookDownLoadPageindownPage:
book=PageFetch.getBookInfo(bookDownLoadPage);
bookOperator.addBookInfo(book);
print("数据抓取成功:"+url);
if__name__=='__main__':
urls=["list152_35.html","list300_2.html","list476_6.html","list977_2.html","list572_5.html","list509_2.html","list481_1.html","list576_1.html","list482_1.html","list483_1.html","list484_1.html"];
forurlinurls:
main(url);
数据库表:书籍信息表和下载地址表
CREATETABLE`book`( `id`INT(11)NOTNULLAUTO_INCREMENT, `bookName`VARCHAR(200)NULLDEFAULTNULL, `bookUrl`VARCHAR(500)NULLDEFAULTNULL, `bookInfo`TEXTNULL, PRIMARYKEY(`id`) ) COLLATE='utf8mb4_general_ci' ENGINE=InnoDB AUTO_INCREMENT=2936;
CREATETABLE`book_down_url`( `id`INT(11)NOTNULLAUTO_INCREMENT, `bookId`INT(11)NOTNULLDEFAULT'0', `downName`VARCHAR(200)NOTNULLDEFAULT'0', `downUrl`VARCHAR(2000)NOTNULLDEFAULT'0', PRIMARYKEY(`id`) ) COLLATE='utf8mb4_general_ci' ENGINE=InnoDB AUTO_INCREMENT=44441;
git地址:https://git.oschina.net/yangsj/BookFetch/tree/master