Python爬取读者并制作成PDF
学了下beautifulsoup后,做个个网络爬虫,爬取读者杂志并用reportlab制作成pdf..
crawler.py
#!/usr/bin/envpython
#coding=utf-8
"""
Author: Anemone
Filename: getmain.py
Lastmodified: 2015-02-1916:47
E-mail: anemone@82flex.com
"""
importurllib2
frombs4importBeautifulSoup
importre
importsys
reload(sys)
sys.setdefaultencoding('utf-8')
defgetEachArticle(url):
# response=urllib2.urlopen('http://www.52duzhe.com/2015_01/duzh20150104.html')
response=urllib2.urlopen(url)
html=response.read()
soup=BeautifulSoup(html)#.decode("utf-8").encode("gbk"))
#foriinsoup.find_all('div'):
# printi,1
title=soup.find("h1").string
writer=soup.find(id="pub_date").string.strip()
_from=soup.find(id="media_name").string.strip()
text=soup.get_text()#.encode("utf-8")
main=re.split("BAIDU_CLB.*;",text)
result={"title":title,"writer":writer,"from":_from,"context":main[1]}
returnresult
#new=open("new.txt","w")
#new.write(result["title"]+"\n\n")
#new.write(result["writer"]+" "+result["from"])
#new.write(result["context"])
#new.close()
defgetCatalog(issue):
url="http://www.52duzhe.com/"+issue[:4]+"_"+issue[-2:]+"/"
firstUrl=url+"duzh"+issue+"01.html"
firstUrl=url+"index.html"
duzhe=dict()
response=urllib2.urlopen(firstUrl)
html=response.read()
soup=BeautifulSoup(html)
firstUrl=url+soup.table.a.get("href")
response=urllib2.urlopen(firstUrl)
html=response.read()
soup=BeautifulSoup(html)
all=soup.find_all("h2")
foriinall:
printi.string
duzhe[i.string]=list()
forlinkini.parent.find_all("a"):
href=url+link.get("href")
printhref
while1:
try:
article=getEachArticle(href)
break
except:
continue
duzhe[i.string].append(article)
returnduzhe
defreadDuZhe(duzhe):
foreachColumninduzhe:
foreachArticleinduzhe[eachColumn]:
printeachArticle["title"]
if__name__=='__main__':
# issue=raw_input("issue(201501):")
readDuZhe(getCatalog("201424"))
getpdf.py
#!/usr/bin/envpython
#coding=utf-8
"""
Author: Anemone
Filename: writetopdf.py
Lastmodified: 2015-02-2019:19
E-mail: anemone@82flex.com
"""
#coding=utf-8
importreportlab.rl_config
fromreportlab.pdfbaseimportpdfmetrics
fromreportlab.pdfbase.ttfontsimportTTFont
fromreportlab.libimportfonts
importcopy
fromreportlab.platypusimportParagraph,SimpleDocTemplate,flowables
fromreportlab.lib.stylesimportgetSampleStyleSheet
importcrawler
defwritePDF(issue,duzhe):
reportlab.rl_config.warnOnMissingFontGlyphs=0
pdfmetrics.registerFont(TTFont('song',"simsun.ttc"))
pdfmetrics.registerFont(TTFont('hei',"msyh.ttc"))
fonts.addMapping('song',0,0,'song')
fonts.addMapping('song',0,1,'song')
fonts.addMapping('song',1,0,'hei')
fonts.addMapping('song',1,1,'hei')
stylesheet=getSampleStyleSheet()
normalStyle=copy.deepcopy(stylesheet['Normal'])
normalStyle.fontName='song'
normalStyle.fontSize=11
normalStyle.leading=11
normalStyle.firstLineIndent=20
titleStyle=copy.deepcopy(stylesheet['Normal'])
titleStyle.fontName='song'
titleStyle.fontSize=15
titleStyle.leading=20
firstTitleStyle=copy.deepcopy(stylesheet['Normal'])
firstTitleStyle.fontName='song'
firstTitleStyle.fontSize=20
firstTitleStyle.leading=20
firstTitleStyle.firstLineIndent=50
smallStyle=copy.deepcopy(stylesheet['Normal'])
smallStyle.fontName='song'
smallStyle.fontSize=8
smallStyle.leading=8
story=[]
story.append(Paragraph("<b>读者{0}期</b>".format(issue),firstTitleStyle))
foreachColumninduzhe:
story.append(Paragraph('__'*28,titleStyle))
story.append(Paragraph('<b>{0}</b>'.format(eachColumn),titleStyle))
foreachArticleinduzhe[eachColumn]:
story.append(Paragraph(eachArticle["title"],normalStyle))
story.append(flowables.PageBreak())
foreachColumninduzhe:
foreachArticleinduzhe[eachColumn]:
story.append(Paragraph("<b>{0}</b>".format(eachArticle["title"]),titleStyle))
story.append(Paragraph("{0} {1}".format(eachArticle["writer"],eachArticle["from"]),smallStyle))
para=eachArticle["context"].split("")
foreachParainpara:
story.append(Paragraph(eachPara,normalStyle))
story.append(flowables.PageBreak())
#story.append(Paragraph("context",normalStyle))
doc=SimpleDocTemplate("duzhe"+issue+".pdf")
print"WritingPDF..."
doc.build(story)
defmain(issue):
duzhe=crawler.getCatalog(issue)
writePDF(issue,duzhe)
if__name__=='__main__':
issue=raw_input("Enterissue(201501):")
main(issue)
以上就是本文的全部内容了,希望大家能够喜欢。