Python爬取读者并制作成PDF
学了下beautifulsoup后,做个个网络爬虫,爬取读者杂志并用reportlab制作成pdf..
crawler.py
#!/usr/bin/envpython
#coding=utf-8
"""
   Author:        Anemone
   Filename:      getmain.py
   Lastmodified: 2015-02-1916:47
   E-mail:        anemone@82flex.com
"""
importurllib2
frombs4importBeautifulSoup
importre
importsys
reload(sys)
sys.setdefaultencoding('utf-8')
defgetEachArticle(url):
#   response=urllib2.urlopen('http://www.52duzhe.com/2015_01/duzh20150104.html')
   response=urllib2.urlopen(url)
   html=response.read()
   soup=BeautifulSoup(html)#.decode("utf-8").encode("gbk"))
   #foriinsoup.find_all('div'):
   #   printi,1
   title=soup.find("h1").string
   writer=soup.find(id="pub_date").string.strip()
   _from=soup.find(id="media_name").string.strip()
   text=soup.get_text()#.encode("utf-8")
   main=re.split("BAIDU_CLB.*;",text)
   result={"title":title,"writer":writer,"from":_from,"context":main[1]}
   returnresult
   #new=open("new.txt","w")
   #new.write(result["title"]+"\n\n")
   #new.write(result["writer"]+" "+result["from"])
   #new.write(result["context"])
   #new.close()
defgetCatalog(issue):
   url="http://www.52duzhe.com/"+issue[:4]+"_"+issue[-2:]+"/"
   firstUrl=url+"duzh"+issue+"01.html"
   firstUrl=url+"index.html"
   duzhe=dict()
   response=urllib2.urlopen(firstUrl)
   html=response.read()
   soup=BeautifulSoup(html)
   firstUrl=url+soup.table.a.get("href")
   response=urllib2.urlopen(firstUrl)
   html=response.read()
   soup=BeautifulSoup(html)
   all=soup.find_all("h2")
   foriinall:
       printi.string
       duzhe[i.string]=list()
       forlinkini.parent.find_all("a"):
           href=url+link.get("href")
           printhref
           while1:
               try:
                   article=getEachArticle(href)
                   break
               except:
                   continue
           duzhe[i.string].append(article)
   returnduzhe
defreadDuZhe(duzhe):
   foreachColumninduzhe:
       foreachArticleinduzhe[eachColumn]:
           printeachArticle["title"]
if__name__=='__main__':
#   issue=raw_input("issue(201501):")
   readDuZhe(getCatalog("201424"))
getpdf.py
#!/usr/bin/envpython
#coding=utf-8
"""
   Author:        Anemone
   Filename:      writetopdf.py
   Lastmodified: 2015-02-2019:19
   E-mail:        anemone@82flex.com
"""
#coding=utf-8
importreportlab.rl_config
fromreportlab.pdfbaseimportpdfmetrics
fromreportlab.pdfbase.ttfontsimportTTFont
fromreportlab.libimportfonts
importcopy
fromreportlab.platypusimportParagraph,SimpleDocTemplate,flowables
fromreportlab.lib.stylesimportgetSampleStyleSheet
importcrawler
defwritePDF(issue,duzhe):
   reportlab.rl_config.warnOnMissingFontGlyphs=0
   pdfmetrics.registerFont(TTFont('song',"simsun.ttc"))
   pdfmetrics.registerFont(TTFont('hei',"msyh.ttc"))
   fonts.addMapping('song',0,0,'song')
   fonts.addMapping('song',0,1,'song')
   fonts.addMapping('song',1,0,'hei')
   fonts.addMapping('song',1,1,'hei')
   stylesheet=getSampleStyleSheet()
   normalStyle=copy.deepcopy(stylesheet['Normal'])
   normalStyle.fontName='song'
   normalStyle.fontSize=11
   normalStyle.leading=11
   normalStyle.firstLineIndent=20
   titleStyle=copy.deepcopy(stylesheet['Normal'])
   titleStyle.fontName='song'
   titleStyle.fontSize=15
   titleStyle.leading=20
   firstTitleStyle=copy.deepcopy(stylesheet['Normal'])
   firstTitleStyle.fontName='song'
   firstTitleStyle.fontSize=20
   firstTitleStyle.leading=20
   firstTitleStyle.firstLineIndent=50
   smallStyle=copy.deepcopy(stylesheet['Normal'])
   smallStyle.fontName='song'
   smallStyle.fontSize=8
   smallStyle.leading=8
   story=[]
   story.append(Paragraph("<b>读者{0}期</b>".format(issue),firstTitleStyle))
   foreachColumninduzhe:
       story.append(Paragraph('__'*28,titleStyle))
       story.append(Paragraph('<b>{0}</b>'.format(eachColumn),titleStyle))
       foreachArticleinduzhe[eachColumn]:
           story.append(Paragraph(eachArticle["title"],normalStyle))
   story.append(flowables.PageBreak())
   foreachColumninduzhe:
       foreachArticleinduzhe[eachColumn]:
           story.append(Paragraph("<b>{0}</b>".format(eachArticle["title"]),titleStyle))
           story.append(Paragraph("{0} {1}".format(eachArticle["writer"],eachArticle["from"]),smallStyle))
           para=eachArticle["context"].split("")
           foreachParainpara:
               story.append(Paragraph(eachPara,normalStyle))
           story.append(flowables.PageBreak())
   #story.append(Paragraph("context",normalStyle))
   doc=SimpleDocTemplate("duzhe"+issue+".pdf")
   print"WritingPDF..."
   doc.build(story)
defmain(issue):
   duzhe=crawler.getCatalog(issue)
   writePDF(issue,duzhe)
if__name__=='__main__':
   issue=raw_input("Enterissue(201501):")
   main(issue)
以上就是本文的全部内容了,希望大家能够喜欢。
