python实现的一个火车票转让信息采集器
好吧,我承认我是对晚上看到一张合适的票转让但打过电话去说已经被搞走了这件事情感到蛋疼。直接上文件吧。
#coding:utf-8 ''' 春运查询火车票转让信息 Author:piglei2007@gmail.com Date:2011.01.25 ''' importre importos importtime importurlparse importdatetime importtraceback importurllib2 importsocket socket.setdefaulttimeout(20) BLANK_RE=re.compile(r"\s+") opener=urllib2.build_opener(urllib2.HTTPCookieProcessor()) opener.addheaders=[ ("User-agent","Mozilla/5.0(X11;U;FreeBSDi386;en-US;rv:1.9.1)Gecko/20090704Firefox/3.5"), ("Accept","*/*"), ] urllib2.install_opener(opener) fromBeautifulSoupimportBeautifulSoup SOURCE={ "58":"http://bj.58.com/huochepiao/?Num=%(train)s&StartTime=%(date)s00", "ganji":"http://bj.ganji.com/piao/cc_%(train)s/%(date)s/", } RECORD_FILE="/tmp/ticket_records.txt" defparse_record(): try: returnset([x.strip()forxinopen(RECORD_FILE,"r").readlines()]) exceptIOError: open(RECORD_FILE,"w") returnset() defflush_record(records): open(RECORD_FILE,"w").write("\n".join(records)) defmain(config): """ 开始抓取 """ existed=parse_record() to_email=[] fortraininconfig["trains"]: fordateinconfig["dates"]: fortype,_urlinSOURCE.items(): url=_url%dict(train=train,date=date) content=urllib2.urlopen(url).read() soup=BeautifulSoup(content) result=parse_content(type,soup,train) forurl,textinresult: url=urlparse.urljoin(_url,url) #只要卧铺! ifurlnotinexistedandu"卧"intext: to_email.append([text,url]) existed.add(url) ifto_email: content="".join( [xforxin["|".join(y)foryinto_email]] ).encode("utf-8") simple_mail(config["people"],content) flush_record(existed) defparse_content(type,soup,train): """ 获得车次信息 """ result=[] iftype=="58": info_table=soup.find("table",id="infolist") ifinfo_table: forxininfo_table.findAll("tr",text=re.compile(ur"%s(?!时刻表)"%train,re.I)): a=x.parent _text=BLANK_RE.sub("",a.text) result.append([a["href"],_text]) iftype=="ganji": forxinsoup.findAll("dl",{"class":"list_piao"}): a=x.dt.a result.append([a["href"],a.text]) returnresult EMAIL_HOST='smtp.sohu.com' EMAIL_HOST_USER='yourname@sohu.com' EMAIL_HOST_PASSWORD='yourpassword' EMAIL_PORT=25 defsimple_mail(to,content): """ 发送邮件 """ importsmtplib fromemail.mime.textimportMIMEText msgRoot=MIMEText(content,'html','UTF-8') msgRoot['Subject']="[%s]有票来啦!!!!"%datetime.datetime.today().isoformat("") msgRoot['From']=EMAIL_HOST_USER msgRoot['To']=",".join(to) s=smtplib.SMTP(EMAIL_HOST,EMAIL_PORT) s.login(EMAIL_HOST_USER,EMAIL_HOST_PASSWORD) s.sendmail(EMAIL_HOST_USER,to,msgRoot.as_string()) s.close() defswitch_time_zone(): """ 切换时区 """ os.environ["TZ"]="Asia/Shanghai" time.tzset() switch_time_zone() if__name__=='__main__': config={ "trains":("k471",), "dates":("20110129",), "people":( "youremail@sohu.com", ) } try: main(config) print"%s:ok"%datetime.datetime.today() exceptException,e: printtraceback.format_exc()
然后放入cron,你懂的。