Python制作爬虫采集小说
开发工具:python3.4
操作系统:win8
主要功能:去指定小说网页爬小说目录,按章节保存到本地,并将爬过的网页保存到本地配置文件。
被爬网站:http://www.cishuge.com/
小说名称:灵棺夜行
代码出处:本人亲自码的
importurllib.request
importhttp.cookiejar
importsocket
importtime
importre
timeout=20
socket.setdefaulttimeout(timeout)
sleep_download_time=10
time.sleep(sleep_download_time)
defmakeMyOpener(head={
'Connection':'Keep-Alive',
'Accept':'text/html,application/xhtml+xml,*/*',
'Accept-Language':'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
'User-Agent':'Mozilla/5.0(WindowsNT6.3;WOW64;Trident/7.0;rv:11.0)likeGecko'
}):
cj=http.cookiejar.CookieJar()
opener=urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
header=[]
forkey,valueinhead.items():
elem=(key,value)
header.append(elem)
opener.addheaders=header
returnopener
defsaveFile(save_path,txts):
f_obj=open(save_path,'w+')
foritemintxts:
f_obj.write(item+'\n')
f_obj.close()
#get_code_list
code_list='http://www.cishuge.com/read/0/771/'
oper=makeMyOpener()
uop=oper.open(code_list,timeout=1000)
data=uop.read().decode('gbk','ignore')
pattern=re.compile('<li><ahref="(.*?)".*?>(.*?)</a></li>',re.S)
items=re.findall(pattern,data)
print('获取列表完成')
url_path='url_file.txt'
url_r=open(url_path,'r')
url_arr=url_r.readlines(100000)
url_r.close()
print(len(url_arr))
url_file=open(url_path,'a')
print('获取已下载网址')
fortmpinitems:
save_path=tmp[1].replace('','')+'.txt'
url=code_list+tmp[0]
ifurl+'\n'inurl_arr:
continue
print('写日志:'+url+'\n')
url_file.write(url+'\n')
opene=makeMyOpener()
op1=opene.open(url,timeout=1000)
data=op1.read().decode('gbk','ignore')
opene.close()
pattern=re.compile(' (.*?)<br/>',re.S)
txts=re.findall(pattern,data)
saveFile(save_path,txts)
url_file.close()
虽然代码还是有点瑕疵,还是分享给大家,一起改进