python生成lmdb格式的文件实例
在crnn训练的时候需要用到lmdb格式的数据集,下面是python生成lmdb个是数据集的代码,注意一定要在linux系统下,否则会读入图像的时候出问题,可能遇到的问题都在代码里面注释了,看代码即可。
#-*-coding:utf-8-*- importos importlmdb#先pipinstall这个模块哦 importcv2 importglob importnumpyasnp defcheckImageIsValid(imageBin): ifimageBinisNone: returnFalse imageBuf=np.fromstring(imageBin,dtype=np.uint8) img=cv2.imdecode(imageBuf,cv2.IMREAD_GRAYSCALE) ifimgisNone: returnFalse imgH,imgW=img.shape[0],img.shape[1] ifimgH*imgW==0: returnFalse returnTrue defwriteCache(env,cache): withenv.begin(write=True)astxn: fork,vincache.iteritems(): txn.put(k,v) defcreateDataset(outputPath,imagePathList,labelList,lexiconList=None,checkValid=True): """ CreateLMDBdatasetforCRNNtraining. #ARGS: outputPath:LMDBoutputpath imagePathList:listofimagepath labelList:listofcorrespondinggroundtruthtexts lexiconList:(optional)listoflexiconlists checkValid:iftrue,checkthevalidityofeveryimage """ #print(len(imagePathList),len(labelList)) assert(len(imagePathList)==len(labelList)) nSamples=len(imagePathList) print'...................' env=lmdb.open(outputPath,map_size=8589934592)#1099511627776)所需要的磁盘空间的最小值,之前是1T,我改成了8g,否则会报磁盘空间不足,这个数字是字节 cache={} cnt=1 foriinxrange(nSamples): imagePath=imagePathList[i] label=labelList[i] ifnotos.path.exists(imagePath): print('%sdoesnotexist'%imagePath) continue withopen(imagePath,'r')asf: imageBin=f.read() ifcheckValid: ifnotcheckImageIsValid(imageBin): print('%sisnotavalidimage'%imagePath)#注意一定要在linux下,否则f.read就不可用了,就会输出这个信息 continue imageKey='image-%09d'%cnt labelKey='label-%09d'%cnt cache[imageKey]=imageBin cache[labelKey]=label iflexiconList: lexiconKey='lexicon-%09d'%cnt cache[lexiconKey]=''.join(lexiconList[i]) ifcnt%1000==0: writeCache(env,cache) cache={} print('Written%d/%d'%(cnt,nSamples)) cnt+=1 nSamples=cnt-1 cache['num-samples']=str(nSamples) writeCache(env,cache) print('Createddatasetwith%dsamples'%nSamples) defread_text(path): withopen(path)asf: text=f.read() text=text.strip() returntext if__name__=='__main__': #lmdb输出目录 outputPath='D:/ruanjianxiazai/tuxiangyangben/fengehou/train'#训练集和验证集要跑两遍这个程序,分两次生成 path="D:/ruanjianxiazai/tuxiangyangben/fengehou/chenguang/*.jpg"#将txt与jpg的都放在同一个文件里面 imagePathList=glob.glob(path) print'------------',len(imagePathList),'------------' imgLabelLists=[] forpinimagePathList: try: imgLabelLists.append((p,read_text(p.replace('.jpg','.txt')))) except: continue #imgLabelList=[(p,read_text(p.replace('.jpg','.txt')))forpinimagePathList] #sortbylabelList imgLabelList=sorted(imgLabelLists,key=lambdax:len(x[1])) imgPaths=[p[0]forpinimgLabelList] txtLists=[p[1]forpinimgLabelList] createDataset(outputPath,imgPaths,txtLists,lexiconList=None,checkValid=True)
以上这篇python生成lmdb格式的文件实例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持毛票票。