tensorflow 变长序列存储实例
问题
问题是这样的,要把一个数组存到tfrecord中,然后读取
a=np.array([[0,54,91,153,177,1], [0,50,89,147,196], [0,38,79,157], [0,49,89,147,177], [0,32,73,145]])
图片我都存储了,这个不还是小意思,一顿操作
importtensorflowastf
importnumpyasnp
def_int64_feature(value):
ifnotisinstance(value,list):
value=[value]
returntf.train.Feature(int64_list=tf.train.Int64List(value=value))
#WriteanarraytoTFrecord.
#aisanarraywhichcontainslistsofvariantlength.
a=np.array([[0,54,91,153,177,1],
[0,50,89,147,196],
[0,38,79,157],
[0,49,89,147,177],
[0,32,73,145]])
writer=tf.python_io.TFRecordWriter('file')
foriinrange(a.shape[0]):
feature={'i':_int64_feature(i),
'data':_int64_feature(a[i])}
#Createanexampleprotocolbuffer
example=tf.train.Example(features=tf.train.Features(feature=feature))
#Serializetostringandwriteonthefile
writer.write(example.SerializeToString())
writer.close()
#UseDatasetAPItoreadtheTFRecordfile.
filenames=["file"]
dataset=tf.data.TFRecordDataset(filenames)
def_parse_function(example_proto):
keys_to_features={'i':tf.FixedLenFeature([],tf.int64),
'data':tf.FixedLenFeature([],tf.int64)}
parsed_features=tf.parse_single_example(example_proto,keys_to_features)
returnparsed_features['i'],parsed_features['data']
dataset=dataset.map(_parse_function)
dataset=dataset.shuffle(buffer_size=1)
dataset=dataset.repeat()
dataset=dataset.batch(1)
iterator=dataset.make_one_shot_iterator()
i,data=iterator.get_next()
withtf.Session()assess:
print(sess.run([i,data]))
print(sess.run([i,data]))
print(sess.run([i,data]))
报了奇怪的错误,Name:
我来试试不存为int64,而是存为bytes。又是一顿厉害的操作
数据转为bytes
#-*-coding:utf-8-*-
importtensorflowastf
importnumpyasnp
def_byte_feature(value):
returntf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def_int64_feature(value):
ifnotisinstance(value,list):
value=[value]
returntf.train.Feature(int64_list=tf.train.Int64List(value=value))
#WriteanarraytoTFrecord.
#aisanarraywhichcontainslistsofvariantlength.
a=np.array([[0,54,91,153,177,1],
[0,50,89,147,196],
[0,38,79,157],
[0,49,89,147,177],
[0,32,73,145]])
writer=tf.python_io.TFRecordWriter('file')
foriinrange(a.shape[0]):#i=0~4
feature={'len':_int64_feature(len(a[i])),#将无意义的i改成len,为了后面还原
'data':_byte_feature(np.array(a[i]).tobytes())}#我也不知道为什么a[i]是list(后面就知道了),要存bytes需要numpy一下
#Createanexampleprotocolbuffer
example=tf.train.Example(features=tf.train.Features(feature=feature))
#Serializetostringandwriteonthefile
writer.write(example.SerializeToString())
writer.close()
#
#UseDatasetAPItoreadtheTFRecordfile.
filenames=["file"]
dataset=tf.data.TFRecordDataset(filenames)
def_parse_function(example_proto):
keys_to_features={'len':tf.FixedLenFeature([],tf.int64),
'data':tf.FixedLenFeature([],tf.string)}#改成string
parsed_features=tf.parse_single_example(example_proto,keys_to_features)
returnparsed_features['len'],parsed_features['data']
dataset=dataset.map(_parse_function)
dataset=dataset.shuffle(buffer_size=1)
dataset=dataset.repeat()
dataset=dataset.batch(1)
iterator=dataset.make_one_shot_iterator()
i,data=iterator.get_next()
withtf.Session()assess:
print(sess.run([i,data]))
print(sess.run([i,data]))
print(sess.run([i,data]))
"""
[array([6],dtype=int64),array([b'\x00\x00\x00\x006\x00\x00\x00[\x00\x00\x00\x99\x00\x00\x00\xb1\x00\x00\x00\x01\x00\x00\x00'],
dtype=object)]
[array([5],dtype=int64),array([b'\x00\x00\x00\x002\x00\x00\x00Y\x00\x00\x00\x93\x00\x00\x00\xc4\x00\x00\x00'],
dtype=object)]
[array([4],dtype=int64),array([b'\x00\x00\x00\x00&\x00\x00\x00O\x00\x00\x00\x9d\x00\x00\x00'],
dtype=object)]
"""
bytes数据解码
如愿的输出来了,但是这个bytes我该如何解码呢
方法一,我们自己解析
a,b=sess.run([i,data]) c=np.frombuffer(b[0],dtype=np.int,count=a[0])
方法二使用tensorflow的解析函数
def_parse_function(example_proto):
keys_to_features={'len':tf.FixedLenFeature([],tf.int64),
'data':tf.FixedLenFeature([],tf.string)}#改成string
parsed_features=tf.parse_single_example(example_proto,keys_to_features)
dat=tf.decode_raw(parsed_features['data'],tf.int64)#用的是这个解析函数,我们使用int64的格式存储的,解析的时候也是转换为int64
returnparsed_features['len'],dat
"""
[array([6]),array([[0,54,91,153,177,1]])]
[array([5]),array([[0,50,89,147,196]])]
[array([4]),array([[0,38,79,157]])]
"""
可以看到是二维数组,这是因为我们使用的是batch输出,虽然我们的bathc_size=1,但是还是会以二维list的格式输出。我手贱再来修改点东西,
def_parse_function(example_proto):
keys_to_features={'len':tf.FixedLenFeature([1],tf.int64),
'data':tf.FixedLenFeature([1],tf.string)}
parsed_features=tf.parse_single_example(example_proto,keys_to_features)
dat=tf.decode_raw(parsed_features['data'],tf.int64)
returnparsed_features['len'],dat
"""
[array([[6]]),array([[[0,54,91,153,177,1]]])]
[array([[5]]),array([[[0,50,89,147,196]]])]
[array([[4]]),array([[[0,38,79,157]]])]
"""
呦呵,又变成3维的了,让他报个错试试
def_parse_function(example_proto):
keys_to_features={'len':tf.FixedLenFeature([2],tf.int64),#1修改为2
'data':tf.FixedLenFeature([1],tf.string)}#改成string
parsed_features=tf.parse_single_example(example_proto,keys_to_features)
returnparsed_features['len'],parsed_features['data']
"""
InvalidArgumentError:Key:len.Can'tparseserializedExample.
[[Node:ParseSingleExample/ParseSingleExample=ParseSingleExample[Tdense=[DT_STRING,DT_INT64],dense_keys=["data","len"],dense_shapes=[[1],[2]],num_sparse=0,sparse_keys=[],sparse_types=[]](arg0,ParseSingleExample/Const,ParseSingleExample/Const_1)]]
[[Node:IteratorGetNext_22=IteratorGetNext[output_shapes=[[?,2],[?,1]],output_types=[DT_INT64,DT_STRING],_device="/job:localhost/replica:0/task:0/device:CPU:0"](OneShotIterator_22)]]
"""
可以看到dense_keys=["data","len"],dense_shapes=[[1],[2]],,tf.FixedLenFeature是读取固定长度的数据,我猜测[]的意思就是读取全部数据,[1]就是读取一个数据,每个数据可能包含多个数据,形如[[1,2],[3,3,4],[2]....],哈哈这都是我瞎猜的,做我女朋友好不好。
tensorflow变长数组存储
反正是可以读取了。但是如果是自己定义的变长数组,每次都要自己解析,这样很麻烦(我瞎遍的),所以tensorflow就定义了变长数组的解析方法tf.VarLenFeature,我们就不需要把边长数组变为bytes再解析了,又是一顿操作
importtensorflowastf
importnumpyasnp
def_int64_feature(value):
ifnotisinstance(value,list):
value=[value]
returntf.train.Feature(int64_list=tf.train.Int64List(value=value))
#WriteanarraytoTFrecord.
#aisanarraywhichcontainslistsofvariantlength.
a=np.array([[0,54,91,153,177,1],
[0,50,89,147,196],
[0,38,79,157],
[0,49,89,147,177],
[0,32,73,145]])
writer=tf.python_io.TFRecordWriter('file')
foriinrange(a.shape[0]):#i=0~4
feature={'i':_int64_feature(i),
'data':_int64_feature(a[i])}
#Createanexampleprotocolbuffer
example=tf.train.Example(features=tf.train.Features(feature=feature))
#Serializetostringandwriteonthefile
writer.write(example.SerializeToString())
writer.close()
#UseDatasetAPItoreadtheTFRecordfile.
filenames=["file"]
dataset=tf.data.TFRecordDataset(filenames)
def_parse_function(example_proto):
keys_to_features={'i':tf.FixedLenFeature([],tf.int64),
'data':tf.VarLenFeature(tf.int64)}
parsed_features=tf.parse_single_example(example_proto,keys_to_features)
returnparsed_features['i'],tf.sparse_tensor_to_dense(parsed_features['data'])
dataset=dataset.map(_parse_function)
dataset=dataset.shuffle(buffer_size=1)
dataset=dataset.repeat()
dataset=dataset.batch(1)
iterator=dataset.make_one_shot_iterator()
i,data=iterator.get_next()
withtf.Session()assess:
print(sess.run([i,data]))
print(sess.run([i,data]))
print(sess.run([i,data]))
"""
[array([0],dtype=int64),array([[0,54,91,153,177,1]],dtype=int64)]
[array([1],dtype=int64),array([[0,50,89,147,196]],dtype=int64)]
[array([2],dtype=int64),array([[0,38,79,157]],dtype=int64)]
"""
batch输出
输出还是数组,哈哈哈。再来一波操作
dataset=dataset.batch(2) """ Cannotbatchtensorswithdifferentshapesincomponent1.Firstelementhadshape[6]andelement1hadshape[5]. """
这是因为一个batch中数据的shape必须是一致的,第一个元素长度为6,第二个元素长度为5,就会报错。办法就是补成一样的长度,在这之前先测试点别的
a=np.array([[0,54,91,153,177,1], [0,50,89,147,196], [0,38,79,157], [0,49,89,147,177], [0,32,73,145]]) foriinrange(a.shape[0]): print(type(a[i])) """"""
可以发现长度不一的array每一个数据是list(一开始我以为是object)。然后补齐
a=np.array([[0,54,91,153,177,1], [0,50,89,147,196,0], [0,38,79,157,0,0], [0,49,89,147,177,0], [0,32,73,145,0,0]]) foriinrange(a.shape[0]): print(type(a[i])) """"""
返回的是numpy。为什么要做这件事呢?
def_int64_feature(value): ifnotisinstance(value,list): value=[value] returntf.train.Feature(int64_list=tf.train.Int64List(value=value))
tensorflow要求我们输入的是list或者直接是numpy.ndarry,如果是list中包含numpy.ndarry[numpy.ndarry]就会报错。上面的那个数组时边长的,返回的时list,没有什么错误,我们补齐看看
a=np.array([[0,54,91,153,177,1], [0,50,89,147,196,0], [0,38,79,157,0,0], [0,49,89,147,177,0], [0,32,73,145,0,0]]) """ TypeError:onlysize-1arrayscanbeconvertedtoPythonscalars """
这就是因为返回的不是list,而是numpy.ndarry,而_int64_feature函数中先判断numpy.ndarry不是list,所以转成了[numpy.ndarry]就报错了。可以做些修改,一种方法是将numpy.ndarry转为list
foriinrange(a.shape[0]):#i=0~4
feature={'i':_int64_feature(i),
'data':_int64_feature(a[i].tolist())}
这样补齐了我们就可以修改batch的值了
dataset=dataset.batch(2) """ [array([0,2],dtype=int64),array([[0,54,91,153,177,1], [0,38,79,157,0,0]],dtype=int64)] [array([1,3],dtype=int64),array([[0,50,89,147,196,0], [0,49,89,147,177,0]],dtype=int64)] [array([4,0],dtype=int64),array([[0,32,73,145,0,0], [0,54,91,153,177,1]],dtype=int64)] """
当然tensorflow不会让我自己补齐,已经提供了补齐函数padded_batch,
#-*-coding:utf-8-*-
importtensorflowastf
def_int64_feature(value):
ifnotisinstance(value,list):
value=[value]
returntf.train.Feature(int64_list=tf.train.Int64List(value=value))
a=[[0,54,91,153,177,1],
[0,50,89,147,196],
[0,38,79,157],
[0,49,89,147,177],
[0,32,73,145]]
writer=tf.python_io.TFRecordWriter('file')
forvina:#i=0~4
feature={'data':_int64_feature(v)}
#Createanexampleprotocolbuffer
example=tf.train.Example(features=tf.train.Features(feature=feature))
#Serializetostringandwriteonthefile
writer.write(example.SerializeToString())
writer.close()
#UseDatasetAPItoreadtheTFRecordfile.
filenames=["file"]
dataset=tf.data.TFRecordDataset(filenames)
def_parse_function(example_proto):
keys_to_features={'data':tf.VarLenFeature(tf.int64)}
parsed_features=tf.parse_single_example(example_proto,keys_to_features)
returntf.sparse_tensor_to_dense(parsed_features['data'])
dataset=dataset.map(_parse_function)
dataset=dataset.shuffle(buffer_size=1)
dataset=dataset.repeat()
dataset=dataset.padded_batch(2,padded_shapes=([None]))
iterator=dataset.make_one_shot_iterator()
data=iterator.get_next()
withtf.Session()assess:
print(sess.run([data]))
print(sess.run([data]))
print(sess.run([data]))
"""
[array([[0,54,91,153,177,1],
[0,50,89,147,196,0]])]
[array([[0,38,79,157,0],
[0,49,89,147,177]])]
[array([[0,32,73,145,0,0],
[0,54,91,153,177,1]])]
"""
可以看到的确是自动补齐了。
图片batch
直接来测试一下图片数据
#-*-coding:utf-8-*-
importtensorflowastf
importmatplotlib.pyplotasplt
def_byte_feature(value):
returntf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
files=tf.gfile.Glob('*.jpeg')
writer=tf.python_io.TFRecordWriter('file')
forfileinfiles:
withtf.gfile.FastGFile(file,'rb')asf:
img_buff=f.read()
feature={'img':_byte_feature(tf.compat.as_bytes(img_buff))}
example=tf.train.Example(features=tf.train.Features(feature=feature))
writer.write(example.SerializeToString())
writer.close()
filenames=["file"]
dataset=tf.data.TFRecordDataset(filenames)
def_parse_function(example_proto):
keys_to_features={'img':tf.FixedLenFeature([],tf.string)}
parsed_features=tf.parse_single_example(example_proto,keys_to_features)
image=tf.image.decode_jpeg(parsed_features['img'])
returnimage
dataset=dataset.map(_parse_function)
dataset=dataset.shuffle(buffer_size=1)
dataset=dataset.repeat()
dataset=dataset.batch(2)
iterator=dataset.make_one_shot_iterator()
image=iterator.get_next()
withtf.Session()assess:
img=sess.run([image])
print(len(img))
print(img[0].shape)
plt.imshow(img[0][0])
"""
Cannotbatchtensorswithdifferentshapesincomponent0.Firstelementhadshape[440,440,3]andelement1hadshape[415,438,3].
"""
看到了没有,一个batch中图片的尺寸不同,就不可以batch了,我们必须要将一个batch的图片resize成相同的代大小。
def_parse_function(example_proto):
keys_to_features={'img':tf.FixedLenFeature([],tf.string)}
parsed_features=tf.parse_single_example(example_proto,keys_to_features)
image=tf.image.decode_jpeg(parsed_features['img'])
image=tf.image.convert_image_dtype(image,tf.float32)#直接resize,会将uint8转为float类型,但是plt.imshow只能显示uint8或者0-1之间float类型,这个函数就是将uint8转为0-1之间的float类型,相当于除以255.0
image=tf.image.resize_images(image,(224,224))
returnimage
但是有时候我们希望输入图片尺寸是不一样的,不需要reize,这样只能将batch_size=1。一个batch中的图片shape必须是一样的,我们可以这样折中训练,使用tensorflow提供的动态填充接口,将一个batch中的图片填充为相同的shape。
dataset=dataset.padded_batch(2,padded_shapes=([None,None,3]))
如果我们想要将图片的名称作为标签保存下来要怎么做呢?
#-*-coding:utf-8-*-
importtensorflowastf
importmatplotlib.pyplotasplt
importos
out_charset="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
def_byte_feature(value):
returntf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def_int64_feature(values):
ifnotisinstance(values,list):
values=[values]
returntf.train.Feature(int64_list=tf.train.Int64List(value=values))
files=tf.gfile.Glob('*.jpg')
writer=tf.python_io.TFRecordWriter('file')
forfileinfiles:
withtf.gfile.FastGFile(file,'rb')asf:
img_buff=f.read()
filename=os.path.basename(file).split('.')[0]
label=list(map(lambdax:out_charset.index(x),filename))
feature={'label':_int64_feature(label),
'filename':_byte_feature(tf.compat.as_bytes(filename)),
'img':_byte_feature(tf.compat.as_bytes(img_buff))}
example=tf.train.Example(features=tf.train.Features(feature=feature))
writer.write(example.SerializeToString())
writer.close()
filenames=["file"]
dataset=tf.data.TFRecordDataset(filenames)
def_parse_function(example_proto):
keys_to_features={
'label':tf.VarLenFeature(tf.int64),
'filename':tf.FixedLenFeature([],tf.string),
'img':tf.FixedLenFeature([],tf.string)}
parsed_features=tf.parse_single_example(example_proto,keys_to_features)
label=tf.sparse_tensor_to_dense(parsed_features['label'])
filename=parsed_features['filename']
image=tf.image.decode_jpeg(parsed_features['img'])
returnimage,label,filename
dataset=dataset.map(_parse_function)
dataset=dataset.shuffle(buffer_size=1)
dataset=dataset.repeat()
dataset=dataset.padded_batch(3,padded_shapes=([None,None,3],[None],[]))
#因为返回有三个,所以每一个都要有padded_shapes,但是解码后的image和label都是变长的
#所以需要padNone,而filename没有解码,返回来是byte类型的,只有一个值,所以不需要pad
iterator=dataset.make_one_shot_iterator()
image,label,filename=iterator.get_next()
withtf.Session()assess:
print(label.eval())
瞎试
如果写入的数据是一个list会是怎样呢
a=np.arange(16).reshape(2,4,2) """ TypeError:[0,1]hastypelist,butexpectedoneof:int,long """
不过想想也是,tf.train.Feature(int64_list=tf.train.Int64List(value=value))这个函数就是存储数据类型为int64的list的。但是如果我们要存储词向量该怎么办呢?例如一句话是一个样本s1='我爱你',假如使用one-hot编码,我=[0,0,1],爱=[0,1,0],你=[1,0,0],s1=[[0,0,1],[0,1,0],[1,0,0]]。这一个样本该怎么存储呢?
以上这篇tensorflow变长序列存储实例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持毛票票。
声明:本文内容来源于网络,版权归原作者所有,内容由互联网用户自发贡献自行上传,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任。如果您发现有涉嫌版权的内容,欢迎发送邮件至:czq8825#qq.com(发邮件时,请将#更换为@)进行举报,并提供相关证据,一经查实,本站将立刻删除涉嫌侵权内容。