python pandas 对时间序列文件处理的实例
如下所示:
importpandasaspd fromnumpyimport* importmatplotlib.pylabasplt importcopy defread(filename): dat=pd.read_csv(filename,iterator=True) loop=True chunkSize=1000000 R=[] whileloop: try: data=dat.get_chunk(chunkSize) data=data.loc[:,'B':'C']#切片 data=data[data.B==855]#条件选择 data['C']=pd.to_datetime(data['C'])#转换成时间格式 data=data.set_index(['C'])#设置索引 data.loc[:,'D']=array([1]*len(data))#增加一列 data=data.resample('D').sum()#按天求和 data=data.loc[:,'D']#截取 data.fillna(0)#填充缺失值 R.append(data) exceptStopIteration: loop=False print("Iterationisstopped.") R.to_csv('855_pay.csv')#保存 defread2(filename): reader=pd.read_csv(filename,iterator=True) loop=True chunkSize=100000 chunks=[] whileloop: try: chunk=reader.get_chunk(chunkSize) chunks.append(chunk) exceptStopIteration: loop=False print("Iterationisstopped.") df=pd.concat(chunks,ignore_index=True) returndf defread3save(filename): dat=pd.read_csv(filename) #data=dat.get_chunk(chunkSize) data=dat.loc[:,'B':'C']#切片 data=data[data.B==855]#条件选择 print(shape(data)) data['C']=pd.to_datetime(data['C'])#转换成时间格式 data=data.set_index(['C'])#设置索引 iflen(data)==0: return data.loc[:,'D']=array([1]*len(data))#增加一列 data=data.resample('D').sum()#按天求和 data=data.loc[:,'D']#截取 data.fillna(0)#填充缺失值 data.to_csv('855_pay.csv',mode='a')#保存 defloadDataSet(fileName,delim='\t'): fr=open(fileName) stringArr=[line.strip().split(delim)forlineinfr.readlines()] datArr=[list(map(float,line))forlineinstringArr] returnmat(datArr) defgetShopData(): fr=open('shopInfo.txt') shopID=[line.strip().split('\n')forlineinfr.readlines()] #datArr=[list(map(float,line))forlineinstringArr] foriinrange(1,9): name="user_pay.001.00%d"%i dat=pd.read_csv(name) #data=dat.get_chunk(chunkSize) data=dat.loc[:,'B':'C']#切片 forfactorinshopID: data=data[data.B==int(str(factor[0]))]#条件选择 print(shape(data)) iflen(data)==0:continue data['C']=pd.to_datetime(data['C'])#转换成时间格式 data=data.set_index(['C'])#设置索引 data.loc[:,'D']=array([1]*len(data))#增加一列 data=data.resample('D').sum()#按天求和 data=data.loc[:,'D']#截取 data.fillna(0)#填充缺失值 s=str(factor[0]) savename='D:\python\data\%s_pay.csv'%s data.to_csv(savename,mode='a')#保存 deldat print("over") deftset(filename): dat=pd.read_csv(filename) #data=dat.get_chunk(chunkSize) data=dat.loc[:,'B':'C']#切片 data=data[data.B==855]#条件选择 print(shape(data)) data['C']=pd.to_datetime(data['C'])#转换成时间格式 data=data.set_index(['C'])#设置索引 iflen(data)==0: return data.loc[:,'D']=array([1]*len(data))#增加一列 data=data.resample('D').sum()#按天求和 data=data.loc[:,'D']#截取 data.fillna(0)#填充缺失值 #data.to_csv('855_pay.csv',mode='a')#保存 s='my' savename='D:\python\data\%s_pay.csv'%s data.to_csv(savename,mode='a')#保存 defgetShopData2(filename): importcsv #fr=open('shopInfo.txt') #shopID=[line.strip().split('\n')forlineinfr.readlines()] #datArr=[list(map(float,line))forlineinstringArr] #foriinrange(1,9): #name="user_pay.001.00%d"%i dat=pd.read_csv(filename) #data=dat.get_chunk(chunkSize) data=dat.loc[:,'B':'C']#切片 data['C']=pd.to_datetime(data['C'])#转换成时间格式 data=data.set_index(['C'])#设置索引 data.loc[:,'D']=array([1]*len(data))#增加一列 foriinrange(1,2001): d=copy.copy(data) d=d[data.B==i]#条件选择 #print(shape(d)) print(i) iflen(d)==0:continue d=d.resample('D').sum()#按天求和 d=d.loc[:,'D']#截取 d.fillna(0)#填充缺失值 s=str(i) #print(s) savename='D:\python\data2\%s_pay.csv'%s c=open(savename,'a') writer=csv.writer(c) writer.writerow(['C','D']) c.close() d.to_csv(savename,mode='a')#保存 #deldat print("over") defformatData(): #fr=open('shopInfo.txt') #shopID=[line.strip().split('\n')forlineinfr.readlines()] #datArr=[list(map(float,line))forlineinstringArr] #data=dat.get_chunk(chunkSize) foriinrange(1,2001): s=str(i) print(s) name='D:\python\data2\%s_pay.csv'%s dat=pd.read_csv(name) data['C']=pd.to_datetime(data['C'])#转换成时间格式 data=data.set_index(['C'])#设置索引 data=data.resample('D').sum()#按天求和 data.fillna(0)#填充缺失值 savename='D:\python\data3\%s_pay.csv'%s data.to_csv(savename,mode='w')#保存 deldat print("over")
以上这篇pythonpandas对时间序列文件处理的实例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持毛票票。