pandas数据处理进阶详解

2023-08-12 01:30:03 261

一、pandas的统计分析

1、关于pandas的数值统计（统计detail中的单价的相关指标）

importpandasaspd

#加载数据
detail=pd.read_excel("./meal_order_detail.xlsx")
print("detail:\n",detail)

print("detail的列索引名称:\n",detail.columns)
print("detail的形状:\n",detail.shape)
print("detail数据类型:\n",detail.dtypes)


print("amounts的最大值：\n",detail.loc[:,'amounts'].max())
print("amounts的最小值：\n",detail.loc[:,'amounts'].min())
print("amounts的均值：\n",detail.loc[:,'amounts'].mean())
print("amounts的中位数：\n",detail.loc[:,'amounts'].median())
print("amounts的方差：\n",detail.loc[:,'amounts'].var())
print("amounts的describe：\n",detail.loc[:,'amounts'].describe())
#对于两列的统计结果
print("amounts的describe：\n",detail.loc[:,['counts','amounts']].describe())
print("amounts的describe：\n",detail.loc[:,['counts','amounts']].describe())
print("amounts的describe：\n",detail.loc[:,'amounts'].describe())
print("amounts的describe：\n",detail.loc[:,'counts'].describe())
print("amounts的极差：\n",detail.loc[:,'amounts'].ptp())
print("amounts的标准差：\n",detail.loc[:,'amounts'].std())
print("amounts的众数：\n",detail.loc[:,'amounts'].mode())#返回众数的数组
print("amounts的众数：\n",detail.loc[:,'counts'].mode())#返回众数的数组
print("amounts的非空值的数目：\n",detail.loc[:,'amounts'].count())
print("amounts的最大值的位置：\n",detail.loc[:,'amounts'].idxmax())#np.argmax()
print("amounts的最小值的位置：\n",detail.loc[:,'amounts'].idxmin())#np.argmin()

2、pandas对于非数值型数据的统计分析

（1）对于dataframe转化数据类型，其他类型转化为object类型

detail.loc[:,'amounts']=detail.loc[:,'amounts'].astype('object')

（2）类别型数据

detail.loc[:,'amounts']=detail.loc[:,'amounts'].astype('category')
print("统计类别型数据的describe指标:\n",detail.loc[:,'amounts'].describe())

（3）统计实例

##在detail中哪些菜品最火？菜品卖出了多少份？
#若白饭算菜
detail.loc[:,'dishes_name']=detail.loc[:,'dishes_name'].astype('category')
print("按照dishes_name统计描述信息：\n",detail.loc[:,'dishes_name'].describe())

#若白饭不算菜---把白饭删除，再统计
#droplabels---行的名称，axis=0,inplace=True
#行的名称？？？怎么获取----bool值
#定位到白饭的行
bool_id=detail.loc[:,'dishes_name']=='白饭/大碗'

#进行获取行名称
index=detail.loc[bool_id,:].index

#进行删除
detail.drop(labels=index,axis=0,inplace=True)

#在进行转化类型
detail.loc[:,'dishes_name']=detail.loc[:,'dishes_name'].astype('category')

#在进行统计描述信息
print("按照dishes_name统计描述信息：\n",detail.loc[:,'dishes_name'].describe())

#看在detail中那个订单点的菜最多，点了多少份菜？
#将order_id转化为类别型数据，再进行describe
detail.loc[:,'order_id']=detail.loc[:,'order_id'].astype("category")
#统计描述
print("按照order_id统计描述信息为:\n",detail.loc[:,'order_id'].describe())

二、pandas时间数据

datetime64[ns]---numpy里面的时间点类
Timestamp---pandas默认的时间点类型----封装了datetime64[ns]
DatetimeIndex---pandas默认支持的时间序列结构

1、可以通过pd.to_datetime将时间点数据转化为pandas默认支持的时间点数据

res=pd.to_datetime("2016/01/01")
print("res:\n",res)
print("res的类型：\n",type(res))

2、时间序列转化--可以通过pd.to_datetime或者pd.DatetimeIndex将时间序列转化为pandas默认支持的时间序列结构

res=pd.to_datetime(['2016-01-01','2016-01-01','2016-01-01','2011-01-01'])
res1=pd.DatetimeIndex(['2016-01-01','2016-01-02','2016-02-05','2011-09-01'])
print("res:\n",res)
print("res的类型：\n",type(res))

print("res1:\n",res1)
print("res1的类型：\n",type(res1))

3、

importpandasaspd
##加载数据
detail=pd.read_excel("./meal_order_detail.xlsx")
#print("detail:\n",detail)
print("detail的列索引名称:\n",detail.columns)
print("detail的形状:\n",detail.shape)
#print("detail数据类型:\n",detail.dtypes)
print("*"*80)
#获取place_order_time列
print(detail.loc[:,'place_order_time'])

#转化为pandas默认支持的时间序列结构
detail.loc[:,'place_order_time']=pd.to_datetime(detail.loc[:,'place_order_time'])

#print(detail.dtypes)
print("*"*80)

#获取该时间序列的属性---可以通过列表推导式来获取时间点的属性
year=[i.yearforiindetail.loc[:,'place_order_time']]
print("年：\n",year)

month=[i.monthforiindetail.loc[:,'place_order_time']]
print("月：\n",month)

day=[i.dayforiindetail.loc[:,'place_order_time']]
print("日：\n",day)

quarter=[i.quarterforiindetail.loc[:,'place_order_time']]
print("季度：\n",quarter)

#返回对象
weekday=[i.weekdayforiindetail.loc[:,'place_order_time']]
print("周几：\n",weekday)

weekday_name=[i.weekday_nameforiindetail.loc[:,'place_order_time']]
print("周几：\n",weekday_name)

is_leap_year=[i.is_leap_yearforiindetail.loc[:,'place_order_time']]
print("是否闰年：\n",is_leap_year)

4、时间加减

importpandasaspd
res=pd.to_datetime("2016-01-01")
print("res:\n",res)
print("res的类型：\n",type(res))

print("时间推后一天：\n",res+pd.Timedelta(days=1))
print("时间推后一小时：\n",res+pd.Timedelta(hours=1))

detail.loc[:,'place_over_time']=detail.loc[:,'place_order_time']+pd.Timedelta(days=1)
print(detail)

##时间差距计算
res=pd.to_datetime('2019-10-9')-pd.to_datetime('1996-11-07')
print(res)

5、获取本机可以使用的最初时间和最后使用的时间节点

print(pd.Timestamp.min)
print(pd.Timestamp.max)

三、分组聚合

importpandasaspd
importnumpyasnp

#加载数据
users=pd.read_excel("./users.xlsx")
print("users:\n",users)
print("users的列索引：\n",users.columns)
print("users的数据类型：\n",users.dtypes)

#根据班级分组、统计学员的班级的平均年龄
#groupby分组
#by---指定分组的列，可以是单列也可以是多列
#res=users.groupby(by='ORGANIZE_NAME')['age'].mean()
#按照单列进行分组，统计多个列的指标
#res=users.groupby(by='ORGANIZE_NAME')[['age','USER_ID']].mean()
res=users.groupby(by=['ORGANIZE_NAME','poo','sex'])['age'].mean()
print(res)

#利用agg
#进行同时对age求平均值、对userid求最大值
#只需要指定np.方法名
print(users.agg({'age':np.mean,'USER_ID':np.max}))

#对age和USER_ID同时分别求和和均值
print(users[['age','USER_ID']].agg([np.sum,np.mean]))

#对ageUSER_ID求取不同个数的统计指标
print(users.agg({'age':np.min,'USER_ID':[np.mean,np.sum]}))


defhh(x):
returnx+1


#自定义函数进行计算
#res=users['age'].apply(hh)
#res=users[['age','USER_ID']].apply(lambdax:x+1)
res=users['age'].transform(lambdax:x+1)
#不能进行跨列的运算
print(res)

四、透视表与交叉表

importpandasaspd

#加载数据
detail=pd.read_excel("./meal_order_detail.xlsx")
print("detail:\n",detail)
print("detail的列名：\n",detail.columns)
print("detail的数据类型：\n",detail.dtypes)

#获取时间点的日属性
#必须pandas默认支持的时间序列类型
detail.loc[:,'place_order_time']=pd.to_datetime(detail.loc[:,'place_order_time'])

#以列表推导式来获取日属性
detail.loc[:,'day']=[i.dayforiindetail.loc[:,'place_order_time']]

#透视表是一种plus版的分组聚合
#创建一个透视表
#datadataframe数据
#values最终统计指标所针对对象，要关心的数据主体
#index--按照index进行行分组
#columns---按照columns进行列分组
#aggfunc---对主体进行什么指标的统计

#res=pd.pivot_table(data=detail[['amounts','order_id','counts','dishes_name','day']],values='amounts',columns=['day','counts'],index=['order_id','dishes_name'],aggfunc='mean',margins=True)
##print(res)
#res.to_excel("./hh.xlsx")

#交叉表mini版的透视表
#如果只传index与columns统计这两列的相对个数
#res=pd.crosstab(index=detail['counts'],columns=detail['amounts'])
#values必须和aggfunc同时存在
res=pd.crosstab(index=detail['order_id'],columns=detail['counts'],values=detail['amounts'],aggfunc='mean')
print(res)

五、案例

1、营业额案例

importpandasaspd

#detail有时间数据

#加载数据
detail=pd.read_excel("./meal_order_detail.xlsx")
print("detail:\n",detail)
print("detail的列名：\n",detail.columns)
print("detail的数据类型：\n",detail.dtypes)

#计算每个菜品的销售额，增加到detail
detail.loc[:,'pay']=detail.loc[:,'counts']*detail.loc[:,'amounts']

#print(detail)

#获取时间点的日属性
#必须pandas默认支持的时间序列类型
detail.loc[:,'place_order_time']=pd.to_datetime(detail.loc[:,'place_order_time'])

#以列表推导式来获取日属性
detail.loc[:,'day']=[i.dayforiindetail.loc[:,'place_order_time']]
#print(detail)
#以日为分组，统计pay的sum
res=detail.groupby(by='day')['pay'].sum()
print(res)
#print(type(res))

df=pd.DataFrame(res.values,columns=['monty'],index=res.index)
print(df)
print(type(df))

2、连锁超市案例

importpandasaspd

#加载数据
order=pd.read_csv("./order.csv",encoding='ansi')
print("order:\n",order)
print("order的列索引：\n",order.columns)

#1、哪些类别的商品比较畅销？
#剔除销量<0的数据（保留销量>0的数据）
#保存
bool_id=order.loc[:,'销量']>0
data=order.loc[bool_id,:]#剔除异常数据之后的正常数据

print(data.shape)
print("*"*80)

#删除异常
#bool_id=order.loc[:,'销量']<=0
#index=order.loc[bool_id,:].index
#
#data=order.drop(labels=index,axis=0,inplace=False)

#按照类别进行分组，统计销量的和
#进行dataframe或者series的值排序
#如果seriessort_values()直接按照seies的值进行排序
#如果df那么需要指定按照哪一列进行排序，by=列名

#默认是升序ascending=True
#ascending=False降序
#res=data.groupby(by='类别ID')['销量'].sum().sort_values(ascending=False)
#
#print(res)

#2、哪些商品比较畅销？
#分组聚合实现
#res=data.groupby(by='商品ID')['销量'].sum().sort_values(ascending=False).head(10)
#
#print(res)

#透视表实现
#res=pd.pivot_table(data=data.loc[:,['商品ID','销量']],index='商品ID',values='销量',aggfunc='sum').sort_values(by='销量',
#ascending=False).head(
#10)
#print(res)


#3、求不同门店的销售额占比
#提示：订单中没有销售额字段，所有需要新增一个销售额字段。增加字段后按照门店编号进行分组，然后计算占比。

##先计算销售额
#data.loc[:,'销售额']=data.loc[:,'单价']*data.loc[:,'销量']
#
##按照门店编号进行分组统计销售额的sum
#res=data.groupby(by='门店编号')['销售额'].sum()
##print(res)
##计算所有的销售额总和
#all_=res.sum()
#
##print(all_)
#per_=res/all_
#
#print("各个门店的销售额占比为：\n",per_.apply(lambdax:format(x,".2%")))

#a=100.105
#print("%.2f"%a)

#print("{}%".format(2.0))

#匿名函数
#print(lambdax:x+5)#
#
#defadd(x):
##returnx+5

#4、哪段时间段是超市的客流高峰期？
#提示：需要知道每个时间段对应的客流量，但是订单表中既有日期又有时间，我们需要从中提出小时数，这里利用订单ID去重计数代表客流量。

#先对订单去重
#subset去重的那一列的列名，可以是多列，多列的时候传列表
data.drop_duplicates(subset='订单ID',inplace=True)

#print(data.shape)

#按照小时分组对订单ID进行统计数量

#将成交时间转化为pandas默认支持的时间序列类型
data.loc[:,'成交时间']=pd.to_datetime(data.loc[:,'成交时间'])

#获取小时属性，增加到data中

data.loc[:,'hour']=[i.hourforiindata.loc[:,'成交时间']]

#print(data)

#按照hour分组统计订单ID数量

res=data.groupby(by='hour')['订单ID'].count().sort_values(ascending=False)

print(res)

以上就是本文的全部内容，希望对大家的学习有所帮助，也希望大家多多支持毛票票。

pandas数据处理进阶详解

热门推荐

随机推荐