博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
8-pandas聚合运算
阅读量:7058 次
发布时间:2019-06-28

本文共 7498 字,大约阅读时间需要 24 分钟。

hot3.png

#encoding:utf8import numpy as npimport pandas as pd'''内置聚合函数自定义聚合函数一次性应该多个聚合函数不同列应用不同聚合函数重置索引'''df = pd.DataFrame(    {        'key1':['a','a','b','b','a'],        'key2':['one','two','one','two','one'],        'data1':np.random.randint(1,10,5),        'data2':np.random.randint(1,10,5),    })print(df)'''   data1  data2 key1 key20      1      8    a  one1      7      5    a  two2      1      6    b  one3      9      7    b  two4      5      4    a  one'''#内置聚合函数#求和print(df.groupby('key1').sum())'''      data1  data2key1a        21     19b         6      4'''#求平均值print(df.groupby('key1').mean())'''         data1  data2key1a     3.333333    5.0b     3.500000    4.0'''#求最小值print(df.groupby('key1').min())'''      data1  data2 key2key1a         1      3  oneb         4      1  one'''#求最大值print(df.groupby('key1').max())'''      data1  data2 key2key1a         9      9  twob         4      4  two'''#展示一些列的聚合运算结果print(df.groupby('key1').describe())'''               data1     data2key1a    count  3.000000  3.000000     mean   4.666667  5.333333     std    0.577350  2.081666     min    4.000000  3.000000     25%    4.500000  4.500000     50%    5.000000  6.000000     75%    5.000000  6.500000     max    5.000000  7.000000b    count  2.000000  2.000000     mean   6.500000  8.500000     std    3.535534  0.707107     min    4.000000  8.000000     25%    5.250000  8.250000     50%    6.500000  8.500000     75%    7.750000  8.750000     max    9.000000  9.000000'''#自定义聚合函数grouped = df.groupby('key1')#聚合结果最大值减去最小值def peak_range(s):    return s.max() - s.min()print(df)print(grouped.agg(peak_range))'''   data1  data2 key1 key20      7      2    a  one1      2      8    a  two2      7      8    b  one3      7      2    b  two4      1      7    a  one      data1  data2key1a         6      6b         0      6'''#还可以传如多个聚合函数进行聚合运算print(grouped.agg(['std','mean','sum',peak_range]))'''         data1                         data2           std mean sum peak_range       std      mean sum peak_rangekey1a     1.732051  5.0  15          3  2.516611  6.666667  20          5b     2.121320  2.5   5          3  2.121320  2.500000   5          3'''print(grouped.agg(['std','mean','sum',('range',peak_range)]))'''         data1                         data2           std      mean sum range       std      mean sum rangekey1a     3.785939  3.666667  11     7  2.081666  7.333333  22     4b     1.414214  5.000000  10     2  0.707107  1.500000   3     1'''#不同的列使用不同的聚合函数d = {'data1':'mean','data2':'sum'}print(grouped.agg(d))'''      data1  data2key1a       5.0     19b       5.5     17'''d = {'data1':['mean','sum'],'data2':['sum',('range',peak_range)]}print(grouped.agg(d))'''     data2           data1       sum range      mean sumkey1a       14     7  5.333333  16b        9     5  5.500000  11'''print(grouped.agg(d).reset_index())'''  key1 data2       data1         sum range  mean sum0    a    16     7     7  211    b    15     3     5  10'''#高级应用df = pd.DataFrame(    {        'key1':['a','a','b','b','a'],        'key2':['one','two','one','two','one'],        'data1':np.random.randint(1,10,5),        'data2':np.random.randint(1,10,5),    })k1_mean = df.groupby('key1').mean()print(k1_mean)'''      data1     data2key1a       5.0  3.333333b       4.0  8.500000'''#首先修改聚合之后的列显示名称k1_mean = df.groupby('key1').mean().add_prefix('mean_')print(k1_mean)'''      mean_data1  mean_data2key1a       7.666667         6.0b       8.000000         5.0'''#使用合并函数print(pd.merge(df,k1_mean,left_on='key1',right_index=True))'''   data1  data2 key1 key2  mean_data1  mean_data20      5      7    a  one         4.0    6.6666671      2      8    a  two         4.0    6.6666674      5      5    a  one         4.0    6.6666672      6      8    b  one         7.5    8.0000003      9      8    b  two         7.5    8.000000'''#另外的方式k1_mean = df.groupby('key1').transform(np.mean).add_prefix('mean_')print(k1_mean)'''   mean_data1  mean_data20    6.666667         7.01    6.666667         7.02    5.500000         3.53    5.500000         3.54    6.666667         7.0'''#合并df[k1_mean.columns] = k1_meanprint(df)'''   data1  data2 key1 key2  mean_data1  mean_data20      9      3    a  one           7    1.6666671      7      1    a  two           7    1.6666672      7      5    b  one           7    6.5000003      7      8    b  two           7    6.5000004      5      1    a  one           7    1.666667'''df = pd.DataFrame(    np.random.randint(1,10,(5,5)),    columns=list('abcde'),    index=['Alice','Bob','Candy','Dark','Emily'])print(df)'''       a  b  c  d  eAlice  8  1  4  8  1Bob    6  8  2  7  1Candy  3  2  3  2  6Dark   8  9  2  9  8Emily  7  9  6  3  4'''def demean(s):    return s - s.mean()key = ['one','one','two','one','two']demeaned = df.groupby(key).transform(demean)print(demeaned)'''         a    b    c         d    eAlice  1.0 -1.0 -3.0  0.666667  2.0Bob    2.0  0.0  0.0  1.666667 -3.0Candy -2.5  3.0  1.5  0.500000 -0.5Dark  -3.0  1.0  3.0 -2.333333  1.0Emily  2.5 -3.0 -1.5 -0.500000  0.5'''#误差print(demeaned.groupby(key).mean())'''                a             b             c    d    eone  2.960595e-16  2.960595e-16  2.960595e-16  0.0  0.0two  0.000000e+00  0.000000e+00  0.000000e+00  0.0  0.0'''#输出某一列排序的最大的前两行df = pd.DataFrame(    {        'key1':['a','a','b','b','a','a','a','b','b','a'],        'key2':['one','two','one','two','one','one','two','one','two','one'],        'data1':np.random.randint(1,10,10),        'data2':np.random.randint(1,10,10),    })print(df)'''   data1  data2 key1 key20      1      2    a  one1      5      8    a  two2      7      9    b  one3      8      3    b  two4      9      8    a  one5      7      8    a  one6      6      7    a  two7      2      7    b  one8      7      8    b  two9      4      4    a  one'''def top(g,n=2,column='data1'):    return g.sort_values(by=column,ascending=False)[:n]print(df)print(top(df))print(top(df,n=3))'''   data1  data2 key1 key20      1      4    a  one1      3      5    a  two2      1      6    b  one3      8      7    b  two4      7      7    a  one5      6      4    a  one6      4      1    a  two7      4      8    b  one8      4      3    b  two9      6      4    a  one   data1  data2 key1 key23      8      7    b  two4      7      7    a  one   data1  data2 key1 key23      8      7    b  two4      7      7    a  one5      6      4    a  one'''print(df)print(df.groupby('key1').apply(top))'''   data1  data2 key1 key20      5      9    a  one1      5      2    a  two2      4      9    b  one3      9      3    b  two4      3      1    a  one5      7      5    a  one6      3      4    a  two7      4      3    b  one8      4      6    b  two9      9      6    a  one        data1  data2 key1 key2key1a    9      9      6    a  one     5      7      5    a  oneb    3      9      3    b  two     2      4      9    b  one'''#对于Nan值采用分组的平均值进行填充states = [    '成都','贵州','重庆','西藏',    '广州','杭州','福州','海南']group_key = ['East'] * 4 + ['West'] * 4data = pd.Series(np.random.randn(8),index=states)data[['重庆','福州','海南']]=np.NaNprint(data)'''成都   -0.415079贵州    0.276800重庆         NaN西藏    0.112333广州   -1.029535杭州    2.415340福州         NaN海南         NaN'''print(data.groupby(group_key).mean())'''East    1.641739West   -0.894022'''def fill_mean(g):    return g.fillna(g.mean())print(data.groupby(group_key).apply(fill_mean))'''成都   -0.171079贵州    0.577168重庆   -0.355028西藏   -1.471175广州   -0.149200杭州    0.193913福州    0.022356海南    0.022356'''

 

转载于:https://my.oschina.net/eddylinux/blog/1529836

你可能感兴趣的文章
最大子序列和O(N)算法简单分析『神兽必读』
查看>>
elasticsearch之排序
查看>>
Magento表单使用ajax验证
查看>>
mac上安装使用docker
查看>>
MySQL5.7.11压缩包版安装
查看>>
如何提升代码水平
查看>>
Elixir 分布式 Application 故障转移和接管
查看>>
微软为 Chrome 和 Firefox 发布了 Windows Defender 扩展
查看>>
英特尔基于 LLVM 的 SYCL 开源编译器现已发布
查看>>
java B2B2C 多租户电子商城系统
查看>>
(十二)java springboot b2b2c shop 多用户商城系统源码- SSO单点登录之OAuth2.0 登出流程(3)...
查看>>
换个角度看GAN:另一种损失函数
查看>>
连接mysql报错Table ‘performance_schema.session_variables’ ...
查看>>
Linux基础命令---zipinfo
查看>>
关于移动互联网产品的指标分析初探
查看>>
4.2019Android多线程总结
查看>>
Java数据解析之JSON
查看>>
大师级直播程序源码,一次让您玩个够
查看>>
跟我学 K8S--代码: 调试 Kubernetes
查看>>
烟沙浮生 | 此间少年(2018-10-15 第五周记)
查看>>