Grouping and aggregation
1. GroupBy object
·Parameters in the groupedby function:
The function of as'index: controls whether the group label is the index value of the aggregate output. The default value is True, which is the hierarchical index. If it is False and one more column of the default index, it is equivalent to sorting non other data.
But what's the difference between the two sets of label index values? ===The purpose is to determine whether a column is an index column.
sort_values: sort the selected column of numerical data from top to bottom, from small to large (if the value transfer is not successful = = = set ontology coverage, value transfer coverage)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pyplot
%matplotlib inline
dict_obj = {'key1' : ['a', 'b', 'a', 'b',
'a', 'b', 'a', 'a'],
'key2' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'data1': np.random.randn(8),
'data2': np.random.randn(8)}
df_obj = pd.DataFrame(dict_obj)
print (df_obj)
key1 key2 data1 data2
0 a one -0.147612 -0.348087
1 b one -0.992986 0.902458
2 a two 0.547541 -0.310040
3 b three 0.458871 -1.895392
4 a two 1.224041 0.220150
5 b two -0.200124 -1.562237
6 a one 1.539144 -0.758716
7 a three 0.385845 0.074309
'''1. dataframe according to key2 Grouping'''
print(df_obj.groupby('key2')['key1'].count())
print (type(df_obj.groupby('key1')))
key2
one 3
three 2
two 3
Name: key1, dtype: int64
<class 'pandas.core.groupby.generic.DataFrameGroupBy'>
'''2. Specify column by key1 Grouping'''
print (type(df_obj['data1'].groupby(df_obj['key1'])))
<class 'pandas.core.groupby.generic.SeriesGroupBy'>
grouped1 = df_obj.groupby('key1',as_index=False)
print (grouped1.mean())
grouped2 = df_obj['data1'].groupby(df_obj['key1'])
print (grouped2.mean())
key1 data1 data2
0 a 0.709792 -0.224477
1 b -0.244746 -0.851723
key1
a 0.709792
b -0.244746
Name: data1, dtype: float64
'''3. Customize key Group, list'''
self_def_key = [1, 1, 2, 2, 2, 1, 1, 1]
df_obj.groupby(self_def_key).mean()
|
data1 |
data2 |
1 |
0.116853 |
-0.338455 |
2 |
0.743484 |
-0.661761 |
df_obj
|
key1 |
key2 |
data1 |
data2 |
0 |
a |
one |
-0.147612 |
-0.348087 |
1 |
b |
one |
-0.992986 |
0.902458 |
2 |
a |
two |
0.547541 |
-0.310040 |
3 |
b |
three |
0.458871 |
-1.895392 |
4 |
a |
two |
1.224041 |
0.220150 |
5 |
b |
two |
-0.200124 |
-1.562237 |
6 |
a |
one |
1.539144 |
-0.758716 |
7 |
a |
three |
0.385845 |
0.074309 |
'''4. Group by multiple columns = = = By list'''
grouped2 = df_obj.groupby(['key1', 'key2'],as_index=False)
print (grouped2.mean())
print('--------compare asindex Difference-------')
grouped2 = df_obj.groupby(['key1', 'key2'],as_index=True)
print (grouped2.mean())
key1 key2 data1 data2
0 a one 0.695766 -0.553401
1 a three 0.385845 0.074309
2 a two 0.885791 -0.044945
3 b one -0.992986 0.902458
4 b three 0.458871 -1.895392
5 b two -0.200124 -1.562237
--------compare asindex Difference-------
data1 data2
key1 key2
a one 0.695766 -0.553401
three 0.385845 0.074309
two 0.885791 -0.044945
b one -0.992986 0.902458
three 0.458871 -1.895392
two -0.200124 -1.562237
grouped3 = df_obj.groupby(['key2', 'key1'])
print (grouped3.mean())
print ('=============================================')
'''PS: If you want to group and aggregate by column === unstack===You can also use transpose'''
print (grouped3.mean().unstack())
data1 data2
key2 key1
one a 0.695766 -0.553401
b -0.992986 0.902458
three a 0.385845 0.074309
b 0.458871 -1.895392
two a 0.885791 -0.044945
b -0.200124 -1.562237
=============================================
data1 data2
key1 a b a b
key2
one 0.695766 -0.992986 -0.553401 0.902458
three 0.385845 0.458871 0.074309 -1.895392
two 0.885791 -0.200124 -0.044945 -1.562237
GroupBy object group iteration = = = "traversal"
grouped1
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001AF5B5F9088>
print(grouped1.head(5))
print("------------------------------------Dividing line------------------------------------------")
for group_name, group_data in grouped1:
print (group_name)
print (group_data['data1'])
key1 key2 data1 data2
0 a one -0.147612 -0.348087
1 b one -0.992986 0.902458
2 a two 0.547541 -0.310040
3 b three 0.458871 -1.895392
4 a two 1.224041 0.220150
5 b two -0.200124 -1.562237
6 a one 1.539144 -0.758716
7 a three 0.385845 0.074309
------------------------------------Dividing line------------------------------------------
a
0 -0.147612
2 0.547541
4 1.224041
6 1.539144
7 0.385845
Name: data1, dtype: float64
b
1 -0.992986
3 0.458871
5 -0.200124
Name: data1, dtype: float64
for group_name, group_data in grouped2:
print (group_name)
print (group_data)
('a', 'one')
key1 key2 data1 data2
0 a one -0.147612 -0.348087
6 a one 1.539144 -0.758716
('a', 'three')
key1 key2 data1 data2
7 a three 0.385845 0.074309
('a', 'two')
key1 key2 data1 data2
2 a two 0.547541 -0.31004
4 a two 1.224041 0.22015
('b', 'one')
key1 key2 data1 data2
1 b one -0.992986 0.902458
('b', 'three')
key1 key2 data1 data2
3 b three 0.458871 -1.895392
('b', 'two')
key1 key2 data1 data2
5 b two -0.200124 -1.562237
print(grouped1.mean())
list(grouped1)
key1 data1 data2
0 a 0.709792 -0.224477
1 b -0.244746 -0.851723
[('a', key1 key2 data1 data2
0 a one -0.147612 -0.348087
2 a two 0.547541 -0.310040
4 a two 1.224041 0.220150
6 a one 1.539144 -0.758716
7 a three 0.385845 0.074309), ('b', key1 key2 data1 data2
1 b one -0.992986 0.902458
3 b three 0.458871 -1.895392
5 b two -0.200124 -1.562237)]
dict(list(grouped1))
{'a': key1 key2 data1 data2
0 a one -0.147612 -0.348087
2 a two 0.547541 -0.310040
4 a two 1.224041 0.220150
6 a one 1.539144 -0.758716
7 a three 0.385845 0.074309, 'b': key1 key2 data1 data2
1 b one -0.992986 0.902458
3 b three 0.458871 -1.895392
5 b two -0.200124 -1.562237}
print (df_obj.dtypes)
df_obj.groupby(df_obj.dtypes, axis=1).size()
df_obj.groupby(df_obj.dtypes, axis=1).sum()
key1 object
key2 object
data1 float64
data2 float64
dtype: object
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2896 try:
-> 2897 return self._engine.get_loc(key)
2898 except KeyError:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: dtype('float64')
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-15-f2dbc4ebe631> in <module>
4 # Group by data type
5 df_obj.groupby(df_obj.dtypes, axis=1).size()
----> 6 df_obj.groupby(df_obj.dtypes, axis=1).sum()
~\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py in f(self, **kwargs)
1380 if isinstance(result, DataFrame):
1381 for col in result.columns:
-> 1382 result[col] = self._try_cast(result[col], self.obj[col])
1383 else:
1384 result = self._try_cast(result, self.obj)
~\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2978 if self.columns.nlevels > 1:
2979 return self._getitem_multilevel(key)
-> 2980 indexer = self.columns.get_loc(key)
2981 if is_integer(indexer):
2982 indexer = [indexer]
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2897 return self._engine.get_loc(key)
2898 except KeyError:
-> 2899 return self._engine.get_loc(self._maybe_cast_indexer(key))
2900 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
2901 if indexer.ndim > 1 or indexer.size > 1:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: dtype('float64')
Other grouping methods
- In fact, list is also a way of grouping
===When using lists, they are usually multi-level indexes
df_obj2 = pd.DataFrame(np.random.randint(1, 10, (5,5)),
columns=['a', 'b', 'c', 'd', 'e'],
index=['A', 'B', 'C', 'D', 'E'])
df_obj2.ix[1, 1:4] = np.NaN
df_obj2
C:\Users\wztli\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: FutureWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
after removing the cwd from sys.path.
|
a |
b |
c |
d |
e |
A |
4 |
2.0 |
6.0 |
5.0 |
9 |
B |
5 |
NaN |
NaN |
NaN |
6 |
C |
2 |
3.0 |
8.0 |
6.0 |
3 |
D |
9 |
5.0 |
6.0 |
5.0 |
9 |
E |
4 |
1.0 |
6.0 |
2.0 |
1 |
- Group by dictionary
mapping_dict = {'A':'python', 'B':'python', 'C':'java', 'D':'C', 'E':'java'}
print(df_obj2.groupby(mapping_dict, axis=0).sum())
a b c d e
C 9 5.0 6.0 5.0 9
java 6 4.0 14.0 8.0 4
python 9 2.0 6.0 5.0 15
- Group by function
df_obj3 = pd.DataFrame(np.random.randint(1, 10, (5,5)),
columns=['a', 'b', 'c', 'd', 'e'],
index=['AA', 'BBB', 'CC', 'D', 'EE'])
def group_key(idx):
"""
idx Column index or row index
"""
return len(idx)
df_obj3.groupby(group_key).size()
1 1
2 3
3 1
dtype: int64
- Group by level index
columns = pd.MultiIndex.from_arrays([['Python', 'Java', 'Python', 'Java', 'Python'],
['A', 'A', 'B', 'C', 'B']], names=['language', 'index'])
df_obj4 = pd.DataFrame(np.random.randint(1, 10, (5, 5)), columns=columns)
df_obj4
language |
Python |
Java |
Python |
Java |
Python |
index |
A |
A |
B |
C |
B |
0 |
4 |
6 |
8 |
8 |
4 |
1 |
1 |
3 |
2 |
3 |
5 |
2 |
3 |
1 |
1 |
5 |
6 |
3 |
2 |
9 |
3 |
1 |
9 |
4 |
4 |
1 |
5 |
6 |
6 |
df_obj4.groupby(level='language', axis=1).sum()
df_obj4.groupby(level='index', axis=1).sum()
index |
A |
B |
C |
0 |
10 |
12 |
8 |
1 |
4 |
7 |
3 |
2 |
4 |
7 |
5 |
3 |
11 |
12 |
1 |
4 |
5 |
11 |
6 |
polymerization
dict_obj = {'key1' : ['a', 'b', 'a', 'b',
'a', 'b', 'a', 'a'],
'key2' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'data1': np.random.randint(1,10, 8),
'data2': np.random.randint(1,10, 8)}
df_obj5 = pd.DataFrame(dict_obj)
print (df_obj5)
key1 key2 data1 data2
0 a one 9 4
1 b one 6 7
2 a two 9 4
3 b three 9 6
4 a two 6 2
5 b two 3 3
6 a one 1 1
7 a three 2 6
- Built in aggregate functions
df_obj5
|
key1 |
key2 |
data1 |
data2 |
0 |
a |
one |
9 |
4 |
1 |
b |
one |
6 |
7 |
2 |
a |
two |
9 |
4 |
3 |
b |
three |
9 |
6 |
4 |
a |
two |
6 |
2 |
5 |
b |
two |
3 |
3 |
6 |
a |
one |
1 |
1 |
7 |
a |
three |
2 |
6 |
print (df_obj5.groupby('key1').mean())
'''
count: In group NA Value
std: standard deviation
var: variance
median: wrong NA Median in
mean: wrong NA Average value
25%||50%||75%What do you mean?==Not made?
'''
data1 data2
key1
a 5.4 3.400000
b 6.0 5.333333
'\ncount: In group NA Value\nstd: standard deviation\nvar: variance\nmedian: wrong NA Median in\nmean: wrong NA Average value\n25%||50%||75%What do you mean?==Not made?\n'
- Custom aggregate function
def peak_range(df):
"""
//Return value range
"""
return df.max() - df.min()
print (df_obj5.groupby('key1').agg(peak_range))
data1 data2
key1
a 8 5
b 6 4
- Applying multiple aggregate functions at the same time: agg
print (df_obj.groupby('key1').agg(['mean', 'std', 'count']))
data1 data2
mean std count mean std count
key1
a 0.709792 0.674293 5 -0.224477 0.385674 5
b -0.244746 0.726957 3 -0.851723 1.528271 3
print (df_obj.groupby('key1').agg(['mean', 'std', 'count', ('range', peak_range)]))
data1 data2
mean std count range mean std count range
key1
a 0.709792 0.674293 5 1.686756 -0.224477 0.385674 5 0.978865
b -0.244746 0.726957 3 1.451857 -0.851723 1.528271 3 2.797850
dict_mapping = {'data1':'mean',
'data2':'sum'}
print (df_obj.groupby('key1').agg(dict_mapping))
data1 data2
key1
a 0.709792 -1.122384
b -0.244746 -2.555170
dict_mapping = {'data1':['mean','max'],
'data2':'sum'}
print (df_obj.groupby('key1').agg(dict_mapping))
data1 data2
mean max sum
key1
a 0.709792 1.539144 -1.122384
b -0.244746 0.458871 -2.555170
Grouping operation
import pandas as pd
import numpy as np
Group and align
s1 = pd.Series(range(10, 20), index = range(10))
s2 = pd.Series(range(20, 25), index = range(5))
print ('s1: ' )
print (s1)
print('===========================')
print ('s2: ')
print (s2)
s1:
0 10
1 11
2 12
3 13
4 14
5 15
6 16
7 17
8 18
9 19
dtype: int64
===========================
s2:
0 20
1 21
2 22
3 23
4 24
dtype: int64
s1 + s2
print(s1+s2)
0 30.0
1 32.0
2 34.0
3 36.0
4 38.0
5 NaN
6 NaN
7 NaN
8 NaN
9 NaN
dtype: float64
df1 = pd.DataFrame(np.ones((2,2)), columns = ['a', 'b'])
df2 = pd.DataFrame(np.ones((3,3)), columns = ['a', 'b', 'c'])
print ('df1: ')
print (df1)
print ('=================')
print ('df2: ')
print (df2)
df1:
a b
0 1.0 1.0
1 1.0 1.0
=================
df2:
a b c
0 1.0 1.0 1.0
1 1.0 1.0 1.0
2 1.0 1.0 1.0
print(df1 + df2)
a b c
0 2.0 2.0 NaN
1 2.0 2.0 NaN
2 NaN NaN NaN
① Common operation function
print(s1.add(s2, fill_value = -1))
0 30.0
1 32.0
2 34.0
3 36.0
4 38.0
5 14.0
6 15.0
7 16.0
8 17.0
9 18.0
dtype: float64
df1.sub(df2, fill_value = 2.)
|
a |
b |
c |
0 |
0.0 |
0.0 |
1.0 |
1 |
0.0 |
0.0 |
1.0 |
2 |
1.0 |
1.0 |
1.0 |
s3 = s1 + s2
print (s3)
0 30.0
1 32.0
2 34.0
3 36.0
4 38.0
5 NaN
6 NaN
7 NaN
8 NaN
9 NaN
dtype: float64
s3_filled = s3.fillna(-1)
print (s3)
0 30.0
1 32.0
2 34.0
3 36.0
4 38.0
5 NaN
6 NaN
7 NaN
8 NaN
9 NaN
dtype: float64
df3 = df1 + df2
print (df3)
a b c
0 2.0 2.0 NaN
1 2.0 2.0 NaN
2 NaN NaN NaN
df3.fillna(100, inplace = True)
print (df3)
a b c
0 2.0 2.0 100.0
1 2.0 2.0 100.0
2 100.0 100.0 100.0
Statistical calculation yao differential aggregation aggregation must be grouped first.
df_obj1 = pd.DataFrame(np.random.randn(5,4), columns = ['a', 'b', 'c', 'd'])
print(df_obj1)
a b c d
0 -0.542708 0.201376 1.111431 1.784324
1 0.583422 0.231096 -2.801967 0.568497
2 -0.577329 -1.668581 -0.842126 1.803080
3 -0.128431 -1.769619 2.089983 0.209761
4 0.493981 -1.571405 0.690019 -0.215292
print(df_obj1.sum(axis=1))
print('=====================================')
print(df_obj1.max())
print('=====================================')
print(df_obj1.min(axis=1))
0 2.554423
1 -1.418952
2 -1.284956
3 0.401694
4 -0.602698
dtype: float64
=====================================
a 0.583422
b 0.231096
c 2.089983
d 1.803080
dtype: float64
=====================================
0 -0.542708
1 -2.801967
2 -1.668581
3 -1.769619
4 -1.571405
dtype: float64
Data grouping operation
dict_obj = {'key1' : ['a', 'b', 'a', 'b',
'a', 'b', 'a', 'a'],
'key2' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'data1': np.random.randint(1, 10, 8),
'data2': np.random.randint(1, 10, 8)}
df_obj = pd.DataFrame(dict_obj)
df_obj
|
key1 |
key2 |
data1 |
data2 |
0 |
a |
one |
4 |
3 |
1 |
b |
one |
4 |
4 |
2 |
a |
two |
9 |
6 |
3 |
b |
three |
8 |
2 |
4 |
a |
two |
3 |
3 |
5 |
b |
two |
6 |
2 |
6 |
a |
one |
4 |
1 |
7 |
a |
three |
2 |
2 |
k1_sum = df_obj.groupby('key1').sum().add_prefix('sum_')
print(k1_sum)
print('================================')
print(df_obj)
sum_data1 sum_data2
key1
a 22 15
b 18 8
================================
key1 key2 data1 data2
0 a one 4 3
1 b one 4 4
2 a two 9 6
3 b three 8 2
4 a two 3 3
5 b two 6 2
6 a one 4 1
7 a three 2 2
- merge method
pd.merge(df_obj, k1_sum, left_on='key1', right_index=True)
|
key1 |
key2 |
data1 |
data2 |
sum_data1 |
sum_data2 |
0 |
a |
one |
4 |
3 |
22 |
15 |
2 |
a |
two |
9 |
6 |
22 |
15 |
4 |
a |
two |
3 |
3 |
22 |
15 |
6 |
a |
one |
4 |
1 |
22 |
15 |
7 |
a |
three |
2 |
2 |
22 |
15 |
1 |
b |
one |
4 |
4 |
18 |
8 |
3 |
b |
three |
8 |
2 |
18 |
8 |
5 |
b |
two |
6 |
2 |
18 |
8 |
- transform method
k1_sum_tf = df_obj.groupby('key1').transform(np.sum).add_prefix('sum_')
df_obj[k1_sum_tf.columns] = k1_sum_tf
df_obj
|
key1 |
key2 |
data1 |
data2 |
sum_key2 |
sum_data1 |
sum_data2 |
0 |
a |
one |
4 |
3 |
onetwotwoonethree |
22 |
15 |
1 |
b |
one |
4 |
4 |
onethreetwo |
18 |
8 |
2 |
a |
two |
9 |
6 |
onetwotwoonethree |
22 |
15 |
3 |
b |
three |
8 |
2 |
onethreetwo |
18 |
8 |
4 |
a |
two |
3 |
3 |
onetwotwoonethree |
22 |
15 |
5 |
b |
two |
6 |
2 |
onethreetwo |
18 |
8 |
6 |
a |
one |
4 |
1 |
onetwotwoonethree |
22 |
15 |
7 |
a |
three |
2 |
2 |
onetwotwoonethree |
22 |
15 |
- Custom function
#Custom function passed in transform
def diff_mean(s):
"""
Difference between return data and mean value
"""
return s - s.mean()
df_obj.groupby('key1').transform(diff_mean)
|
data1 |
data2 |
sum_data1 |
sum_data2 |
0 |
-0.4 |
0.000000 |
0 |
0 |
1 |
-2.0 |
1.333333 |
0 |
0 |
2 |
4.6 |
3.000000 |
0 |
0 |
3 |
2.0 |
-0.666667 |
0 |
0 |
4 |
-1.4 |
0.000000 |
0 |
0 |
5 |
0.0 |
-0.666667 |
0 |
0 |
6 |
-0.4 |
-2.000000 |
0 |
0 |
7 |
-2.4 |
-1.000000 |
0 |
0 |
dataset_path = './data/starcraft.csv'
df_data = pd.read_csv(dataset_path, usecols=['LeagueIndex', 'Age', 'HoursPerWeek',
'TotalHours', 'APM'])
def top_n(df, n=3, column='APM'):
"""
//Return top n data of each group by column
"""
return df.sort_values(by=column, ascending=False)[:n]
df_data.groupby('LeagueIndex').apply(top_n)
|
|
LeagueIndex |
Age |
HoursPerWeek |
TotalHours |
APM |
LeagueIndex |
|
|
|
|
|
|
1 |
2214 |
1 |
20.0 |
12.0 |
730.0 |
172.9530 |
2246 |
1 |
27.0 |
8.0 |
250.0 |
141.6282 |
1753 |
1 |
20.0 |
28.0 |
100.0 |
139.6362 |
2 |
3062 |
2 |
20.0 |
6.0 |
100.0 |
179.6250 |
3229 |
2 |
16.0 |
24.0 |
110.0 |
156.7380 |
1520 |
2 |
29.0 |
6.0 |
250.0 |
151.6470 |
3 |
1557 |
3 |
22.0 |
6.0 |
200.0 |
226.6554 |
484 |
3 |
19.0 |
42.0 |
450.0 |
220.0692 |
2883 |
3 |
16.0 |
8.0 |
800.0 |
208.9500 |
4 |
2688 |
4 |
26.0 |
24.0 |
990.0 |
249.0210 |
1759 |
4 |
16.0 |
6.0 |
75.0 |
229.9122 |
2637 |
4 |
23.0 |
24.0 |
650.0 |
227.2272 |
5 |
3277 |
5 |
18.0 |
16.0 |
950.0 |
372.6426 |
93 |
5 |
17.0 |
36.0 |
720.0 |
335.4990 |
202 |
5 |
37.0 |
14.0 |
800.0 |
327.7218 |
6 |
734 |
6 |
16.0 |
28.0 |
730.0 |
389.8314 |
2746 |
6 |
16.0 |
28.0 |
4000.0 |
350.4114 |
1810 |
6 |
21.0 |
14.0 |
730.0 |
323.2506 |
7 |
3127 |
7 |
23.0 |
42.0 |
2000.0 |
298.7952 |
104 |
7 |
21.0 |
24.0 |
1000.0 |
286.4538 |
1654 |
7 |
18.0 |
98.0 |
700.0 |
236.0316 |
8 |
3393 |
8 |
NaN |
NaN |
NaN |
375.8664 |
3373 |
8 |
NaN |
NaN |
NaN |
364.8504 |
3372 |
8 |
NaN |
NaN |
NaN |
355.3518 |
df_data.groupby('LeagueIndex').apply(top_n, n=2, column='Age')
|
|
LeagueIndex |
Age |
HoursPerWeek |
TotalHours |
APM |
LeagueIndex |
|
|
|
|
|
|
1 |
3146 |
1 |
40.0 |
12.0 |
150.0 |
38.5590 |
3040 |
1 |
39.0 |
10.0 |
500.0 |
29.8764 |
2 |
920 |
2 |
43.0 |
10.0 |
730.0 |
86.0586 |
2437 |
2 |
41.0 |
4.0 |
200.0 |
54.2166 |
3 |
1258 |
3 |
41.0 |
14.0 |
800.0 |
77.6472 |
2972 |
3 |
40.0 |
10.0 |
500.0 |
60.5970 |
4 |
1696 |
4 |
44.0 |
6.0 |
500.0 |
89.5266 |
1729 |
4 |
39.0 |
8.0 |
500.0 |
86.7246 |
5 |
202 |
5 |
37.0 |
14.0 |
800.0 |
327.7218 |
2745 |
5 |
37.0 |
18.0 |
1000.0 |
123.4098 |
6 |
3069 |
6 |
31.0 |
8.0 |
800.0 |
133.1790 |
2706 |
6 |
31.0 |
8.0 |
700.0 |
66.9918 |
7 |
2813 |
7 |
26.0 |
36.0 |
1300.0 |
188.5512 |
1992 |
7 |
26.0 |
24.0 |
1000.0 |
219.6690 |
8 |
3340 |
8 |
NaN |
NaN |
NaN |
189.7404 |
3341 |
8 |
NaN |
NaN |
NaN |
287.8128 |
df_data.groupby('LeagueIndex', group_keys=False).apply(top_n)
|
LeagueIndex |
Age |
HoursPerWeek |
TotalHours |
APM |
2214 |
1 |
20.0 |
12.0 |
730.0 |
172.9530 |
2246 |
1 |
27.0 |
8.0 |
250.0 |
141.6282 |
1753 |
1 |
20.0 |
28.0 |
100.0 |
139.6362 |
3062 |
2 |
20.0 |
6.0 |
100.0 |
179.6250 |
3229 |
2 |
16.0 |
24.0 |
110.0 |
156.7380 |
1520 |
2 |
29.0 |
6.0 |
250.0 |
151.6470 |
1557 |
3 |
22.0 |
6.0 |
200.0 |
226.6554 |
484 |
3 |
19.0 |
42.0 |
450.0 |
220.0692 |
2883 |
3 |
16.0 |
8.0 |
800.0 |
208.9500 |
2688 |
4 |
26.0 |
24.0 |
990.0 |
249.0210 |
1759 |
4 |
16.0 |
6.0 |
75.0 |
229.9122 |
2637 |
4 |
23.0 |
24.0 |
650.0 |
227.2272 |
3277 |
5 |
18.0 |
16.0 |
950.0 |
372.6426 |
93 |
5 |
17.0 |
36.0 |
720.0 |
335.4990 |
202 |
5 |
37.0 |
14.0 |
800.0 |
327.7218 |
734 |
6 |
16.0 |
28.0 |
730.0 |
389.8314 |
2746 |
6 |
16.0 |
28.0 |
4000.0 |
350.4114 |
1810 |
6 |
21.0 |
14.0 |
730.0 |
323.2506 |
3127 |
7 |
23.0 |
42.0 |
2000.0 |
298.7952 |
104 |
7 |
21.0 |
24.0 |
1000.0 |
286.4538 |
1654 |
7 |
18.0 |
98.0 |
700.0 |
236.0316 |
3393 |
8 |
NaN |
NaN |
NaN |
375.8664 |
3373 |
8 |
NaN |
NaN |
NaN |
364.8504 |
3372 |
8 |
NaN |
NaN |
NaN |
355.3518 |