Python Data Analysis Lesson 06: Data Cleaning and Preliminary Analysis Notes

Articles Catalogue

Lagrange Interpolation Method for Missing Value Processing
Data frame merge
Merge on Index
Axial connection
Merge overlapping data
Reshaping Hierarchical Index
Conversion of Length-Width Format
Remove duplicate data
Data Conversion Using Functions or Maps
Data standardization

Minimum-Maximum Normalization
Zero-Mean Normalization
Normalization by decimal scaling

Replacement value
Renamed axis index
Discretization and panel partition
Detection and filtering of outliers
Arrangement and Random Sampling
Computational Indicators and Dumb Variables
Attribute Construction
String Object Method
regular expression
Vectorized string functions in pandas
Example: USDA Food Database

Lagrange Interpolation Method for Missing Value Processing

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from scipy.interpolate import lagrange  # Import Lagrange interpolation function

np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))

inputfile = 'data/catering_sale.xls'  # Sales Data Path
outputfile = 'data/sales.xls'  # Output data path

data = pd.read_excel(inputfile)  # read in data

# Filter outliers to null values
"""
data[u'Sales volume'][(data[u'Sales volume'] < 400) | (data[u'Sales volume'] > 5000)] = None  
//There will be a warning if it is written as follows:
SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

//If you want to change the original data, use a single assignment operation (loc):
data.loc[(data[u'Sales volume'] < 400) | (data[u'Sales volume'] > 5000), u'Sales volume'] = None

//If you want a copy, be sure to force Pandas to create a copy:
error_data = data.copy() 
error_data.loc[(error_data[u'Sales volume'] < 400) | (error_data[u'Sales volume'] > 5000), u'Sales volume'] = None

//Reference: https://www.jianshu.com/p/72274 ccb647a
"""
data.loc[(data[u'Sales volume'] < 400) | (data[u'Sales volume'] > 5000), u'Sales volume'] = None


# Custom Column Vector Interpolation Function
# s is the column vector, n is the interpolated position, k is the number of data before and after taking, the default is 5.
def ployinterp_column(s, n, k=5):
    y = s[list(range(n - k, n)) + list(range(n + 1, n + 1 + k))]  # Fetch number
    y = y[y.notnull()]  # Exclude null values
    return lagrange(y.index, list(y))(n)  # Interpolate and return the result of the interpolation


# Determine whether interpolation is necessary by element
for i in data.columns:
    for j in range(len(data)):
        if (data[i].isnull())[j]:  # If null, interpolate.
            """
            data[i][j] = ployinterp_column(data[i], j)  
            //In this way, there will be warnings:
            SettingWithCopyWarning: 
            A value is trying to be set on a copy of a slice from a DataFrame
            """
            data.loc[j, i] = ployinterp_column(data[i], j)

data.to_excel(outputfile)  # Output result, write to file

Data frame merge

#Data frame merge
# 1
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                 'data1': range(7)})
df2 = DataFrame({'key': ['a', 'b', 'd'],
                 'data2': range(3)})
print(df1)
print(df2)

   data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   a
6      6   b
   data2 key
0      0   a
1      1   b
2      2   d

pd.merge(df1, df2)

	data1	key	data2
0	0	b	1
1	1	b	1
2	6	b	1
3	2	a	0
4	4	a	0
5	5	a	0

pd.merge(df1, df2, on='key')

	data1	key	data2
0	0	b	1
1	1	b	1
2	6	b	1
3	2	a	0
4	4	a	0
5	5	a	0

# 2
df3 = DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                 'data1': range(7)})
df4 = DataFrame({'rkey': ['a', 'b', 'd'],
                 'data2': range(3)})

pd.merge(df3, df4, left_on='lkey', right_on='rkey')

	data1	lkey	data2	rkey
0	0	b	1	b
1	1	b	1	b
2	6	b	1	b
3	2	a	0	a
4	4	a	0	a
5	5	a	0	a

pd.merge(df1, df2, how='outer')

	data1	key	data2
0	0.0	b	1.0
1	1.0	b	1.0
2	6.0	b	1.0
3	2.0	a	0.0
4	4.0	a	0.0
5	5.0	a	0.0
6	3.0	c	NaN
7	NaN	d	2.0

# 3
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                 'data1': range(6)})
df2 = DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
                 'data2': range(5)})
print(df1)
print(df2)

   data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   b
   data2 key
0      0   a
1      1   b
2      2   a
3      3   b
4      4   d

pd.merge(df1, df2, on='key', how='left')

	data1	key	data2
0	0	b	1.0
1	0	b	3.0
2	1	b	1.0
3	1	b	3.0
4	2	a	0.0
5	2	a	2.0
6	3	c	NaN
7	4	a	0.0
8	4	a	2.0
9	5	b	1.0
10	5	b	3.0

pd.merge(df1, df2, how='inner')

	data1	key	data2
0	0	b	1
1	0	b	3
2	1	b	1
3	1	b	3
4	5	b	1
5	5	b	3
6	2	a	0
7	2	a	2
8	4	a	0
9	4	a	2

# 4
left = DataFrame({'key1': ['foo', 'foo', 'bar'],
                  'key2': ['one', 'two', 'one'],
                  'lval': [1, 2, 3]})
right = DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                   'key2': ['one', 'one', 'one', 'two'],
                   'rval': [4, 5, 6, 7]})
pd.merge(left, right, on=['key1', 'key2'], how='outer')

	key1	key2	lval	rval
0	foo	one	1.0	4.0
1	foo	one	1.0	5.0
2	foo	two	2.0	NaN
3	bar	one	3.0	6.0
4	bar	two	NaN	7.0

# 5
pd.merge(left, right, on='key1')

	key1	key2_x	lval	key2_y	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

pd.merge(left, right, on='key1', suffixes=('_left', '_right'))

	key1	key2_left	lval	key2_right	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

Merge on Index

# 1
left1 = DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'], 'value': range(6)})
right1 = DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])
print(left1)
print(right1)

  key  value
0   a      0
1   b      1
2   a      2
3   a      3
4   b      4
5   c      5
   group_val
a        3.5
b        7.0

pd.merge(left1, right1, left_on='key', right_index=True)

	key	value	group_val
0	a	0	3.5
2	a	2	3.5
3	a	3	3.5
1	b	1	7.0
4	b	4	7.0

pd.merge(left1, right1, left_on='key', right_index=True, how='outer')

	key	value	group_val
0	a	0	3.5
2	a	2	3.5
3	a	3	3.5
1	b	1	7.0
4	b	4	7.0
5	c	5	NaN

# 2
lefth = DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
                   'key2': [2000, 2001, 2002, 2001, 2002],
                   'data': np.arange(5.)})
righth = DataFrame(np.arange(12).reshape((6, 2)),
                   index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],
                          [2001, 2000, 2000, 2000, 2001, 2002]],
                   columns=['event1', 'event2'])
print(lefth)
print(righth)

   data    key1  key2
0   0.0    Ohio  2000
1   1.0    Ohio  2001
2   2.0    Ohio  2002
3   3.0  Nevada  2001
4   4.0  Nevada  2002
             event1  event2
Nevada 2001       0       1
       2000       2       3
Ohio   2000       4       5
       2000       6       7
       2001       8       9
       2002      10      11

pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)

	data	key1	key2	event1	event2
0	0.0	Ohio	2000	4	5
0	0.0	Ohio	2000	6	7
1	1.0	Ohio	2001	8	9
2	2.0	Ohio	2002	10	11
3	3.0	Nevada	2001	0	1

pd.merge(lefth, righth, left_on=['key1', 'key2'],
         right_index=True, how='outer')

	data	key1	key2	event1	event2
0	0.0	Ohio	2000	4.0	5.0
0	0.0	Ohio	2000	6.0	7.0
1	1.0	Ohio	2001	8.0	9.0
2	2.0	Ohio	2002	10.0	11.0
3	3.0	Nevada	2001	0.0	1.0
4	4.0	Nevada	2002	NaN	NaN
4	NaN	Nevada	2000	2.0	3.0

left2 = DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=['a', 'c', 'e'],
                  columns=['Ohio', 'Nevada'])
right2 = DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
                   index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabama'])
print(left2)
print(right2)

   Ohio  Nevada
a   1.0     2.0
c   3.0     4.0
e   5.0     6.0
   Missouri  Alabama
b       7.0      8.0
c       9.0     10.0
d      11.0     12.0
e      13.0     14.0

pd.merge(left2, right2, how='outer', left_index=True, right_index=True)

	Ohio	Nevada	Missouri	Alabama
a	1.0	2.0	NaN	NaN
b	NaN	NaN	7.0	8.0
c	3.0	4.0	9.0	10.0
d	NaN	NaN	11.0	12.0
e	5.0	6.0	13.0	14.0

# 3
left2.join(right2, how='outer')

	Ohio	Nevada	Missouri	Alabama
a	1.0	2.0	NaN	NaN
b	NaN	NaN	7.0	8.0
c	3.0	4.0	9.0	10.0
d	NaN	NaN	11.0	12.0
e	5.0	6.0	13.0	14.0

left1.join(right1, on='key')

	key	value	group_val
0	a	0	3.5
1	b	1	7.0
2	a	2	3.5
3	a	3	3.5
4	b	4	7.0
5	c	5	NaN

# 4
another = DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
                    index=['a', 'c', 'e', 'f'], columns=['New York', 'Oregon'])
left2.join([right2, another])

	Ohio	Nevada	Missouri	Alabama	New York	Oregon
a	1.0	2.0	NaN	NaN	7.0	8.0
c	3.0	4.0	9.0	10.0	9.0	10.0
e	5.0	6.0	13.0	14.0	11.0	12.0

left2.join([right2, another], how='outer')

	Ohio	Nevada	Missouri	Alabama	New York	Oregon
a	1.0	2.0	NaN	NaN	7.0	8.0
b	NaN	NaN	7.0	8.0	NaN	NaN
c	3.0	4.0	9.0	10.0	9.0	10.0
d	NaN	NaN	11.0	12.0	NaN	NaN
e	5.0	6.0	13.0	14.0	11.0	12.0
f	NaN	NaN	NaN	NaN	16.0	17.0

Axial connection

# 1
arr = np.arange(12).reshape((3, 4))
print(arr)

np.concatenate([arr, arr], axis=1)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]





array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

# 2
s1 = Series([0, 1], index=['a', 'b'])
s2 = Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = Series([5, 6], index=['f', 'g'])

pd.concat([s1, s2, s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

pd.concat([s1, s2, s3], axis=0)

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

pd.concat([s1, s2, s3], axis=1)

	0	1	2
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

s4 = pd.concat([s1 * 5, s3])
print(s4)

a    0
b    5
f    5
g    6
dtype: int64

pd.concat([s1, s4], axis=1)

	0	1
a	0.0	0
b	1.0	5
f	NaN	5
g	NaN	6

pd.concat([s1, s4], axis=1, join='inner')

	0	1
a	0	0
b	1	5

pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']])

	0	1
a	0.0	0.0
c	NaN	NaN
b	1.0	5.0
e	NaN	NaN

# 3
result = pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])
print(result)

one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64

result.unstack()

	a	b	f	g
one	0.0	1.0	NaN	NaN
two	0.0	1.0	NaN	NaN
three	NaN	NaN	5.0	6.0

# 4
pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])

	one	two	three
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

df1 = DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
                columns=['one', 'two'])
df2 = DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],
                columns=['three', 'four'])

pd.concat([df1, df2], axis=1, keys=['level1', 'level2'])

	level1		level2
	one	two	three	four
a	0	1	5.0	6.0
b	2	3	NaN	NaN
c	4	5	7.0	8.0

pd.concat({'level1': df1, 'level2': df2}, axis=1)

	level1		level2
	one	two	three	four
a	0	1	5.0	6.0
b	2	3	NaN	NaN
c	4	5	7.0	8.0

pd.concat([df1, df2], axis=1, keys=['level1', 'level2'],
          names=['upper', 'lower'])

upper	level1		level2
lower	one	two	three	four
a	0	1	5.0	6.0
b	2	3	NaN	NaN
c	4	5	7.0	8.0

# 5
df1 = DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df2 = DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])
print(df1)
print(df2)

          a         b         c         d
0 -0.204708  0.478943 -0.519439 -0.555730
1  1.965781  1.393406  0.092908  0.281746
2  0.769023  1.246435  1.007189 -1.296221
          b         d         a
0  0.274992  0.228913  1.352917
1  0.886429 -2.001637 -0.371843

pd.concat([df1, df2], ignore_index=True)

	a	b	c	d
0	-0.204708	0.478943	-0.519439	-0.555730
1	1.965781	1.393406	0.092908	0.281746
2	0.769023	1.246435	1.007189	-1.296221
3	1.352917	0.274992	NaN	0.228913
4	-0.371843	0.886429	NaN	-2.001637

Merge overlapping data

# 1
a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
           index=['f', 'e', 'd', 'c', 'b', 'a'])
b = Series(np.arange(len(a), dtype=np.float64),
           index=['f', 'e', 'd', 'c', 'b', 'a'])
b[-1] = np.nan

print(a)
print(b)

np.where(pd.isnull(a), b, a)

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64
f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    NaN
dtype: float64





array([ 0. ,  2.5,  2. ,  3.5,  4.5,  nan])

# 2
b[:-2].combine_first(a[2:])

a    NaN
b    4.5
c    3.0
d    2.0
e    1.0
f    0.0
dtype: float64

# 3
df1 = DataFrame({'a': [1., np.nan, 5., np.nan],
                 'b': [np.nan, 2., np.nan, 6.],
                 'c': range(2, 18, 4)})
df2 = DataFrame({'a': [5., 4., np.nan, 3., 7.],
                 'b': [np.nan, 3., 4., 6., 8.]})
df1.combine_first(df2)

	a	b	c
0	1.0	NaN	2.0
1	4.0	2.0	6.0
2	5.0	4.0	10.0
3	3.0	6.0	14.0
4	7.0	8.0	NaN

Reshaping Hierarchical Index

# 1
data = DataFrame(np.arange(6).reshape((2, 3)),
                 index=pd.Index(['Ohio', 'Colorado'], name='state'),
                 columns=pd.Index(['one', 'two', 'three'], name='number'))
print(data)

number    one  two  three
state                    
Ohio        0    1      2
Colorado    3    4      5

# stack converts column to index
result = data.stack()
print(result)

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

[External Link Picture Transfer Failure (img-RFupWnTz-1565689273685)(data/stack1.png)]

# unstack converts index to column
result.unstack()

number	one	two	three
state
Ohio	0	1	2
Colorado	3	4	5

[External Link Picture Transfer Failure (img-zYzZClXX-1565689273686)(data/unstack1.png)]

result.unstack(0)

state	Ohio	Colorado
number
one	0	3
two	1	4
three	2	5

result.unstack('state')

state	Ohio	Colorado
number
one	0	3
two	1	4
three	2	5

# 2
s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = Series([4, 5, 6], index=['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys=['one', 'two'])
print(data2)

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64

# data2.stack() # No column execution will report errors

data2.unstack()

	a	b	c	d	e
one	0.0	1.0	2.0	3.0	NaN
two	NaN	NaN	4.0	5.0	6.0

[External Link Picture Transfer Failure (img-HNbyj5MF-1565689273687)(data/unstack2.png)]

data2.unstack().stack()

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64

[External Link Picture Transfer Failure (img-zQBjoEri-1565689273688)(data/stack2.png)]

data2.unstack().stack(dropna=False)

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64

# 3
df = DataFrame({'left': result, 'right': result + 5},
               columns=pd.Index(['left', 'right'], name='side'))
print(df)

side             left  right
state    number             
Ohio     one        0      5
         two        1      6
         three      2      7
Colorado one        3      8
         two        4      9
         three      5     10

df.unstack('state')

side	left		right
state	Ohio	Colorado	Ohio	Colorado
number
one	0	3	5	8
two	1	4	6	9
three	2	5	7	10

df.unstack('state').stack('side')

	state	Colorado	Ohio
number	side
one	left	3	0
one	right	8	5
two	left	4	1
two	right	9	6
three	left	5	2
three	right	10	7

Conversion of Length-Width Format

data = pd.read_csv('data/macrodata.csv')
print(data)

       year  quarter    realgdp  realcons   realinv  realgovt  realdpi  \
0    1959.0      1.0   2710.349    1707.4   286.898   470.045   1886.9   
1    1959.0      2.0   2778.801    1733.7   310.859   481.301   1919.7   
2    1959.0      3.0   2775.488    1751.8   289.226   491.260   1916.4   
3    1959.0      4.0   2785.204    1753.7   299.356   484.052   1931.3   
4    1960.0      1.0   2847.699    1770.5   331.722   462.199   1955.5   
5    1960.0      2.0   2834.390    1792.9   298.152   460.400   1966.1   
6    1960.0      3.0   2839.022    1785.8   296.375   474.676   1967.8   
7    1960.0      4.0   2802.616    1788.2   259.764   476.434   1966.6   
8    1961.0      1.0   2819.264    1787.7   266.405   475.854   1984.5   
9    1961.0      2.0   2872.005    1814.3   286.246   480.328   2014.4   
10   1961.0      3.0   2918.419    1823.1   310.227   493.828   2041.9   
11   1961.0      4.0   2977.830    1859.6   315.463   502.521   2082.0   
12   1962.0      1.0   3031.241    1879.4   334.271   520.960   2101.7   
13   1962.0      2.0   3064.709    1902.5   331.039   523.066   2125.2   
14   1962.0      3.0   3093.047    1917.9   336.962   538.838   2137.0   
15   1962.0      4.0   3100.563    1945.1   325.650   535.912   2154.6   
16   1963.0      1.0   3141.087    1958.2   343.721   522.917   2172.5   
17   1963.0      2.0   3180.447    1976.9   348.730   518.108   2193.1   
18   1963.0      3.0   3240.332    2003.8   360.102   546.893   2217.9   
19   1963.0      4.0   3264.967    2020.6   364.534   532.383   2254.6   
20   1964.0      1.0   3338.246    2060.5   379.523   529.686   2299.6   
21   1964.0      2.0   3376.587    2096.7   377.778   526.175   2362.1   
22   1964.0      3.0   3422.469    2135.2   386.754   522.008   2392.7   
23   1964.0      4.0   3431.957    2141.2   389.910   514.603   2420.4   
24   1965.0      1.0   3516.251    2188.8   429.145   508.006   2447.4   
25   1965.0      2.0   3563.960    2213.0   429.119   508.931   2474.5   
26   1965.0      3.0   3636.285    2251.0   444.444   529.446   2542.6   
27   1965.0      4.0   3724.014    2314.3   446.493   544.121   2594.1   
28   1966.0      1.0   3815.423    2348.5   484.244   556.593   2618.4   
29   1966.0      2.0   3828.124    2354.5   475.408   571.371   2624.7   
..      ...      ...        ...       ...       ...       ...      ...   
173  2002.0      2.0  11538.770    7997.8  1810.779   774.408   8658.9   
174  2002.0      3.0  11596.430    8052.0  1814.531   786.673   8629.2   
175  2002.0      4.0  11598.824    8080.6  1813.219   799.967   8649.6   
176  2003.0      1.0  11645.819    8122.3  1813.141   800.196   8681.3   
177  2003.0      2.0  11738.706    8197.8  1823.698   838.775   8812.5   
178  2003.0      3.0  11935.461    8312.1  1889.883   839.598   8935.4   
179  2003.0      4.0  12042.817    8358.0  1959.783   845.722   8986.4   
180  2004.0      1.0  12127.623    8437.6  1970.015   856.570   9025.9   
181  2004.0      2.0  12213.818    8483.2  2055.580   861.440   9115.0   
182  2004.0      3.0  12303.533    8555.8  2082.231   876.385   9175.9   
183  2004.0      4.0  12410.282    8654.2  2125.152   865.596   9303.4   
184  2005.0      1.0  12534.113    8719.0  2170.299   869.204   9189.6   
185  2005.0      2.0  12587.535    8802.9  2131.468   870.044   9253.0   
186  2005.0      3.0  12683.153    8865.6  2154.949   890.394   9308.0   
187  2005.0      4.0  12748.699    8888.5  2232.193   875.557   9358.7   
188  2006.0      1.0  12915.938    8986.6  2264.721   900.511   9533.8   
189  2006.0      2.0  12962.462    9035.0  2261.247   892.839   9617.3   
190  2006.0      3.0  12965.916    9090.7  2229.636   892.002   9662.5   
191  2006.0      4.0  13060.679    9181.6  2165.966   894.404   9788.8   
192  2007.0      1.0  13099.901    9265.1  2132.609   882.766   9830.2   
193  2007.0      2.0  13203.977    9291.5  2162.214   898.713   9842.7   
194  2007.0      3.0  13321.109    9335.6  2166.491   918.983   9883.9   
195  2007.0      4.0  13391.249    9363.6  2123.426   925.110   9886.2   
196  2008.0      1.0  13366.865    9349.6  2082.886   943.372   9826.8   
197  2008.0      2.0  13415.266    9351.0  2026.518   961.280  10059.0   
198  2008.0      3.0  13324.600    9267.7  1990.693   991.551   9838.3   
199  2008.0      4.0  13141.920    9195.3  1857.661  1007.273   9920.4   
200  2009.0      1.0  12925.410    9209.2  1558.494   996.287   9926.4   
201  2009.0      2.0  12901.504    9189.0  1456.678  1023.528  10077.5   
202  2009.0      3.0  12990.341    9256.0  1486.398  1044.088  10040.6   

         cpi      m1  tbilrate  unemp      pop  infl  realint  
0     28.980   139.7      2.82    5.8  177.146  0.00     0.00  
1     29.150   141.7      3.08    5.1  177.830  2.34     0.74  
2     29.350   140.5      3.82    5.3  178.657  2.74     1.09  
3     29.370   140.0      4.33    5.6  179.386  0.27     4.06  
4     29.540   139.6      3.50    5.2  180.007  2.31     1.19  
5     29.550   140.2      2.68    5.2  180.671  0.14     2.55  
6     29.750   140.9      2.36    5.6  181.528  2.70    -0.34  
7     29.840   141.1      2.29    6.3  182.287  1.21     1.08  
8     29.810   142.1      2.37    6.8  182.992 -0.40     2.77  
9     29.920   142.9      2.29    7.0  183.691  1.47     0.81  
10    29.980   144.1      2.32    6.8  184.524  0.80     1.52  
11    30.040   145.2      2.60    6.2  185.242  0.80     1.80  
12    30.210   146.4      2.73    5.6  185.874  2.26     0.47  
13    30.220   146.5      2.78    5.5  186.538  0.13     2.65  
14    30.380   146.7      2.78    5.6  187.323  2.11     0.67  
15    30.440   148.3      2.87    5.5  188.013  0.79     2.08  
16    30.480   149.7      2.90    5.8  188.580  0.53     2.38  
17    30.690   151.3      3.03    5.7  189.242  2.75     0.29  
18    30.750   152.6      3.38    5.5  190.028  0.78     2.60  
19    30.940   153.7      3.52    5.6  190.668  2.46     1.06  
20    30.950   154.8      3.51    5.5  191.245  0.13     3.38  
21    31.020   156.8      3.47    5.2  191.889  0.90     2.57  
22    31.120   159.2      3.53    5.0  192.631  1.29     2.25  
23    31.280   160.7      3.76    5.0  193.223  2.05     1.71  
24    31.380   162.0      3.93    4.9  193.709  1.28     2.65  
25    31.580   163.1      3.84    4.7  194.303  2.54     1.30  
26    31.650   166.0      3.93    4.4  194.997  0.89     3.04  
27    31.880   169.1      4.35    4.1  195.539  2.90     1.46  
28    32.280   171.8      4.62    3.9  195.999  4.99    -0.37  
29    32.450   170.3      4.65    3.8  196.560  2.10     2.55  
..       ...     ...       ...    ...      ...   ...      ...  
173  180.000  1199.5      1.70    5.8  288.028  1.56     0.14  
174  181.200  1204.0      1.61    5.7  288.783  2.66    -1.05  
175  182.600  1226.8      1.20    5.8  289.421  3.08    -1.88  
176  183.200  1248.4      1.14    5.9  290.019  1.31    -0.17  
177  183.700  1287.9      0.96    6.2  290.704  1.09    -0.13  
178  184.900  1297.3      0.94    6.1  291.449  2.60    -1.67  
179  186.300  1306.1      0.90    5.8  292.057  3.02    -2.11  
180  187.400  1332.1      0.94    5.7  292.635  2.35    -1.42  
181  189.100  1340.5      1.21    5.6  293.310  3.61    -2.41  
182  190.800  1361.0      1.63    5.4  294.066  3.58    -1.95  
183  191.800  1366.6      2.20    5.4  294.741  2.09     0.11  
184  193.800  1357.8      2.69    5.3  295.308  4.15    -1.46  
185  194.700  1366.6      3.01    5.1  295.994  1.85     1.16  
186  199.200  1375.0      3.52    5.0  296.770  9.14    -5.62  
187  199.400  1380.6      4.00    4.9  297.435  0.40     3.60  
188  200.700  1380.5      4.51    4.7  298.061  2.60     1.91  
189  202.700  1369.2      4.82    4.7  298.766  3.97     0.85  
190  201.900  1369.4      4.90    4.7  299.593 -1.58     6.48  
191  203.574  1373.6      4.92    4.4  300.320  3.30     1.62  
192  205.920  1379.7      4.95    4.5  300.977  4.58     0.36  
193  207.338  1370.0      4.72    4.5  301.714  2.75     1.97  
194  209.133  1379.2      4.00    4.7  302.509  3.45     0.55  
195  212.495  1377.4      3.01    4.8  303.204  6.38    -3.37  
196  213.997  1384.0      1.56    4.9  303.803  2.82    -1.26  
197  218.610  1409.3      1.74    5.4  304.483  8.53    -6.79  
198  216.889  1474.7      1.17    6.0  305.270 -3.16     4.33  
199  212.174  1576.5      0.12    6.9  305.952 -8.79     8.91  
200  212.671  1592.8      0.22    8.1  306.547  0.94    -0.71  
201  214.469  1653.6      0.18    9.2  307.226  3.37    -3.19  
202  216.385  1673.9      0.12    9.6  308.013  3.56    -3.44  

[203 rows x 14 columns]

periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date')
print(periods)

PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',
             '1960Q3', '1960Q4', '1961Q1', '1961Q2',
             ...
             '2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3',
             '2008Q4', '2009Q1', '2009Q2', '2009Q3'],
            dtype='period[Q-DEC]', name='date', length=203, freq='Q-DEC')

data = DataFrame(data.to_records(),
                 columns=pd.Index(['realgdp', 'infl', 'unemp'], name='item'),
                 index=periods.to_timestamp('D', 'end'))
print(data)

item          realgdp  infl  unemp
date                              
1959-03-31   2710.349  0.00    5.8
1959-06-30   2778.801  2.34    5.1
1959-09-30   2775.488  2.74    5.3
1959-12-31   2785.204  0.27    5.6
1960-03-31   2847.699  2.31    5.2
1960-06-30   2834.390  0.14    5.2
1960-09-30   2839.022  2.70    5.6
1960-12-31   2802.616  1.21    6.3
1961-03-31   2819.264 -0.40    6.8
1961-06-30   2872.005  1.47    7.0
1961-09-30   2918.419  0.80    6.8
1961-12-31   2977.830  0.80    6.2
1962-03-31   3031.241  2.26    5.6
1962-06-30   3064.709  0.13    5.5
1962-09-30   3093.047  2.11    5.6
1962-12-31   3100.563  0.79    5.5
1963-03-31   3141.087  0.53    5.8
1963-06-30   3180.447  2.75    5.7
1963-09-30   3240.332  0.78    5.5
1963-12-31   3264.967  2.46    5.6
1964-03-31   3338.246  0.13    5.5
1964-06-30   3376.587  0.90    5.2
1964-09-30   3422.469  1.29    5.0
1964-12-31   3431.957  2.05    5.0
1965-03-31   3516.251  1.28    4.9
1965-06-30   3563.960  2.54    4.7
1965-09-30   3636.285  0.89    4.4
1965-12-31   3724.014  2.90    4.1
1966-03-31   3815.423  4.99    3.9
1966-06-30   3828.124  2.10    3.8
...               ...   ...    ...
2002-06-30  11538.770  1.56    5.8
2002-09-30  11596.430  2.66    5.7
2002-12-31  11598.824  3.08    5.8
2003-03-31  11645.819  1.31    5.9
2003-06-30  11738.706  1.09    6.2
2003-09-30  11935.461  2.60    6.1
2003-12-31  12042.817  3.02    5.8
2004-03-31  12127.623  2.35    5.7
2004-06-30  12213.818  3.61    5.6
2004-09-30  12303.533  3.58    5.4
2004-12-31  12410.282  2.09    5.4
2005-03-31  12534.113  4.15    5.3
2005-06-30  12587.535  1.85    5.1
2005-09-30  12683.153  9.14    5.0
2005-12-31  12748.699  0.40    4.9
2006-03-31  12915.938  2.60    4.7
2006-06-30  12962.462  3.97    4.7
2006-09-30  12965.916 -1.58    4.7
2006-12-31  13060.679  3.30    4.4
2007-03-31  13099.901  4.58    4.5
2007-06-30  13203.977  2.75    4.5
2007-09-30  13321.109  3.45    4.7
2007-12-31  13391.249  6.38    4.8
2008-03-31  13366.865  2.82    4.9
2008-06-30  13415.266  8.53    5.4
2008-09-30  13324.600 -3.16    6.0
2008-12-31  13141.920 -8.79    6.9
2009-03-31  12925.410  0.94    8.1
2009-06-30  12901.504  3.37    9.2
2009-09-30  12990.341  3.56    9.6

[203 rows x 3 columns]

ldata = data.stack().reset_index().rename(columns={0: 'value'})
print(ldata)

          date     item      value
0   1959-03-31  realgdp   2710.349
1   1959-03-31     infl      0.000
2   1959-03-31    unemp      5.800
3   1959-06-30  realgdp   2778.801
4   1959-06-30     infl      2.340
5   1959-06-30    unemp      5.100
6   1959-09-30  realgdp   2775.488
7   1959-09-30     infl      2.740
8   1959-09-30    unemp      5.300
9   1959-12-31  realgdp   2785.204
10  1959-12-31     infl      0.270
11  1959-12-31    unemp      5.600
12  1960-03-31  realgdp   2847.699
13  1960-03-31     infl      2.310
14  1960-03-31    unemp      5.200
15  1960-06-30  realgdp   2834.390
16  1960-06-30     infl      0.140
17  1960-06-30    unemp      5.200
18  1960-09-30  realgdp   2839.022
19  1960-09-30     infl      2.700
20  1960-09-30    unemp      5.600
21  1960-12-31  realgdp   2802.616
22  1960-12-31     infl      1.210
23  1960-12-31    unemp      6.300
24  1961-03-31  realgdp   2819.264
25  1961-03-31     infl     -0.400
26  1961-03-31    unemp      6.800
27  1961-06-30  realgdp   2872.005
28  1961-06-30     infl      1.470
29  1961-06-30    unemp      7.000
..         ...      ...        ...
579 2007-06-30  realgdp  13203.977
580 2007-06-30     infl      2.750
581 2007-06-30    unemp      4.500
582 2007-09-30  realgdp  13321.109
583 2007-09-30     infl      3.450
584 2007-09-30    unemp      4.700
585 2007-12-31  realgdp  13391.249
586 2007-12-31     infl      6.380
587 2007-12-31    unemp      4.800
588 2008-03-31  realgdp  13366.865
589 2008-03-31     infl      2.820
590 2008-03-31    unemp      4.900
591 2008-06-30  realgdp  13415.266
592 2008-06-30     infl      8.530
593 2008-06-30    unemp      5.400
594 2008-09-30  realgdp  13324.600
595 2008-09-30     infl     -3.160
596 2008-09-30    unemp      6.000
597 2008-12-31  realgdp  13141.920
598 2008-12-31     infl     -8.790
599 2008-12-31    unemp      6.900
600 2009-03-31  realgdp  12925.410
601 2009-03-31     infl      0.940
602 2009-03-31    unemp      8.100
603 2009-06-30  realgdp  12901.504
604 2009-06-30     infl      3.370
605 2009-06-30    unemp      9.200
606 2009-09-30  realgdp  12990.341
607 2009-09-30     infl      3.560
608 2009-09-30    unemp      9.600

[609 rows x 3 columns]

wdata = ldata.pivot(index='date',columns= 'item',values= 'value')
print(wdata)

item        infl    realgdp  unemp
date                              
1959-03-31  0.00   2710.349    5.8
1959-06-30  2.34   2778.801    5.1
1959-09-30  2.74   2775.488    5.3
1959-12-31  0.27   2785.204    5.6
1960-03-31  2.31   2847.699    5.2
1960-06-30  0.14   2834.390    5.2
1960-09-30  2.70   2839.022    5.6
1960-12-31  1.21   2802.616    6.3
1961-03-31 -0.40   2819.264    6.8
1961-06-30  1.47   2872.005    7.0
1961-09-30  0.80   2918.419    6.8
1961-12-31  0.80   2977.830    6.2
1962-03-31  2.26   3031.241    5.6
1962-06-30  0.13   3064.709    5.5
1962-09-30  2.11   3093.047    5.6
1962-12-31  0.79   3100.563    5.5
1963-03-31  0.53   3141.087    5.8
1963-06-30  2.75   3180.447    5.7
1963-09-30  0.78   3240.332    5.5
1963-12-31  2.46   3264.967    5.6
1964-03-31  0.13   3338.246    5.5
1964-06-30  0.90   3376.587    5.2
1964-09-30  1.29   3422.469    5.0
1964-12-31  2.05   3431.957    5.0
1965-03-31  1.28   3516.251    4.9
1965-06-30  2.54   3563.960    4.7
1965-09-30  0.89   3636.285    4.4
1965-12-31  2.90   3724.014    4.1
1966-03-31  4.99   3815.423    3.9
1966-06-30  2.10   3828.124    3.8
...          ...        ...    ...
2002-06-30  1.56  11538.770    5.8
2002-09-30  2.66  11596.430    5.7
2002-12-31  3.08  11598.824    5.8
2003-03-31  1.31  11645.819    5.9
2003-06-30  1.09  11738.706    6.2
2003-09-30  2.60  11935.461    6.1
2003-12-31  3.02  12042.817    5.8
2004-03-31  2.35  12127.623    5.7
2004-06-30  3.61  12213.818    5.6
2004-09-30  3.58  12303.533    5.4
2004-12-31  2.09  12410.282    5.4
2005-03-31  4.15  12534.113    5.3
2005-06-30  1.85  12587.535    5.1
2005-09-30  9.14  12683.153    5.0
2005-12-31  0.40  12748.699    4.9
2006-03-31  2.60  12915.938    4.7
2006-06-30  3.97  12962.462    4.7
2006-09-30 -1.58  12965.916    4.7
2006-12-31  3.30  13060.679    4.4
2007-03-31  4.58  13099.901    4.5
2007-06-30  2.75  13203.977    4.5
2007-09-30  3.45  13321.109    4.7
2007-12-31  6.38  13391.249    4.8
2008-03-31  2.82  13366.865    4.9
2008-06-30  8.53  13415.266    5.4
2008-09-30 -3.16  13324.600    6.0
2008-12-31 -8.79  13141.920    6.9
2009-03-31  0.94  12925.410    8.1
2009-06-30  3.37  12901.504    9.2
2009-09-30  3.56  12990.341    9.6

[203 rows x 3 columns]

# 2
ldata[:10]

	date	item	value
0	1959-03-31	realgdp	2710.349
1	1959-03-31	infl	0.000
2	1959-03-31	unemp	5.800
3	1959-06-30	realgdp	2778.801
4	1959-06-30	infl	2.340
5	1959-06-30	unemp	5.100
6	1959-09-30	realgdp	2775.488
7	1959-09-30	infl	2.740
8	1959-09-30	unemp	5.300
9	1959-12-31	realgdp	2785.204

pivoted = ldata.pivot('date', 'item', 'value')
pivoted.head()

item	infl	realgdp	unemp
date
1959-03-31	0.00	2710.349	5.8
1959-06-30	2.34	2778.801	5.1
1959-09-30	2.74	2775.488	5.3
1959-12-31	0.27	2785.204	5.6
1960-03-31	2.31	2847.699	5.2

ldata['value2'] = np.random.randn(len(ldata))
ldata[:10]

	date	item	value	value2
0	1959-03-31	realgdp	2710.349	1.669025
1	1959-03-31	infl	0.000	-0.438570
2	1959-03-31	unemp	5.800	-0.539741
3	1959-06-30	realgdp	2778.801	0.476985
4	1959-06-30	infl	2.340	3.248944
5	1959-06-30	unemp	5.100	-1.021228
6	1959-09-30	realgdp	2775.488	-0.577087
7	1959-09-30	infl	2.740	0.124121
8	1959-09-30	unemp	5.300	0.302614
9	1959-12-31	realgdp	2785.204	0.523772

pivoted = ldata.pivot('date', 'item')
pivoted[:5]

	value			value2
item	infl	realgdp	unemp	infl	realgdp	unemp
date
1959-03-31	0.00	2710.349	5.8	-0.438570	1.669025	-0.539741
1959-06-30	2.34	2778.801	5.1	3.248944	0.476985	-1.021228
1959-09-30	2.74	2775.488	5.3	0.124121	-0.577087	0.302614
1959-12-31	0.27	2785.204	5.6	0.000940	0.523772	1.343810
1960-03-31	2.31	2847.699	5.2	-0.831154	-0.713544	-2.370232

pivoted['value'][:5]

item	infl	realgdp	unemp
date
1959-03-31	0.00	2710.349	5.8
1959-06-30	2.34	2778.801	5.1
1959-09-30	2.74	2775.488	5.3
1959-12-31	0.27	2785.204	5.6
1960-03-31	2.31	2847.699	5.2

unstacked = ldata.set_index(['date', 'item']).unstack('item')
unstacked[:7]

	value			value2
item	infl	realgdp	unemp	infl	realgdp	unemp
date
1959-03-31	0.00	2710.349	5.8	-0.438570	1.669025	-0.539741
1959-06-30	2.34	2778.801	5.1	3.248944	0.476985	-1.021228
1959-09-30	2.74	2775.488	5.3	0.124121	-0.577087	0.302614
1959-12-31	0.27	2785.204	5.6	0.000940	0.523772	1.343810
1960-03-31	2.31	2847.699	5.2	-0.831154	-0.713544	-2.370232
1960-06-30	0.14	2834.390	5.2	-0.860757	-1.860761	0.560145
1960-09-30	2.70	2839.022	5.6	0.119827	-1.265934	-1.063512

Remove duplicate data

data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
                  'k2': [1, 1, 2, 3, 3, 4, 4]})
print(data)

    k1  k2
0  one   1
1  one   1
2  one   2
3  two   3
4  two   3
5  two   4
6  two   4

data.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

data.drop_duplicates()

	k1	k2
0	one	1
2	one	2
3	two	3
5	two	4

data['v1'] = range(7)
print(data)

    k1  k2  v1
0  one   1   0
1  one   1   1
2  one   2   2
3  two   3   3
4  two   3   4
5  two   4   5
6  two   4   6

data.drop_duplicates(['k1'])

	k1	k2	v1
0	one	1	0
3	two	3	3

data.drop_duplicates(['k1', 'k2'], keep='last')

	k1	k2	v1
1	one	1	1
2	one	2	2
4	two	3	4
6	two	4	6

Data Conversion Using Functions or Maps

# 1
data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
                           'corned beef', 'Bacon', 'pastrami', 'honey ham',
                           'nova lox'],
                  'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
print(data)

          food  ounces
0        bacon     4.0
1  pulled pork     3.0
2        bacon    12.0
3     Pastrami     6.0
4  corned beef     7.5
5        Bacon     8.0
6     pastrami     3.0
7    honey ham     5.0
8     nova lox     6.0

meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}

data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
print(data)

          food  ounces  animal
0        bacon     4.0     pig
1  pulled pork     3.0     pig
2        bacon    12.0     pig
3     Pastrami     6.0     cow
4  corned beef     7.5     cow
5        Bacon     8.0     pig
6     pastrami     3.0     cow
7    honey ham     5.0     pig
8     nova lox     6.0  salmon

data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

Data standardization

datafile = 'data/normalization_data.xls'  # Parameter initialization
data = pd.read_excel(datafile, header=None)  # Read data
print(data)

     0    1    2     3
0   78  521  602  2863
1  144 -600 -521  2245
2   95 -457  468 -1283
3   69  596  695  1054
4  190  527  691  2051
5  101  403  470  2487
6  146  413  435  2571

Minimum-Maximum Normalization

(data - data.min()) / (data.max() - data.min())  # Minimum-Maximum Normalization

	0	1	2	3
0	0.074380	0.937291	0.923520	1.000000
1	0.619835	0.000000	0.000000	0.850941
2	0.214876	0.119565	0.813322	0.000000
3	0.000000	1.000000	1.000000	0.563676
4	1.000000	0.942308	0.996711	0.804149
5	0.264463	0.838629	0.814967	0.909310
6	0.636364	0.846990	0.786184	0.929571

Zero-Mean Normalization

(data - data.mean()) / data.std()  # Zero-Mean Normalization

	0	1	2	3
0	-0.905383	0.635863	0.464531	0.798149
1	0.604678	-1.587675	-2.193167	0.369390
2	-0.516428	-1.304030	0.147406	-2.078279
3	-1.111301	0.784628	0.684625	-0.456906
4	1.657146	0.647765	0.675159	0.234796
5	-0.379150	0.401807	0.152139	0.537286
6	0.650438	0.421642	0.069308	0.595564

Normalization by decimal scaling

data / 10 ** np.ceil(np.log10(data.abs().max()))  # Normalization by decimal scaling

	0	1	2	3
0	0.078	0.521	0.602	0.2863
1	0.144	-0.600	-0.521	0.2245
2	0.095	-0.457	0.468	-0.1283
3	0.069	0.596	0.695	0.1054
4	0.190	0.527	0.691	0.2051
5	0.101	0.403	0.470	0.2487
6	0.146	0.413	0.435	0.2571

Replacement value

data = Series([1., -999., 2., -999., -1000., 3.])
print(data)

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

Renamed axis index

data = DataFrame(np.arange(12).reshape((3, 4)),
                 index=['Ohio', 'Colorado', 'New York'],
                 columns=['one', 'two', 'three', 'four'])
print(data)

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
New York    8    9     10    11

data.index.map(str.upper)

Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')

data.index = data.index.map(str.upper)
print(data)

          one  two  three  four
OHIO        0    1      2     3
COLORADO    4    5      6     7
NEW YORK    8    9     10    11

data.rename(index=str.title, columns=str.upper)

	ONE	TWO	THREE	FOUR
Ohio	0	1	2	3
Colorado	4	5	6	7
New York	8	9	10	11

data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'})

	one	two	peekaboo	four
INDIANA	0	1	2	3
COLORADO	4	5	6	7
NEW YORK	8	9	10	11

# Always return a reference to the DataFrame
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
print(data)

          one  two  three  four
INDIANA     0    1      2     3
COLORADO    4    5      6     7
NEW YORK    8    9     10    11

Discretization and panel partition

# 1
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
print(cats)

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [MiddleAged < Senior < YoungAdult < Youth]

data = np.random.rand(20)
pd.cut(data, 4, precision=2)

[(0.45, 0.67], (0.23, 0.45], (0.0037, 0.23], (0.45, 0.67], (0.67, 0.9], ..., (0.67, 0.9], (0.0037, 0.23], (0.0037, 0.23], (0.23, 0.45], (0.23, 0.45]]
Length: 20
Categories (4, interval[float64]): [(0.0037, 0.23] < (0.23, 0.45] < (0.45, 0.67] < (0.67, 0.9]]

# 2
data = np.random.randn(1000)  # Normally distributed
cats = pd.qcut(data, 4)  # Cut into quartiles
print(cats)

[(-0.022, 0.641], (-3.746, -0.635], (0.641, 3.26], (-3.746, -0.635], (-0.022, 0.641], ..., (-0.022, 0.641], (0.641, 3.26], (-0.635, -0.022], (0.641, 3.26], (-0.635, -0.022]]
Length: 1000
Categories (4, interval[float64]): [(-3.746, -0.635] < (-0.635, -0.022] < (-0.022, 0.641] < (0.641, 3.26]]

pd.value_counts(cats)

(0.641, 3.26]       250
(-0.022, 0.641]     250
(-0.635, -0.022]    250
(-3.746, -0.635]    250
dtype: int64

pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(-0.022, 1.302], (-1.266, -0.022], (-0.022, 1.302], (-3.746, -1.266], (-0.022, 1.302], ..., (-0.022, 1.302], (-0.022, 1.302], (-1.266, -0.022], (-0.022, 1.302], (-1.266, -0.022]]
Length: 1000
Categories (4, interval[float64]): [(-3.746, -1.266] < (-1.266, -0.022] < (-0.022, 1.302] < (1.302, 3.26]]

Detection and filtering of outliers

# 1
np.random.seed(12345)
data = DataFrame(np.random.randn(1000, 4))
data.describe()

	0	1	2	3
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	-0.067684	0.067924	0.025598	-0.002298
std	0.998035	0.992106	1.006835	0.996794
min	-3.428254	-3.548824	-3.184377	-3.745356
25%	-0.774890	-0.591841	-0.641675	-0.644144
50%	-0.116401	0.101143	0.002073	-0.013611
75%	0.616366	0.780282	0.680391	0.654328
max	3.366626	2.653656	3.260383	3.927528

col = data[3]
col[np.abs(col) > 3]

97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64

data[(np.abs(data) > 3).any(1)]

	0	1	2	3
5	-0.539741	0.476985	3.248944	-1.021228
97	-0.774363	0.552936	0.106061	3.927528
102	-0.655054	-0.565230	3.176873	0.959533
305	-2.315555	0.457246	-0.025907	-3.399312
324	0.050188	1.951312	3.260383	0.963301
400	0.146326	0.508391	-0.196713	-3.745356
499	-0.293333	-0.242459	-3.056990	1.918403
523	-3.428254	-0.296336	-0.439938	-0.867165
586	0.275144	1.179227	-3.184377	1.369891
808	-0.362528	-3.548824	1.553205	-2.186301
900	3.366626	-2.372214	0.851010	1.332846

# 2
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()

	0	1	2	3
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	-0.067623	0.068473	0.025153	-0.002081
std	0.995485	0.990253	1.003977	0.989736
min	-3.000000	-3.000000	-3.000000	-3.000000
25%	-0.774890	-0.591841	-0.641675	-0.644144
50%	-0.116401	0.101143	0.002073	-0.013611
75%	0.616366	0.780282	0.680391	0.654328
max	3.000000	2.653656	3.000000	3.000000

Arrangement and Random Sampling

df = DataFrame(np.arange(5 * 4).reshape((5, 4)))
print(df)

    0   1   2   3
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15
4  16  17  18  19

sampler = np.random.permutation(5)
print(sampler)

[1 0 2 3 4]

df.take(sampler)

	0	1	2	3
1	4	5	6	7
0	0	1	2	3
2	8	9	10	11
3	12	13	14	15
4	16	17	18	19

# 2
df.take(np.random.permutation(len(df))[:3])

	0	1	2	3
1	4	5	6	7
3	12	13	14	15
4	16	17	18	19

# 3
bag = np.array([5, 7, -1, 6, 4])
sampler = np.random.randint(0, len(bag), size=10)
print(sampler)

[4 4 2 2 2 0 3 0 4 1]

draws = bag.take(sampler)
print(draws)

[ 4  4 -1 -1 -1  5  6  5  4  7]

Computational Indicators and Dumb Variables

# 1
df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                'data1': range(6)})
print(df)

   data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   b

pd.get_dummies(df['key'])

	a	b	c
0	0	1	0
1	0	1	0
2	1	0	0
3	0	0	1
4	1	0	0
5	0	1	0

dummies = pd.get_dummies(df['key'], prefix='key')
print(dummies)

   key_a  key_b  key_c
0      0      1      0
1      0      1      0
2      1      0      0
3      0      0      1
4      1      0      0
5      0      1      0

df_with_dummy = df[['data1']].join(dummies)
print(df_with_dummy)

   data1  key_a  key_b  key_c
0      0      0      1      0
1      1      0      1      0
2      2      1      0      0
3      3      0      0      1
4      4      1      0      0
5      5      0      1      0

# 2
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('data/movies.dat', sep='::', header=None,
                       names=mnames)
movies[:10]

C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  after removing the cwd from sys.path.

	movie_id	title	genres
0	1	Toy Story (1995)	Animation\|Children's\|Comedy
1	2	Jumanji (1995)	Adventure\|Children's\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama
4	5	Father of the Bride Part II (1995)	Comedy
5	6	Heat (1995)	Action\|Crime\|Thriller
6	7	Sabrina (1995)	Comedy\|Romance
7	8	Tom and Huck (1995)	Adventure\|Children's
8	9	Sudden Death (1995)	Action
9	10	GoldenEye (1995)	Action\|Adventure\|Thriller

genre_iter = (set(x.split('|')) for x in movies.genres)
genres = sorted(set.union(*genre_iter))
print(genres)

['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

dummies = DataFrame(np.zeros((len(movies), len(genres))), columns=genres)
print(dummies)

      Action  Adventure  Animation  Children's  Comedy  Crime  Documentary  \
0        0.0        0.0        0.0         0.0     0.0    0.0          0.0   
1        0.0        0.0        0.0         0.0     0.0    0.0          0.0   
2        0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3        0.0        0.0        0.0         0.0     0.0    0.0          0.0   
4        0.0        0.0        0.0         0.0     0.0    0.0          0.0   
5        0.0        0.0        0.0         0.0     0.0    0.0          0.0   
6        0.0        0.0        0.0         0.0     0.0    0.0          0.0   
7        0.0        0.0        0.0         0.0     0.0    0.0          0.0   
8        0.0        0.0        0.0         0.0     0.0    0.0          0.0   
9        0.0        0.0        0.0         0.0     0.0    0.0          0.0   
10       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
11       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
12       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
13       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
14       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
15       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
16       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
17       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
18       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
19       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
20       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
21       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
22       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
23       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
24       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
25       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
26       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
27       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
28       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
29       0.0        0.0        0.0         0.0     0.0    0.0          0.0   
...      ...        ...        ...         ...     ...    ...          ...   
3853     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3854     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3855     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3856     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3857     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3858     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3859     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3860     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3861     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3862     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3863     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3864     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3865     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3866     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3867     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3868     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3869     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3870     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3871     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3872     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3873     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3874     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3875     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3876     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3877     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3878     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3879     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3880     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3881     0.0        0.0        0.0         0.0     0.0    0.0          0.0   
3882     0.0        0.0        0.0         0.0     0.0    0.0          0.0   

      Drama  Fantasy  Film-Noir  Horror  Musical  Mystery  Romance  Sci-Fi  \
0       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
1       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
2       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
4       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
5       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
6       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
7       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
8       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
9       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
10      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
11      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
12      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
13      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
14      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
15      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
16      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
17      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
18      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
19      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
20      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
21      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
22      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
23      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
24      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
25      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
26      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
27      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
28      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
29      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
...     ...      ...        ...     ...      ...      ...      ...     ...   
3853    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3854    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3855    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3856    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3857    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3858    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3859    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3860    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3861    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3862    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3863    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3864    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3865    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3866    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3867    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3868    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3869    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3870    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3871    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3872    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3873    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3874    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3875    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3876    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3877    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3878    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3879    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3880    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3881    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   
3882    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0   

      Thriller  War  Western  
0          0.0  0.0      0.0  
1          0.0  0.0      0.0  
2          0.0  0.0      0.0  
3          0.0  0.0      0.0  
4          0.0  0.0      0.0  
5          0.0  0.0      0.0  
6          0.0  0.0      0.0  
7          0.0  0.0      0.0  
8          0.0  0.0      0.0  
9          0.0  0.0      0.0  
10         0.0  0.0      0.0  
11         0.0  0.0      0.0  
12         0.0  0.0      0.0  
13         0.0  0.0      0.0  
14         0.0  0.0      0.0  
15         0.0  0.0      0.0  
16         0.0  0.0      0.0  
17         0.0  0.0      0.0  
18         0.0  0.0      0.0  
19         0.0  0.0      0.0  
20         0.0  0.0      0.0  
21         0.0  0.0      0.0  
22         0.0  0.0      0.0  
23         0.0  0.0      0.0  
24         0.0  0.0      0.0  
25         0.0  0.0      0.0  
26         0.0  0.0      0.0  
27         0.0  0.0      0.0  
28         0.0  0.0      0.0  
29         0.0  0.0      0.0  
...        ...  ...      ...  
3853       0.0  0.0      0.0  
3854       0.0  0.0      0.0  
3855       0.0  0.0      0.0  
3856       0.0  0.0      0.0  
3857       0.0  0.0      0.0  
3858       0.0  0.0      0.0  
3859       0.0  0.0      0.0  
3860       0.0  0.0      0.0  
3861       0.0  0.0      0.0  
3862       0.0  0.0      0.0  
3863       0.0  0.0      0.0  
3864       0.0  0.0      0.0  
3865       0.0  0.0      0.0  
3866       0.0  0.0      0.0  
3867       0.0  0.0      0.0  
3868       0.0  0.0      0.0  
3869       0.0  0.0      0.0  
3870       0.0  0.0      0.0  
3871       0.0  0.0      0.0  
3872       0.0  0.0      0.0  
3873       0.0  0.0      0.0  
3874       0.0  0.0      0.0  
3875       0.0  0.0      0.0  
3876       0.0  0.0      0.0  
3877       0.0  0.0      0.0  
3878       0.0  0.0      0.0  
3879       0.0  0.0      0.0  
3880       0.0  0.0      0.0  
3881       0.0  0.0      0.0  
3882       0.0  0.0      0.0  

[3883 rows x 18 columns]

for i, gen in enumerate(movies.genres):
    dummies.loc[i, gen.split('|')] = 1

movies_windic = movies.join(dummies.add_prefix('Genre_'))
print(movies_windic.loc[0])

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Action                                   0
Genre_Adventure                                0
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Crime                                    0
Genre_Documentary                              0
Genre_Drama                                    0
Genre_Fantasy                                  0
Genre_Film-Noir                                0
Genre_Horror                                   0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Romance                                  0
Genre_Sci-Fi                                   0
Genre_Thriller                                 0
Genre_War                                      0
Genre_Western                                  0
Name: 0, dtype: object

# 3
np.random.seed(12345)
values = np.random.rand(10)
print(values)

[ 0.92961609  0.31637555  0.18391881  0.20456028  0.56772503  0.5955447
  0.96451452  0.6531771   0.74890664  0.65356987]

bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))

	(0.0, 0.2]	(0.2, 0.4]	(0.4, 0.6]	(0.6, 0.8]	(0.8, 1.0]
0	0	0	0	0	1
1	0	1	0	0	0
2	1	0	0	0	0
3	0	1	0	0	0
4	0	0	1	0	0
5	0	0	1	0	0
6	0	0	0	0	1
7	0	0	0	1	0
8	0	0	0	1	0
9	0	0	0	1	0

Attribute Construction

# Parameter initialization
inputfile = 'data/electricity_data.xls'  # Power supply and output data
outputfile = 'data/electricity_data.xls'  # Data file after attribute construction

data = pd.read_excel(inputfile)  # read in data
data[u'Line loss rate'] = (data[u'Electricity supply and input'] - data[u'Power supply and output']) / data[u'Electricity supply and input']

data.to_excel(outputfile, index=False)  # Preserve the results

String Object Method

val = 'a,b,  guido'
val.split(',')

['a', 'b', '  guido']

pieces = [x.strip() for x in val.split(',')]
print(pieces)

['a', 'b', 'guido']

first, second, third = pieces
first + '::' + second + '::' + third

'a::b::guido'

'::'.join(pieces)

'a::b::guido'

'guido' in val

True

val.index(',')

val.find(':')

-1

val.index(':')

---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

<ipython-input-138-280f8b2856ce> in <module>()
----> 1 val.index(':')


ValueError: substring not found

val.count('a')

val.replace(',', '::')

'a::b::  guido'

val.replace(',', '')

'ab  guido'

regular expression

# 1
import re

text = "foo    bar\t baz  \tqux"
re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

regex = re.compile('\s+')
regex.split(text)

['foo', 'bar', 'baz', 'qux']

regex.findall(text)

['    ', '\t ', '  \t']

# 2
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

# The function of re.IGNORECASE is to make regular expressions insensitive to case
regex = re.compile(pattern, flags=re.IGNORECASE)

regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

m = regex.search(text)
print(m)

<_sre.SRE_Match object; span=(5, 20), match='dave@google.com'>

text[m.start():m.end()]

'dave@google.com'

print(regex.match(text))

None

print(regex.sub('REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED

# 3
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)

m = regex.match('wesm@bright.net')
m.groups()

('wesm', 'bright', 'net')

regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com

# 4
regex = re.compile(r"""
    (?P<username>[A-Z0-9._%+-]+)
    @
    (?P<domain>[A-Z0-9.-]+)
    \.
    (?P<suffix>[A-Z]{2,4})""", flags=re.IGNORECASE | re.VERBOSE)

m = regex.match('wesm@bright.net')
m.groupdict()

{'domain': 'bright', 'suffix': 'net', 'username': 'wesm'}

Vectorized string functions in pandas

data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = Series(data)
print(data)

Dave     dave@google.com
Rob        rob@gmail.com
Steve    steve@gmail.com
Wes                  NaN
dtype: object

data.isnull()

Dave     False
Rob      False
Steve    False
Wes       True
dtype: bool

data.str.contains('gmail')

Dave     False
Rob       True
Steve     True
Wes        NaN
dtype: object

data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Rob        [(rob, gmail, com)]
Steve    [(steve, gmail, com)]
Wes                        NaN
dtype: object

matches = data.str.match(pattern, flags=re.IGNORECASE)
print(matches)

Dave     True
Rob      True
Steve    True
Wes       NaN
dtype: object

matches.str.get(1)

Dave    NaN
Rob     NaN
Steve   NaN
Wes     NaN
dtype: float64

matches.str[0]

Dave    NaN
Rob     NaN
Steve   NaN
Wes     NaN
dtype: float64

data.str[:5]

Dave     dave@
Rob      rob@g
Steve    steve
Wes        NaN
dtype: object

Example: USDA Food Database

'''
{
  "id": 21441,
  "description": "KENTUCKY FRIED CHICKEN, Fried Chicken, EXTRA CRISPY,
Wing, meat and skin with breading",
  "tags": ["KFC"],
  "manufacturer": "Kentucky Fried Chicken",
  "group": "Fast Foods",
  "portions": [
    {
      "amount": 1,
      "unit": "wing, with skin",
      "grams": 68.0
    },

    ...
  ],
  "nutrients": [
    {
      "value": 20.8,
      "units": "g",
      "description": "Protein",
      "group": "Composition"
    },

    ...
  ]
}
'''

import json

db = json.load(open('data/foods-2011-10-03.json'))
len(db)

db[0].keys()

dict_keys(['id', 'description', 'tags', 'manufacturer', 'group', 'portions', 'nutrients'])

db[0]['nutrients'][0]

{'description': 'Protein',
 'group': 'Composition',
 'units': 'g',
 'value': 25.18}

nutrients = DataFrame(db[0]['nutrients'])
nutrients[:7]

	description	group	units	value
0	Protein	Composition	g	25.18
1	Total lipid (fat)	Composition	g	29.20
2	Carbohydrate, by difference	Composition	g	3.06
3	Ash	Other	g	3.28
4	Energy	Energy	kcal	376.00
5	Water	Composition	g	39.28
6	Energy	Energy	kJ	1573.00

info_keys = ['description', 'group', 'id', 'manufacturer']
info = DataFrame(db, columns=info_keys)

info[:5]

	description	group	id
0	Cheese, caraway	Dairy and Egg Products	1008
1	Cheese, cheddar	Dairy and Egg Products	1009
2	Cheese, edam	Dairy and Egg Products	1018
3	Cheese, feta	Dairy and Egg Products	1019
4	Cheese, mozzarella, part skim milk	Dairy and Egg Products	1028

pd.value_counts(info.group)[:10]

Vegetables and Vegetable Products    812
Beef Products                        618
Baked Products                       496
Breakfast Cereals                    403
Fast Foods                           365
Legumes and Legume Products          365
Lamb, Veal, and Game Products        345
Sweets                               341
Fruits and Fruit Juices              328
Pork Products                        328
Name: group, dtype: int64

nutrients = []

for rec in db:
    fnuts = DataFrame(rec['nutrients'])
    fnuts['id'] = rec['id']
    nutrients.append(fnuts)

nutrients = pd.concat(nutrients, ignore_index=True)

print(nutrients)

                               description        group    units     value  \
0                                  Protein  Composition        g    25.180   
1                        Total lipid (fat)  Composition        g    29.200   
2              Carbohydrate, by difference  Composition        g     3.060   
3                                      Ash        Other        g     3.280   
4                                   Energy       Energy     kcal   376.000   
5                                    Water  Composition        g    39.280   
6                                   Energy       Energy       kJ  1573.000   
7                     Fiber, total dietary  Composition        g     0.000   
8                              Calcium, Ca     Elements       mg   673.000   
9                                 Iron, Fe     Elements       mg     0.640   
10                           Magnesium, Mg     Elements       mg    22.000   
11                           Phosphorus, P     Elements       mg   490.000   
12                            Potassium, K     Elements       mg    93.000   
13                              Sodium, Na     Elements       mg   690.000   
14                                Zinc, Zn     Elements       mg     2.940   
15                              Copper, Cu     Elements       mg     0.024   
16                           Manganese, Mn     Elements       mg     0.021   
17                            Selenium, Se     Elements      mcg    14.500   
18                           Vitamin A, IU     Vitamins       IU  1054.000   
19                                 Retinol     Vitamins      mcg   262.000   
20                          Vitamin A, RAE     Vitamins  mcg_RAE   271.000   
21          Vitamin C, total ascorbic acid     Vitamins       mg     0.000   
22                                 Thiamin     Vitamins       mg     0.031   
23                              Riboflavin     Vitamins       mg     0.450   
24                                  Niacin     Vitamins       mg     0.180   
25                        Pantothenic acid     Vitamins       mg     0.190   
26                             Vitamin B-6     Vitamins       mg     0.074   
27                           Folate, total     Vitamins      mcg    18.000   
28                            Vitamin B-12     Vitamins      mcg     0.270   
29                              Folic acid     Vitamins      mcg     0.000   
...                                    ...          ...      ...       ...   
389325                        Selenium, Se     Elements      mcg     1.100   
389326                       Vitamin A, IU     Vitamins       IU     5.000   
389327                             Retinol     Vitamins      mcg     0.000   
389328                      Vitamin A, RAE     Vitamins  mcg_RAE     0.000   
389329                      Carotene, beta     Vitamins      mcg     2.000   
389330                     Carotene, alpha     Vitamins      mcg     2.000   
389331        Vitamin E (alpha-tocopherol)     Vitamins       mg     0.250   
389332                           Vitamin D     Vitamins       IU     0.000   
389333                 Vitamin D (D2 + D3)     Vitamins      mcg     0.000   
389334                 Cryptoxanthin, beta     Vitamins      mcg     0.000   
389335                            Lycopene     Vitamins      mcg     0.000   
389336                 Lutein + zeaxanthin     Vitamins      mcg    20.000   
389337      Vitamin C, total ascorbic acid     Vitamins       mg    21.900   
389338                             Thiamin     Vitamins       mg     0.020   
389339                          Riboflavin     Vitamins       mg     0.060   
389340                              Niacin     Vitamins       mg     0.540   
389341                         Vitamin B-6     Vitamins       mg     0.260   
389342                       Folate, total     Vitamins      mcg    17.000   
389343                        Vitamin B-12     Vitamins      mcg     0.000   
389344                      Choline, total     Vitamins       mg     4.100   
389345           Vitamin K (phylloquinone)     Vitamins      mcg     0.500   
389346                          Folic acid     Vitamins      mcg     0.000   
389347                        Folate, food     Vitamins      mcg    17.000   
389348                         Folate, DFE     Vitamins  mcg_DFE    17.000   
389349                    Vitamin E, added     Vitamins       mg     0.000   
389350                 Vitamin B-12, added     Vitamins      mcg     0.000   
389351                         Cholesterol        Other       mg     0.000   
389352        Fatty acids, total saturated        Other        g     0.072   
389353  Fatty acids, total monounsaturated        Other        g     0.028   
389354  Fatty acids, total polyunsaturated        Other        g     0.041   

           id  
0        1008  
1        1008  
2        1008  
3        1008  
4        1008  
5        1008  
6        1008  
7        1008  
8        1008  
9        1008  
10       1008  
11       1008  
12       1008  
13       1008  
14       1008  
15       1008  
16       1008  
17       1008  
18       1008  
19       1008  
20       1008  
21       1008  
22       1008  
23       1008  
24       1008  
25       1008  
26       1008  
27       1008  
28       1008  
29       1008  
...       ...  
389325  43546  
389326  43546  
389327  43546  
389328  43546  
389329  43546  
389330  43546  
389331  43546  
389332  43546  
389333  43546  
389334  43546  
389335  43546  
389336  43546  
389337  43546  
389338  43546  
389339  43546  
389340  43546  
389341  43546  
389342  43546  
389343  43546  
389344  43546  
389345  43546  
389346  43546  
389347  43546  
389348  43546  
389349  43546  
389350  43546  
389351  43546  
389352  43546  
389353  43546  
389354  43546  

[389355 rows x 5 columns]

# There is duplicate data in the data.
nutrients.duplicated().sum()

# Duplicate removal
nutrients = nutrients.drop_duplicates()

col_mapping = {'description': 'food',
               'group': 'fgroup'}
info = info.rename(columns=col_mapping, copy=False)
print(info)

                                                   food  \
0                                       Cheese, caraway   
1                                       Cheese, cheddar   
2                                          Cheese, edam   
3                                          Cheese, feta   
4                    Cheese, mozzarella, part skim milk   
5      Cheese, mozzarella, part skim milk, low moisture   
6                                        Cheese, romano   
7                                     Cheese, roquefort   
8     Cheese spread, pasteurized process, american, ...   
9                           Cream, fluid, half and half   
10    Sour dressing, non-butterfat, cultured, filled...   
11    Milk, filled, fluid, with blend of hydrogenate...   
12    Cream substitute, liquid, with lauric acid oil...   
13                           Cream substitute, powdered   
14                  Milk, producer, fluid, 3.7% milkfat   
15    Milk, reduced fat, fluid, 2% milkfat, with add...   
16    Milk, reduced fat, fluid, 2% milkfat, with add...   
17    Milk, reduced fat, fluid, 2% milkfat, protein ...   
18    Milk, lowfat, fluid, 1% milkfat, with added vi...   
19    Milk, lowfat, fluid, 1% milkfat, with added no...   
20    Milk, lowfat, fluid, 1% milkfat, protein forti...   
21    Milk, nonfat, fluid, with added vitamin A and ...   
22    Milk, nonfat, fluid, with added nonfat milk so...   
23    Milk, nonfat, fluid, protein fortified, with a...   
24            Milk, buttermilk, fluid, cultured, lowfat   
25                              Milk, low sodium, fluid   
26               Milk, dry, whole, with added vitamin D   
27    Milk, dry, nonfat, regular, without added vita...   
28    Milk, dry, nonfat, instant, with added vitamin...   
29                   Milk, dry, nonfat, calcium reduced   
...                                                 ...   
6606  Beef, tenderloin, steak, separable lean only, ...   
6607  Beef, top sirloin, steak, separable lean only,...   
6608  Beef, short loin, top loin, steak, separable l...   
6609  Beef, chuck, arm pot roast, separable lean onl...   
6610  Beef, brisket, flat half, separable lean only,...   
6611  Beef, chuck, arm pot roast, separable lean onl...   
6612  Beef, brisket, flat half, separable lean only,...   
6613  Beef, round, eye of round, roast, separable le...   
6614  Beef, round, top round, steak, separable lean ...   
6615  Beef, round, bottom round, roast, separable le...   
6616  Beef, rib, small end (ribs 10-12), separable l...   
6617  CAMPBELL Soup Company, CAMPBELL'S Red and Whit...   
6618  CAMPBELL Soup Company, CAMPBELL's Red and Whit...   
6619  CAMPBELL Soup Company, CAMPBELL'S SELECT Soups...   
6620  CAMPBELL Soup Company, CAMPBELL'S SOUP AT HAND...   
6621  CAMPBELL Soup Company, CAMPBELL'S SOUP AT HAND...   
6622  CAMPBELL Soup Company, CAMPBELL'S SELECT Gold ...   
6623  CAMPBELL Soup Company, CAMPBELL'S SELECT Gold ...   
6624  CAMPBELL Soup Company, CAMPBELL'S SELECT Gold ...   
6625  CAMPBELL Soup Company, CAMPBELL'S Red and Whit...   
6626  CAMPBELL Soup Company, V8 Vegetable Juice, Ess...   
6627  CAMPBELL Soup Company, V8 Vegetable Juice, Spi...   
6628  CAMPBELL Soup Company, PACE, Jalapenos Nacho S...   
6629  CAMPBELL Soup Company, V8 60% Vegetable Juice,...   
6630  CAMPBELL Soup Company, V8 Vegetable Juice, Low...   
6631                             Bologna, beef, low fat   
6632  Turkey and pork sausage, fresh, bulk, patty or...   
6633                              Babyfood, juice, pear   
6634         Babyfood, dessert, banana yogurt, strained   
6635              Babyfood, banana no tapioca, strained   

                                 fgroup     id       manufacturer  
0                Dairy and Egg Products   1008                     
1                Dairy and Egg Products   1009                     
2                Dairy and Egg Products   1018                     
3                Dairy and Egg Products   1019                     
4                Dairy and Egg Products   1028                     
5                Dairy and Egg Products   1029                     
6                Dairy and Egg Products   1038                     
7                Dairy and Egg Products   1039                     
8                Dairy and Egg Products   1048                     
9                Dairy and Egg Products   1049                     
10               Dairy and Egg Products   1058                     
11               Dairy and Egg Products   1059                     
12               Dairy and Egg Products   1068                     
13               Dairy and Egg Products   1069                     
14               Dairy and Egg Products   1078                     
15               Dairy and Egg Products   1079               None  
16               Dairy and Egg Products   1080                     
17               Dairy and Egg Products   1081                     
18               Dairy and Egg Products   1082                     
19               Dairy and Egg Products   1083                     
20               Dairy and Egg Products   1084                     
21               Dairy and Egg Products   1085                     
22               Dairy and Egg Products   1086                     
23               Dairy and Egg Products   1087                     
24               Dairy and Egg Products   1088                     
25               Dairy and Egg Products   1089                     
26               Dairy and Egg Products   1090                     
27               Dairy and Egg Products   1091                     
28               Dairy and Egg Products   1092                     
29               Dairy and Egg Products   1093                     
...                                 ...    ...                ...  
6606                      Beef Products  23628                     
6607                      Beef Products  23629                     
6608                      Beef Products  23630                     
6609                      Beef Products  23631                     
6610                      Beef Products  23632                     
6611                      Beef Products  23633                     
6612                      Beef Products  23634                     
6613                      Beef Products  23635                     
6614                      Beef Products  23636                     
6615                      Beef Products  23637                     
6616                      Beef Products  23638                     
6617         Soups, Sauces, and Gravies  27015  Campbell Soup Co.  
6618         Soups, Sauces, and Gravies  27016  Campbell Soup Co.  
6619         Soups, Sauces, and Gravies  27021  Campbell Soup Co.  
6620         Soups, Sauces, and Gravies  27022  Campbell Soup Co.  
6621         Soups, Sauces, and Gravies  27023  Campbell Soup Co.  
6622         Soups, Sauces, and Gravies  27024  Campbell Soup Co.  
6623         Soups, Sauces, and Gravies  27025  Campbell Soup Co.  
6624         Soups, Sauces, and Gravies  27026  Campbell Soup Co.  
6625         Soups, Sauces, and Gravies  27032  Campbell Soup Co.  
6626  Vegetables and Vegetable Products  31010  Campbell Soup Co.  
6627  Vegetables and Vegetable Products  31013  Campbell Soup Co.  
6628  Vegetables and Vegetable Products  31014  Campbell Soup Co.  
6629  Vegetables and Vegetable Products  31016  Campbell Soup Co.  
6630  Vegetables and Vegetable Products  31017  Campbell Soup Co.  
6631        Sausages and Luncheon Meats  42161                     
6632        Sausages and Luncheon Meats  42173                     
6633                         Baby Foods  43408               None  
6634                         Baby Foods  43539               None  
6635                         Baby Foods  43546               None  

[6636 rows x 4 columns]

col_mapping = {'description': 'nutrient',
               'group': 'nutgroup'}
nutrients = nutrients.rename(columns=col_mapping, copy=False)
print(nutrients)

                                  nutrient     nutgroup    units     value  \
0                                  Protein  Composition        g    25.180   
1                        Total lipid (fat)  Composition        g    29.200   
2              Carbohydrate, by difference  Composition        g     3.060   
3                                      Ash        Other        g     3.280   
4                                   Energy       Energy     kcal   376.000   
5                                    Water  Composition        g    39.280   
6                                   Energy       Energy       kJ  1573.000   
7                     Fiber, total dietary  Composition        g     0.000   
8                              Calcium, Ca     Elements       mg   673.000   
9                                 Iron, Fe     Elements       mg     0.640   
10                           Magnesium, Mg     Elements       mg    22.000   
11                           Phosphorus, P     Elements       mg   490.000   
12                            Potassium, K     Elements       mg    93.000   
13                              Sodium, Na     Elements       mg   690.000   
14                                Zinc, Zn     Elements       mg     2.940   
15                              Copper, Cu     Elements       mg     0.024   
16                           Manganese, Mn     Elements       mg     0.021   
17                            Selenium, Se     Elements      mcg    14.500   
18                           Vitamin A, IU     Vitamins       IU  1054.000   
19                                 Retinol     Vitamins      mcg   262.000   
20                          Vitamin A, RAE     Vitamins  mcg_RAE   271.000   
21          Vitamin C, total ascorbic acid     Vitamins       mg     0.000   
22                                 Thiamin     Vitamins       mg     0.031   
23                              Riboflavin     Vitamins       mg     0.450   
24                                  Niacin     Vitamins       mg     0.180   
25                        Pantothenic acid     Vitamins       mg     0.190   
26                             Vitamin B-6     Vitamins       mg     0.074   
27                           Folate, total     Vitamins      mcg    18.000   
28                            Vitamin B-12     Vitamins      mcg     0.270   
29                              Folic acid     Vitamins      mcg     0.000   
...                                    ...          ...      ...       ...   
389325                        Selenium, Se     Elements      mcg     1.100   
389326                       Vitamin A, IU     Vitamins       IU     5.000   
389327                             Retinol     Vitamins      mcg     0.000   
389328                      Vitamin A, RAE     Vitamins  mcg_RAE     0.000   
389329                      Carotene, beta     Vitamins      mcg     2.000   
389330                     Carotene, alpha     Vitamins      mcg     2.000   
389331        Vitamin E (alpha-tocopherol)     Vitamins       mg     0.250   
389332                           Vitamin D     Vitamins       IU     0.000   
389333                 Vitamin D (D2 + D3)     Vitamins      mcg     0.000   
389334                 Cryptoxanthin, beta     Vitamins      mcg     0.000   
389335                            Lycopene     Vitamins      mcg     0.000   
389336                 Lutein + zeaxanthin     Vitamins      mcg    20.000   
389337      Vitamin C, total ascorbic acid     Vitamins       mg    21.900   
389338                             Thiamin     Vitamins       mg     0.020   
389339                          Riboflavin     Vitamins       mg     0.060   
389340                              Niacin     Vitamins       mg     0.540   
389341                         Vitamin B-6     Vitamins       mg     0.260   
389342                       Folate, total     Vitamins      mcg    17.000   
389343                        Vitamin B-12     Vitamins      mcg     0.000   
389344                      Choline, total     Vitamins       mg     4.100   
389345           Vitamin K (phylloquinone)     Vitamins      mcg     0.500   
389346                          Folic acid     Vitamins      mcg     0.000   
389347                        Folate, food     Vitamins      mcg    17.000   
389348                         Folate, DFE     Vitamins  mcg_DFE    17.000   
389349                    Vitamin E, added     Vitamins       mg     0.000   
389350                 Vitamin B-12, added     Vitamins      mcg     0.000   
389351                         Cholesterol        Other       mg     0.000   
389352        Fatty acids, total saturated        Other        g     0.072   
389353  Fatty acids, total monounsaturated        Other        g     0.028   
389354  Fatty acids, total polyunsaturated        Other        g     0.041   

           id  
0        1008  
1        1008  
2        1008  
3        1008  
4        1008  
5        1008  
6        1008  
7        1008  
8        1008  
9        1008  
10       1008  
11       1008  
12       1008  
13       1008  
14       1008  
15       1008  
16       1008  
17       1008  
18       1008  
19       1008  
20       1008  
21       1008  
22       1008  
23       1008  
24       1008  
25       1008  
26       1008  
27       1008  
28       1008  
29       1008  
...       ...  
389325  43546  
389326  43546  
389327  43546  
389328  43546  
389329  43546  
389330  43546  
389331  43546  
389332  43546  
389333  43546  
389334  43546  
389335  43546  
389336  43546  
389337  43546  
389338  43546  
389339  43546  
389340  43546  
389341  43546  
389342  43546  
389343  43546  
389344  43546  
389345  43546  
389346  43546  
389347  43546  
389348  43546  
389349  43546  
389350  43546  
389351  43546  
389352  43546  
389353  43546  
389354  43546  

[375176 rows x 5 columns]

ndata = pd.merge(nutrients, info, on='id', how='outer')
print(ndata)

                                  nutrient     nutgroup    units     value  \
0                                  Protein  Composition        g    25.180   
1                        Total lipid (fat)  Composition        g    29.200   
2              Carbohydrate, by difference  Composition        g     3.060   
3                                      Ash        Other        g     3.280   
4                                   Energy       Energy     kcal   376.000   
5                                    Water  Composition        g    39.280   
6                                   Energy       Energy       kJ  1573.000   
7                     Fiber, total dietary  Composition        g     0.000   
8                              Calcium, Ca     Elements       mg   673.000   
9                                 Iron, Fe     Elements       mg     0.640   
10                           Magnesium, Mg     Elements       mg    22.000   
11                           Phosphorus, P     Elements       mg   490.000   
12                            Potassium, K     Elements       mg    93.000   
13                              Sodium, Na     Elements       mg   690.000   
14                                Zinc, Zn     Elements       mg     2.940   
15                              Copper, Cu     Elements       mg     0.024   
16                           Manganese, Mn     Elements       mg     0.021   
17                            Selenium, Se     Elements      mcg    14.500   
18                           Vitamin A, IU     Vitamins       IU  1054.000   
19                                 Retinol     Vitamins      mcg   262.000   
20                          Vitamin A, RAE     Vitamins  mcg_RAE   271.000   
21          Vitamin C, total ascorbic acid     Vitamins       mg     0.000   
22                                 Thiamin     Vitamins       mg     0.031   
23                              Riboflavin     Vitamins       mg     0.450   
24                                  Niacin     Vitamins       mg     0.180   
25                        Pantothenic acid     Vitamins       mg     0.190   
26                             Vitamin B-6     Vitamins       mg     0.074   
27                           Folate, total     Vitamins      mcg    18.000   
28                            Vitamin B-12     Vitamins      mcg     0.270   
29                              Folic acid     Vitamins      mcg     0.000   
...                                    ...          ...      ...       ...   
375146                        Selenium, Se     Elements      mcg     1.100   
375147                       Vitamin A, IU     Vitamins       IU     5.000   
375148                             Retinol     Vitamins      mcg     0.000   
375149                      Vitamin A, RAE     Vitamins  mcg_RAE     0.000   
375150                      Carotene, beta     Vitamins      mcg     2.000   
375151                     Carotene, alpha     Vitamins      mcg     2.000   
375152        Vitamin E (alpha-tocopherol)     Vitamins       mg     0.250   
375153                           Vitamin D     Vitamins       IU     0.000   
375154                 Vitamin D (D2 + D3)     Vitamins      mcg     0.000   
375155                 Cryptoxanthin, beta     Vitamins      mcg     0.000   
375156                            Lycopene     Vitamins      mcg     0.000   
375157                 Lutein + zeaxanthin     Vitamins      mcg    20.000   
375158      Vitamin C, total ascorbic acid     Vitamins       mg    21.900   
375159                             Thiamin     Vitamins       mg     0.020   
375160                          Riboflavin     Vitamins       mg     0.060   
375161                              Niacin     Vitamins       mg     0.540   
375162                         Vitamin B-6     Vitamins       mg     0.260   
375163                       Folate, total     Vitamins      mcg    17.000   
375164                        Vitamin B-12     Vitamins      mcg     0.000   
375165                      Choline, total     Vitamins       mg     4.100   
375166           Vitamin K (phylloquinone)     Vitamins      mcg     0.500   
375167                          Folic acid     Vitamins      mcg     0.000   
375168                        Folate, food     Vitamins      mcg    17.000   
375169                         Folate, DFE     Vitamins  mcg_DFE    17.000   
375170                    Vitamin E, added     Vitamins       mg     0.000   
375171                 Vitamin B-12, added     Vitamins      mcg     0.000   
375172                         Cholesterol        Other       mg     0.000   
375173        Fatty acids, total saturated        Other        g     0.072   
375174  Fatty acids, total monounsaturated        Other        g     0.028   
375175  Fatty acids, total polyunsaturated        Other        g     0.041   

           id                                   food                  fgroup  \
0        1008                        Cheese, caraway  Dairy and Egg Products   
1        1008                        Cheese, caraway  Dairy and Egg Products   
2        1008                        Cheese, caraway  Dairy and Egg Products   
3        1008                        Cheese, caraway  Dairy and Egg Products   
4        1008                        Cheese, caraway  Dairy and Egg Products   
5        1008                        Cheese, caraway  Dairy and Egg Products   
6        1008                        Cheese, caraway  Dairy and Egg Products   
7        1008                        Cheese, caraway  Dairy and Egg Products   
8        1008                        Cheese, caraway  Dairy and Egg Products   
9        1008                        Cheese, caraway  Dairy and Egg Products   
10       1008                        Cheese, caraway  Dairy and Egg Products   
11       1008                        Cheese, caraway  Dairy and Egg Products   
12       1008                        Cheese, caraway  Dairy and Egg Products   
13       1008                        Cheese, caraway  Dairy and Egg Products   
14       1008                        Cheese, caraway  Dairy and Egg Products   
15       1008                        Cheese, caraway  Dairy and Egg Products   
16       1008                        Cheese, caraway  Dairy and Egg Products   
17       1008                        Cheese, caraway  Dairy and Egg Products   
18       1008                        Cheese, caraway  Dairy and Egg Products   
19       1008                        Cheese, caraway  Dairy and Egg Products   
20       1008                        Cheese, caraway  Dairy and Egg Products   
21       1008                        Cheese, caraway  Dairy and Egg Products   
22       1008                        Cheese, caraway  Dairy and Egg Products   
23       1008                        Cheese, caraway  Dairy and Egg Products   
24       1008                        Cheese, caraway  Dairy and Egg Products   
25       1008                        Cheese, caraway  Dairy and Egg Products   
26       1008                        Cheese, caraway  Dairy and Egg Products   
27       1008                        Cheese, caraway  Dairy and Egg Products   
28       1008                        Cheese, caraway  Dairy and Egg Products   
29       1008                        Cheese, caraway  Dairy and Egg Products   
...       ...                                    ...                     ...   
375146  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375147  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375148  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375149  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375150  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375151  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375152  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375153  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375154  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375155  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375156  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375157  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375158  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375159  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375160  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375161  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375162  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375163  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375164  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375165  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375166  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375167  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375168  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375169  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375170  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375171  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375172  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375173  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375174  43546  Babyfood, banana no tapioca, strained              Baby Foods   
375175  43546  Babyfood, banana no tapioca, strained              Baby Foods   

       manufacturer  
0                    
1                    
2                    
3                    
4                    
5                    
6                    
7                    
8                    
9                    
10                   
11                   
12                   
13                   
14                   
15                   
16                   
17                   
18                   
19                   
20                   
21                   
22                   
23                   
24                   
25                   
26                   
27                   
28                   
29                   
...             ...  
375146         None  
375147         None  
375148         None  
375149         None  
375150         None  
375151         None  
375152         None  
375153         None  
375154         None  
375155         None  
375156         None  
375157         None  
375158         None  
375159         None  
375160         None  
375161         None  
375162         None  
375163         None  
375164         None  
375165         None  
375166         None  
375167         None  
375168         None  
375169         None  
375170         None  
375171         None  
375172         None  
375173         None  
375174         None  
375175         None  

[375176 rows x 8 columns]

ndata.loc[30000]

nutrient                                       Glycine
nutgroup                                   Amino Acids
units                                                g
value                                             0.04
id                                                6158
food            Soup, tomato bisque, canned, condensed
fgroup                      Soups, Sauces, and Gravies
manufacturer                                          
Name: 30000, dtype: object

result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)
print(result)

nutrient          fgroup                           
Adjusted Protein  Sweets                               12.9000
                  Vegetables and Vegetable Products     2.1800
Alanine           Baby Foods                            0.0850
                  Baked Products                        0.2480
                  Beef Products                         1.5500
                  Beverages                             0.0030
                  Breakfast Cereals                     0.3110
                  Cereal Grains and Pasta               0.3730
                  Dairy and Egg Products                0.2710
                  Ethnic Foods                          1.2900
                  Fast Foods                            0.5140
                  Fats and Oils                         0.0000
                  Finfish and Shellfish Products        1.2180
                  Fruits and Fruit Juices               0.0270
                  Lamb, Veal, and Game Products         1.4080
                  Legumes and Legume Products           0.4100
                  Meals, Entrees, and Sidedishes        0.3270
                  Nut and Seed Products                 0.7345
                  Pork Products                         1.3070
                  Poultry Products                      1.3940
                  Restaurant Foods                      0.4650
                  Sausages and Luncheon Meats           0.9420
                  Snacks                                0.4335
                  Soups, Sauces, and Gravies            0.0650
                  Spices and Herbs                      0.5550
                  Sweets                                0.1020
                  Vegetables and Vegetable Products     0.0840
Alcohol, ethyl    Baby Foods                            0.0000
                  Baked Products                        0.0000
                  Beef Products                         0.0000
                                                        ...   
Water             Snacks                                3.5200
                  Soups, Sauces, and Gravies           85.9000
                  Spices and Herbs                     43.6700
                  Sweets                                9.0500
                  Vegetables and Vegetable Products    89.1950
Zinc, Zn          Baby Foods                            0.5900
                  Baked Products                        0.6600
                  Beef Products                         5.3900
                  Beverages                             0.0400
                  Breakfast Cereals                     2.8850
                  Cereal Grains and Pasta               1.0900
                  Dairy and Egg Products                1.3900
                  Ethnic Foods                          1.0450
                  Fast Foods                            1.2500
                  Fats and Oils                         0.0200
                  Finfish and Shellfish Products        0.6700
                  Fruits and Fruit Juices               0.1000
                  Lamb, Veal, and Game Products         3.9400
                  Legumes and Legume Products           1.1400
                  Meals, Entrees, and Sidedishes        0.6300
                  Nut and Seed Products                 3.2900
                  Pork Products                         2.3200
                  Poultry Products                      2.5000
                  Restaurant Foods                      0.8000
                  Sausages and Luncheon Meats           2.1300
                  Snacks                                1.4700
                  Soups, Sauces, and Gravies            0.2000
                  Spices and Herbs                      2.7500
                  Sweets                                0.3600
                  Vegetables and Vegetable Products     0.3300
Name: value, Length: 2246, dtype: float64

# result['Zinc, Zn'].order().plot(kind='barh') #AttributeError: 'Series' object has no attribute 'order'
result['Zinc, Zn'].sort_values().plot(kind='barh')

<matplotlib.axes._subplots.AxesSubplot at 0x2156ce3c748>

by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])

get_maximum = lambda x: x.xs(x.value.idxmax())
get_minimum = lambda x: x.xs(x.value.idxmin())

max_foods = by_nutrient.apply(get_maximum)[['value', 'food']]
print(max_foods)

                                                 value  \
nutgroup    nutrient                                     
Amino Acids Alanine                              8.009   
            Arginine                             7.436   
            Aspartic acid                       10.203   
            Cystine                              1.307   
            Glutamic acid                       17.452   
            Glycine                             19.049   
            Histidine                            2.999   
            Hydroxyproline                       0.803   
            Isoleucine                           4.300   
            Leucine                              7.200   
            Lysine                               6.690   
            Methionine                           1.859   
            Phenylalanine                        4.600   
            Proline                             12.295   
            Serine                               4.600   
            Threonine                            3.300   
            Tryptophan                           1.600   
            Tyrosine                             3.300   
            Valine                               4.500   
Composition Adjusted Protein                    12.900   
            Carbohydrate, by difference        100.000   
            Fiber, total dietary                79.000   
            Protein                             88.320   
            Sugars, total                       99.800   
            Total lipid (fat)                  100.000   
            Water                              100.000   
Elements    Calcium, Ca                       7364.000   
            Copper, Cu                          15.050   
            Fluoride, F                        584.000   
            Iron, Fe                            87.470   
...                                                ...   
Vitamins    Cryptoxanthin, beta               6186.000   
            Dihydrophylloquinone               103.800   
            Folate, DFE                       2630.000   
            Folate, food                      2340.000   
            Folate, total                     2340.000   
            Folic acid                        1538.000   
            Lutein + zeaxanthin              39550.000   
            Lycopene                         46260.000   
            Menaquinone-4                       33.200   
            Niacin                              97.000   
            Pantothenic acid                    35.000   
            Retinol                          30000.000   
            Riboflavin                          14.300   
            Thiamin                             20.000   
            Tocopherol, beta                     6.490   
            Tocopherol, delta                   30.880   
            Tocopherol, gamma                  100.880   
            Vitamin A, IU                   100000.000   
            Vitamin A, RAE                   30000.000   
            Vitamin B-12                        98.890   
            Vitamin B-12, added                 24.000   
            Vitamin B-6                         12.000   
            Vitamin C, total ascorbic acid    2400.000   
            Vitamin D                        10000.000   
            Vitamin D (D2 + D3)                250.000   
            Vitamin D2 (ergocalciferol)         28.100   
            Vitamin D3 (cholecalciferol)        27.400   
            Vitamin E (alpha-tocopherol)       149.400   
            Vitamin E, added                    46.550   
            Vitamin K (phylloquinone)         1714.500   

                                                                                         food  
nutgroup    nutrient                                                                           
Amino Acids Alanine                                         Gelatins, dry powder, unsweetened  
            Arginine                                             Seeds, sesame flour, low-fat  
            Aspartic acid                                                 Soy protein isolate  
            Cystine                              Seeds, cottonseed flour, low fat (glandless)  
            Glutamic acid                                                 Soy protein isolate  
            Glycine                                         Gelatins, dry powder, unsweetened  
            Histidine                              Whale, beluga, meat, dried (Alaska Native)  
            Hydroxyproline                  KENTUCKY FRIED CHICKEN, Fried Chicken, ORIGINA...  
            Isoleucine                      Soy protein isolate, PROTEIN TECHNOLOGIES INTE...  
            Leucine                         Soy protein isolate, PROTEIN TECHNOLOGIES INTE...  
            Lysine                          Seal, bearded (Oogruk), meat, dried (Alaska Na...  
            Methionine                                  Fish, cod, Atlantic, dried and salted  
            Phenylalanine                   Soy protein isolate, PROTEIN TECHNOLOGIES INTE...  
            Proline                                         Gelatins, dry powder, unsweetened  
            Serine                          Soy protein isolate, PROTEIN TECHNOLOGIES INTE...  
            Threonine                       Soy protein isolate, PROTEIN TECHNOLOGIES INTE...  
            Tryptophan                       Sea lion, Steller, meat with fat (Alaska Native)  
            Tyrosine                        Soy protein isolate, PROTEIN TECHNOLOGIES INTE...  
            Valine                          Soy protein isolate, PROTEIN TECHNOLOGIES INTE...  
Composition Adjusted Protein                           Baking chocolate, unsweetened, squares  
            Carbohydrate, by difference           Sweeteners, tabletop, fructose, dry, powder  
            Fiber, total dietary                                             Corn bran, crude  
            Protein                         Soy protein isolate, potassium type, crude pro...  
            Sugars, total                                                  Sugars, granulated  
            Total lipid (fat)                                                 Oil, wheat germ  
            Water                                               Water, bottled, POLAND SPRING  
Elements    Calcium, Ca                     Leavening agents, baking powder, double-acting...  
            Copper, Cu                      Veal, variety meats and by-products, liver, co...  
            Fluoride, F                     Tea, instant, sweetened with sugar, lemon-flav...  
            Iron, Fe                            Salad dressing, russian dressing, low calorie  
...                                                                                       ...  
Vitamins    Cryptoxanthin, beta                                               Spices, paprika  
            Dihydrophylloquinone            Margarine, 80% fat, stick, includes regular an...  
            Folate, DFE                            Cereals ready-to-eat, QUAKER, CAP'N CRUNCH  
            Folate, food                         Leavening agents, yeast, baker's, active dry  
            Folate, total                        Leavening agents, yeast, baker's, active dry  
            Folic acid                             Cereals ready-to-eat, QUAKER, CAP'N CRUNCH  
            Lutein + zeaxanthin                                                     Kale, raw  
            Lycopene                                                            Tomato powder  
            Menaquinone-4                   Chicken, broilers or fryers, drumstick, meat a...  
            Niacin                                                       Yeast extract spread  
            Pantothenic acid                Cereals ready-to-eat, KELLOGG, KELLOGG'S Compl...  
            Retinol                                                       Fish oil, cod liver  
            Riboflavin                                                   Yeast extract spread  
            Thiamin                         MORNINGSTAR FARMS Hot and Spicy Veggie Sausage...  
            Tocopherol, beta                Yellow pond lily, Wocas, dried seeds (Pacific ...  
            Tocopherol, delta                 Oil, cooking and salad, ENOVA, 80% diglycerides  
            Tocopherol, gamma                 Oil, cooking and salad, ENOVA, 80% diglycerides  
            Vitamin A, IU                                                 Fish oil, cod liver  
            Vitamin A, RAE                                                Fish oil, cod liver  
            Vitamin B-12                    Mollusks, clam, mixed species, cooked, moist heat  
            Vitamin B-12, added             Cereals ready-to-eat, KELLOGG, KELLOGG'S ALL-B...  
            Vitamin B-6                     Cereals ready-to-eat, KELLOGG, KELLOGG'S ALL-B...  
            Vitamin C, total ascorbic acid  Orange-flavor drink, breakfast type, low calor...  
            Vitamin D                                                     Fish oil, cod liver  
            Vitamin D (D2 + D3)                                           Fish oil, cod liver  
            Vitamin D2 (ergocalciferol)                               Mushrooms, maitake, raw  
            Vitamin D3 (cholecalciferol)                        Fish, halibut, Greenland, raw  
            Vitamin E (alpha-tocopherol)                                      Oil, wheat germ  
            Vitamin E, added                Cereals ready-to-eat, GENERAL MILLS, Multi-Gra...  
            Vitamin K (phylloquinone)                                    Spices, sage, ground  

[94 rows x 2 columns]

max_foods.loc['Amino Acids']['food']

nutrient
Alanine                           Gelatins, dry powder, unsweetened
Arginine                               Seeds, sesame flour, low-fat
Aspartic acid                                   Soy protein isolate
Cystine                Seeds, cottonseed flour, low fat (glandless)
Glutamic acid                                   Soy protein isolate
Glycine                           Gelatins, dry powder, unsweetened
Histidine                Whale, beluga, meat, dried (Alaska Native)
Hydroxyproline    KENTUCKY FRIED CHICKEN, Fried Chicken, ORIGINA...
Isoleucine        Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Leucine           Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Lysine            Seal, bearded (Oogruk), meat, dried (Alaska Na...
Methionine                    Fish, cod, Atlantic, dried and salted
Phenylalanine     Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Proline                           Gelatins, dry powder, unsweetened
Serine            Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Threonine         Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Tryptophan         Sea lion, Steller, meat with fat (Alaska Native)
Tyrosine          Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Valine            Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Name: food, dtype: object

Reference: Python Data Analysis Course

Keywords: Google Selenium Attribute Lambda

Added by robche on Tue, 13 Aug 2019 12:45:36 +0300

Programming VIP