# Lagrange Interpolation Method for Missing Value Processing

```import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from scipy.interpolate import lagrange  # Import Lagrange interpolation function

np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))

inputfile = 'data/catering_sale.xls'  # Sales Data Path
outputfile = 'data/sales.xls'  # Output data path

# Filter outliers to null values
"""
data[u'Sales volume'][(data[u'Sales volume'] < 400) | (data[u'Sales volume'] > 5000)] = None
//There will be a warning if it is written as follows:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame

//If you want to change the original data, use a single assignment operation (loc):
data.loc[(data[u'Sales volume'] < 400) | (data[u'Sales volume'] > 5000), u'Sales volume'] = None

//If you want a copy, be sure to force Pandas to create a copy:
error_data = data.copy()
error_data.loc[(error_data[u'Sales volume'] < 400) | (error_data[u'Sales volume'] > 5000), u'Sales volume'] = None

//Reference: https://www.jianshu.com/p/72274 ccb647a
"""
data.loc[(data[u'Sales volume'] < 400) | (data[u'Sales volume'] > 5000), u'Sales volume'] = None

# Custom Column Vector Interpolation Function
# s is the column vector, n is the interpolated position, k is the number of data before and after taking, the default is 5.
def ployinterp_column(s, n, k=5):
y = s[list(range(n - k, n)) + list(range(n + 1, n + 1 + k))]  # Fetch number
y = y[y.notnull()]  # Exclude null values
return lagrange(y.index, list(y))(n)  # Interpolate and return the result of the interpolation

# Determine whether interpolation is necessary by element
for i in data.columns:
for j in range(len(data)):
if (data[i].isnull())[j]:  # If null, interpolate.
"""
data[i][j] = ployinterp_column(data[i], j)
//In this way, there will be warnings:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
"""
data.loc[j, i] = ployinterp_column(data[i], j)

data.to_excel(outputfile)  # Output result, write to file
```

# Data frame merge

```#Data frame merge
# 1
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1': range(7)})
df2 = DataFrame({'key': ['a', 'b', 'd'],
'data2': range(3)})
print(df1)
print(df2)
```
```   data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   a
6      6   b
data2 key
0      0   a
1      1   b
2      2   d
```
```pd.merge(df1, df2)
```
data1 key data2
0 0 b 1
1 1 b 1
2 6 b 1
3 2 a 0
4 4 a 0
5 5 a 0
```pd.merge(df1, df2, on='key')
```
data1 key data2
0 0 b 1
1 1 b 1
2 6 b 1
3 2 a 0
4 4 a 0
5 5 a 0
```# 2
df3 = DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1': range(7)})
df4 = DataFrame({'rkey': ['a', 'b', 'd'],
'data2': range(3)})
```
```pd.merge(df3, df4, left_on='lkey', right_on='rkey')
```
data1 lkey data2 rkey
0 0 b 1 b
1 1 b 1 b
2 6 b 1 b
3 2 a 0 a
4 4 a 0 a
5 5 a 0 a
```pd.merge(df1, df2, how='outer')
```
data1 key data2
0 0.0 b 1.0
1 1.0 b 1.0
2 6.0 b 1.0
3 2.0 a 0.0
4 4.0 a 0.0
5 5.0 a 0.0
6 3.0 c NaN
7 NaN d 2.0
```# 3
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
'data1': range(6)})
df2 = DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
'data2': range(5)})
print(df1)
print(df2)
```
```   data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   b
data2 key
0      0   a
1      1   b
2      2   a
3      3   b
4      4   d
```
```pd.merge(df1, df2, on='key', how='left')
```
data1 key data2
0 0 b 1.0
1 0 b 3.0
2 1 b 1.0
3 1 b 3.0
4 2 a 0.0
5 2 a 2.0
6 3 c NaN
7 4 a 0.0
8 4 a 2.0
9 5 b 1.0
10 5 b 3.0
```pd.merge(df1, df2, how='inner')
```
data1 key data2
0 0 b 1
1 0 b 3
2 1 b 1
3 1 b 3
4 5 b 1
5 5 b 3
6 2 a 0
7 2 a 2
8 4 a 0
9 4 a 2
```# 4
left = DataFrame({'key1': ['foo', 'foo', 'bar'],
'key2': ['one', 'two', 'one'],
'lval': [1, 2, 3]})
right = DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
'key2': ['one', 'one', 'one', 'two'],
'rval': [4, 5, 6, 7]})
pd.merge(left, right, on=['key1', 'key2'], how='outer')
```
key1 key2 lval rval
0 foo one 1.0 4.0
1 foo one 1.0 5.0
2 foo two 2.0 NaN
3 bar one 3.0 6.0
4 bar two NaN 7.0
```# 5
pd.merge(left, right, on='key1')
```
key1 key2_x lval key2_y rval
0 foo one 1 one 4
1 foo one 1 one 5
2 foo two 2 one 4
3 foo two 2 one 5
4 bar one 3 one 6
5 bar one 3 two 7
```pd.merge(left, right, on='key1', suffixes=('_left', '_right'))
```
key1 key2_left lval key2_right rval
0 foo one 1 one 4
1 foo one 1 one 5
2 foo two 2 one 4
3 foo two 2 one 5
4 bar one 3 one 6
5 bar one 3 two 7

# Merge on Index

```# 1
left1 = DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'], 'value': range(6)})
right1 = DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])
print(left1)
print(right1)
```
```  key  value
0   a      0
1   b      1
2   a      2
3   a      3
4   b      4
5   c      5
group_val
a        3.5
b        7.0
```
```pd.merge(left1, right1, left_on='key', right_index=True)
```
key value group_val
0 a 0 3.5
2 a 2 3.5
3 a 3 3.5
1 b 1 7.0
4 b 4 7.0
```pd.merge(left1, right1, left_on='key', right_index=True, how='outer')
```
key value group_val
0 a 0 3.5
2 a 2 3.5
3 a 3 3.5
1 b 1 7.0
4 b 4 7.0
5 c 5 NaN
```# 2
'key2': [2000, 2001, 2002, 2001, 2002],
'data': np.arange(5.)})
righth = DataFrame(np.arange(12).reshape((6, 2)),
[2001, 2000, 2000, 2000, 2001, 2002]],
columns=['event1', 'event2'])
print(lefth)
print(righth)
```
```   data    key1  key2
0   0.0    Ohio  2000
1   1.0    Ohio  2001
2   2.0    Ohio  2002
event1  event2
2000       2       3
Ohio   2000       4       5
2000       6       7
2001       8       9
2002      10      11
```
```pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)
```
data key1 key2 event1 event2
0 0.0 Ohio 2000 4 5
0 0.0 Ohio 2000 6 7
1 1.0 Ohio 2001 8 9
2 2.0 Ohio 2002 10 11
3 3.0 Nevada 2001 0 1
```pd.merge(lefth, righth, left_on=['key1', 'key2'],
right_index=True, how='outer')
```
data key1 key2 event1 event2
0 0.0 Ohio 2000 4.0 5.0
0 0.0 Ohio 2000 6.0 7.0
1 1.0 Ohio 2001 8.0 9.0
2 2.0 Ohio 2002 10.0 11.0
3 3.0 Nevada 2001 0.0 1.0
4 4.0 Nevada 2002 NaN NaN
4 NaN Nevada 2000 2.0 3.0
```left2 = DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=['a', 'c', 'e'],
right2 = DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabama'])
print(left2)
print(right2)
```
```   Ohio  Nevada
a   1.0     2.0
c   3.0     4.0
e   5.0     6.0
Missouri  Alabama
b       7.0      8.0
c       9.0     10.0
d      11.0     12.0
e      13.0     14.0
```
```pd.merge(left2, right2, how='outer', left_index=True, right_index=True)
```
a 1.0 2.0 NaN NaN
b NaN NaN 7.0 8.0
c 3.0 4.0 9.0 10.0
d NaN NaN 11.0 12.0
e 5.0 6.0 13.0 14.0
```# 3
left2.join(right2, how='outer')
```
a 1.0 2.0 NaN NaN
b NaN NaN 7.0 8.0
c 3.0 4.0 9.0 10.0
d NaN NaN 11.0 12.0
e 5.0 6.0 13.0 14.0
```left1.join(right1, on='key')
```
key value group_val
0 a 0 3.5
1 b 1 7.0
2 a 2 3.5
3 a 3 3.5
4 b 4 7.0
5 c 5 NaN
```# 4
another = DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
index=['a', 'c', 'e', 'f'], columns=['New York', 'Oregon'])
left2.join([right2, another])
```
Ohio Nevada Missouri Alabama New York Oregon
a 1.0 2.0 NaN NaN 7.0 8.0
c 3.0 4.0 9.0 10.0 9.0 10.0
e 5.0 6.0 13.0 14.0 11.0 12.0
```left2.join([right2, another], how='outer')
```
Ohio Nevada Missouri Alabama New York Oregon
a 1.0 2.0 NaN NaN 7.0 8.0
b NaN NaN 7.0 8.0 NaN NaN
c 3.0 4.0 9.0 10.0 9.0 10.0
d NaN NaN 11.0 12.0 NaN NaN
e 5.0 6.0 13.0 14.0 11.0 12.0
f NaN NaN NaN NaN 16.0 17.0

# Axial connection

```# 1
arr = np.arange(12).reshape((3, 4))
print(arr)

np.concatenate([arr, arr], axis=1)
```
```[[ 0  1  2  3]
[ 4  5  6  7]
[ 8  9 10 11]]

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
[ 4,  5,  6,  7,  4,  5,  6,  7],
[ 8,  9, 10, 11,  8,  9, 10, 11]])
```
```# 2
s1 = Series([0, 1], index=['a', 'b'])
s2 = Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = Series([5, 6], index=['f', 'g'])

pd.concat([s1, s2, s3])
```
```a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64
```
```pd.concat([s1, s2, s3], axis=0)
```
```a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64
```
```pd.concat([s1, s2, s3], axis=1)
```
0 1 2
a 0.0 NaN NaN
b 1.0 NaN NaN
c NaN 2.0 NaN
d NaN 3.0 NaN
e NaN 4.0 NaN
f NaN NaN 5.0
g NaN NaN 6.0
```s4 = pd.concat([s1 * 5, s3])
print(s4)
```
```a    0
b    5
f    5
g    6
dtype: int64
```
```pd.concat([s1, s4], axis=1)
```
0 1
a 0.0 0
b 1.0 5
f NaN 5
g NaN 6
```pd.concat([s1, s4], axis=1, join='inner')
```
0 1
a 0 0
b 1 5
```pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']])
```
0 1
a 0.0 0.0
c NaN NaN
b 1.0 5.0
e NaN NaN
```# 3
result = pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])
print(result)
```
```one    a    0
b    1
two    a    0
b    1
three  f    5
g    6
dtype: int64
```
```result.unstack()
```
a b f g
one 0.0 1.0 NaN NaN
two 0.0 1.0 NaN NaN
three NaN NaN 5.0 6.0
```# 4
pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])
```
one two three
a 0.0 NaN NaN
b 1.0 NaN NaN
c NaN 2.0 NaN
d NaN 3.0 NaN
e NaN 4.0 NaN
f NaN NaN 5.0
g NaN NaN 6.0
```df1 = DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
columns=['one', 'two'])
df2 = DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],
columns=['three', 'four'])
```
```pd.concat([df1, df2], axis=1, keys=['level1', 'level2'])
```
level1 level2
one two three four
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0
```pd.concat({'level1': df1, 'level2': df2}, axis=1)
```
level1 level2
one two three four
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0
```pd.concat([df1, df2], axis=1, keys=['level1', 'level2'],
names=['upper', 'lower'])
```
upper level1 level2
lower one two three four
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0
```# 5
df1 = DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df2 = DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])
print(df1)
print(df2)
```
```          a         b         c         d
0 -0.204708  0.478943 -0.519439 -0.555730
1  1.965781  1.393406  0.092908  0.281746
2  0.769023  1.246435  1.007189 -1.296221
b         d         a
0  0.274992  0.228913  1.352917
1  0.886429 -2.001637 -0.371843
```
```pd.concat([df1, df2], ignore_index=True)
```
a b c d
0 -0.204708 0.478943 -0.519439 -0.555730
1 1.965781 1.393406 0.092908 0.281746
2 0.769023 1.246435 1.007189 -1.296221
3 1.352917 0.274992 NaN 0.228913
4 -0.371843 0.886429 NaN -2.001637

# Merge overlapping data

```# 1
a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
index=['f', 'e', 'd', 'c', 'b', 'a'])
b = Series(np.arange(len(a), dtype=np.float64),
index=['f', 'e', 'd', 'c', 'b', 'a'])
b[-1] = np.nan

print(a)
print(b)

np.where(pd.isnull(a), b, a)
```
```f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64
f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    NaN
dtype: float64

array([ 0. ,  2.5,  2. ,  3.5,  4.5,  nan])
```
```# 2
b[:-2].combine_first(a[2:])
```
```a    NaN
b    4.5
c    3.0
d    2.0
e    1.0
f    0.0
dtype: float64
```
```# 3
df1 = DataFrame({'a': [1., np.nan, 5., np.nan],
'b': [np.nan, 2., np.nan, 6.],
'c': range(2, 18, 4)})
df2 = DataFrame({'a': [5., 4., np.nan, 3., 7.],
'b': [np.nan, 3., 4., 6., 8.]})
df1.combine_first(df2)
```
a b c
0 1.0 NaN 2.0
1 4.0 2.0 6.0
2 5.0 4.0 10.0
3 3.0 6.0 14.0
4 7.0 8.0 NaN

# Reshaping Hierarchical Index

```# 1
data = DataFrame(np.arange(6).reshape((2, 3)),
columns=pd.Index(['one', 'two', 'three'], name='number'))
print(data)
```
```number    one  two  three
state
Ohio        0    1      2
```
```# stack converts column to index
result = data.stack()
print(result)
```
```state     number
Ohio      one       0
two       1
three     2
two       4
three     5
dtype: int32
```

[External Link Picture Transfer Failure (img-RFupWnTz-1565689273685)(data/stack1.png)]

```# unstack converts index to column
result.unstack()
```
number one two three
state
Ohio 0 1 2

[External Link Picture Transfer Failure (img-zYzZClXX-1565689273686)(data/unstack1.png)]

```result.unstack(0)
```
number
one 0 3
two 1 4
three 2 5
```result.unstack('state')
```
number
one 0 3
two 1 4
three 2 5
```# 2
s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = Series([4, 5, 6], index=['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys=['one', 'two'])
print(data2)
```
```one  a    0
b    1
c    2
d    3
two  c    4
d    5
e    6
dtype: int64
```
```# data2.stack() # No column execution will report errors
```
```data2.unstack()
```
a b c d e
one 0.0 1.0 2.0 3.0 NaN
two NaN NaN 4.0 5.0 6.0

[External Link Picture Transfer Failure (img-HNbyj5MF-1565689273687)(data/unstack2.png)]

```data2.unstack().stack()
```
```one  a    0.0
b    1.0
c    2.0
d    3.0
two  c    4.0
d    5.0
e    6.0
dtype: float64
```

[External Link Picture Transfer Failure (img-zQBjoEri-1565689273688)(data/stack2.png)]

```data2.unstack().stack(dropna=False)
```
```one  a    0.0
b    1.0
c    2.0
d    3.0
e    NaN
two  a    NaN
b    NaN
c    4.0
d    5.0
e    6.0
dtype: float64
```
```# 3
df = DataFrame({'left': result, 'right': result + 5},
columns=pd.Index(['left', 'right'], name='side'))
print(df)
```
```side             left  right
state    number
Ohio     one        0      5
two        1      6
three      2      7
two        4      9
three      5     10
```
```df.unstack('state')
```
side left right
number
one 0 3 5 8
two 1 4 6 9
three 2 5 7 10
```df.unstack('state').stack('side')
```
number side
one left 3 0
right 8 5
two left 4 1
right 9 6
three left 5 2
right 10 7

# Conversion of Length-Width Format

```data = pd.read_csv('data/macrodata.csv')
print(data)
```
```       year  quarter    realgdp  realcons   realinv  realgovt  realdpi  \
0    1959.0      1.0   2710.349    1707.4   286.898   470.045   1886.9
1    1959.0      2.0   2778.801    1733.7   310.859   481.301   1919.7
2    1959.0      3.0   2775.488    1751.8   289.226   491.260   1916.4
3    1959.0      4.0   2785.204    1753.7   299.356   484.052   1931.3
4    1960.0      1.0   2847.699    1770.5   331.722   462.199   1955.5
5    1960.0      2.0   2834.390    1792.9   298.152   460.400   1966.1
6    1960.0      3.0   2839.022    1785.8   296.375   474.676   1967.8
7    1960.0      4.0   2802.616    1788.2   259.764   476.434   1966.6
8    1961.0      1.0   2819.264    1787.7   266.405   475.854   1984.5
9    1961.0      2.0   2872.005    1814.3   286.246   480.328   2014.4
10   1961.0      3.0   2918.419    1823.1   310.227   493.828   2041.9
11   1961.0      4.0   2977.830    1859.6   315.463   502.521   2082.0
12   1962.0      1.0   3031.241    1879.4   334.271   520.960   2101.7
13   1962.0      2.0   3064.709    1902.5   331.039   523.066   2125.2
14   1962.0      3.0   3093.047    1917.9   336.962   538.838   2137.0
15   1962.0      4.0   3100.563    1945.1   325.650   535.912   2154.6
16   1963.0      1.0   3141.087    1958.2   343.721   522.917   2172.5
17   1963.0      2.0   3180.447    1976.9   348.730   518.108   2193.1
18   1963.0      3.0   3240.332    2003.8   360.102   546.893   2217.9
19   1963.0      4.0   3264.967    2020.6   364.534   532.383   2254.6
20   1964.0      1.0   3338.246    2060.5   379.523   529.686   2299.6
21   1964.0      2.0   3376.587    2096.7   377.778   526.175   2362.1
22   1964.0      3.0   3422.469    2135.2   386.754   522.008   2392.7
23   1964.0      4.0   3431.957    2141.2   389.910   514.603   2420.4
24   1965.0      1.0   3516.251    2188.8   429.145   508.006   2447.4
25   1965.0      2.0   3563.960    2213.0   429.119   508.931   2474.5
26   1965.0      3.0   3636.285    2251.0   444.444   529.446   2542.6
27   1965.0      4.0   3724.014    2314.3   446.493   544.121   2594.1
28   1966.0      1.0   3815.423    2348.5   484.244   556.593   2618.4
29   1966.0      2.0   3828.124    2354.5   475.408   571.371   2624.7
..      ...      ...        ...       ...       ...       ...      ...
173  2002.0      2.0  11538.770    7997.8  1810.779   774.408   8658.9
174  2002.0      3.0  11596.430    8052.0  1814.531   786.673   8629.2
175  2002.0      4.0  11598.824    8080.6  1813.219   799.967   8649.6
176  2003.0      1.0  11645.819    8122.3  1813.141   800.196   8681.3
177  2003.0      2.0  11738.706    8197.8  1823.698   838.775   8812.5
178  2003.0      3.0  11935.461    8312.1  1889.883   839.598   8935.4
179  2003.0      4.0  12042.817    8358.0  1959.783   845.722   8986.4
180  2004.0      1.0  12127.623    8437.6  1970.015   856.570   9025.9
181  2004.0      2.0  12213.818    8483.2  2055.580   861.440   9115.0
182  2004.0      3.0  12303.533    8555.8  2082.231   876.385   9175.9
183  2004.0      4.0  12410.282    8654.2  2125.152   865.596   9303.4
184  2005.0      1.0  12534.113    8719.0  2170.299   869.204   9189.6
185  2005.0      2.0  12587.535    8802.9  2131.468   870.044   9253.0
186  2005.0      3.0  12683.153    8865.6  2154.949   890.394   9308.0
187  2005.0      4.0  12748.699    8888.5  2232.193   875.557   9358.7
188  2006.0      1.0  12915.938    8986.6  2264.721   900.511   9533.8
189  2006.0      2.0  12962.462    9035.0  2261.247   892.839   9617.3
190  2006.0      3.0  12965.916    9090.7  2229.636   892.002   9662.5
191  2006.0      4.0  13060.679    9181.6  2165.966   894.404   9788.8
192  2007.0      1.0  13099.901    9265.1  2132.609   882.766   9830.2
193  2007.0      2.0  13203.977    9291.5  2162.214   898.713   9842.7
194  2007.0      3.0  13321.109    9335.6  2166.491   918.983   9883.9
195  2007.0      4.0  13391.249    9363.6  2123.426   925.110   9886.2
196  2008.0      1.0  13366.865    9349.6  2082.886   943.372   9826.8
197  2008.0      2.0  13415.266    9351.0  2026.518   961.280  10059.0
198  2008.0      3.0  13324.600    9267.7  1990.693   991.551   9838.3
199  2008.0      4.0  13141.920    9195.3  1857.661  1007.273   9920.4
200  2009.0      1.0  12925.410    9209.2  1558.494   996.287   9926.4
201  2009.0      2.0  12901.504    9189.0  1456.678  1023.528  10077.5
202  2009.0      3.0  12990.341    9256.0  1486.398  1044.088  10040.6

cpi      m1  tbilrate  unemp      pop  infl  realint
0     28.980   139.7      2.82    5.8  177.146  0.00     0.00
1     29.150   141.7      3.08    5.1  177.830  2.34     0.74
2     29.350   140.5      3.82    5.3  178.657  2.74     1.09
3     29.370   140.0      4.33    5.6  179.386  0.27     4.06
4     29.540   139.6      3.50    5.2  180.007  2.31     1.19
5     29.550   140.2      2.68    5.2  180.671  0.14     2.55
6     29.750   140.9      2.36    5.6  181.528  2.70    -0.34
7     29.840   141.1      2.29    6.3  182.287  1.21     1.08
8     29.810   142.1      2.37    6.8  182.992 -0.40     2.77
9     29.920   142.9      2.29    7.0  183.691  1.47     0.81
10    29.980   144.1      2.32    6.8  184.524  0.80     1.52
11    30.040   145.2      2.60    6.2  185.242  0.80     1.80
12    30.210   146.4      2.73    5.6  185.874  2.26     0.47
13    30.220   146.5      2.78    5.5  186.538  0.13     2.65
14    30.380   146.7      2.78    5.6  187.323  2.11     0.67
15    30.440   148.3      2.87    5.5  188.013  0.79     2.08
16    30.480   149.7      2.90    5.8  188.580  0.53     2.38
17    30.690   151.3      3.03    5.7  189.242  2.75     0.29
18    30.750   152.6      3.38    5.5  190.028  0.78     2.60
19    30.940   153.7      3.52    5.6  190.668  2.46     1.06
20    30.950   154.8      3.51    5.5  191.245  0.13     3.38
21    31.020   156.8      3.47    5.2  191.889  0.90     2.57
22    31.120   159.2      3.53    5.0  192.631  1.29     2.25
23    31.280   160.7      3.76    5.0  193.223  2.05     1.71
24    31.380   162.0      3.93    4.9  193.709  1.28     2.65
25    31.580   163.1      3.84    4.7  194.303  2.54     1.30
26    31.650   166.0      3.93    4.4  194.997  0.89     3.04
27    31.880   169.1      4.35    4.1  195.539  2.90     1.46
28    32.280   171.8      4.62    3.9  195.999  4.99    -0.37
29    32.450   170.3      4.65    3.8  196.560  2.10     2.55
..       ...     ...       ...    ...      ...   ...      ...
173  180.000  1199.5      1.70    5.8  288.028  1.56     0.14
174  181.200  1204.0      1.61    5.7  288.783  2.66    -1.05
175  182.600  1226.8      1.20    5.8  289.421  3.08    -1.88
176  183.200  1248.4      1.14    5.9  290.019  1.31    -0.17
177  183.700  1287.9      0.96    6.2  290.704  1.09    -0.13
178  184.900  1297.3      0.94    6.1  291.449  2.60    -1.67
179  186.300  1306.1      0.90    5.8  292.057  3.02    -2.11
180  187.400  1332.1      0.94    5.7  292.635  2.35    -1.42
181  189.100  1340.5      1.21    5.6  293.310  3.61    -2.41
182  190.800  1361.0      1.63    5.4  294.066  3.58    -1.95
183  191.800  1366.6      2.20    5.4  294.741  2.09     0.11
184  193.800  1357.8      2.69    5.3  295.308  4.15    -1.46
185  194.700  1366.6      3.01    5.1  295.994  1.85     1.16
186  199.200  1375.0      3.52    5.0  296.770  9.14    -5.62
187  199.400  1380.6      4.00    4.9  297.435  0.40     3.60
188  200.700  1380.5      4.51    4.7  298.061  2.60     1.91
189  202.700  1369.2      4.82    4.7  298.766  3.97     0.85
190  201.900  1369.4      4.90    4.7  299.593 -1.58     6.48
191  203.574  1373.6      4.92    4.4  300.320  3.30     1.62
192  205.920  1379.7      4.95    4.5  300.977  4.58     0.36
193  207.338  1370.0      4.72    4.5  301.714  2.75     1.97
194  209.133  1379.2      4.00    4.7  302.509  3.45     0.55
195  212.495  1377.4      3.01    4.8  303.204  6.38    -3.37
196  213.997  1384.0      1.56    4.9  303.803  2.82    -1.26
197  218.610  1409.3      1.74    5.4  304.483  8.53    -6.79
198  216.889  1474.7      1.17    6.0  305.270 -3.16     4.33
199  212.174  1576.5      0.12    6.9  305.952 -8.79     8.91
200  212.671  1592.8      0.22    8.1  306.547  0.94    -0.71
201  214.469  1653.6      0.18    9.2  307.226  3.37    -3.19
202  216.385  1673.9      0.12    9.6  308.013  3.56    -3.44

[203 rows x 14 columns]
```
```periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date')
print(periods)
```
```PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',
'1960Q3', '1960Q4', '1961Q1', '1961Q2',
...
'2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3',
'2008Q4', '2009Q1', '2009Q2', '2009Q3'],
dtype='period[Q-DEC]', name='date', length=203, freq='Q-DEC')
```
```data = DataFrame(data.to_records(),
columns=pd.Index(['realgdp', 'infl', 'unemp'], name='item'),
index=periods.to_timestamp('D', 'end'))
print(data)
```
```item          realgdp  infl  unemp
date
1959-03-31   2710.349  0.00    5.8
1959-06-30   2778.801  2.34    5.1
1959-09-30   2775.488  2.74    5.3
1959-12-31   2785.204  0.27    5.6
1960-03-31   2847.699  2.31    5.2
1960-06-30   2834.390  0.14    5.2
1960-09-30   2839.022  2.70    5.6
1960-12-31   2802.616  1.21    6.3
1961-03-31   2819.264 -0.40    6.8
1961-06-30   2872.005  1.47    7.0
1961-09-30   2918.419  0.80    6.8
1961-12-31   2977.830  0.80    6.2
1962-03-31   3031.241  2.26    5.6
1962-06-30   3064.709  0.13    5.5
1962-09-30   3093.047  2.11    5.6
1962-12-31   3100.563  0.79    5.5
1963-03-31   3141.087  0.53    5.8
1963-06-30   3180.447  2.75    5.7
1963-09-30   3240.332  0.78    5.5
1963-12-31   3264.967  2.46    5.6
1964-03-31   3338.246  0.13    5.5
1964-06-30   3376.587  0.90    5.2
1964-09-30   3422.469  1.29    5.0
1964-12-31   3431.957  2.05    5.0
1965-03-31   3516.251  1.28    4.9
1965-06-30   3563.960  2.54    4.7
1965-09-30   3636.285  0.89    4.4
1965-12-31   3724.014  2.90    4.1
1966-03-31   3815.423  4.99    3.9
1966-06-30   3828.124  2.10    3.8
...               ...   ...    ...
2002-06-30  11538.770  1.56    5.8
2002-09-30  11596.430  2.66    5.7
2002-12-31  11598.824  3.08    5.8
2003-03-31  11645.819  1.31    5.9
2003-06-30  11738.706  1.09    6.2
2003-09-30  11935.461  2.60    6.1
2003-12-31  12042.817  3.02    5.8
2004-03-31  12127.623  2.35    5.7
2004-06-30  12213.818  3.61    5.6
2004-09-30  12303.533  3.58    5.4
2004-12-31  12410.282  2.09    5.4
2005-03-31  12534.113  4.15    5.3
2005-06-30  12587.535  1.85    5.1
2005-09-30  12683.153  9.14    5.0
2005-12-31  12748.699  0.40    4.9
2006-03-31  12915.938  2.60    4.7
2006-06-30  12962.462  3.97    4.7
2006-09-30  12965.916 -1.58    4.7
2006-12-31  13060.679  3.30    4.4
2007-03-31  13099.901  4.58    4.5
2007-06-30  13203.977  2.75    4.5
2007-09-30  13321.109  3.45    4.7
2007-12-31  13391.249  6.38    4.8
2008-03-31  13366.865  2.82    4.9
2008-06-30  13415.266  8.53    5.4
2008-09-30  13324.600 -3.16    6.0
2008-12-31  13141.920 -8.79    6.9
2009-03-31  12925.410  0.94    8.1
2009-06-30  12901.504  3.37    9.2
2009-09-30  12990.341  3.56    9.6

[203 rows x 3 columns]
```
```ldata = data.stack().reset_index().rename(columns={0: 'value'})
print(ldata)
```
```          date     item      value
0   1959-03-31  realgdp   2710.349
1   1959-03-31     infl      0.000
2   1959-03-31    unemp      5.800
3   1959-06-30  realgdp   2778.801
4   1959-06-30     infl      2.340
5   1959-06-30    unemp      5.100
6   1959-09-30  realgdp   2775.488
7   1959-09-30     infl      2.740
8   1959-09-30    unemp      5.300
9   1959-12-31  realgdp   2785.204
10  1959-12-31     infl      0.270
11  1959-12-31    unemp      5.600
12  1960-03-31  realgdp   2847.699
13  1960-03-31     infl      2.310
14  1960-03-31    unemp      5.200
15  1960-06-30  realgdp   2834.390
16  1960-06-30     infl      0.140
17  1960-06-30    unemp      5.200
18  1960-09-30  realgdp   2839.022
19  1960-09-30     infl      2.700
20  1960-09-30    unemp      5.600
21  1960-12-31  realgdp   2802.616
22  1960-12-31     infl      1.210
23  1960-12-31    unemp      6.300
24  1961-03-31  realgdp   2819.264
25  1961-03-31     infl     -0.400
26  1961-03-31    unemp      6.800
27  1961-06-30  realgdp   2872.005
28  1961-06-30     infl      1.470
29  1961-06-30    unemp      7.000
..         ...      ...        ...
579 2007-06-30  realgdp  13203.977
580 2007-06-30     infl      2.750
581 2007-06-30    unemp      4.500
582 2007-09-30  realgdp  13321.109
583 2007-09-30     infl      3.450
584 2007-09-30    unemp      4.700
585 2007-12-31  realgdp  13391.249
586 2007-12-31     infl      6.380
587 2007-12-31    unemp      4.800
588 2008-03-31  realgdp  13366.865
589 2008-03-31     infl      2.820
590 2008-03-31    unemp      4.900
591 2008-06-30  realgdp  13415.266
592 2008-06-30     infl      8.530
593 2008-06-30    unemp      5.400
594 2008-09-30  realgdp  13324.600
595 2008-09-30     infl     -3.160
596 2008-09-30    unemp      6.000
597 2008-12-31  realgdp  13141.920
598 2008-12-31     infl     -8.790
599 2008-12-31    unemp      6.900
600 2009-03-31  realgdp  12925.410
601 2009-03-31     infl      0.940
602 2009-03-31    unemp      8.100
603 2009-06-30  realgdp  12901.504
604 2009-06-30     infl      3.370
605 2009-06-30    unemp      9.200
606 2009-09-30  realgdp  12990.341
607 2009-09-30     infl      3.560
608 2009-09-30    unemp      9.600

[609 rows x 3 columns]
```
```wdata = ldata.pivot(index='date',columns= 'item',values= 'value')
print(wdata)
```
```item        infl    realgdp  unemp
date
1959-03-31  0.00   2710.349    5.8
1959-06-30  2.34   2778.801    5.1
1959-09-30  2.74   2775.488    5.3
1959-12-31  0.27   2785.204    5.6
1960-03-31  2.31   2847.699    5.2
1960-06-30  0.14   2834.390    5.2
1960-09-30  2.70   2839.022    5.6
1960-12-31  1.21   2802.616    6.3
1961-03-31 -0.40   2819.264    6.8
1961-06-30  1.47   2872.005    7.0
1961-09-30  0.80   2918.419    6.8
1961-12-31  0.80   2977.830    6.2
1962-03-31  2.26   3031.241    5.6
1962-06-30  0.13   3064.709    5.5
1962-09-30  2.11   3093.047    5.6
1962-12-31  0.79   3100.563    5.5
1963-03-31  0.53   3141.087    5.8
1963-06-30  2.75   3180.447    5.7
1963-09-30  0.78   3240.332    5.5
1963-12-31  2.46   3264.967    5.6
1964-03-31  0.13   3338.246    5.5
1964-06-30  0.90   3376.587    5.2
1964-09-30  1.29   3422.469    5.0
1964-12-31  2.05   3431.957    5.0
1965-03-31  1.28   3516.251    4.9
1965-06-30  2.54   3563.960    4.7
1965-09-30  0.89   3636.285    4.4
1965-12-31  2.90   3724.014    4.1
1966-03-31  4.99   3815.423    3.9
1966-06-30  2.10   3828.124    3.8
...          ...        ...    ...
2002-06-30  1.56  11538.770    5.8
2002-09-30  2.66  11596.430    5.7
2002-12-31  3.08  11598.824    5.8
2003-03-31  1.31  11645.819    5.9
2003-06-30  1.09  11738.706    6.2
2003-09-30  2.60  11935.461    6.1
2003-12-31  3.02  12042.817    5.8
2004-03-31  2.35  12127.623    5.7
2004-06-30  3.61  12213.818    5.6
2004-09-30  3.58  12303.533    5.4
2004-12-31  2.09  12410.282    5.4
2005-03-31  4.15  12534.113    5.3
2005-06-30  1.85  12587.535    5.1
2005-09-30  9.14  12683.153    5.0
2005-12-31  0.40  12748.699    4.9
2006-03-31  2.60  12915.938    4.7
2006-06-30  3.97  12962.462    4.7
2006-09-30 -1.58  12965.916    4.7
2006-12-31  3.30  13060.679    4.4
2007-03-31  4.58  13099.901    4.5
2007-06-30  2.75  13203.977    4.5
2007-09-30  3.45  13321.109    4.7
2007-12-31  6.38  13391.249    4.8
2008-03-31  2.82  13366.865    4.9
2008-06-30  8.53  13415.266    5.4
2008-09-30 -3.16  13324.600    6.0
2008-12-31 -8.79  13141.920    6.9
2009-03-31  0.94  12925.410    8.1
2009-06-30  3.37  12901.504    9.2
2009-09-30  3.56  12990.341    9.6

[203 rows x 3 columns]
```
```# 2
ldata[:10]
```
date item value
0 1959-03-31 realgdp 2710.349
1 1959-03-31 infl 0.000
2 1959-03-31 unemp 5.800
3 1959-06-30 realgdp 2778.801
4 1959-06-30 infl 2.340
5 1959-06-30 unemp 5.100
6 1959-09-30 realgdp 2775.488
7 1959-09-30 infl 2.740
8 1959-09-30 unemp 5.300
9 1959-12-31 realgdp 2785.204
```pivoted = ldata.pivot('date', 'item', 'value')
```
item infl realgdp unemp
date
1959-03-31 0.00 2710.349 5.8
1959-06-30 2.34 2778.801 5.1
1959-09-30 2.74 2775.488 5.3
1959-12-31 0.27 2785.204 5.6
1960-03-31 2.31 2847.699 5.2
```ldata['value2'] = np.random.randn(len(ldata))
ldata[:10]
```
date item value value2
0 1959-03-31 realgdp 2710.349 1.669025
1 1959-03-31 infl 0.000 -0.438570
2 1959-03-31 unemp 5.800 -0.539741
3 1959-06-30 realgdp 2778.801 0.476985
4 1959-06-30 infl 2.340 3.248944
5 1959-06-30 unemp 5.100 -1.021228
6 1959-09-30 realgdp 2775.488 -0.577087
7 1959-09-30 infl 2.740 0.124121
8 1959-09-30 unemp 5.300 0.302614
9 1959-12-31 realgdp 2785.204 0.523772
```pivoted = ldata.pivot('date', 'item')
pivoted[:5]
```
value value2
item infl realgdp unemp infl realgdp unemp
date
1959-03-31 0.00 2710.349 5.8 -0.438570 1.669025 -0.539741
1959-06-30 2.34 2778.801 5.1 3.248944 0.476985 -1.021228
1959-09-30 2.74 2775.488 5.3 0.124121 -0.577087 0.302614
1959-12-31 0.27 2785.204 5.6 0.000940 0.523772 1.343810
1960-03-31 2.31 2847.699 5.2 -0.831154 -0.713544 -2.370232
```pivoted['value'][:5]
```
item infl realgdp unemp
date
1959-03-31 0.00 2710.349 5.8
1959-06-30 2.34 2778.801 5.1
1959-09-30 2.74 2775.488 5.3
1959-12-31 0.27 2785.204 5.6
1960-03-31 2.31 2847.699 5.2
```unstacked = ldata.set_index(['date', 'item']).unstack('item')
unstacked[:7]
```
value value2
item infl realgdp unemp infl realgdp unemp
date
1959-03-31 0.00 2710.349 5.8 -0.438570 1.669025 -0.539741
1959-06-30 2.34 2778.801 5.1 3.248944 0.476985 -1.021228
1959-09-30 2.74 2775.488 5.3 0.124121 -0.577087 0.302614
1959-12-31 0.27 2785.204 5.6 0.000940 0.523772 1.343810
1960-03-31 2.31 2847.699 5.2 -0.831154 -0.713544 -2.370232
1960-06-30 0.14 2834.390 5.2 -0.860757 -1.860761 0.560145
1960-09-30 2.70 2839.022 5.6 0.119827 -1.265934 -1.063512

# Remove duplicate data

```data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
'k2': [1, 1, 2, 3, 3, 4, 4]})
print(data)
```
```    k1  k2
0  one   1
1  one   1
2  one   2
3  two   3
4  two   3
5  two   4
6  two   4
```
```data.duplicated()
```
```0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool
```
```data.drop_duplicates()
```
k1 k2
0 one 1
2 one 2
3 two 3
5 two 4
```data['v1'] = range(7)
print(data)
```
```    k1  k2  v1
0  one   1   0
1  one   1   1
2  one   2   2
3  two   3   3
4  two   3   4
5  two   4   5
6  two   4   6
```
```data.drop_duplicates(['k1'])
```
k1 k2 v1
0 one 1 0
3 two 3 3
```data.drop_duplicates(['k1', 'k2'], keep='last')
```
k1 k2 v1
1 one 1 1
2 one 2 2
4 two 3 4
6 two 4 6

# Data Conversion Using Functions or Maps

```# 1
data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
'corned beef', 'Bacon', 'pastrami', 'honey ham',
'nova lox'],
'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
print(data)
```
```          food  ounces
0        bacon     4.0
1  pulled pork     3.0
2        bacon    12.0
3     Pastrami     6.0
4  corned beef     7.5
5        Bacon     8.0
6     pastrami     3.0
7    honey ham     5.0
8     nova lox     6.0
```
```meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}

data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
print(data)
```
```          food  ounces  animal
0        bacon     4.0     pig
1  pulled pork     3.0     pig
2        bacon    12.0     pig
3     Pastrami     6.0     cow
4  corned beef     7.5     cow
5        Bacon     8.0     pig
6     pastrami     3.0     cow
7    honey ham     5.0     pig
8     nova lox     6.0  salmon
```
```data['food'].map(lambda x: meat_to_animal[x.lower()])
```
```0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object
```

# Data standardization

```datafile = 'data/normalization_data.xls'  # Parameter initialization
print(data)
```
```     0    1    2     3
0   78  521  602  2863
1  144 -600 -521  2245
2   95 -457  468 -1283
3   69  596  695  1054
4  190  527  691  2051
5  101  403  470  2487
6  146  413  435  2571
```

## Minimum-Maximum Normalization

```(data - data.min()) / (data.max() - data.min())  # Minimum-Maximum Normalization
```
0 1 2 3
0 0.074380 0.937291 0.923520 1.000000
1 0.619835 0.000000 0.000000 0.850941
2 0.214876 0.119565 0.813322 0.000000
3 0.000000 1.000000 1.000000 0.563676
4 1.000000 0.942308 0.996711 0.804149
5 0.264463 0.838629 0.814967 0.909310
6 0.636364 0.846990 0.786184 0.929571

## Zero-Mean Normalization

```(data - data.mean()) / data.std()  # Zero-Mean Normalization
```
0 1 2 3
0 -0.905383 0.635863 0.464531 0.798149
1 0.604678 -1.587675 -2.193167 0.369390
2 -0.516428 -1.304030 0.147406 -2.078279
3 -1.111301 0.784628 0.684625 -0.456906
4 1.657146 0.647765 0.675159 0.234796
5 -0.379150 0.401807 0.152139 0.537286
6 0.650438 0.421642 0.069308 0.595564

## Normalization by decimal scaling

```data / 10 ** np.ceil(np.log10(data.abs().max()))  # Normalization by decimal scaling
```
0 1 2 3
0 0.078 0.521 0.602 0.2863
1 0.144 -0.600 -0.521 0.2245
2 0.095 -0.457 0.468 -0.1283
3 0.069 0.596 0.695 0.1054
4 0.190 0.527 0.691 0.2051
5 0.101 0.403 0.470 0.2487
6 0.146 0.413 0.435 0.2571

# Replacement value

```data = Series([1., -999., 2., -999., -1000., 3.])
print(data)
```
```0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64
```
```data.replace(-999, np.nan)
```
```0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64
```
```data.replace([-999, -1000], np.nan)
```
```0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64
```
```data.replace([-999, -1000], [np.nan, 0])
```
```0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64
```
```data.replace({-999: np.nan, -1000: 0})
```
```0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64
```

# Renamed axis index

```data = DataFrame(np.arange(12).reshape((3, 4)),
columns=['one', 'two', 'three', 'four'])
print(data)
```
```          one  two  three  four
Ohio        0    1      2     3
New York    8    9     10    11
```
```data.index.map(str.upper)
```
```Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')
```
```data.index = data.index.map(str.upper)
print(data)
```
```          one  two  three  four
OHIO        0    1      2     3
NEW YORK    8    9     10    11
```
```data.rename(index=str.title, columns=str.upper)
```
ONE TWO THREE FOUR
Ohio 0 1 2 3
New York 8 9 10 11
```data.rename(index={'OHIO': 'INDIANA'},
columns={'three': 'peekaboo'})
```
one two peekaboo four
INDIANA 0 1 2 3
NEW YORK 8 9 10 11
```# Always return a reference to the DataFrame
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
print(data)
```
```          one  two  three  four
INDIANA     0    1      2     3
NEW YORK    8    9     10    11
```

# Discretization and panel partition

```# 1
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
print(cats)
```
```[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
```
```cats.codes
```
```array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
```
```cats.categories
```
```IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
closed='right',
dtype='interval[int64]')
```
```pd.value_counts(cats)
```
```(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64
```
```pd.cut(ages, [18, 26, 36, 61, 100], right=False)
```
```[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]
```
```group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)
```
```[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [MiddleAged < Senior < YoungAdult < Youth]
```
```data = np.random.rand(20)
pd.cut(data, 4, precision=2)
```
```[(0.45, 0.67], (0.23, 0.45], (0.0037, 0.23], (0.45, 0.67], (0.67, 0.9], ..., (0.67, 0.9], (0.0037, 0.23], (0.0037, 0.23], (0.23, 0.45], (0.23, 0.45]]
Length: 20
Categories (4, interval[float64]): [(0.0037, 0.23] < (0.23, 0.45] < (0.45, 0.67] < (0.67, 0.9]]
```
```# 2
data = np.random.randn(1000)  # Normally distributed
cats = pd.qcut(data, 4)  # Cut into quartiles
print(cats)
```
```[(-0.022, 0.641], (-3.746, -0.635], (0.641, 3.26], (-3.746, -0.635], (-0.022, 0.641], ..., (-0.022, 0.641], (0.641, 3.26], (-0.635, -0.022], (0.641, 3.26], (-0.635, -0.022]]
Length: 1000
Categories (4, interval[float64]): [(-3.746, -0.635] < (-0.635, -0.022] < (-0.022, 0.641] < (0.641, 3.26]]
```
```pd.value_counts(cats)
```
```(0.641, 3.26]       250
(-0.022, 0.641]     250
(-0.635, -0.022]    250
(-3.746, -0.635]    250
dtype: int64
```
```pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])
```
```[(-0.022, 1.302], (-1.266, -0.022], (-0.022, 1.302], (-3.746, -1.266], (-0.022, 1.302], ..., (-0.022, 1.302], (-0.022, 1.302], (-1.266, -0.022], (-0.022, 1.302], (-1.266, -0.022]]
Length: 1000
Categories (4, interval[float64]): [(-3.746, -1.266] < (-1.266, -0.022] < (-0.022, 1.302] < (1.302, 3.26]]
```

# Detection and filtering of outliers

```# 1
np.random.seed(12345)
data = DataFrame(np.random.randn(1000, 4))
data.describe()
```
0 1 2 3
count 1000.000000 1000.000000 1000.000000 1000.000000
mean -0.067684 0.067924 0.025598 -0.002298
std 0.998035 0.992106 1.006835 0.996794
min -3.428254 -3.548824 -3.184377 -3.745356
25% -0.774890 -0.591841 -0.641675 -0.644144
50% -0.116401 0.101143 0.002073 -0.013611
75% 0.616366 0.780282 0.680391 0.654328
max 3.366626 2.653656 3.260383 3.927528
```col = data[3]
col[np.abs(col) > 3]
```
```97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64
```
```data[(np.abs(data) > 3).any(1)]
```
0 1 2 3
5 -0.539741 0.476985 3.248944 -1.021228
97 -0.774363 0.552936 0.106061 3.927528
102 -0.655054 -0.565230 3.176873 0.959533
305 -2.315555 0.457246 -0.025907 -3.399312
324 0.050188 1.951312 3.260383 0.963301
400 0.146326 0.508391 -0.196713 -3.745356
499 -0.293333 -0.242459 -3.056990 1.918403
523 -3.428254 -0.296336 -0.439938 -0.867165
586 0.275144 1.179227 -3.184377 1.369891
808 -0.362528 -3.548824 1.553205 -2.186301
900 3.366626 -2.372214 0.851010 1.332846
```# 2
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()
```
0 1 2 3
count 1000.000000 1000.000000 1000.000000 1000.000000
mean -0.067623 0.068473 0.025153 -0.002081
std 0.995485 0.990253 1.003977 0.989736
min -3.000000 -3.000000 -3.000000 -3.000000
25% -0.774890 -0.591841 -0.641675 -0.644144
50% -0.116401 0.101143 0.002073 -0.013611
75% 0.616366 0.780282 0.680391 0.654328
max 3.000000 2.653656 3.000000 3.000000

# Arrangement and Random Sampling

```df = DataFrame(np.arange(5 * 4).reshape((5, 4)))
print(df)
```
```    0   1   2   3
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15
4  16  17  18  19
```
```sampler = np.random.permutation(5)
print(sampler)
```
```[1 0 2 3 4]
```
```df.take(sampler)
```
0 1 2 3
1 4 5 6 7
0 0 1 2 3
2 8 9 10 11
3 12 13 14 15
4 16 17 18 19
```# 2
df.take(np.random.permutation(len(df))[:3])
```
0 1 2 3
1 4 5 6 7
3 12 13 14 15
4 16 17 18 19
```# 3
bag = np.array([5, 7, -1, 6, 4])
sampler = np.random.randint(0, len(bag), size=10)
print(sampler)
```
```[4 4 2 2 2 0 3 0 4 1]
```
```draws = bag.take(sampler)
print(draws)
```
```[ 4  4 -1 -1 -1  5  6  5  4  7]
```

# Computational Indicators and Dumb Variables

```# 1
df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
'data1': range(6)})
print(df)
```
```   data1 key
0      0   b
1      1   b
2      2   a
3      3   c
4      4   a
5      5   b
```
```pd.get_dummies(df['key'])
```
a b c
0 0 1 0
1 0 1 0
2 1 0 0
3 0 0 1
4 1 0 0
5 0 1 0
```dummies = pd.get_dummies(df['key'], prefix='key')
print(dummies)
```
```   key_a  key_b  key_c
0      0      1      0
1      0      1      0
2      1      0      0
3      0      0      1
4      1      0      0
5      0      1      0
```
```df_with_dummy = df[['data1']].join(dummies)
print(df_with_dummy)
```
```   data1  key_a  key_b  key_c
0      0      0      1      0
1      1      0      1      0
2      2      1      0      0
3      3      0      0      1
4      4      1      0      0
5      5      0      1      0
```
```# 2
mnames = ['movie_id', 'title', 'genres']
names=mnames)
movies[:10]
```
```C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
after removing the cwd from sys.path.
```
movie_id title genres
0 1 Toy Story (1995) Animation|Children's|Comedy
2 3 Grumpier Old Men (1995) Comedy|Romance
3 4 Waiting to Exhale (1995) Comedy|Drama
4 5 Father of the Bride Part II (1995) Comedy
5 6 Heat (1995) Action|Crime|Thriller
6 7 Sabrina (1995) Comedy|Romance
7 8 Tom and Huck (1995) Adventure|Children's
8 9 Sudden Death (1995) Action
```genre_iter = (set(x.split('|')) for x in movies.genres)
genres = sorted(set.union(*genre_iter))
print(genres)
```
```['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
```
```dummies = DataFrame(np.zeros((len(movies), len(genres))), columns=genres)
print(dummies)
```
```      Action  Adventure  Animation  Children's  Comedy  Crime  Documentary  \
0        0.0        0.0        0.0         0.0     0.0    0.0          0.0
1        0.0        0.0        0.0         0.0     0.0    0.0          0.0
2        0.0        0.0        0.0         0.0     0.0    0.0          0.0
3        0.0        0.0        0.0         0.0     0.0    0.0          0.0
4        0.0        0.0        0.0         0.0     0.0    0.0          0.0
5        0.0        0.0        0.0         0.0     0.0    0.0          0.0
6        0.0        0.0        0.0         0.0     0.0    0.0          0.0
7        0.0        0.0        0.0         0.0     0.0    0.0          0.0
8        0.0        0.0        0.0         0.0     0.0    0.0          0.0
9        0.0        0.0        0.0         0.0     0.0    0.0          0.0
10       0.0        0.0        0.0         0.0     0.0    0.0          0.0
11       0.0        0.0        0.0         0.0     0.0    0.0          0.0
12       0.0        0.0        0.0         0.0     0.0    0.0          0.0
13       0.0        0.0        0.0         0.0     0.0    0.0          0.0
14       0.0        0.0        0.0         0.0     0.0    0.0          0.0
15       0.0        0.0        0.0         0.0     0.0    0.0          0.0
16       0.0        0.0        0.0         0.0     0.0    0.0          0.0
17       0.0        0.0        0.0         0.0     0.0    0.0          0.0
18       0.0        0.0        0.0         0.0     0.0    0.0          0.0
19       0.0        0.0        0.0         0.0     0.0    0.0          0.0
20       0.0        0.0        0.0         0.0     0.0    0.0          0.0
21       0.0        0.0        0.0         0.0     0.0    0.0          0.0
22       0.0        0.0        0.0         0.0     0.0    0.0          0.0
23       0.0        0.0        0.0         0.0     0.0    0.0          0.0
24       0.0        0.0        0.0         0.0     0.0    0.0          0.0
25       0.0        0.0        0.0         0.0     0.0    0.0          0.0
26       0.0        0.0        0.0         0.0     0.0    0.0          0.0
27       0.0        0.0        0.0         0.0     0.0    0.0          0.0
28       0.0        0.0        0.0         0.0     0.0    0.0          0.0
29       0.0        0.0        0.0         0.0     0.0    0.0          0.0
...      ...        ...        ...         ...     ...    ...          ...
3853     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3854     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3855     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3856     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3857     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3858     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3859     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3860     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3861     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3862     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3863     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3864     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3865     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3866     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3867     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3868     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3869     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3870     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3871     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3872     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3873     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3874     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3875     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3876     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3877     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3878     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3879     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3880     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3881     0.0        0.0        0.0         0.0     0.0    0.0          0.0
3882     0.0        0.0        0.0         0.0     0.0    0.0          0.0

Drama  Fantasy  Film-Noir  Horror  Musical  Mystery  Romance  Sci-Fi  \
0       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
1       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
2       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
4       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
5       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
6       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
7       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
8       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
9       0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
10      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
11      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
12      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
13      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
14      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
15      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
16      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
17      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
18      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
19      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
20      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
21      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
22      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
23      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
24      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
25      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
26      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
27      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
28      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
29      0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
...     ...      ...        ...     ...      ...      ...      ...     ...
3853    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3854    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3855    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3856    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3857    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3858    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3859    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3860    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3861    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3862    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3863    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3864    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3865    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3866    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3867    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3868    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3869    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3870    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3871    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3872    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3873    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3874    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3875    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3876    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3877    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3878    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3879    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3880    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3881    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0
3882    0.0      0.0        0.0     0.0      0.0      0.0      0.0     0.0

Thriller  War  Western
0          0.0  0.0      0.0
1          0.0  0.0      0.0
2          0.0  0.0      0.0
3          0.0  0.0      0.0
4          0.0  0.0      0.0
5          0.0  0.0      0.0
6          0.0  0.0      0.0
7          0.0  0.0      0.0
8          0.0  0.0      0.0
9          0.0  0.0      0.0
10         0.0  0.0      0.0
11         0.0  0.0      0.0
12         0.0  0.0      0.0
13         0.0  0.0      0.0
14         0.0  0.0      0.0
15         0.0  0.0      0.0
16         0.0  0.0      0.0
17         0.0  0.0      0.0
18         0.0  0.0      0.0
19         0.0  0.0      0.0
20         0.0  0.0      0.0
21         0.0  0.0      0.0
22         0.0  0.0      0.0
23         0.0  0.0      0.0
24         0.0  0.0      0.0
25         0.0  0.0      0.0
26         0.0  0.0      0.0
27         0.0  0.0      0.0
28         0.0  0.0      0.0
29         0.0  0.0      0.0
...        ...  ...      ...
3853       0.0  0.0      0.0
3854       0.0  0.0      0.0
3855       0.0  0.0      0.0
3856       0.0  0.0      0.0
3857       0.0  0.0      0.0
3858       0.0  0.0      0.0
3859       0.0  0.0      0.0
3860       0.0  0.0      0.0
3861       0.0  0.0      0.0
3862       0.0  0.0      0.0
3863       0.0  0.0      0.0
3864       0.0  0.0      0.0
3865       0.0  0.0      0.0
3866       0.0  0.0      0.0
3867       0.0  0.0      0.0
3868       0.0  0.0      0.0
3869       0.0  0.0      0.0
3870       0.0  0.0      0.0
3871       0.0  0.0      0.0
3872       0.0  0.0      0.0
3873       0.0  0.0      0.0
3874       0.0  0.0      0.0
3875       0.0  0.0      0.0
3876       0.0  0.0      0.0
3877       0.0  0.0      0.0
3878       0.0  0.0      0.0
3879       0.0  0.0      0.0
3880       0.0  0.0      0.0
3881       0.0  0.0      0.0
3882       0.0  0.0      0.0

[3883 rows x 18 columns]
```
```for i, gen in enumerate(movies.genres):
dummies.loc[i, gen.split('|')] = 1

print(movies_windic.loc[0])
```
```movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Action                                   0
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Crime                                    0
Genre_Documentary                              0
Genre_Drama                                    0
Genre_Fantasy                                  0
Genre_Film-Noir                                0
Genre_Horror                                   0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Romance                                  0
Genre_Sci-Fi                                   0
Genre_Thriller                                 0
Genre_War                                      0
Genre_Western                                  0
Name: 0, dtype: object
```
```# 3
np.random.seed(12345)
values = np.random.rand(10)
print(values)
```
```[ 0.92961609  0.31637555  0.18391881  0.20456028  0.56772503  0.5955447
0.96451452  0.6531771   0.74890664  0.65356987]
```
```bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))
```
(0.0, 0.2] (0.2, 0.4] (0.4, 0.6] (0.6, 0.8] (0.8, 1.0]
0 0 0 0 0 1
1 0 1 0 0 0
2 1 0 0 0 0
3 0 1 0 0 0
4 0 0 1 0 0
5 0 0 1 0 0
6 0 0 0 0 1
7 0 0 0 1 0
8 0 0 0 1 0
9 0 0 0 1 0

# Attribute Construction

```# Parameter initialization
inputfile = 'data/electricity_data.xls'  # Power supply and output data
outputfile = 'data/electricity_data.xls'  # Data file after attribute construction

data[u'Line loss rate'] = (data[u'Electricity supply and input'] - data[u'Power supply and output']) / data[u'Electricity supply and input']

data.to_excel(outputfile, index=False)  # Preserve the results

```

# String Object Method

```val = 'a,b,  guido'
val.split(',')
```
```['a', 'b', '  guido']
```
```pieces = [x.strip() for x in val.split(',')]
print(pieces)
```
```['a', 'b', 'guido']
```
```first, second, third = pieces
first + '::' + second + '::' + third
```
```'a::b::guido'
```
```'::'.join(pieces)
```
```'a::b::guido'
```
```'guido' in val
```
```True
```
```val.index(',')
```
```1
```
```val.find(':')
```
```-1
```
```val.index(':')
```
```---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

<ipython-input-138-280f8b2856ce> in <module>()
----> 1 val.index(':')

```
```val.count('a')
```
```1
```
```val.replace(',', '::')
```
```'a::b::  guido'
```
```val.replace(',', '')
```
```'ab  guido'
```

# regular expression

```# 1
import re

text = "foo    bar\t baz  \tqux"
re.split('\s+', text)
```
```['foo', 'bar', 'baz', 'qux']
```
```regex = re.compile('\s+')
regex.split(text)
```
```['foo', 'bar', 'baz', 'qux']
```
```regex.findall(text)
```
```['    ', '\t ', '  \t']
```
```# 2
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

# The function of re.IGNORECASE is to make regular expressions insensitive to case
regex = re.compile(pattern, flags=re.IGNORECASE)

regex.findall(text)
```
```['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']
```
```m = regex.search(text)
print(m)
```
```<_sre.SRE_Match object; span=(5, 20), match='dave@google.com'>
```
```text[m.start():m.end()]
```
```'dave@google.com'
```
```print(regex.match(text))
```
```None
```
```print(regex.sub('REDACTED', text))
```
```Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED
```
```# 3
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)

m = regex.match('wesm@bright.net')
m.groups()
```
```('wesm', 'bright', 'net')
```
```regex.findall(text)
```
```[('dave', 'google', 'com'),
('steve', 'gmail', 'com'),
('rob', 'gmail', 'com'),
('ryan', 'yahoo', 'com')]
```
```print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))
```
```Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com
```
```# 4
regex = re.compile(r"""
@
(?P<domain>[A-Z0-9.-]+)
\.
(?P<suffix>[A-Z]{2,4})""", flags=re.IGNORECASE | re.VERBOSE)

m = regex.match('wesm@bright.net')
m.groupdict()
```
```{'domain': 'bright', 'suffix': 'net', 'username': 'wesm'}
```

# Vectorized string functions in pandas

```data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = Series(data)
print(data)
```
```Dave     dave@google.com
Rob        rob@gmail.com
Steve    steve@gmail.com
Wes                  NaN
dtype: object
```
```data.isnull()
```
```Dave     False
Rob      False
Steve    False
Wes       True
dtype: bool
```
```data.str.contains('gmail')
```
```Dave     False
Rob       True
Steve     True
Wes        NaN
dtype: object
```
```data.str.findall(pattern, flags=re.IGNORECASE)
```
```Dave     [(dave, google, com)]
Rob        [(rob, gmail, com)]
Steve    [(steve, gmail, com)]
Wes                        NaN
dtype: object
```
```matches = data.str.match(pattern, flags=re.IGNORECASE)
print(matches)
```
```Dave     True
Rob      True
Steve    True
Wes       NaN
dtype: object
```
```matches.str.get(1)
```
```Dave    NaN
Rob     NaN
Steve   NaN
Wes     NaN
dtype: float64
```
```matches.str[0]
```
```Dave    NaN
Rob     NaN
Steve   NaN
Wes     NaN
dtype: float64
```
```data.str[:5]
```
```Dave     dave@
Rob      rob@g
Steve    steve
Wes        NaN
dtype: object
```

# Example: USDA Food Database

```'''
{
"id": 21441,
"description": "KENTUCKY FRIED CHICKEN, Fried Chicken, EXTRA CRISPY,
Wing, meat and skin with breading",
"tags": ["KFC"],
"manufacturer": "Kentucky Fried Chicken",
"group": "Fast Foods",
"portions": [
{
"amount": 1,
"unit": "wing, with skin",
"grams": 68.0
},

...
],
"nutrients": [
{
"value": 20.8,
"units": "g",
"description": "Protein",
"group": "Composition"
},

...
]
}
'''

import json

len(db)
```
```6636
```
```db[0].keys()
```
```dict_keys(['id', 'description', 'tags', 'manufacturer', 'group', 'portions', 'nutrients'])
```
```db[0]['nutrients'][0]
```
```{'description': 'Protein',
'group': 'Composition',
'units': 'g',
'value': 25.18}
```
```nutrients = DataFrame(db[0]['nutrients'])
nutrients[:7]
```
description group units value
0 Protein Composition g 25.18
1 Total lipid (fat) Composition g 29.20
2 Carbohydrate, by difference Composition g 3.06
3 Ash Other g 3.28
4 Energy Energy kcal 376.00
5 Water Composition g 39.28
6 Energy Energy kJ 1573.00
```info_keys = ['description', 'group', 'id', 'manufacturer']
info = DataFrame(db, columns=info_keys)

info[:5]
```
description group id manufacturer
0 Cheese, caraway Dairy and Egg Products 1008
1 Cheese, cheddar Dairy and Egg Products 1009
2 Cheese, edam Dairy and Egg Products 1018
3 Cheese, feta Dairy and Egg Products 1019
4 Cheese, mozzarella, part skim milk Dairy and Egg Products 1028
```pd.value_counts(info.group)[:10]
```
```Vegetables and Vegetable Products    812
Beef Products                        618
Baked Products                       496
Breakfast Cereals                    403
Fast Foods                           365
Legumes and Legume Products          365
Lamb, Veal, and Game Products        345
Sweets                               341
Fruits and Fruit Juices              328
Pork Products                        328
Name: group, dtype: int64
```
```nutrients = []

for rec in db:
fnuts = DataFrame(rec['nutrients'])
fnuts['id'] = rec['id']
nutrients.append(fnuts)

nutrients = pd.concat(nutrients, ignore_index=True)

print(nutrients)
```
```                               description        group    units     value  \
0                                  Protein  Composition        g    25.180
1                        Total lipid (fat)  Composition        g    29.200
2              Carbohydrate, by difference  Composition        g     3.060
3                                      Ash        Other        g     3.280
4                                   Energy       Energy     kcal   376.000
5                                    Water  Composition        g    39.280
6                                   Energy       Energy       kJ  1573.000
7                     Fiber, total dietary  Composition        g     0.000
8                              Calcium, Ca     Elements       mg   673.000
9                                 Iron, Fe     Elements       mg     0.640
10                           Magnesium, Mg     Elements       mg    22.000
11                           Phosphorus, P     Elements       mg   490.000
12                            Potassium, K     Elements       mg    93.000
13                              Sodium, Na     Elements       mg   690.000
14                                Zinc, Zn     Elements       mg     2.940
15                              Copper, Cu     Elements       mg     0.024
16                           Manganese, Mn     Elements       mg     0.021
17                            Selenium, Se     Elements      mcg    14.500
18                           Vitamin A, IU     Vitamins       IU  1054.000
19                                 Retinol     Vitamins      mcg   262.000
20                          Vitamin A, RAE     Vitamins  mcg_RAE   271.000
21          Vitamin C, total ascorbic acid     Vitamins       mg     0.000
22                                 Thiamin     Vitamins       mg     0.031
23                              Riboflavin     Vitamins       mg     0.450
24                                  Niacin     Vitamins       mg     0.180
25                        Pantothenic acid     Vitamins       mg     0.190
26                             Vitamin B-6     Vitamins       mg     0.074
27                           Folate, total     Vitamins      mcg    18.000
28                            Vitamin B-12     Vitamins      mcg     0.270
29                              Folic acid     Vitamins      mcg     0.000
...                                    ...          ...      ...       ...
389325                        Selenium, Se     Elements      mcg     1.100
389326                       Vitamin A, IU     Vitamins       IU     5.000
389327                             Retinol     Vitamins      mcg     0.000
389328                      Vitamin A, RAE     Vitamins  mcg_RAE     0.000
389329                      Carotene, beta     Vitamins      mcg     2.000
389330                     Carotene, alpha     Vitamins      mcg     2.000
389331        Vitamin E (alpha-tocopherol)     Vitamins       mg     0.250
389332                           Vitamin D     Vitamins       IU     0.000
389333                 Vitamin D (D2 + D3)     Vitamins      mcg     0.000
389334                 Cryptoxanthin, beta     Vitamins      mcg     0.000
389335                            Lycopene     Vitamins      mcg     0.000
389336                 Lutein + zeaxanthin     Vitamins      mcg    20.000
389337      Vitamin C, total ascorbic acid     Vitamins       mg    21.900
389338                             Thiamin     Vitamins       mg     0.020
389339                          Riboflavin     Vitamins       mg     0.060
389340                              Niacin     Vitamins       mg     0.540
389341                         Vitamin B-6     Vitamins       mg     0.260
389342                       Folate, total     Vitamins      mcg    17.000
389343                        Vitamin B-12     Vitamins      mcg     0.000
389344                      Choline, total     Vitamins       mg     4.100
389345           Vitamin K (phylloquinone)     Vitamins      mcg     0.500
389346                          Folic acid     Vitamins      mcg     0.000
389347                        Folate, food     Vitamins      mcg    17.000
389348                         Folate, DFE     Vitamins  mcg_DFE    17.000
389349                    Vitamin E, added     Vitamins       mg     0.000
389350                 Vitamin B-12, added     Vitamins      mcg     0.000
389351                         Cholesterol        Other       mg     0.000
389352        Fatty acids, total saturated        Other        g     0.072
389353  Fatty acids, total monounsaturated        Other        g     0.028
389354  Fatty acids, total polyunsaturated        Other        g     0.041

id
0        1008
1        1008
2        1008
3        1008
4        1008
5        1008
6        1008
7        1008
8        1008
9        1008
10       1008
11       1008
12       1008
13       1008
14       1008
15       1008
16       1008
17       1008
18       1008
19       1008
20       1008
21       1008
22       1008
23       1008
24       1008
25       1008
26       1008
27       1008
28       1008
29       1008
...       ...
389325  43546
389326  43546
389327  43546
389328  43546
389329  43546
389330  43546
389331  43546
389332  43546
389333  43546
389334  43546
389335  43546
389336  43546
389337  43546
389338  43546
389339  43546
389340  43546
389341  43546
389342  43546
389343  43546
389344  43546
389345  43546
389346  43546
389347  43546
389348  43546
389349  43546
389350  43546
389351  43546
389352  43546
389353  43546
389354  43546

[389355 rows x 5 columns]
```
```# There is duplicate data in the data.
nutrients.duplicated().sum()
```
```14179
```
```# Duplicate removal
nutrients = nutrients.drop_duplicates()
```
```col_mapping = {'description': 'food',
'group': 'fgroup'}
info = info.rename(columns=col_mapping, copy=False)
print(info)
```
```                                                   food  \
0                                       Cheese, caraway
1                                       Cheese, cheddar
2                                          Cheese, edam
3                                          Cheese, feta
4                    Cheese, mozzarella, part skim milk
5      Cheese, mozzarella, part skim milk, low moisture
6                                        Cheese, romano
7                                     Cheese, roquefort
8     Cheese spread, pasteurized process, american, ...
9                           Cream, fluid, half and half
10    Sour dressing, non-butterfat, cultured, filled...
11    Milk, filled, fluid, with blend of hydrogenate...
12    Cream substitute, liquid, with lauric acid oil...
13                           Cream substitute, powdered
14                  Milk, producer, fluid, 3.7% milkfat
15    Milk, reduced fat, fluid, 2% milkfat, with add...
16    Milk, reduced fat, fluid, 2% milkfat, with add...
17    Milk, reduced fat, fluid, 2% milkfat, protein ...
18    Milk, lowfat, fluid, 1% milkfat, with added vi...
19    Milk, lowfat, fluid, 1% milkfat, with added no...
20    Milk, lowfat, fluid, 1% milkfat, protein forti...
21    Milk, nonfat, fluid, with added vitamin A and ...
22    Milk, nonfat, fluid, with added nonfat milk so...
23    Milk, nonfat, fluid, protein fortified, with a...
24            Milk, buttermilk, fluid, cultured, lowfat
25                              Milk, low sodium, fluid
26               Milk, dry, whole, with added vitamin D
27    Milk, dry, nonfat, regular, without added vita...
28    Milk, dry, nonfat, instant, with added vitamin...
29                   Milk, dry, nonfat, calcium reduced
...                                                 ...
6606  Beef, tenderloin, steak, separable lean only, ...
6607  Beef, top sirloin, steak, separable lean only,...
6608  Beef, short loin, top loin, steak, separable l...
6609  Beef, chuck, arm pot roast, separable lean onl...
6610  Beef, brisket, flat half, separable lean only,...
6611  Beef, chuck, arm pot roast, separable lean onl...
6612  Beef, brisket, flat half, separable lean only,...
6613  Beef, round, eye of round, roast, separable le...
6614  Beef, round, top round, steak, separable lean ...
6615  Beef, round, bottom round, roast, separable le...
6616  Beef, rib, small end (ribs 10-12), separable l...
6617  CAMPBELL Soup Company, CAMPBELL'S Red and Whit...
6618  CAMPBELL Soup Company, CAMPBELL's Red and Whit...
6619  CAMPBELL Soup Company, CAMPBELL'S SELECT Soups...
6620  CAMPBELL Soup Company, CAMPBELL'S SOUP AT HAND...
6621  CAMPBELL Soup Company, CAMPBELL'S SOUP AT HAND...
6622  CAMPBELL Soup Company, CAMPBELL'S SELECT Gold ...
6623  CAMPBELL Soup Company, CAMPBELL'S SELECT Gold ...
6624  CAMPBELL Soup Company, CAMPBELL'S SELECT Gold ...
6625  CAMPBELL Soup Company, CAMPBELL'S Red and Whit...
6626  CAMPBELL Soup Company, V8 Vegetable Juice, Ess...
6627  CAMPBELL Soup Company, V8 Vegetable Juice, Spi...
6628  CAMPBELL Soup Company, PACE, Jalapenos Nacho S...
6629  CAMPBELL Soup Company, V8 60% Vegetable Juice,...
6630  CAMPBELL Soup Company, V8 Vegetable Juice, Low...
6631                             Bologna, beef, low fat
6632  Turkey and pork sausage, fresh, bulk, patty or...
6633                              Babyfood, juice, pear
6634         Babyfood, dessert, banana yogurt, strained
6635              Babyfood, banana no tapioca, strained

fgroup     id       manufacturer
0                Dairy and Egg Products   1008
1                Dairy and Egg Products   1009
2                Dairy and Egg Products   1018
3                Dairy and Egg Products   1019
4                Dairy and Egg Products   1028
5                Dairy and Egg Products   1029
6                Dairy and Egg Products   1038
7                Dairy and Egg Products   1039
8                Dairy and Egg Products   1048
9                Dairy and Egg Products   1049
10               Dairy and Egg Products   1058
11               Dairy and Egg Products   1059
12               Dairy and Egg Products   1068
13               Dairy and Egg Products   1069
14               Dairy and Egg Products   1078
15               Dairy and Egg Products   1079               None
16               Dairy and Egg Products   1080
17               Dairy and Egg Products   1081
18               Dairy and Egg Products   1082
19               Dairy and Egg Products   1083
20               Dairy and Egg Products   1084
21               Dairy and Egg Products   1085
22               Dairy and Egg Products   1086
23               Dairy and Egg Products   1087
24               Dairy and Egg Products   1088
25               Dairy and Egg Products   1089
26               Dairy and Egg Products   1090
27               Dairy and Egg Products   1091
28               Dairy and Egg Products   1092
29               Dairy and Egg Products   1093
...                                 ...    ...                ...
6606                      Beef Products  23628
6607                      Beef Products  23629
6608                      Beef Products  23630
6609                      Beef Products  23631
6610                      Beef Products  23632
6611                      Beef Products  23633
6612                      Beef Products  23634
6613                      Beef Products  23635
6614                      Beef Products  23636
6615                      Beef Products  23637
6616                      Beef Products  23638
6617         Soups, Sauces, and Gravies  27015  Campbell Soup Co.
6618         Soups, Sauces, and Gravies  27016  Campbell Soup Co.
6619         Soups, Sauces, and Gravies  27021  Campbell Soup Co.
6620         Soups, Sauces, and Gravies  27022  Campbell Soup Co.
6621         Soups, Sauces, and Gravies  27023  Campbell Soup Co.
6622         Soups, Sauces, and Gravies  27024  Campbell Soup Co.
6623         Soups, Sauces, and Gravies  27025  Campbell Soup Co.
6624         Soups, Sauces, and Gravies  27026  Campbell Soup Co.
6625         Soups, Sauces, and Gravies  27032  Campbell Soup Co.
6626  Vegetables and Vegetable Products  31010  Campbell Soup Co.
6627  Vegetables and Vegetable Products  31013  Campbell Soup Co.
6628  Vegetables and Vegetable Products  31014  Campbell Soup Co.
6629  Vegetables and Vegetable Products  31016  Campbell Soup Co.
6630  Vegetables and Vegetable Products  31017  Campbell Soup Co.
6631        Sausages and Luncheon Meats  42161
6632        Sausages and Luncheon Meats  42173
6633                         Baby Foods  43408               None
6634                         Baby Foods  43539               None
6635                         Baby Foods  43546               None

[6636 rows x 4 columns]
```
```col_mapping = {'description': 'nutrient',
'group': 'nutgroup'}
nutrients = nutrients.rename(columns=col_mapping, copy=False)
print(nutrients)
```
```                                  nutrient     nutgroup    units     value  \
0                                  Protein  Composition        g    25.180
1                        Total lipid (fat)  Composition        g    29.200
2              Carbohydrate, by difference  Composition        g     3.060
3                                      Ash        Other        g     3.280
4                                   Energy       Energy     kcal   376.000
5                                    Water  Composition        g    39.280
6                                   Energy       Energy       kJ  1573.000
7                     Fiber, total dietary  Composition        g     0.000
8                              Calcium, Ca     Elements       mg   673.000
9                                 Iron, Fe     Elements       mg     0.640
10                           Magnesium, Mg     Elements       mg    22.000
11                           Phosphorus, P     Elements       mg   490.000
12                            Potassium, K     Elements       mg    93.000
13                              Sodium, Na     Elements       mg   690.000
14                                Zinc, Zn     Elements       mg     2.940
15                              Copper, Cu     Elements       mg     0.024
16                           Manganese, Mn     Elements       mg     0.021
17                            Selenium, Se     Elements      mcg    14.500
18                           Vitamin A, IU     Vitamins       IU  1054.000
19                                 Retinol     Vitamins      mcg   262.000
20                          Vitamin A, RAE     Vitamins  mcg_RAE   271.000
21          Vitamin C, total ascorbic acid     Vitamins       mg     0.000
22                                 Thiamin     Vitamins       mg     0.031
23                              Riboflavin     Vitamins       mg     0.450
24                                  Niacin     Vitamins       mg     0.180
25                        Pantothenic acid     Vitamins       mg     0.190
26                             Vitamin B-6     Vitamins       mg     0.074
27                           Folate, total     Vitamins      mcg    18.000
28                            Vitamin B-12     Vitamins      mcg     0.270
29                              Folic acid     Vitamins      mcg     0.000
...                                    ...          ...      ...       ...
389325                        Selenium, Se     Elements      mcg     1.100
389326                       Vitamin A, IU     Vitamins       IU     5.000
389327                             Retinol     Vitamins      mcg     0.000
389328                      Vitamin A, RAE     Vitamins  mcg_RAE     0.000
389329                      Carotene, beta     Vitamins      mcg     2.000
389330                     Carotene, alpha     Vitamins      mcg     2.000
389331        Vitamin E (alpha-tocopherol)     Vitamins       mg     0.250
389332                           Vitamin D     Vitamins       IU     0.000
389333                 Vitamin D (D2 + D3)     Vitamins      mcg     0.000
389334                 Cryptoxanthin, beta     Vitamins      mcg     0.000
389335                            Lycopene     Vitamins      mcg     0.000
389336                 Lutein + zeaxanthin     Vitamins      mcg    20.000
389337      Vitamin C, total ascorbic acid     Vitamins       mg    21.900
389338                             Thiamin     Vitamins       mg     0.020
389339                          Riboflavin     Vitamins       mg     0.060
389340                              Niacin     Vitamins       mg     0.540
389341                         Vitamin B-6     Vitamins       mg     0.260
389342                       Folate, total     Vitamins      mcg    17.000
389343                        Vitamin B-12     Vitamins      mcg     0.000
389344                      Choline, total     Vitamins       mg     4.100
389345           Vitamin K (phylloquinone)     Vitamins      mcg     0.500
389346                          Folic acid     Vitamins      mcg     0.000
389347                        Folate, food     Vitamins      mcg    17.000
389348                         Folate, DFE     Vitamins  mcg_DFE    17.000
389349                    Vitamin E, added     Vitamins       mg     0.000
389350                 Vitamin B-12, added     Vitamins      mcg     0.000
389351                         Cholesterol        Other       mg     0.000
389352        Fatty acids, total saturated        Other        g     0.072
389353  Fatty acids, total monounsaturated        Other        g     0.028
389354  Fatty acids, total polyunsaturated        Other        g     0.041

id
0        1008
1        1008
2        1008
3        1008
4        1008
5        1008
6        1008
7        1008
8        1008
9        1008
10       1008
11       1008
12       1008
13       1008
14       1008
15       1008
16       1008
17       1008
18       1008
19       1008
20       1008
21       1008
22       1008
23       1008
24       1008
25       1008
26       1008
27       1008
28       1008
29       1008
...       ...
389325  43546
389326  43546
389327  43546
389328  43546
389329  43546
389330  43546
389331  43546
389332  43546
389333  43546
389334  43546
389335  43546
389336  43546
389337  43546
389338  43546
389339  43546
389340  43546
389341  43546
389342  43546
389343  43546
389344  43546
389345  43546
389346  43546
389347  43546
389348  43546
389349  43546
389350  43546
389351  43546
389352  43546
389353  43546
389354  43546

[375176 rows x 5 columns]
```
```ndata = pd.merge(nutrients, info, on='id', how='outer')
print(ndata)
```
```                                  nutrient     nutgroup    units     value  \
0                                  Protein  Composition        g    25.180
1                        Total lipid (fat)  Composition        g    29.200
2              Carbohydrate, by difference  Composition        g     3.060
3                                      Ash        Other        g     3.280
4                                   Energy       Energy     kcal   376.000
5                                    Water  Composition        g    39.280
6                                   Energy       Energy       kJ  1573.000
7                     Fiber, total dietary  Composition        g     0.000
8                              Calcium, Ca     Elements       mg   673.000
9                                 Iron, Fe     Elements       mg     0.640
10                           Magnesium, Mg     Elements       mg    22.000
11                           Phosphorus, P     Elements       mg   490.000
12                            Potassium, K     Elements       mg    93.000
13                              Sodium, Na     Elements       mg   690.000
14                                Zinc, Zn     Elements       mg     2.940
15                              Copper, Cu     Elements       mg     0.024
16                           Manganese, Mn     Elements       mg     0.021
17                            Selenium, Se     Elements      mcg    14.500
18                           Vitamin A, IU     Vitamins       IU  1054.000
19                                 Retinol     Vitamins      mcg   262.000
20                          Vitamin A, RAE     Vitamins  mcg_RAE   271.000
21          Vitamin C, total ascorbic acid     Vitamins       mg     0.000
22                                 Thiamin     Vitamins       mg     0.031
23                              Riboflavin     Vitamins       mg     0.450
24                                  Niacin     Vitamins       mg     0.180
25                        Pantothenic acid     Vitamins       mg     0.190
26                             Vitamin B-6     Vitamins       mg     0.074
27                           Folate, total     Vitamins      mcg    18.000
28                            Vitamin B-12     Vitamins      mcg     0.270
29                              Folic acid     Vitamins      mcg     0.000
...                                    ...          ...      ...       ...
375146                        Selenium, Se     Elements      mcg     1.100
375147                       Vitamin A, IU     Vitamins       IU     5.000
375148                             Retinol     Vitamins      mcg     0.000
375149                      Vitamin A, RAE     Vitamins  mcg_RAE     0.000
375150                      Carotene, beta     Vitamins      mcg     2.000
375151                     Carotene, alpha     Vitamins      mcg     2.000
375152        Vitamin E (alpha-tocopherol)     Vitamins       mg     0.250
375153                           Vitamin D     Vitamins       IU     0.000
375154                 Vitamin D (D2 + D3)     Vitamins      mcg     0.000
375155                 Cryptoxanthin, beta     Vitamins      mcg     0.000
375156                            Lycopene     Vitamins      mcg     0.000
375157                 Lutein + zeaxanthin     Vitamins      mcg    20.000
375158      Vitamin C, total ascorbic acid     Vitamins       mg    21.900
375159                             Thiamin     Vitamins       mg     0.020
375160                          Riboflavin     Vitamins       mg     0.060
375161                              Niacin     Vitamins       mg     0.540
375162                         Vitamin B-6     Vitamins       mg     0.260
375163                       Folate, total     Vitamins      mcg    17.000
375164                        Vitamin B-12     Vitamins      mcg     0.000
375165                      Choline, total     Vitamins       mg     4.100
375166           Vitamin K (phylloquinone)     Vitamins      mcg     0.500
375167                          Folic acid     Vitamins      mcg     0.000
375168                        Folate, food     Vitamins      mcg    17.000
375169                         Folate, DFE     Vitamins  mcg_DFE    17.000
375170                    Vitamin E, added     Vitamins       mg     0.000
375171                 Vitamin B-12, added     Vitamins      mcg     0.000
375172                         Cholesterol        Other       mg     0.000
375173        Fatty acids, total saturated        Other        g     0.072
375174  Fatty acids, total monounsaturated        Other        g     0.028
375175  Fatty acids, total polyunsaturated        Other        g     0.041

id                                   food                  fgroup  \
0        1008                        Cheese, caraway  Dairy and Egg Products
1        1008                        Cheese, caraway  Dairy and Egg Products
2        1008                        Cheese, caraway  Dairy and Egg Products
3        1008                        Cheese, caraway  Dairy and Egg Products
4        1008                        Cheese, caraway  Dairy and Egg Products
5        1008                        Cheese, caraway  Dairy and Egg Products
6        1008                        Cheese, caraway  Dairy and Egg Products
7        1008                        Cheese, caraway  Dairy and Egg Products
8        1008                        Cheese, caraway  Dairy and Egg Products
9        1008                        Cheese, caraway  Dairy and Egg Products
10       1008                        Cheese, caraway  Dairy and Egg Products
11       1008                        Cheese, caraway  Dairy and Egg Products
12       1008                        Cheese, caraway  Dairy and Egg Products
13       1008                        Cheese, caraway  Dairy and Egg Products
14       1008                        Cheese, caraway  Dairy and Egg Products
15       1008                        Cheese, caraway  Dairy and Egg Products
16       1008                        Cheese, caraway  Dairy and Egg Products
17       1008                        Cheese, caraway  Dairy and Egg Products
18       1008                        Cheese, caraway  Dairy and Egg Products
19       1008                        Cheese, caraway  Dairy and Egg Products
20       1008                        Cheese, caraway  Dairy and Egg Products
21       1008                        Cheese, caraway  Dairy and Egg Products
22       1008                        Cheese, caraway  Dairy and Egg Products
23       1008                        Cheese, caraway  Dairy and Egg Products
24       1008                        Cheese, caraway  Dairy and Egg Products
25       1008                        Cheese, caraway  Dairy and Egg Products
26       1008                        Cheese, caraway  Dairy and Egg Products
27       1008                        Cheese, caraway  Dairy and Egg Products
28       1008                        Cheese, caraway  Dairy and Egg Products
29       1008                        Cheese, caraway  Dairy and Egg Products
...       ...                                    ...                     ...
375146  43546  Babyfood, banana no tapioca, strained              Baby Foods
375147  43546  Babyfood, banana no tapioca, strained              Baby Foods
375148  43546  Babyfood, banana no tapioca, strained              Baby Foods
375149  43546  Babyfood, banana no tapioca, strained              Baby Foods
375150  43546  Babyfood, banana no tapioca, strained              Baby Foods
375151  43546  Babyfood, banana no tapioca, strained              Baby Foods
375152  43546  Babyfood, banana no tapioca, strained              Baby Foods
375153  43546  Babyfood, banana no tapioca, strained              Baby Foods
375154  43546  Babyfood, banana no tapioca, strained              Baby Foods
375155  43546  Babyfood, banana no tapioca, strained              Baby Foods
375156  43546  Babyfood, banana no tapioca, strained              Baby Foods
375157  43546  Babyfood, banana no tapioca, strained              Baby Foods
375158  43546  Babyfood, banana no tapioca, strained              Baby Foods
375159  43546  Babyfood, banana no tapioca, strained              Baby Foods
375160  43546  Babyfood, banana no tapioca, strained              Baby Foods
375161  43546  Babyfood, banana no tapioca, strained              Baby Foods
375162  43546  Babyfood, banana no tapioca, strained              Baby Foods
375163  43546  Babyfood, banana no tapioca, strained              Baby Foods
375164  43546  Babyfood, banana no tapioca, strained              Baby Foods
375165  43546  Babyfood, banana no tapioca, strained              Baby Foods
375166  43546  Babyfood, banana no tapioca, strained              Baby Foods
375167  43546  Babyfood, banana no tapioca, strained              Baby Foods
375168  43546  Babyfood, banana no tapioca, strained              Baby Foods
375169  43546  Babyfood, banana no tapioca, strained              Baby Foods
375170  43546  Babyfood, banana no tapioca, strained              Baby Foods
375171  43546  Babyfood, banana no tapioca, strained              Baby Foods
375172  43546  Babyfood, banana no tapioca, strained              Baby Foods
375173  43546  Babyfood, banana no tapioca, strained              Baby Foods
375174  43546  Babyfood, banana no tapioca, strained              Baby Foods
375175  43546  Babyfood, banana no tapioca, strained              Baby Foods

manufacturer
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
...             ...
375146         None
375147         None
375148         None
375149         None
375150         None
375151         None
375152         None
375153         None
375154         None
375155         None
375156         None
375157         None
375158         None
375159         None
375160         None
375161         None
375162         None
375163         None
375164         None
375165         None
375166         None
375167         None
375168         None
375169         None
375170         None
375171         None
375172         None
375173         None
375174         None
375175         None

[375176 rows x 8 columns]
```
```ndata.loc[30000]
```
```nutrient                                       Glycine
nutgroup                                   Amino Acids
units                                                g
value                                             0.04
id                                                6158
food            Soup, tomato bisque, canned, condensed
fgroup                      Soups, Sauces, and Gravies
manufacturer
Name: 30000, dtype: object
```
```result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)
print(result)
```
```nutrient          fgroup
Vegetables and Vegetable Products     2.1800
Alanine           Baby Foods                            0.0850
Baked Products                        0.2480
Beef Products                         1.5500
Beverages                             0.0030
Breakfast Cereals                     0.3110
Cereal Grains and Pasta               0.3730
Dairy and Egg Products                0.2710
Ethnic Foods                          1.2900
Fast Foods                            0.5140
Fats and Oils                         0.0000
Finfish and Shellfish Products        1.2180
Fruits and Fruit Juices               0.0270
Lamb, Veal, and Game Products         1.4080
Legumes and Legume Products           0.4100
Meals, Entrees, and Sidedishes        0.3270
Nut and Seed Products                 0.7345
Pork Products                         1.3070
Poultry Products                      1.3940
Restaurant Foods                      0.4650
Sausages and Luncheon Meats           0.9420
Snacks                                0.4335
Soups, Sauces, and Gravies            0.0650
Spices and Herbs                      0.5550
Sweets                                0.1020
Vegetables and Vegetable Products     0.0840
Alcohol, ethyl    Baby Foods                            0.0000
Baked Products                        0.0000
Beef Products                         0.0000
...
Water             Snacks                                3.5200
Soups, Sauces, and Gravies           85.9000
Spices and Herbs                     43.6700
Sweets                                9.0500
Vegetables and Vegetable Products    89.1950
Zinc, Zn          Baby Foods                            0.5900
Baked Products                        0.6600
Beef Products                         5.3900
Beverages                             0.0400
Breakfast Cereals                     2.8850
Cereal Grains and Pasta               1.0900
Dairy and Egg Products                1.3900
Ethnic Foods                          1.0450
Fast Foods                            1.2500
Fats and Oils                         0.0200
Finfish and Shellfish Products        0.6700
Fruits and Fruit Juices               0.1000
Lamb, Veal, and Game Products         3.9400
Legumes and Legume Products           1.1400
Meals, Entrees, and Sidedishes        0.6300
Nut and Seed Products                 3.2900
Pork Products                         2.3200
Poultry Products                      2.5000
Restaurant Foods                      0.8000
Sausages and Luncheon Meats           2.1300
Snacks                                1.4700
Soups, Sauces, and Gravies            0.2000
Spices and Herbs                      2.7500
Sweets                                0.3600
Vegetables and Vegetable Products     0.3300
Name: value, Length: 2246, dtype: float64
```
```# result['Zinc, Zn'].order().plot(kind='barh') #AttributeError: 'Series' object has no attribute 'order'
result['Zinc, Zn'].sort_values().plot(kind='barh')
```
```<matplotlib.axes._subplots.AxesSubplot at 0x2156ce3c748>
```
```by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])

get_maximum = lambda x: x.xs(x.value.idxmax())
get_minimum = lambda x: x.xs(x.value.idxmin())

max_foods = by_nutrient.apply(get_maximum)[['value', 'food']]
print(max_foods)
```
```                                                 value  \
nutgroup    nutrient
Amino Acids Alanine                              8.009
Arginine                             7.436
Aspartic acid                       10.203
Cystine                              1.307
Glutamic acid                       17.452
Glycine                             19.049
Histidine                            2.999
Hydroxyproline                       0.803
Isoleucine                           4.300
Leucine                              7.200
Lysine                               6.690
Methionine                           1.859
Phenylalanine                        4.600
Proline                             12.295
Serine                               4.600
Threonine                            3.300
Tryptophan                           1.600
Tyrosine                             3.300
Valine                               4.500
Carbohydrate, by difference        100.000
Fiber, total dietary                79.000
Protein                             88.320
Sugars, total                       99.800
Total lipid (fat)                  100.000
Water                              100.000
Elements    Calcium, Ca                       7364.000
Copper, Cu                          15.050
Fluoride, F                        584.000
Iron, Fe                            87.470
...                                                ...
Vitamins    Cryptoxanthin, beta               6186.000
Dihydrophylloquinone               103.800
Folate, DFE                       2630.000
Folate, food                      2340.000
Folate, total                     2340.000
Folic acid                        1538.000
Lutein + zeaxanthin              39550.000
Lycopene                         46260.000
Menaquinone-4                       33.200
Niacin                              97.000
Pantothenic acid                    35.000
Retinol                          30000.000
Riboflavin                          14.300
Thiamin                             20.000
Tocopherol, beta                     6.490
Tocopherol, delta                   30.880
Tocopherol, gamma                  100.880
Vitamin A, IU                   100000.000
Vitamin A, RAE                   30000.000
Vitamin B-12                        98.890
Vitamin B-6                         12.000
Vitamin C, total ascorbic acid    2400.000
Vitamin D                        10000.000
Vitamin D (D2 + D3)                250.000
Vitamin D2 (ergocalciferol)         28.100
Vitamin D3 (cholecalciferol)        27.400
Vitamin E (alpha-tocopherol)       149.400
Vitamin K (phylloquinone)         1714.500

food
nutgroup    nutrient
Amino Acids Alanine                                         Gelatins, dry powder, unsweetened
Arginine                                             Seeds, sesame flour, low-fat
Aspartic acid                                                 Soy protein isolate
Cystine                              Seeds, cottonseed flour, low fat (glandless)
Glutamic acid                                                 Soy protein isolate
Glycine                                         Gelatins, dry powder, unsweetened
Histidine                              Whale, beluga, meat, dried (Alaska Native)
Hydroxyproline                  KENTUCKY FRIED CHICKEN, Fried Chicken, ORIGINA...
Isoleucine                      Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Leucine                         Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Lysine                          Seal, bearded (Oogruk), meat, dried (Alaska Na...
Methionine                                  Fish, cod, Atlantic, dried and salted
Phenylalanine                   Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Proline                                         Gelatins, dry powder, unsweetened
Serine                          Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Threonine                       Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Tryptophan                       Sea lion, Steller, meat with fat (Alaska Native)
Tyrosine                        Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Valine                          Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Composition Adjusted Protein                           Baking chocolate, unsweetened, squares
Carbohydrate, by difference           Sweeteners, tabletop, fructose, dry, powder
Fiber, total dietary                                             Corn bran, crude
Protein                         Soy protein isolate, potassium type, crude pro...
Sugars, total                                                  Sugars, granulated
Total lipid (fat)                                                 Oil, wheat germ
Water                                               Water, bottled, POLAND SPRING
Elements    Calcium, Ca                     Leavening agents, baking powder, double-acting...
Copper, Cu                      Veal, variety meats and by-products, liver, co...
Fluoride, F                     Tea, instant, sweetened with sugar, lemon-flav...
Iron, Fe                            Salad dressing, russian dressing, low calorie
...                                                                                       ...
Vitamins    Cryptoxanthin, beta                                               Spices, paprika
Dihydrophylloquinone            Margarine, 80% fat, stick, includes regular an...
Folate, DFE                            Cereals ready-to-eat, QUAKER, CAP'N CRUNCH
Folate, food                         Leavening agents, yeast, baker's, active dry
Folate, total                        Leavening agents, yeast, baker's, active dry
Folic acid                             Cereals ready-to-eat, QUAKER, CAP'N CRUNCH
Lutein + zeaxanthin                                                     Kale, raw
Lycopene                                                            Tomato powder
Menaquinone-4                   Chicken, broilers or fryers, drumstick, meat a...
Pantothenic acid                Cereals ready-to-eat, KELLOGG, KELLOGG'S Compl...
Retinol                                                       Fish oil, cod liver
Thiamin                         MORNINGSTAR FARMS Hot and Spicy Veggie Sausage...
Tocopherol, beta                Yellow pond lily, Wocas, dried seeds (Pacific ...
Tocopherol, delta                 Oil, cooking and salad, ENOVA, 80% diglycerides
Tocopherol, gamma                 Oil, cooking and salad, ENOVA, 80% diglycerides
Vitamin A, IU                                                 Fish oil, cod liver
Vitamin A, RAE                                                Fish oil, cod liver
Vitamin B-12                    Mollusks, clam, mixed species, cooked, moist heat
Vitamin B-6                     Cereals ready-to-eat, KELLOGG, KELLOGG'S ALL-B...
Vitamin C, total ascorbic acid  Orange-flavor drink, breakfast type, low calor...
Vitamin D                                                     Fish oil, cod liver
Vitamin D (D2 + D3)                                           Fish oil, cod liver
Vitamin D2 (ergocalciferol)                               Mushrooms, maitake, raw
Vitamin D3 (cholecalciferol)                        Fish, halibut, Greenland, raw
Vitamin E (alpha-tocopherol)                                      Oil, wheat germ
Vitamin K (phylloquinone)                                    Spices, sage, ground

[94 rows x 2 columns]
```
```max_foods.loc['Amino Acids']['food']
```
```nutrient
Alanine                           Gelatins, dry powder, unsweetened
Arginine                               Seeds, sesame flour, low-fat
Aspartic acid                                   Soy protein isolate
Cystine                Seeds, cottonseed flour, low fat (glandless)
Glutamic acid                                   Soy protein isolate
Glycine                           Gelatins, dry powder, unsweetened
Histidine                Whale, beluga, meat, dried (Alaska Native)
Hydroxyproline    KENTUCKY FRIED CHICKEN, Fried Chicken, ORIGINA...
Isoleucine        Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Leucine           Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Lysine            Seal, bearded (Oogruk), meat, dried (Alaska Na...
Methionine                    Fish, cod, Atlantic, dried and salted
Phenylalanine     Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Proline                           Gelatins, dry powder, unsweetened
Serine            Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Threonine         Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Tryptophan         Sea lion, Steller, meat with fat (Alaska Native)
Tyrosine          Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Valine            Soy protein isolate, PROTEIN TECHNOLOGIES INTE...
Name: food, dtype: object
```

Reference: Python Data Analysis Course