Data analysis using python

Chapter VII data cleaning and preparation

7.1 handling missing values

All descriptive statistics for pandas objects exclude missing values by default.
For numeric data, pandas uses the floating-point value NaN (Not a Number to represent the missing value).
In pandas, we use the programming convention in R language to make the missing value NA, which means not available

string_data = pd.Series(['aardvark','artichoke',np.nan,'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

#When cleaning data for analysis, it is often important to analyze the missing data itself to determine the data deviation caused by data collection problems or data loss.
#Python's built-in None value is also treated as NA in the object array
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

NA treatment method

Function name	describe
dropna	The axis labels are filtered according to whether the value of each label is missing data, and the threshold is determined according to the amount of data allowed to be lost
fillna	Fill in missing data with some values or use interpolation methods (such as' fill 'or' bfill ')
isnull	Returns a boolean indicating which values are missing values
notnull	Inverse function of isnull

7.1. 1 filter missing values

#Using dropna on Series, it will return all non empty data in Series and its index value
from numpy import nan as  NA
data = pd.Series([1,NA,3.5,NA,7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

#dropna deletes rows with missing values by default
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

data = pd.DataFrame([[1,6.5,3],[1,NA,NA],[NA,NA,NA],[NA,6.5,3]])
data

	0	1	2
0	1.0	6.5	3.0
1	1.0	NaN	NaN
2	NaN	NaN	NaN
3	NaN	6.5	3.0

#When how='all 'is passed in, all rows with NA values will be deleted
data.dropna(how='all')

	0	1	2
0	1.0	6.5	3.0
1	1.0	NaN	NaN
3	NaN	6.5	3.0

#If you want to delete columns in the same way, pass in the parameter axis=1
data[4] = NA
data

	0	1	2	4
0	1.0	6.5	3.0	NaN
1	1.0	NaN	NaN	NaN
2	NaN	NaN	NaN	NaN
3	NaN	6.5	3.0	NaN

data.dropna(axis=1,how = 'all')

	0	1	2
0	1.0	6.5	3.0
1	1.0	NaN	NaN
2	NaN	NaN	NaN
3	NaN	6.5	3.0

df = pd.DataFrame(np.random.randn(7,3))
df

	0	1	2
0	-0.100288	0.117081	0.629897
1	0.145224	0.827820	-0.197561
2	-1.372610	-0.521075	0.783224
3	-0.679339	0.355698	-1.283404
4	-1.587708	0.254616	0.149215
5	-0.323276	-0.393636	-1.828212
6	-0.639610	-1.677821	1.618943

df.iloc[:4,1] = NA

df.iloc[:2,2] = NA

df

	0	1	2
0	-0.100288	NaN	NaN
1	0.145224	NaN	NaN
2	-1.372610	NaN	0.783224
3	-0.679339	NaN	-1.283404
4	-1.587708	0.254616	0.149215
5	-0.323276	-0.393636	-1.828212
6	-0.639610	-1.677821	1.618943

df.dropna()

	0	1	2
4	-1.587708	0.254616	0.149215
5	-0.323276	-0.393636	-1.828212
6	-0.639610	-1.677821	1.618943

#Suppose you only want to keep rows that contain a certain number of observations. You can use the thresh parameter
#thresh=n means that the number of non NA data rows of each reserved row is greater than or equal to N, and thresh=2 means that at least n non Nan data rows are reserved
df.dropna(thresh = 2)

	0	1	2
2	-1.372610	NaN	0.783224
3	-0.679339	NaN	-1.283404
4	-1.587708	0.254616	0.149215
5	-0.323276	-0.393636	-1.828212
6	-0.639610	-1.677821	1.618943

7.1. 2. Complete the missing value

Fillna method is mainly used to complete the missing values. When fillna is called, a constant can be used instead of the missing value
fillna function parameters

parameter	describe
value	Scalar values or dictionary objects are used to populate missing values
method	The interpolation method defaults to 'fill' if there are no other parameters
axis	Axis to be filled, default axis=0
inplace	Modify the called object instead of generating a backup
limit	Maximum fill range for forward or backward fill

df.fillna(0)

	0	1	2
0	-0.100288	0.000000	0.000000
1	0.145224	0.000000	0.000000
2	-1.372610	0.000000	0.783224
3	-0.679339	0.000000	-1.283404
4	-1.587708	0.254616	0.149215
5	-0.323276	-0.393636	-1.828212
6	-0.639610	-1.677821	1.618943

#Using the dictionary when calling fillna, you can set different fill values for different columns
df.fillna({1:0.5,2:0})

	0	1	2
0	-0.100288	0.500000	0.000000
1	0.145224	0.500000	0.000000
2	-1.372610	0.500000	0.783224
3	-0.679339	0.500000	-1.283404
4	-1.587708	0.254616	0.149215
5	-0.323276	-0.393636	-1.828212
6	-0.639610	-1.677821	1.618943

#fillna returns a new object, but you can also modify an existing object
_ = df.fillna(0,inplace = True)
df

	0	1	2
0	-0.100288	0.000000	0.000000
1	0.145224	0.000000	0.000000
2	-1.372610	0.000000	0.783224
3	-0.679339	0.000000	-1.283404
4	-1.587708	0.254616	0.149215
5	-0.323276	-0.393636	-1.828212
6	-0.639610	-1.677821	1.618943

#The same interpolation method used to rebuild the index can also be used for fillna
df = pd.DataFrame(np.random.randn(6,3))
df

	0	1	2
0	-0.428405	0.199383	0.354342
1	0.019782	0.921389	0.534736
2	-0.583158	0.390681	-2.386976
3	-0.076475	-0.034995	1.635065
4	0.528814	0.711717	0.696243
5	-0.193577	0.162206	-0.520191

df.iloc[2:,1] = NA

df

	0	1	2
0	-0.428405	0.199383	0.354342
1	0.019782	0.921389	0.534736
2	-0.583158	NaN	-2.386976
3	-0.076475	NaN	1.635065
4	0.528814	NaN	0.696243
5	-0.193577	NaN	-0.520191

df.iloc[4:,2] = NA

df

	0	1	2
0	-0.428405	0.199383	0.354342
1	0.019782	0.921389	0.534736
2	-0.583158	NaN	-2.386976
3	-0.076475	NaN	1.635065
4	0.528814	NaN	NaN
5	-0.193577	NaN	NaN

df.fillna(method='ffill')

	0	1	2
0	-0.428405	0.199383	0.354342
1	0.019782	0.921389	0.534736
2	-0.583158	0.921389	-2.386976
3	-0.076475	0.921389	1.635065
4	0.528814	0.921389	1.635065
5	-0.193577	0.921389	1.635065

df.fillna(method='backfill')

	0	1	2
0	-0.428405	0.199383	0.354342
1	0.019782	0.921389	0.534736
2	-0.583158	NaN	-2.386976
3	-0.076475	NaN	1.635065
4	0.528814	NaN	NaN
5	-0.193577	NaN	NaN

df.fillna(method='ffill',limit=2)

	0	1	2
0	-0.428405	0.199383	0.354342
1	0.019782	0.921389	0.534736
2	-0.583158	0.921389	-2.386976
3	-0.076475	0.921389	1.635065
4	0.528814	NaN	1.635065
5	-0.193577	NaN	1.635065

data = pd.Series([5,NA,3,NA,7])
data.fillna(data.mean())

0    5.0
1    5.0
2    3.0
3    5.0
4    7.0
dtype: float64

data.mean()

5.0

7.2 data conversion

7.2. 1 delete duplicate values

data = pd.DataFrame({'k1':['one','two']*3+['two'],
                    'k2':[1,1,2,3,3,4,4,]})
data

	k1	k2
0	one	1
1	two	1
2	one	2
3	two	3
4	one	3
5	two	4
6	two	4

#The duplicated method of DataFrame returns a Boolean Series,
#This Series reflects whether there are duplicates in each row (the same as the rows that have occurred before)
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

#drop_duplicates returns the DataFrame, and the content is the part of the duplicated returned array that is False
#Delete duplicate values and keep only unique values
#These methods operate on columns by default
data.drop_duplicates()

	k1	k2
0	one	1
1	two	1
2	one	2
3	two	3
4	one	3
5	two	4

data['v1'] = range(7)
data

	k1	k2	v1
0	one	1	0
1	two	1	1
2	one	2	2
3	two	3	3
4	one	3	4
5	two	4	5
6	two	4	6

data.drop_duplicates(['k1'])

	k1	k2	v1
0	one	1	0
1	two	1	1

#duplicated and drop_duplicates by default retain the first observed value. Passing in the parameter keep='last 'will return the last one
data.drop_duplicates(['k1','k2'],keep = 'last')

	k1	k2	v1
0	one	1	0
1	two	1	1
2	one	2	2
3	two	3	3
4	one	3	4
6	two	4	6

7.2. 2 data conversion using functions or mappings

data = pd.DataFrame({'food':['bacon','pulled pork','bacon','pastrami','corned beef',
                             'bacon','pastrami','honey ham','nova lox'],
                     'ounces':[4.0,3.0,12.0,6.0,7.5,8.0,3.0,5.0,6.0]})

data

	food	ounces
0	bacon	4.0
1	pulled pork	3.0
2	bacon	12.0
3	pastrami	6.0
4	corned beef	7.5
5	bacon	8.0
6	pastrami	3.0
7	honey ham	5.0
8	nova lox	6.0

meat_to_animal = {
'bacon':'pig',
'pulled pork':'pig',  
'pastrami':'cow',
'corned beef':'cow',  
'honey ham':'pig', 
'nova lox':'samlon',     
}

#The map method of Series receives a function or a dictionary object containing a mapping relationship
lowercased = data['food'].str.lower()
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

data['animal'] = lowercased.map(meat_to_animal)
data

	food	ounces	animal
0	bacon	4.0	pig
1	pulled pork	3.0	pig
2	bacon	12.0	pig
3	pastrami	6.0	cow
4	corned beef	7.5	cow
5	bacon	8.0	pig
6	pastrami	3.0	cow
7	honey ham	5.0	pig
8	nova lox	6.0	samlon

#Using map is a convenient way to perform element by element conversion and other cleaning related operations
data['food'].map(lambda x :meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    samlon
Name: food, dtype: object

7.2. 3 alternative values

data.replace method and data The str.replace method is different, data Str. replace replaces a string by element.

data = pd.Series([1,-999,2,-999,-1000,3])
data

0       1
1    -999
2       2
3    -999
4   -1000
5       3
dtype: int64

data.replace(-999,np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

#If you want to override multiple values at once, you can pass in a list and override values
data.replace([-999,-1000],np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

#To replace different values with different values, you can pass in a list of alternative values
data.replace([-999,-1000],[np.nan,0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

#Parameters can also be passed through a dictionary
data.replace({-999:np.nan,-1000:0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

7.2. 4 rename axis index

data = pd.DataFrame(np.arange(12).reshape(3,4),
                   index = ['Ohio','Colorado','New York'],
                   columns = ['one','two','three','four'])
data

	one	two	three	four
Ohio	0	1	2	3
Colorado	4	5	6	7
New York	8	9	10	11

transform = lambda x :x[:4].upper()
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

data.index = data.index.map(transform)
data

	one	two	three	four
OHIO	0	1	2	3
COLO	4	5	6	7
NEW	8	9	10	11

data.rename(index = str.title,columns = str.upper)

	ONE	TWO	THREE	FOUR
Ohio	0	1	2	3
Colo	4	5	6	7
New	8	9	10	11

#rename can be used in conjunction with dictionary objects to provide new values for a subset of axis labels
data.rename(index = {'OHIO':'INDIANA'},
           columns = {'three':'peekaboo'})

	one	two	peekaboo	four
INDIANA	0	1	2	3
COLO	4	5	6	7
NEW	8	9	10	11

7.2. 5 discretization and distribution

ages = [20,22,24,27,21,23,37,31,61,45,41,32]

bins = [18,25,35,60,100]
cats = pd.cut(ages,bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

#pd.value_counts(cats) is for pandas Count of the number of boxes in the result of the cut
pd.value_counts(cats)

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

#Consistent with the mathematical symbol of the interval, parentheses indicate that the edge is open and square brackets indicate that it is closed (including edges).
#You can change which side is closed by passing right=False
pd.cut(ages,[18,26,36,61,100],right = False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

#You can also pass in a custom box name by passing a list or array to the labels option
group_names = ['youth','youngadult','middleaged','senior']
a = pd.cut(ages,bins,labels = group_names)

pd.value_counts(a)

youth         5
youngadult    3
middleaged    3
senior        1
dtype: int64

#If you pass an integer box of cut to replace the explicit box edge, pandas will calculate the box with the same length according to the minimum and maximum values in the data
#The option precision=2 limits decimal precision to two digits
data = np.random.rand(20)
pd.cut(data,4,precision=2)

[(0.51, 0.74], (0.29, 0.51], (0.74, 0.97], (0.29, 0.51], (0.06, 0.29], ..., (0.06, 0.29], (0.29, 0.51], (0.74, 0.97], (0.51, 0.74], (0.74, 0.97]]
Length: 20
Categories (4, interval[float64]): [(0.06, 0.29] < (0.29, 0.51] < (0.51, 0.74] < (0.74, 0.97]]

#qcut is a function closely related to bin division, which is based on sample quantiles. Depending on the distribution of data, using cut usually does not make each box have the same amount of data points.
#Because qcut uses the quantile of the sample, you can get the same length of the box through qcut
data = np.random.randn(1000)
cats = pd.qcut(data,4)#Cut into 4 parts
cats

[(-0.00707, 0.65], (-0.00707, 0.65], (-2.936, -0.626], (-0.626, -0.00707], (-2.936, -0.626], ..., (-0.626, -0.00707], (-0.626, -0.00707], (-0.626, -0.00707], (-0.626, -0.00707], (-0.00707, 0.65]]
Length: 1000
Categories (4, interval[float64]): [(-2.936, -0.626] < (-0.626, -0.00707] < (-0.00707, 0.65] < (0.65, 3.139]]

pd.value_counts(cats)

(-2.936, -0.626]      250
(-0.626, -0.00707]    250
(-0.00707, 0.65]      250
(0.65, 3.139]         250
dtype: int64

#Similar to cut, you can pass in custom quantiles (data between 0 and 1, including edges)
pd.cut(data,[0,0.1,0.5,0.9,1])

[(0.5, 0.9], (0.1, 0.5], NaN, NaN, NaN, ..., NaN, NaN, NaN, NaN, (0.1, 0.5]]
Length: 1000
Categories (4, interval[float64]): [(0.0, 0.1] < (0.1, 0.5] < (0.5, 0.9] < (0.9, 1.0]]

7.2. 6 detect and filter outliers

data = pd.DataFrame(np.random.randn(1000,4))
data.describe()

	0	1	2	3
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	0.013343	0.030142	0.020312	0.042330
std	1.012528	0.984443	0.999869	0.982124
min	-2.942920	-3.799121	-3.412855	-2.632107
25%	-0.668303	-0.629645	-0.654843	-0.643005
50%	0.010349	0.040064	0.026197	0.028003
75%	0.701525	0.679371	0.706170	0.714993
max	3.274496	3.998493	3.264216	2.907744

#Suppose you want to find a column with an absolute value greater than three
col = data[2]
col[np.abs(col) > 3]

91    -3.044972
711    3.264216
858   -3.412855
Name: 2, dtype: float64

data[(np.abs(data)>3).any(1)]

	0	1	2	3
91	-0.341046	-0.555910	-3.044972	0.474512
325	2.233400	-3.027404	0.845704	1.441757
332	-0.460361	-3.799121	-0.312931	0.478548
457	0.011004	3.998493	0.977419	0.577620
711	-0.603762	-1.650901	3.264216	-0.803395
746	1.455624	-3.178085	-0.387140	0.859193
858	-2.127923	0.163924	-3.412855	-0.073186
946	3.274496	-0.699596	-1.016879	0.358252

data[np.abs(data)>3] = np.sign(data)*3
data.describe()

	0	1	2	3
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	0.013069	0.030148	0.020506	0.042330
std	1.011680	0.977459	0.997573	0.982124
min	-2.942920	-3.000000	-3.000000	-2.632107
25%	-0.668303	-0.629645	-0.654843	-0.643005
50%	0.010349	0.040064	0.026197	0.028003
75%	0.701525	0.679371	0.706170	0.714993
max	3.000000	3.000000	3.000000	2.907744

data

	0	1	2	3
0	0.997285	0.352539	-0.158277	-0.069519
1	-1.144523	-0.173312	-0.651227	0.686972
2	0.650131	0.271325	-0.304344	-0.281217
3	0.527442	-2.023765	0.827982	-1.855424
4	-0.578451	-0.949705	-0.582701	-1.725697
...	...	...	...	...
995	0.494311	0.528862	-0.191097	0.118121
996	-0.582154	1.251247	-1.622055	-0.436563
997	0.687732	-1.670059	-0.272708	-0.369290
998	-0.443230	0.984728	-0.283506	-1.473420
999	-0.276277	-0.597256	1.269391	-0.704337

1000 rows × 4 columns

#Statement NP Sign (data) generates 1 and - 1 values respectively according to the positive and negative values in the data
np.sign(data).head()

	0	1	2	3
0	1.0	1.0	-1.0	-1.0
1	-1.0	-1.0	-1.0	1.0
2	1.0	1.0	-1.0	-1.0
3	1.0	-1.0	1.0	-1.0
4	-1.0	-1.0	-1.0	-1.0

7.2. 7 replacement and random sampling

Use numpy random. Permutation is very convenient for permutation (random reordering) of Series or rows in DataFrame.

sampler = np.random.permutation(5)
sampler

array([3, 2, 0, 4, 1])

df = pd.DataFrame(np.arange(5*4).reshape(5,4))
df

	0	1	2	3
0	0	1	2	3
1	4	5	6	7
2	8	9	10	11
3	12	13	14	15
4	16	17	18	19

#Integer arrays can be used in iloc based indexes or equivalent take functions
df.take(sampler)

	0	1	2	3
3	12	13	14	15
2	8	9	10	11
0	0	1	2	3
4	16	17	18	19
1	4	5	6	7

#To select a random subset without alternative values, you can use the sample methods of Series and DataFrame
df.sample(n=3)

	0	1	2	3
0	0	1	2	3
4	16	17	18	19
3	12	13	14	15

#To generate a sample with alternative values (duplicate selections are allowed), pass replace=True into the sample method
choices = pd.Series([5,6,-1,6,4])
draws = choices.sample(n=10,replace = True)
draws

2   -1
0    5
2   -1
3    6
0    5
1    6
1    6
4    4
3    6
1    6
dtype: int64

7.2. 8 calculation index / dummy variable

Transforming categorical variables into "virtual" or "indicator" matrices is another transformation operation for statistical modeling or machine learning
pandas has a get_ The dummies function is used to implement this function

df = pd.DataFrame({'key':['b','b','a','c','a','b'],
                  'data1':range(6)})
df

	key	data1
0	b	0
1	b	1
2	a	2
3	c	3
4	a	4
5	b	5

pd.get_dummies(df['key'])

	a	b	c
0	0	1	0
1	0	1	0
2	1	0	0
3	0	0	1
4	1	0	0
5	0	1	0

#In some cases, you may want to prefix the column of the indicator DataFrame and merge it with other data.
#In get_ There is a prefix parameter in the dummies method to implement this function
dummies = pd.get_dummies(df['key'],prefix='key')
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

	data1	key_a	key_b	key_c
0	0	0	1	0
1	1	0	1	0
2	2	1	0	0
3	3	0	0	1
4	4	1	0	0
5	5	0	1	0

mnames = ['movie_id','title','genres']
movies = pd.read_table(r'D:\PythonFlie\python\utilize python Conduct data analysis(Book notes)\pydata-book-2nd-edition\datasets\movielens\movies.dat'
                       ,sep='::',header=None,names = mnames)

<ipython-input-188-960ac40c2eea>:2: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  movies = pd.read_table(r'D:\PythonFlie\python\utilize python Conduct data analysis(Book notes)\pydata-book-2nd-edition\datasets\movielens\movies.dat'

movies[::10]

	movie_id	title	genres
0	1	Toy Story (1995)	Animation\|Children's\|Comedy
10	11	American President, The (1995)	Comedy\|Drama\|Romance
20	21	Get Shorty (1995)	Action\|Comedy\|Drama
30	31	Dangerous Minds (1995)	Drama
40	41	Richard III (1995)	Drama\|War
...	...	...	...
3840	3910	Dancer in the Dark (2000)	Drama\|Musical
3850	3920	Faraway, So Close (In Weiter Ferne, So Nah!) (...	Drama\|Fantasy
3860	3930	Creature From the Black Lagoon, The (1954)	Horror
3870	3940	Slumber Party Massacre III, The (1990)	Horror
3880	3950	Tigerland (2000)	Drama

389 rows × 3 columns

#Adding index variables for each movie genre requires some data processing. First, we extract a list of all the different genres from the dataset
all_genres = []
for x in movies.genres:
    all_genres.extend(x.split('|'))
genres = pd.unique(all_genres)

genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

zero_matrix = np.zeros((len(movies),len(genres)))
dummies = pd.DataFrame(zero_matrix,columns=genres)

zero_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

dummies

	Animation	Children's	Comedy	Adventure	Fantasy	Romance	Drama	Action	Crime	Thriller	Horror	Sci-Fi	Documentary	War	Musical	Mystery	Film-Noir	Western
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
3878	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3879	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3880	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3881	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3882	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

3883 rows × 18 columns

gen = movies.genres[0]
gen.split('|')

['Animation', "Children's", 'Comedy']

dummies.columns.get_indexer(gen.split("|"))

array([0, 1, 2], dtype=int64)

for i,gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split("|"))
    dummies.iloc[i,indices] = 1

movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.iloc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Animation                              1.0
Genre_Children's                             1.0
Genre_Comedy                                 1.0
Genre_Adventure                              0.0
Genre_Fantasy                                0.0
Genre_Romance                                0.0
Genre_Drama                                  0.0
Genre_Action                                 0.0
Genre_Crime                                  0.0
Genre_Thriller                               0.0
Genre_Horror                                 0.0
Genre_Sci-Fi                                 0.0
Genre_Documentary                            0.0
Genre_War                                    0.0
Genre_Musical                                0.0
Genre_Mystery                                0.0
Genre_Film-Noir                              0.0
Genre_Western                                0.0
Name: 0, dtype: object

#Will get_dummies combined with cut and other discrete functions is a useful method for statistical applications
np.random.seed(12345)
values = np.random.rand(10)
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

bins = [0,0.2,0.4,0.6,0.8,1]
pd.get_dummies(pd.cut(values,bins))

	(0.0, 0.2]	(0.2, 0.4]	(0.4, 0.6]	(0.6, 0.8]	(0.8, 1.0]
0	0	0	0	0	1
1	0	1	0	0	0
2	1	0	0	0	0
3	0	1	0	0	0
4	0	0	1	0	0
5	0	0	1	0	0
6	0	0	0	0	1
7	0	0	0	1	0
8	0	0	0	1	0
9	0	0	0	1	0

7.3 string operation

7.3. 1 string object method

#A comma separated string can be split into multiple blocks using the split method:
val = 'a,b, guido'
val.split(',')

['a', 'b', ' guido']

#split is often used with strip to clear spaces (including line breaks)
pieces = [x.strip() for x in val.split(',')]
pieces

['a', 'b', 'guido']

#These substrings can be concatenated with two colon separators using addition
first,second,third = pieces
first+"::"+second+"::"+third

'a::b::guido'

#Passing in a list or tuple in the join method of string '::' is a faster and more Python style method
"::".join(pieces)

'a::b::guido'

#Using Python's in keyword is the best way to detect substrings, although index and find can do the same
'guido' in val

True

#Note that the difference between find and index is that index throws an exception when the string is not found (while find returns - 1)
val.index('guido')

val.find('guido')

#count returns the number of occurrences of a specific substring in the string

val.count('guido')

#replace replaces one pattern with another. It is also commonly used to pass in an empty string to delete a pattern
val.replace(',','::')

'a::b:: guido'

val.replace(',','')

'ab guido'

Python built-in string method

method	describe
count	Returns the number of non overlapping occurrences of a substring in a string
endswith	Returns True if the string ends with a suffix
startswith	Returns True if the string starts with a prefix
join	Use strings as spacers to glue sequences of other strings
index	If found in the string, the position of the first character in the substring is returned: if not found, ValueError is raised
find	Returns the position of the first character in the string where the first sub character appears: similar to index, but - 1 is returned if it is not found
rfind	Returns the position of the first character of the substring when it last appears in the string. If it is not found, it returns - 1
replace	Replace one string with another
strip,rstrip,1strip	Trimming whitespace, including line breaks, is equivalent to x. strip() (and rstrip, lstrip) for each element.
split	Use the separator to split the string into a list of substrings
lower	Convert uppercase letters to lowercase letters
upper	Convert lowercase letters to uppercase letters
casefold	Converts characters to lowercase and any region specific variable character combination to a common comparable form
ljust, rjust	Align left or right; Fill the opposite side of the string with spaces (or some other characters) to return the string with the minimum width

7.3. 2 regular expression

The re module has three main topics: pattern matching, substitution and splitting
The regular expression describing one or more white space characters is \ s+
If you need to apply the same expression to multiple strings, re. Is recommended Compile creates a regular expression object, which helps to save CPU cycles.
To avoid the influence of the escape character \ in regular expressions, you can use native String Syntax, such as r 'C:\x' or the equivalent 'C:\x'
Regular expression method

method	describe
findall	Returns all non overlapping matching patterns in a string as a list
finditer	Similar to findall, but returns an iterator
match	Match the pattern at the beginning of the string, or match the pattern to the group; If the pattern matches, a matching object is returned; otherwise, None is returned
search	Scan the matching pattern of the string. If the returned matching object is scanned, unlike the match method, the matching of the search method can be any position of the string, not just the starting position of the string
split	Splits the string into multiple parts according to the schema
sub，subn	Replace all matching (sub) or the nth matching string (subn) in the string with a replacement expression; Use symbols \ 1\ 2... To reference the matching group elements in the replacement string

import re

text = 'foo bar\t baz \tqux'
re.split('\s+',text)

['foo', 'bar', 'baz', 'qux']

#You can use re Compile compiles itself to form a reusable regular expression object
regex = re.compile('\s+')
regex.split(text)

['foo', 'bar', 'baz', 'qux']

#If you want to get a list of all the patterns that match the regular expression, you can use the findall method
regex.findall(text)

[' ', '\t ', ' \t']

text = """
Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

pattern = r'[A-Z0-9.%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
regex = re.compile(pattern,flags = re.IGNORECASE)

regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

#search returns the first matching email address in the text
m = regex.search(text)
m

<re.Match object; span=(6, 21), match='dave@google.com'>

text[m.start():m.end()]

'dave@google.com'

#regex.match matches only when the pattern appears at the beginning of the string. If it does not match, it returns None
print(regex.match(text))

None

#sub will return a new string, and the pattern in the original string will be replaced by a new string
print(regex.sub('ABC',text))

Dave ABC
Steve ABC
Rob ABC
Ryan ABC

#Suppose you want to find an e-mail address and divide each address into three parts: user name, domain name and domain name suffix.
#To do this, you can wrap patterns in parentheses
pattern = r'([A-Z0-9.%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern,flags = re.IGNORECASE)

m = regex.match('wesm@bright.net')
m.groups()

('wesm', 'bright', 'net')

#When patterns can be grouped, findall returns a list containing tuples
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

print(regex.sub(r'Username:\1,Domain:\2,Suffix:\3',text))

Dave Username:dave,Domain:google,Suffix:com
Steve Username:steve,Domain:gmail,Suffix:com
Rob Username:rob,Domain:gmail,Suffix:com
Ryan Username:ryan,Domain:yahoo,Suffix:com

7.3. 3. Vectorized string function in pandas

List of partial vectorization string methods

method	describe
cat	Glue strings by element based on optional delimiters
contains	Returns an array of Boolean values containing a pattern / regular expression
count	Count of the number of occurrences of the mode
extract	Use regular expressions to group and extract - or more strings from string scripts; The result returned is a DataFrame in which each grouping forms a - column
endswith	Equivalent to using x. endwith (schema) for each element

data = {'Dave':'dave@google.com','Steve':'steve@gmail.com','Rob':'rob@gmail.com','Ryan':np.nan}
data = pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Ryan                 NaN
dtype: object

data.isnull()

Dave     False
Steve    False
Rob      False
Ryan      True
dtype: bool

#You can use data Map applies strings and valid regular expression methods (passed as lambda or other functions) to each value,
#However, it will fail on the NA (null) value. Series has array oriented methods to skip the string operation of NA value. These methods are called through the str property of series
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Ryan       NaN
dtype: object

#Regular expressions can also be used with any re module option
data.str.findall(pattern,flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Ryan                       NaN
dtype: object

#You can use str.get or index inside the str attribute
matches = data.str.match(pattern,flags=re.IGNORECASE)
matches

Dave     True
Steve    True
Rob      True
Ryan      NaN
dtype: object

#To access the elements in the embedded list, we can pass the index to any of these functions
matches.str.get(l)

---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

<ipython-input-245-8d76f9329d2a> in <module>
      1 #To access the elements in the embedded list, we can pass the index to any of these functions
----> 2 matches.str.get(l)


D:\Anaconda3\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
   5459             or name in self._accessors
   5460         ):
-> 5461             return object.__getattribute__(self, name)
   5462         else:
   5463             if self._info_axis._can_hold_identifiers_and_holds_name(name):


D:\Anaconda3\lib\site-packages\pandas\core\accessor.py in __get__(self, obj, cls)
    178             # we're accessing the attribute of the class, i.e., Dataset.geo
    179             return self._accessor
--> 180         accessor_obj = self._accessor(obj)
    181         # Replace the property with the accessor object. Inspired by:
    182         # https://www.pydanny.com/cached-property.html


D:\Anaconda3\lib\site-packages\pandas\core\strings\accessor.py in __init__(self, data)
    152         from pandas.core.arrays.string_ import StringDtype
    153 
--> 154         self._inferred_dtype = self._validate(data)
    155         self._is_categorical = is_categorical_dtype(data.dtype)
    156         self._is_string = isinstance(data.dtype, StringDtype)


D:\Anaconda3\lib\site-packages\pandas\core\strings\accessor.py in _validate(data)
    215 
    216         if inferred_dtype not in allowed_types:
--> 217             raise AttributeError("Can only use .str accessor with string values!")
    218         return inferred_dtype
    219 


AttributeError: Can only use .str accessor with string values!

matches.str[0]

---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

<ipython-input-246-10bdd22fd8b2> in <module>
----> 1 matches.str[0]


D:\Anaconda3\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
   5459             or name in self._accessors
   5460         ):
-> 5461             return object.__getattribute__(self, name)
   5462         else:
   5463             if self._info_axis._can_hold_identifiers_and_holds_name(name):


D:\Anaconda3\lib\site-packages\pandas\core\accessor.py in __get__(self, obj, cls)
    178             # we're accessing the attribute of the class, i.e., Dataset.geo
    179             return self._accessor
--> 180         accessor_obj = self._accessor(obj)
    181         # Replace the property with the accessor object. Inspired by:
    182         # https://www.pydanny.com/cached-property.html


D:\Anaconda3\lib\site-packages\pandas\core\strings\accessor.py in __init__(self, data)
    152         from pandas.core.arrays.string_ import StringDtype
    153 
--> 154         self._inferred_dtype = self._validate(data)
    155         self._is_categorical = is_categorical_dtype(data.dtype)
    156         self._is_string = isinstance(data.dtype, StringDtype)


D:\Anaconda3\lib\site-packages\pandas\core\strings\accessor.py in _validate(data)
    215 
    216         if inferred_dtype not in allowed_types:
--> 217             raise AttributeError("Can only use .str accessor with string values!")
    218         return inferred_dtype
    219 


AttributeError: Can only use .str accessor with string values!

#You can use the syntax similar to string slicing for vectorization slicing
data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Ryan       NaN
dtype: object

Chapter 8 data regularization: connection, association and reconstruction

8.1 hierarchical index

#What you see is a beautified view of Series indexed by MultiIndex. The "gap" in the index means "use the label directly above"
data = pd.Series(np.random.randn(9),
                index = [['a','a','a','b','b','c','c','d','d'],
                        [1,2,3,1,3,1,2,2,3]])
data

a  1    1.007189
   2   -1.296221
   3    0.274992
b  1    0.228913
   3    1.352917
c  1    0.886429
   2   -2.001637
d  2   -0.371843
   3    1.669025
dtype: float64

data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

data['b']

1    0.228913
3    1.352917
dtype: float64

data['b':'c']

b  1    0.228913
   3    1.352917
c  1    0.886429
   2   -2.001637
dtype: float64

data.loc[['b','c']]

b  1    0.228913
   3    1.352917
c  1    0.886429
   2   -2.001637
dtype: float64

#It is also possible to select at the internal level
data.loc[:,3]

a    0.274992
b    1.352917
d    1.669025
dtype: float64

#Hierarchical index plays an important role in grouping operations such as reshaping data and array PivotTable.
#You can rearrange the data in the DataFrame using the unstack method
data.unstack()

	1	2	3
a	1.007189	-1.296221	0.274992
b	0.228913	NaN	1.352917
c	0.886429	-2.001637	NaN
d	NaN	-0.371843	1.669025

#The reverse operation of unstack is stack
data.unstack().stack()

a  1    1.007189
   2   -1.296221
   3    0.274992
b  1    0.228913
   3    1.352917
c  1    0.886429
   2   -2.001637
d  2   -0.371843
   3    1.669025
dtype: float64

#In the DataFrame, each axis can have a hierarchical index
frame = pd.DataFrame(np.arange(12).reshape(4,3),
                    index = [['a','a','b','b'],[1,2,1,2]],
                    columns = [['ohio','ohio','colorado'],['green','red','green']])
frame

		ohio		colorado
		green	red	green
a	1	0	1	2
a	2	3	4	5
b	1	6	7	8
b	2	9	10	11

frame.index.names = ['key1','key2']
frame

		ohio		colorado
		green	red	green
key1	key2
a	1	0	1	2
a	2	3	4	5
b	1	6	7	8
b	2	9	10	11

frame.columns.names = ['state','color']
frame

	state	ohio		colorado
	color	green	red	green
key1	key2
a	1	0	1	2
a	2	3	4	5
b	1	6	7	8
b	2	9	10	11

frame['ohio']

	color	green	red
key1	key2
a	1	0	1
a	2	3	4
b	1	6	7
b	2	9	10

#A MultiIndex object can be created and reused using its own constructor
pd.MultiIndex.from_arrays([['ohio','ohio','colorado'],['green','red','green']],
                      names = ['state','color'])

MultiIndex([(    'ohio', 'green'),
            (    'ohio',   'red'),
            ('colorado', 'green')],
           names=['state', 'color'])

8.1. 1 reordering and hierarchical sorting

#Swapelevel receives the sequence number or level name of two levels and returns a new object with level change (but the data is unchanged)
frame.swaplevel('key1','key2')

	state	ohio		colorado
	color	green	red	green
key2	key1
1	a	0	1	2
2	a	3	4	5
1	b	6	7	8
2	b	9	10	11

#sort_index can only sort data at a single level.
#Use sort when making hierarchical changes_ It is also common to index so that the results are lexically sorted by level
frame.sort_index(level=1)

	state	ohio		colorado
	color	green	red	green
key1	key2
a	1	0	1	2
b	1	6	7	8
a	2	3	4	5
b	2	9	10	11

frame.sort_index(level=0)

	state	ohio		colorado
	color	green	red	green
key1	key2
a	1	0	1	2
a	2	3	4	5
b	1	6	7	8
b	2	9	10	11

frame.swaplevel(0,1).sort_index(level=0)

	state	ohio		colorado
	color	green	red	green
key2	key1
1	a	0	1	2
1	b	6	7	8
2	a	3	4	5
2	b	9	10	11

8.1. 2. Summary statistics by level

Many descriptive and summary statistics in DataFrame and Series have a level option, through which you can specify that you want to aggregate on a specific axis

frame.sum(level='key2')

state	ohio		colorado
color	green	red	green
key2
1	6	8	10
2	12	14	16

frame.sum(level='color',axis=1)

	color	green	red
key1	key2
a	1	2	1
a	2	8	4
b	1	14	7
b	2	20	10

8.1. 3 use DataFrame columns for indexing

Generally, we do not use one or more columns in the DataFrame as row indexes; Instead, you may want to move the row index into the columns of the DataFrame.
Different connection types of how parameter

option	behavior
' inner'	Join only the intersection of keys that both tables have
'left'	Union all keys of the left table
'right '	Federate the keys of all right tables
' outer'	Union the union of the keys of both tables

frame = pd.DataFrame({'a':range(7),'b':range(7,0,-1),
                     'c':['one','one','one','two','two','two','two'],
                     'd':[0,1,2,0,1,2,3]})
frame

	a	b	c	d
0	0	7	one	0
1	1	6	one	1
2	2	5	one	2
3	3	4	two	0
4	4	3	two	1
5	5	2	two	2
6	6	1	two	3

#Set of DataFrame_ The index function generates a new DataFrame, which uses one or more columns as indexes
frame2 = frame.set_index(['c','d'])
frame2

		a	b
c	d
one	0	0	7
	1	1	6
	2	2	5
two	0	3	4
	1	4	3
	2	5	2
	3	6	1

#By default, these columns are removed from the DataFrame, or you can leave them in the DataFrame
frame.set_index(['c','d'],drop = False)

		a	b	c	d
c	d
one	0	0	7	one	0
	1	1	6	one	1
	2	2	5	one	2
two	0	3	4	two	0
	1	4	3	two	1
	2	5	2	two	2
	3	6	1	two	3

#reset_index is set_ In the reverse operation of index, the index level of hierarchical index will be moved to the column
frame2.reset_index()

	c	d	a	b
0	one	0	0	7
1	one	1	1	6
2	one	2	2	5
3	two	0	3	4
4	two	1	4	3
5	two	2	5	2
6	two	3	6	1

8.2 federated and consolidated data sets

pandas.merge joins rows based on one or more keys. For users of SQL or other relational databases, this method is familiar. It realizes the connection operation of the database.
pandas.concat causes objects to bond or "stack" axially.
combine_ The first instance method allows overlapping data to be spliced together to fill missing values in another object with values in one object.

8.2. 1 database style DataFrame connection

A merge or join operation federates a dataset by joining rows with one or more keys
merge function parameters

parameter	describe
left	DataFrame on the left in the merge operation
right	DataFrame on the right side of the operation during merging
how	‘inner’. ‘ outer’. ‘ left’. ‘ right '; The default is' inner '
on	The name of the column that needs to be connected. It must be the column name of DataFrame objects on both sides, and the intersection of column names in left and right is used as the connection key
left_on	1eft columns used as join keys in dataframe
right_on	Right the column used as the join key in the dataframe
left_index	Use the row index of left as its join key (multiple keys in case of Multilndex)
right_index	Use the row index of right as its join key (multiple keys in case of MultiIndex)
sort	Sort the merged data alphabetically through the connection key; True by default (for better performance on large datasets, disabling this feature in some cases)
suffixes	In case of overlap, the string tuple added after the column name; The default is ('x ',' y ') (for example, if the DataFrame to be merged contains' data' columns, 'data_x' and 'data_y' will appear in the result)
copy	If False, avoid copying data to the result data structure in some special cases; Always copy by default
indicator	Add a special column_ merge, indicating the source of each row; The value will be 'left' according to the source of the connected data in each row_ only’，‘right_ only 'or' both '

df1 = pd.DataFrame({'key':['b','b','a','c','a','a','b'],
                   'data1':range(7)})
df1

	key	data1
0	b	0
1	b	1
2	a	2
3	c	3
4	a	4
5	a	5
6	b	6

df2 = pd.DataFrame({'key':['a','b','d'],
                   'data2':range(3)})
df2

	key	data2
0	a	0
1	b	1
2	d	2

pd.merge(df1,df2)

	key	data1	data2
0	b	0	1
1	b	1	1
2	b	6	1
3	a	2	0
4	a	4	0
5	a	5	0

#Note that I did not specify which column to connect on. If the key information of the connection is not specified, merge will automatically use the overlapping column name as the key of the connection.
#However, specifying the join key explicitly is a good implementation
pd.merge(df1,df2,on='key')

	key	data1	data2
0	b	0	1
1	b	1	1
2	b	6	1
3	a	2	0
4	a	4	0
5	a	5	0

#If the column names of each object are different, you can specify column names for them separately
df3 = pd.DataFrame({'Lkey':['b','b','a','c','a','a','b'],
                   'data1':range(7)})
df3

	Lkey	data1
0	b	0
1	b	1
2	a	2
3	c	3
4	a	4
5	a	5
6	b	6

df4 = pd.DataFrame({'Rkey':['a','b','d'],
                   'data2':range(3)})
df4

	Rkey	data2
0	a	0
1	b	1
2	d	2

pd.merge(df3,df4,left_on='Lkey',right_on='Rkey')

	Lkey	data1	Rkey	data2
0	b	0	b	1
1	b	1	b	1
2	b	6	b	1
3	a	2	a	0
4	a	4	a	0
5	a	5	a	0

#Other options are 'left', 'right' and 'outer'.
#outer join is the union of keys, which combines the effects of left join and right join
pd.merge(df1,df2,how='outer')

	key	data1	data2
0	b	0.0	1.0
1	b	1.0	1.0
2	b	6.0	1.0
3	a	2.0	0.0
4	a	4.0	0.0
5	a	5.0	0.0
6	c	3.0	NaN
7	d	NaN	2.0

df1 = pd.DataFrame({'key':['b','b','a','c','a','b'],
                   'data1':range(6)})
df1

	key	data1
0	b	0
1	b	1
2	a	2
3	c	3
4	a	4
5	b	5

df2 = pd.DataFrame({'key':['a','b','a','b','d'],
                   'data2':range(5)})
df2

	key	data2
0	a	0
1	b	1
2	a	2
3	b	3
4	d	4

pd.merge(df1,df2,on='key',how='left')

	key	data1	data2
0	b	0	1.0
1	b	0	3.0
2	b	1	1.0
3	b	1	3.0
4	a	2	0.0
5	a	2	2.0
6	c	3	NaN
7	a	4	0.0
8	a	4	2.0
9	b	5	1.0
10	b	5	3.0

pd.merge(df1,df2,on='key',how='inner')

	key	data1	data2
0	b	0	1
1	b	0	3
2	b	1	1
3	b	1	3
4	b	5	1
5	b	5	3
6	a	2	0
7	a	2	2
8	a	4	0
9	a	4	2

#When merging with multiple keys, a list of column names is passed in
left = pd.DataFrame({'key1':['foo','foo','bar'],
                    'key2':['one','two','one'],
                    'lval':[1,2,3]})
left

	key1	key2	lval
0	foo	one	1
1	foo	two	2
2	bar	one	3

right = pd.DataFrame({'key1':['foo','foo','bar','bar'],
                    'key2':['one','one','one','two'],
                    'rval':[4,5,6,7]})
right

	key1	key2	rval
0	foo	one	4
1	foo	one	5
2	bar	one	6
3	bar	two	7

pd.merge(left,right,on=['key1','key2'],how='outer')

	key1	key2	lval	rval
0	foo	one	1.0	4.0
1	foo	one	1.0	5.0
2	foo	two	2.0	NaN
3	bar	one	3.0	6.0
4	bar	two	NaN	7.0

#merge has a suffixes option,
#Used to specify the string to be added after the overlapping column names of the left and right DataFrame objects
pd.merge(left,right,on=['key1'])

	key1	key2_x	lval	key2_y	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

pd.merge(left,right,on=['key1'],suffixes=('_left','_right'))

	key1	key2_left	lval	key2_right	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

8.2. 2 merge by index

In some cases, the key used for merging in the DataFrame is its index. In this case, you can pass left_index=True or right_index=True (or both) to indicate that the index needs to be used as the merge key

left1 = pd.DataFrame({'key':['a','b','a','a','b','c'],
                     'value':range(6)})
left1

	key	value
0	a	0
1	b	1
2	a	2
3	a	3
4	b	4
5	c	5

right1 = pd.DataFrame({'group_val':[3.5,7]},index = ['a','b'])
right1

	group_val
a	3.5
b	7.0

pd.merge(left1,right1,left_on = 'key',right_index=True)

	key	value	group_val
0	a	0	3.5
2	a	2	3.5
3	a	3	3.5
1	b	1	7.0
4	b	4	7.0

pd.merge(left1,right1,left_on = 'key',right_index=True,how='outer')

	key	value	group_val
0	a	0	3.5
2	a	2	3.5
3	a	3	3.5
1	b	1	7.0
4	b	4	7.0
5	c	5	NaN

#In the case of multi-layer index data, things will be more complex. Joining on the index is an implicit multi key merge
lefth = pd.DataFrame({'key1':['ohio','ohio','ohio','Nevada','Nevada'],
                     'key2':[2000,2001,2002,2001,2002],
                     'data':np.arange(5.)})
lefth

	key1	key2	data
0	ohio	2000	0.0
1	ohio	2001	1.0
2	ohio	2002	2.0
3	Nevada	2001	3.0
4	Nevada	2002	4.0

righth = pd.DataFrame(np.arange(12).reshape(6,2),
                     index=[['nevada','nevada','ohio','ohio','ohio','ohio'],[2001,2000,2000,2000,2001,2002]],
                     columns = ['event1','event2'])
righth

		event1	event2
nevada	2001	0	1
nevada	2000	2	3
ohio	2000	4	5
	2000	6	7
	2001	8	9
	2002	10	11

pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True,how='outer')

	key1	key2	data	event1	event2
0	ohio	2000	0.0	4.0	5.0
0	ohio	2000	0.0	6.0	7.0
1	ohio	2001	1.0	8.0	9.0
2	ohio	2002	2.0	10.0	11.0
3	Nevada	2001	3.0	NaN	NaN
4	Nevada	2002	4.0	NaN	NaN
4	nevada	2001	NaN	0.0	1.0
4	nevada	2000	NaN	2.0	3.0

left2 = pd.DataFrame([[1,2],[3,4],[5,6]],
                    index = ['a','c','e'],
                    columns = ['ohio','nevada'])
left2

	ohio	nevada
a	1	2
c	3	4
e	5	6

right2 = pd.DataFrame([[7,8],[9,10],[11,12],[13,14]],
                    index = ['b','c','d','e'],
                    columns = ['missouri','alabama'])
right2

	missouri	alabama
b	7	8
c	9	10
d	11	12
e	13	14

pd.merge(left2,right2,how='outer',left_index=True,right_index=True)

	ohio	nevada	missouri	alabama
a	1.0	2.0	NaN	NaN
b	NaN	NaN	7.0	8.0
c	3.0	4.0	9.0	10.0
d	NaN	NaN	11.0	12.0
e	5.0	6.0	13.0	14.0

#join instance method for merging by index
left2.join(right2,how='outer')

	ohio	nevada	missouri	alabama
a	1.0	2.0	NaN	NaN
b	NaN	NaN	7.0	8.0
c	3.0	4.0	9.0	10.0
d	NaN	NaN	11.0	12.0
e	5.0	6.0	13.0	14.0

left1.join(right1,on='key')

	key	value	group_val
0	a	0	3.5
1	b	1	7.0
2	a	2	3.5
3	a	3	3.5
4	b	4	7.0
5	c	5	NaN

another = pd.DataFrame([[7,8],[9,10],[11,12],[16,17]],
                      index = ['a','c','e','f'],
                      columns = ['new york','oregon'])
another

	new york	oregon
a	7	8
c	9	10
e	11	12
f	16	17

left2.join([right2,another])

	ohio	nevada	missouri	alabama	new york	oregon
a	1.0	2.0	NaN	NaN	7.0	8.0
c	3.0	4.0	9.0	10.0	9.0	10.0
e	5.0	6.0	13.0	14.0	11.0	12.0

left2.join([right2,another],how='outer')

	ohio	nevada	missouri	alabama	new york	oregon
a	1.0	2.0	NaN	NaN	7.0	8.0
c	3.0	4.0	9.0	10.0	9.0	10.0
e	5.0	6.0	13.0	14.0	11.0	12.0
b	NaN	NaN	7.0	8.0	NaN	NaN
d	NaN	NaN	11.0	12.0	NaN	NaN
f	NaN	NaN	NaN	NaN	16.0	17.0

8.2. 3 axial connection

Another data composition operation is interchangeably called splicing, binding, or stacking.
The concatenate function of NumPy can implement this function on the NumPy array
Parameters of concat function

parameter	describe
objs	List of pandas objects or dictionary to be connected. This is a required parameter
axis	Axial direction of connection; The default is 0 (along the row)
join	It can be 'inner' or 'outer' (the default is' outer '); Used to specify whether the connection method is internal or external
join_ _axes	Used to specify the specific index of other n-1 axes, which can replace the logic of inner / outer connection
keys	The value associated with the object to be connected forms a hierarchical index along the connection axis; It can be a list or array of arbitrary values, an array of tuples, or a list of arrays (if you pass a multi-layer array to the levels parameter)
leels	When the key value is passed, this parameter is used to specify the level of multi-level index

arr = np.arange(12).reshape(3,4)
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

np.concatenate([arr,arr],axis=1)

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

s1 = pd.Series([0,1],index=['a','b'])
s1

a    0
b    1
dtype: int64

s2 = pd.Series([2,3,4],index=['c','d','e'])
s2

c    2
d    3
e    4
dtype: int64

s3 = pd.Series([5,6],index=['f','g'])
s3

f    5
g    6
dtype: int64

pd.concat([s1,s2,s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

#The concat method takes effect along the axis of axis=0 to generate another Series.
#If you pass axis=1, the returned result is a DataFrame (column when axis=1)
pd.concat([s1,s2,s3],axis=1)

	0	1	2
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

s4 = pd.concat([s1,s3])
s4

a    0
b    1
f    5
g    6
dtype: int64

pd.concat([s1,s4],axis=1)

	0	1
a	0.0	0
b	1.0	1
f	NaN	5
g	NaN	6

pd.concat([s1,s4],axis=1,join='inner')

	0	1
a	0	0
b	1	1

#You can use join_axes to specify the axis used to connect other axes
pd.concat([s1,s4],axis=1,join_axes=[['a','c','b','e']])

	0	1
a	0.0	0
b	1.0	1
f	NaN	5
g	NaN	6

result = pd.concat([s1,s2,s3],keys=['one','two','three'])
result

one    a    0
       b    1
two    c    2
       d    3
       e    4
three  f    5
       g    6
dtype: int64

result.unstack()

	a	b	c	d	e	f	g
one	0.0	1.0	NaN	NaN	NaN	NaN	NaN
two	NaN	NaN	2.0	3.0	4.0	NaN	NaN
three	NaN	NaN	NaN	NaN	NaN	5.0	6.0

pd.concat([s1,s2,s3],axis = 1,keys=['one','two','three'])

	one	two	three
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

df1 = pd.DataFrame(np.arange(6).reshape(3,2),
                  index = ['a','b','c'],
                  columns = ['one','two'])
df1

	one	two
a	0	1
b	2	3
c	4	5

df2 = pd.DataFrame(np.arange(4).reshape(2,2)+5,
                  index = ['a','c'],
                  columns = ['three','four'])
df2

	three	four
a	5	6
c	7	8

pd.concat([df1,df2],axis=1,keys=['lever1','level2'])

	lever1		level2
	one	two	three	four
a	0	1	5.0	6.0
b	2	3	NaN	NaN
c	4	5	7.0	8.0

#If you pass the object's dictionary instead of the list, the dictionary key will be used for the keys option
pd.concat({'level1':df1,'level2':df2},axis=1)

	level1		level2
	one	two	three	four
a	0	1	5.0	6.0
b	2	3	NaN	NaN
c	4	5	7.0	8.0

pd.concat([df1,df2],axis=1,keys=['lever1','level2'],names=['upper','lower'])

upper	lever1		level2
lower	one	two	three	four
a	0	1	5.0	6.0
b	2	3	NaN	NaN
c	4	5	7.0	8.0

df1 = pd.DataFrame(np.random.randn(3,4),columns = ['a','b','c','d'])
df1

	a	b	c	d
0	-1.119593	1.953114	-1.514807	-1.054782
1	0.543393	1.172903	0.945829	0.656643
2	1.012695	1.481920	-0.413033	-1.280521

df2 = pd.DataFrame(np.random.randn(2,3),columns = ['b','d','a'])
df2

	b	d	a
0	1.638046	-0.850112	1.895532
1	-1.175952	1.370474	-0.992356

pd.concat([df1,df2],ignore_index=True)

	a	b	c	d
0	-1.119593	1.953114	-1.514807	-1.054782
1	0.543393	1.172903	0.945829	0.656643
2	1.012695	1.481920	-0.413033	-1.280521
3	1.895532	1.638046	NaN	-0.850112
4	-0.992356	-1.175952	NaN	1.370474

8.2. 4 joint overlapping data

a = pd.Series([np.nan,2.5,0,3.5,4.5,np.nan],
             index=['f','e','d','c','b','a'])
a

f    NaN
e    2.5
d    0.0
c    3.5
b    4.5
a    NaN
dtype: float64

b = pd.Series([0,np.nan,2,np.nan,np.nan,5],
             index=['a','b','c','d','e','f'])
b

a    0.0
b    NaN
c    2.0
d    NaN
e    NaN
f    5.0
dtype: float64

#Consider NumPy's where function, which can perform array oriented if else equivalent operations
np.where(pd.isnull(a),b,a)

array([0. , 2.5, 0. , 3.5, 4.5, 5. ])

#Series has a combine_first method, which can be equivalent to the following axial operation using pandas common data alignment logic
b.combine_first(a)

a    0.0
b    4.5
c    2.0
d    0.0
e    2.5
f    5.0
dtype: float64

df1 = pd.DataFrame({'a':[1,np.nan,5,np.nan],
                   'b':[np.nan,2,np.nan,6],
                   'c':range(2,18,4)})
df1

	a	b	c
0	1.0	NaN	2
1	NaN	2.0	6
2	5.0	NaN	10
3	NaN	6.0	14

df2 = pd.DataFrame({'a':[5,4,np.nan,3,7],
                   'b':[np.nan,3,4,6,8]})
df2

	a	b
0	5.0	NaN
1	4.0	3.0
2	NaN	4.0
3	3.0	6.0
4	7.0	8.0

df1.combine_first(df2)

	a	b	c
0	1.0	NaN	2.0
1	4.0	2.0	6.0
2	5.0	4.0	10.0
3	3.0	6.0	14.0
4	7.0	8.0	NaN

8.3 remodeling and Perspective

There are many basic operations to rearrange tabular data. These operations are called reshaping or perspective.

8.3. 1 reshape with multi-layer index

statck this operation "rotates" or pivots the data in the column to the row.
unstack this operation pivots the data in the row to the column.

data = pd.DataFrame(np.arange(6).reshape(2,3),
                    index = pd.Index(['ohio','colorado'],name='state'),
                    columns = pd.Index(['one','two','three'],name='number'))
data

number	one	two	three
state
ohio	0	1	2
colorado	3	4	5

result = data.stack()
result

state     number
ohio      one       0
          two       1
          three     2
colorado  one       3
          two       4
          three     5
dtype: int32

result.unstack()

number	one	two	three
state
ohio	0	1	2
colorado	3	4	5

result.unstack(0)

state	ohio	colorado
number
one	0	3
two	1	4
three	2	5

result.unstack('state')

state	ohio	colorado
number
one	0	3
two	1	4
three	2	5

#Splitting may introduce missing values if all values in the hierarchy are not included in each subgroup
s1 = pd.Series([0,1,2,3],index=['a','b','c','d'])
s1

a    0
b    1
c    2
d    3
dtype: int64

s2 = pd.Series([4,5,6],index=['c','d','e'])
s2

c    4
d    5
e    6
dtype: int64

data = pd.concat([s1,s2],keys=['one','two'])
data

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64

data.unstack()

	a	b	c	d	e
one	0.0	1.0	2.0	3.0	NaN
two	NaN	NaN	4.0	5.0	6.0

#By default, the stack will filter out missing values, so the operation of stacking and destacking is reversible
data.unstack().stack()

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64

data.unstack().stack(dropna = False)

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64

#When you disassemble the heap in the DataFrame, the level of the disassembled heap will become the lowest level in the result
df = pd.DataFrame({'left':result,'right':result+5},
                 columns=pd.Index(['left','right'],name='side'))
df

	side	left	right
state	number
ohio	one	0	5
	two	1	6
	three	2	7
colorado	one	3	8
	two	4	9
	three	5	10

df.unstack('state')

side	left		right
state	ohio	colorado	ohio	colorado
number
one	0	3	5	8
two	1	4	6	9
three	2	5	7	10

#When calling the stack method, we can indicate the name of the axis to be stacked
df.unstack('state').stack('side')

	state	colorado	ohio
number	side
one	left	3	0
one	right	8	5
two	left	4	1
two	right	9	6
three	left	5	2
three	right	10	7

df.unstack('state').stack()

	side	left	right
number	state
one	ohio	0	5
one	colorado	3	8
two	ohio	1	6
two	colorado	4	9
three	ohio	2	7
three	colorado	5	10

8.3. 2 change "long" perspective to "wide"

data = pd.read_csv('examples/macrodata.csv')
data.head()

	year	quarter	realgdp	realcons	realinv	realgovt	realdpi	cpi	m1	tbilrate	unemp	pop	infl	realint
0	1959.0	1.0	2710.349	1707.4	286.898	470.045	1886.9	28.98	139.7	2.82	5.8	177.146	0.00	0.00
1	1959.0	2.0	2778.801	1733.7	310.859	481.301	1919.7	29.15	141.7	3.08	5.1	177.830	2.34	0.74
2	1959.0	3.0	2775.488	1751.8	289.226	491.260	1916.4	29.35	140.5	3.82	5.3	178.657	2.74	1.09
3	1959.0	4.0	2785.204	1753.7	299.356	484.052	1931.3	29.37	140.0	4.33	5.6	179.386	0.27	4.06
4	1960.0	1.0	2847.699	1770.5	331.722	462.199	1955.5	29.54	139.6	3.50	5.2	180.007	2.31	1.19

periods = pd.PeriodIndex(year=data.year,quarter=data.quarter,name='date')
periods

PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',
             '1960Q3', '1960Q4', '1961Q1', '1961Q2',
             ...
             '2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3',
             '2008Q4', '2009Q1', '2009Q2', '2009Q3'],
            dtype='period[Q-DEC]', name='date', length=203, freq='Q-DEC')

columns = pd.Index(['realgdp','infl','unemp'],name='item')
columns

Index(['realgdp', 'infl', 'unemp'], dtype='object', name='item')

data = data.reindex(columns=columns)
data.head()

item	realgdp	infl	unemp
0	2710.349	NaN	5.8
1	2778.801	NaN	5.1
2	2775.488	NaN	5.3
3	2785.204	NaN	5.6
4	2847.699	NaN	5.2

data.index = periods.to_timestamp('D','end')
data.head()

item	realgdp	infl	unemp
date
1959-03-31 23:59:59.999999999	2710.349	NaN	5.8
1959-06-30 23:59:59.999999999	2778.801	NaN	5.1
1959-09-30 23:59:59.999999999	2775.488	NaN	5.3
1959-12-31 23:59:59.999999999	2785.204	NaN	5.6
1960-03-31 23:59:59.999999999	2847.699	NaN	5.2

ldata = data.stack().reset_index().rename(columns={0:'value'})
ldata[:10]

	date	item	value
0	1959-03-31 23:59:59.999999999	realgdp	2710.349
1	1959-03-31 23:59:59.999999999	unemp	5.800
2	1959-06-30 23:59:59.999999999	realgdp	2778.801
3	1959-06-30 23:59:59.999999999	unemp	5.100
4	1959-09-30 23:59:59.999999999	realgdp	2775.488
5	1959-09-30 23:59:59.999999999	unemp	5.300
6	1959-12-31 23:59:59.999999999	realgdp	2785.204
7	1959-12-31 23:59:59.999999999	unemp	5.600
8	1960-03-31 23:59:59.999999999	realgdp	2847.699
9	1960-03-31 23:59:59.999999999	unemp	5.200

pivoted = ldata.pivot('date','item','value')
pivoted

item	realgdp	unemp
date
1959-03-31 23:59:59.999999999	2710.349	5.8
1959-06-30 23:59:59.999999999	2778.801	5.1
1959-09-30 23:59:59.999999999	2775.488	5.3
1959-12-31 23:59:59.999999999	2785.204	5.6
1960-03-31 23:59:59.999999999	2847.699	5.2
...	...	...
2008-09-30 23:59:59.999999999	13324.600	6.0
2008-12-31 23:59:59.999999999	13141.920	6.9
2009-03-31 23:59:59.999999999	12925.410	8.1
2009-06-30 23:59:59.999999999	12901.504	9.2
2009-09-30 23:59:59.999999999	12990.341	9.6

203 rows × 2 columns

8.3. 3 change the "wide" perspective to "long"

In DataFrame, the inverse operation of pivot method is pandas melt

df = pd.DataFrame({'key':['foo','bar','baz'],
                  'A':[1,2,3],
                  'B':[4,5,6],
                  'C':[7,8,9]})
df

	key	A	B	C
0	foo	1	4	7
1	bar	2	5	8
2	baz	3	6	9

#When using pandas When melt, we must indicate which columns are grouping indicators (if any)
melted = pd.melt(df)
melted

	variable	value
0	key	foo
1	key	bar
2	key	baz
3	A	1
4	A	2
5	A	3
6	B	4
7	B	5
8	B	6
9	C	7
10	C	8
11	C	9

melted = pd.melt(df,['key'])
melted

	key	variable	value
0	foo	A	1
1	bar	A	2
2	baz	A	3
3	foo	B	4
4	bar	B	5
5	baz	B	6
6	foo	C	7
7	bar	C	8
8	baz	C	9

reshaped = melted.pivot('key','variable','value')
reshaped

variable	A	B	C
key
bar	2	5	8
baz	3	6	9
foo	1	4	7

#Using reset_index to move the data back one column
reshaped.reset_index()

variable	key	A	B	C
0	bar	2	5	8
1	baz	3	6	9
2	foo	1	4	7

pd.melt(df,id_vars=['key'],value_vars=['A','B'])

	key	variable	value
0	foo	A	1
1	bar	A	2
2	baz	A	3
3	foo	B	4
4	bar	B	5
5	baz	B	6

pd.melt(df,value_vars=['A','B','C'])

	variable	value
0	A	1
1	A	2
2	A	3
3	B	4
4	B	5
5	B	6
6	C	7
7	C	8
8	C	9

pd.melt(df,value_vars=['A','B','key'])

	variable	value
0	A	1
1	A	2
2	A	3
3	B	4
4	B	5
5	B	6
6	key	foo
7	key	bar
8	key	baz

Chapter 9 drawing and visualization

%matplotlib notebook

9.1 introduction to concise matplotlib API

There is a detail to note when using Jupiter notebook. After each cell is run, the chart is reset. Therefore, for more complex charts, you must put all drawing commands in a single notebook cell

import matplotlib.pyplot as plt

import numpy as np
data = np.arange(10)
data

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

plt.plot(data)

9.1. 1 pictures and subgraphs

The graph drawn by matplotlib is located in the picture object
pyplot.subplots option

parameter	describe
nrows	Number of rows of subgraph
ncols	Column number of subgraph
sharex	All subgraphs use the same x-axis scale (adjusting xlim will affect all subgraphs)
sharey	All subgraphs use the same y-axis scale (adjusting ylim affects all subgraphs)
subplot_ kw	Incoming add_ The keyword parameter Dictionary of subplot is used to generate subgraphs
**fig_ _kW	Additional keyword parameters used when generating pictures, such as PLT subplots (2，2,figsize= (8,6))

#You can use PLT Figure generate a new picture
fig = plt.figure()
ax1 = fig.add_subplot(2,2,1)
ax2 = fig.add_subplot(2,2,2)
ax3 = fig.add_subplot(2,2,3)

<IPython.core.display.Javascript object>

Keywords: Python Data Analysis

Added by nrussell on Sun, 02 Jan 2022 06:03:37 +0200

Programming VIP

Data analysis using python

Chapter VII data cleaning and preparation

7.1 handling missing values

7.1. 1 filter missing values

7.1. 2. Complete the missing value

7.2 data conversion

7.2. 1 delete duplicate values

7.2. 2 data conversion using functions or mappings

7.2. 3 alternative values

7.2. 4 rename axis index

7.2. 5 discretization and distribution

7.2. 6 detect and filter outliers

7.2. 7 replacement and random sampling

7.2. 8 calculation index / dummy variable

7.3 string operation

7.3. 1 string object method

7.3. 2 regular expression

7.3. 3. Vectorized string function in pandas

Chapter 8 data regularization: connection, association and reconstruction

8.1 hierarchical index

8.1. 1 reordering and hierarchical sorting

8.1. 2. Summary statistics by level

8.1. 3 use DataFrame columns for indexing

8.2 federated and consolidated data sets

8.2. 1 database style DataFrame connection

8.2. 2 merge by index

8.2. 3 axial connection

8.2. 4 joint overlapping data

8.3 remodeling and Perspective

8.3. 1 reshape with multi-layer index

8.3. 2 change "long" perspective to "wide"

8.3. 3 change the "wide" perspective to "long"

Chapter 9 drawing and visualization

9.1 introduction to concise matplotlib API

9.1. 1 pictures and subgraphs

Popular Keywords

	(0.0, 0.2]	(0.2, 0.4]	(0.4, 0.6]	(0.6, 0.8]	(0.8, 1.0]
0	0	0	0	0	1
1	0	1	0	0	0
2	1	0	0	0	0
3	0	1	0	0	0
4	0	0	1	0	0
5	0	0	1	0	0
6	0	0	0	0	1
7	0	0	0	1	0
8	0	0	0	1	0
9	0	0	0	1	0

	key	data1	data2
0	b	0.0	1.0
1	b	1.0	1.0
2	b	6.0	1.0
3	a	2.0	0.0
4	a	4.0	0.0
5	a	5.0	0.0
6	c	3.0	NaN
7	d	NaN	2.0

	key	data1	data2
0	b	0	1.0
1	b	0	3.0
2	b	1	1.0
3	b	1	3.0
4	a	2	0.0
5	a	2	2.0
6	c	3	NaN
7	a	4	0.0
8	a	4	2.0
9	b	5	1.0
10	b	5	3.0

	key1	key2	lval	rval
0	foo	one	1.0	4.0
1	foo	one	1.0	5.0
2	foo	two	2.0	NaN
3	bar	one	3.0	6.0
4	bar	two	NaN	7.0

	key1	key2_x	lval	key2_y	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

	key1	key2_left	lval	key2_right	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

	0	1	2
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

	one	two	three
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

	(0.0, 0.2]	(0.2, 0.4]	(0.4, 0.6]	(0.6, 0.8]	(0.8, 1.0]
0	0	0	0	0	1
1	0	1	0	0	0
2	1	0	0	0	0
3	0	1	0	0	0
4	0	0	1	0	0
5	0	0	1	0	0
6	0	0	0	0	1
7	0	0	0	1	0
8	0	0	0	1	0
9	0	0	0	1	0

	key	data1	data2
0	b	0.0	1.0
1	b	1.0	1.0
2	b	6.0	1.0
3	a	2.0	0.0
4	a	4.0	0.0
5	a	5.0	0.0
6	c	3.0	NaN
7	d	NaN	2.0

	key	data1	data2
0	b	0	1.0
1	b	0	3.0
2	b	1	1.0
3	b	1	3.0
4	a	2	0.0
5	a	2	2.0
6	c	3	NaN
7	a	4	0.0
8	a	4	2.0
9	b	5	1.0
10	b	5	3.0

	key1	key2	lval	rval
0	foo	one	1.0	4.0
1	foo	one	1.0	5.0
2	foo	two	2.0	NaN
3	bar	one	3.0	6.0
4	bar	two	NaN	7.0

	key1	key2_x	lval	key2_y	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

	key1	key2_left	lval	key2_right	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

	0	1	2
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

	one	two	three
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

	(0.0, 0.2]	(0.2, 0.4]	(0.4, 0.6]	(0.6, 0.8]	(0.8, 1.0]
0	0	0	0	0	1
1	0	1	0	0	0
2	1	0	0	0	0
3	0	1	0	0	0
4	0	0	1	0	0
5	0	0	1	0	0
6	0	0	0	0	1
7	0	0	0	1	0
8	0	0	0	1	0
9	0	0	0	1	0

	key	data1	data2
0	b	0.0	1.0
1	b	1.0	1.0
2	b	6.0	1.0
3	a	2.0	0.0
4	a	4.0	0.0
5	a	5.0	0.0
6	c	3.0	NaN
7	d	NaN	2.0

	key	data1	data2
0	b	0	1.0
1	b	0	3.0
2	b	1	1.0
3	b	1	3.0
4	a	2	0.0
5	a	2	2.0
6	c	3	NaN
7	a	4	0.0
8	a	4	2.0
9	b	5	1.0
10	b	5	3.0

	key1	key2	lval	rval
0	foo	one	1.0	4.0
1	foo	one	1.0	5.0
2	foo	two	2.0	NaN
3	bar	one	3.0	6.0
4	bar	two	NaN	7.0

	key1	key2_x	lval	key2_y	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

	key1	key2_left	lval	key2_right	rval
0	foo	one	1	one	4
1	foo	one	1	one	5
2	foo	two	2	one	4
3	foo	two	2	one	5
4	bar	one	3	one	6
5	bar	one	3	two	7

	0	1	2
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0

	one	two	three
a	0.0	NaN	NaN
b	1.0	NaN	NaN
c	NaN	2.0	NaN
d	NaN	3.0	NaN
e	NaN	4.0	NaN
f	NaN	NaN	5.0
g	NaN	NaN	6.0