1. Feature digitization
1.1 Replace() function
import pandas as pd df = pd.DataFrame({"gene_segA": [1, 0, 0, 1, 1, 1, 0, 0, 1, 0], "gene_segB": [1, 0, 1, 0, 1, 1, 0, 0, 1, 0], "hypertension": ["Y", 'N', 'N', 'N', 'N', 'N', 'Y', 'N', 'Y', 'N'], "Gallstones": ['Y', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y'] }) df df.replace({"N": 0, 'Y': 1})
1.2 LabelEncoder in sklearn package
from sklearn.preprocessing import LabelEncoder le = LabelEncoder() # ① le.fit(['white', 'green', 'red', 'green', 'white']) # ② le.classes_ # ③ le.transform(["green", 'green', 'green', 'white']) # ④
1.3 category_encoders package
It encapsulates a variety of coding methods
https://mattzheng.blog.csdn.net/article/details/107851162
2. Feature binarization
2.1 manual setting
import numpy as np pm25['bdays'] = np.where(pm25["Exposed days"] > pm25["Exposed days"].mean(), 1, 0) pm25.sample(10)
2.2 Binarizer in sklearn package
This is to change the numerical type into binary type according to the threshold. The threshold can be set. In addition, only numerical data can be processed, and the passed in parameters must be 2D array, that is, they cannot be Series For this type, the shape is an array of (m,n) instead of (n,). Let's take a look at the example below
df = DataFrame(np.arange(12).reshape(4,3),columns=['A','B','C']) df The first column is the index value A B C 0 0 1 2 1 3 4 5 2 6 7 8 3 9 10 11 Turn the value less than or equal to 5 to 0 and the value greater than 5 to 1 binarize = Binarizer(threshold=5) binarize.fit_transform(df) array([[0, 0, 0], [0, 0, 0], [1, 1, 1], [1, 1, 1]]) Can also be passed in df[['A','B']]To convert two columns. Note that it can't be df['A']perhaps df.A,because df.A yes Series Not two-dimensional
from sklearn.preprocessing import Binarizer bn = Binarizer(threshold=pm25["Exposed days"].mean()) # ① result = bn.fit_transform(pm25[["Exposed days"]]) # ② pm25['sk-bdays'] = result pm25.sample(10)
Supplementary knowledge points:
reshape Function is to change the format, parameters and return value of an array without changing its data. reshape(m, -1) Change to dimension m Unknown number of rows and columns reshape(-1, m) Change dimension to m Column, unknown number of rows
3. One hot code
df = pd.DataFrame({ "color": ['green', 'red', 'blue', 'red'], "size": ['M', 'L', 'XL', 'L'], "price": [29.9, 69.9, 99.9, 59.9], "classlabel": ['class1', 'class2', 'class1', 'class1'] }) df
3.1 manual conversion
size_mapping = {'XL': 3, 'L': 2, 'M': 1} df['size'] = df['size'].map(size_mapping) # ② df
3.2 call OneHotEncoder of sklearn package
from sklearn.preprocessing import OneHotEncoder ohe = OneHotEncoder() fs = ohe.fit_transform(df[['color']]) fs_ohe = pd.DataFrame(fs.toarray()[:, 1:], columns=["color_green", 'color_red']) df = pd.concat([df, fs_ohe], axis=1) df
4. Data transformation
4.1 take logarithm of variable
%matplotlib inline import seaborn as sns ax = sns.scatterplot(x='time', y='location', data=data) import numpy as np data.drop([0], inplace=True) # Remove 0 and do not calculate log0 data['logtime'] = np.log10(data['time']) # ① data['logloc'] = np.log10(data['location']) # ② data.head() ax2 = sns.scatterplot(x='logtime', y='logloc', data=data) from sklearn.linear_model import LinearRegression reg = LinearRegression() reg.fit(data['logtime'].values.reshape(-1, 1), data['logloc'].values.reshape(-1, 1)) (reg.coef_, reg.intercept_)
4.2 calling PolynomialFeatures of sklearn package
sklearn package PolynomialFeatures
import numpy as np X = np.arange(6).reshape(3, 2) X from sklearn.preprocessing import PolynomialFeatures # ③ poly = PolynomialFeatures(2) # ④ poly.fit_transform(X)
Comprehensive case
%matplotlib inline import pandas as pd import matplotlib.pyplot as plt from sklearn.linear_model import Ridge from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import make_pipeline df = pd.read_csv("/home/aistudio/data/data20514/xsin.csv") colors = ['teal', 'yellowgreen', 'gold'] plt.scatter(df['x'], df['y'], color='navy', s=30, marker='o', label="training points") for count, degree in enumerate([3, 4, 5]): model = make_pipeline(PolynomialFeatures(degree), Ridge()) # ③ model.fit(df[['x']], df[['y']]) y_pre = model.predict(df[['x']]) plt.plot(df['x'], y_pre, color=colors[count], linewidth=2, label="degree %d" % degree) plt.legend()
5. Feature discretization
5.1 unsupervised discretization
Use the cut() function of pandas to group the attributes
ages2 = pd.DataFrame({'years':[10, 14, 30, 53, 300, 32, 45], 'name':['A', 'B', 'C', 'D', 'E', 'F', 'G']}) klass2 = pd.cut(ages2['years'], 3, labels=['Young', 'Middle', 'Senior']) # ② ages2['label'] = klass2 ages2
ages2 = pd.DataFrame({'years':[10, 14, 30, 53, 300, 32, 45], 'name':['A', 'B', 'C', 'D', 'E', 'F', 'G']}) klass2 = pd.cut(ages2['years'], bins=[9, 30, 50, 300], labels=['Young', 'Middle', 'Senior']) # ③ ages2['label'] = klass2 ages2
Call KBinsDiscretizer implementation of sklearn
from sklearn.preprocessing import KBinsDiscretizer kbd = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform') # ④ trans = kbd.fit_transform(ages[['years']]) # ⑤ ages['kbd'] = trans[:, 0] # ⑥ ages
Introduction to KBinsDiscretizer
https://scikit-learn.org.cn/view/722.html
Sklearn official example
This example compares the prediction results of linear regression (linear model) with or without discrete real value characteristics and decision tree (tree based model).
As shown by the results before discretization, the establishment of linear model is fast and the interpretation is relatively simple, but only linear relationship can be modeled, while decision tree can build more complex data model. One way to make linear models more powerful on continuous data is to use discretization (also known as box Division). In the example, we discretize the features and thermally encode the converted data. Please note that if the width of the sub box is not reasonable, the risk of over fitting seems to increase greatly, so the discretizer parameters should usually be adjusted under cross validation.
After discretization, linear regression and decision tree make exactly the same prediction. Since the elements in each bin are constant, any model must predict the same value for all points in the bin. Compared with the results before discretization, the linear model becomes more flexible, while the flexibility of decision tree is greatly reduced. Note that the merge function usually does not have any beneficial impact on tree based models because they can learn to split data anywhere.
import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.preprocessing import KBinsDiscretizer from sklearn.tree import DecisionTreeRegressor print(__doc__) # Building data sets rnd = np.random.RandomState(42) X = rnd.uniform(-3, 3, size=100) y = np.sin(X) + rnd.normal(size=len(X)) / 3 X = X.reshape(-1, 1) # Converting data sets with KBinsDiscretizer enc = KBinsDiscretizer(n_bins=10, encode='onehot') X_binned = enc.fit_transform(X) # Prediction with original data set fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True, figsize=(10, 4)) line = np.linspace(-3, 3, 1000, endpoint=False).reshape(-1, 1) reg = LinearRegression().fit(X, y) ax1.plot(line, reg.predict(line), linewidth=2, color='green', label="linear regression") reg = DecisionTreeRegressor(min_samples_split=3, random_state=0).fit(X, y) ax1.plot(line, reg.predict(line), linewidth=2, color='red', label="decision tree") ax1.plot(X[:, 0], y, 'o', c='k') ax1.legend(loc="best") ax1.set_ylabel("Regression output") ax1.set_xlabel("Input feature") ax1.set_title("Result before discretization") # Forecast with converted data line_binned = enc.transform(line) reg = LinearRegression().fit(X_binned, y) ax2.plot(line, reg.predict(line_binned), linewidth=2, color='green', linestyle='-', label='linear regression') reg = DecisionTreeRegressor(min_samples_split=3, random_state=0).fit(X_binned, y) ax2.plot(line, reg.predict(line_binned), linewidth=2, color='red', linestyle=':', label='decision tree') ax2.plot(X[:, 0], y, 'o', c='k') ax2.vlines(enc.bin_edges_[0], *plt.gca().get_ylim(), linewidth=1, alpha=.2) ax2.legend(loc="best") ax2.set_xlabel("Input feature") ax2.set_title("Result after discretization") plt.tight_layout() plt.show()
![[KBinsDiscretizer.png]]
5.2 supervised discretization
Call entropy_based_binning package
import entropy_based_binning as ebb A = np.array([[1,1,2,3,3], [1,1,0,1,0]]) ebb.bin_array(A, nbins=2, axis=1)
Usage introduction
Docstring: Find and apply the maximum entropy binning to an integer array, given the number of target bins. Convenience wrapper around bin_sequence(). Arguments: ---------- A: (N, M) ndarray input array; must be integer nbins: int number of bins axis: None or int (default None) axis along which to bin; if None, the optimal binning is chosen based on all values in the array; Returns: -------- B: (N, M) ndarray binned array
Introduction to MDLP
https://github.com/hlin117/mdlp-discretization
This is the implementation of Usama Fayyad's entropy based expert box sorting method
Examples
from mdlp.discretization import MDLP from sklearn.datasets import load_iris transformer = MDLP() iris = load_iris() X, y = iris.data, iris.target X_disc = transformer.fit_transform(X, y) X_disc
6. Data standardization
6.1 StandardScaler of sklearn package
from sklearn import datasets from sklearn.preprocessing import StandardScaler iris = datasets.load_iris() iris_std = StandardScaler().fit_transform(iris.data) # ①
6.2 MinMaxScaler of sklearn package
from sklearn.preprocessing import MinMaxScaler iris_mm = MinMaxScaler().fit_transform(iris.data)
6.3 RobustScaler of sklearn package
from sklearn.preprocessing import RobustScaler iris_mm = RobustScaler().fit_transform(iris.data)
6.4 examples
Example 1
''' np.random.normal() The first parameter is the mean The second parameter is the standard deviation The third parameter is number np.concatenate() Concatenate arrays ''' #Build data import pandas as pd X = pd.DataFrame({ 'x1': np.concatenate([np.random.normal(20, 1, 1000), np.random.normal(1, 1, 25)]), 'x2': np.concatenate([np.random.normal(30, 1, 1000), np.random.normal(50, 1, 25)]), }) X.sample(10) #Create RobustScaler, MinMaxScaler normalized model from sklearn.preprocessing import RobustScaler, MinMaxScaler robust = RobustScaler() robust_scaled = robust.fit_transform(X) robust_scaled = pd.DataFrame(robust_scaled, columns=['x1', 'x2']) minmax = MinMaxScaler() minmax_scaled = minmax.fit_transform(X) minmax_scaled = pd.DataFrame(minmax_scaled, columns=['x1', 'x2']) #mapping %matplotlib inline import matplotlib.pyplot as plt import seaborn as sns fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(9, 5)) ax1.set_title('Before Scaling') sns.kdeplot(X['x1'], ax=ax1) sns.kdeplot(X['x2'], ax=ax1) ax2.set_title('After Robust Scaling') sns.kdeplot(robust_scaled['x1'], ax=ax2) sns.kdeplot(robust_scaled['x2'], ax=ax2) ax3.set_title('After Min-Max Scaling') sns.kdeplot(minmax_scaled['x1'], ax=ax3) sns.kdeplot(minmax_scaled['x2'], ax=ax3)
Example 2
from sklearn.preprocessing import Normalizer from mpl_toolkits.mplot3d import Axes3D df = pd.DataFrame({ 'x1': np.random.randint(-100, 100, 1000).astype(float), 'y1': np.random.randint(-80, 80, 1000).astype(float), 'z1': np.random.randint(-150, 150, 1000).astype(float), }) #Normalizer() normalizing effect scaler = Normalizer() scaled_df = scaler.fit_transform(df) scaled_df = pd.DataFrame(scaled_df, columns=df.columns) fig = plt.figure(figsize=(9, 5)) ax1 = fig.add_subplot(121, projection='3d') ax2 = fig.add_subplot(122, projection='3d') ax1.scatter(df['x1'], df['y1'], df['z1']) ax2.scatter(scaled_df['x1'], scaled_df['y1'], scaled_df['z1']) #Effect of MinMaxScaler() normalization scaler = MinMaxScaler() scaled_df = scaler.fit_transform(df) scaled_df = pd.DataFrame(scaled_df, columns=df.columns) fig = plt.figure(figsize=(9, 5)) ax1 = fig.add_subplot(121, projection='3d') ax2 = fig.add_subplot(122, projection='3d') ax1.scatter(df['x1'], df['y1'], df['z1']) ax2.scatter(scaled_df['x1'], scaled_df['y1'], scaled_df['z1'])