#coding:utf-8 #Import warnings package and use filter to ignore warning statements. import warnings warnings.filterwarnings('ignore') import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import missingno as msno
## 1) Load training set and test set; path = './' Train_data = pd.read_csv(path+'car_train_0110.csv', sep=' ') Test_data = pd.read_csv(path+'car_testA_0110.csv', sep=' ')
Train_data.head()
SaleID | name | regDate | model | brand | bodyType | fuelType | gearbox | power | kilometer | ... | v_14 | v_15 | v_16 | v_17 | v_18 | v_19 | v_20 | v_21 | v_22 | v_23 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 134890 | 734 | 20160002 | 13.0 | 9 | NaN | 0.0 | 1.0 | 0 | 15.0 | ... | 0.092139 | 0.000000 | 18.763832 | -1.512063 | -1.008718 | -12.100623 | -0.947052 | 9.077297 | 0.581214 | 3.945923 |
1 | 306648 | 196973 | 20080307 | 72.0 | 9 | 7.0 | 5.0 | 1.0 | 173 | 15.0 | ... | 0.001070 | 0.122335 | -5.685612 | -0.489963 | -2.223693 | -0.226865 | -0.658246 | -3.949621 | 4.593618 | -1.145653 |
2 | 340675 | 25347 | 20020312 | 18.0 | 12 | 3.0 | 0.0 | 1.0 | 50 | 12.5 | ... | 0.064410 | 0.003345 | -3.295700 | 1.816499 | 3.554439 | -0.683675 | 0.971495 | 2.625318 | -0.851922 | -1.246135 |
3 | 57332 | 5382 | 20000611 | 38.0 | 8 | 7.0 | 0.0 | 1.0 | 54 | 15.0 | ... | 0.069231 | 0.000000 | -3.405521 | 1.497826 | 4.782636 | 0.039101 | 1.227646 | 3.040629 | -0.801854 | -1.251894 |
4 | 265235 | 173174 | 20030109 | 87.0 | 0 | 5.0 | 5.0 | 1.0 | 131 | 3.0 | ... | 0.000099 | 0.001655 | -4.475429 | 0.124138 | 1.364567 | -0.319848 | -1.131568 | -3.303424 | -1.998466 | -1.279368 |
5 rows × 40 columns
All feature sets are desensitized (convenient for everyone to watch) ¶
name - car code
regDate - vehicle registration time
model - model Code
brand - brand
bodyType - body type
fuelType - fuel type
gearbox - Transmission
Power - vehicle power
kilometer - vehicle kilometers
notRepairedDamage - the car has unrepaired damage
regionCode - car viewing area code
Seller - seller
offerType - Quotation type
CreateDate - advertisement release time
Price - car price
v_0’, ‘v_1’, ‘v_2’, ‘v_3’, ‘v_4’, ‘v_5’, ‘v_6’, ‘v_7’, ‘v_8’, ‘v_9’, ‘v_10’, ‘v_11’, ‘v_12’, ‘v_13’,‘v_14 '[anonymous features, including 15 anonymous features including v0-14]
Train_data.head().append(Test_data.tail())
SaleID | name | regDate | model | brand | bodyType | fuelType | gearbox | power | kilometer | ... | v_14 | v_15 | v_16 | v_17 | v_18 | v_19 | v_20 | v_21 | v_22 | v_23 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 134890 | 734 | 20160002 | 13.0 | 9 | NaN | 0.0 | 1.0 | 0 | 15.0 | ... | 0.092139 | 0.000000 | 18.763832 | -1.512063 | -1.008718 | -12.100623 | -0.947052 | 9.077297 | 0.581214 | 3.945923 |
1 | 306648 | 196973 | 20080307 | 72.0 | 9 | 7.0 | 5.0 | 1.0 | 173 | 15.0 | ... | 0.001070 | 0.122335 | -5.685612 | -0.489963 | -2.223693 | -0.226865 | -0.658246 | -3.949621 | 4.593618 | -1.145653 |
2 | 340675 | 25347 | 20020312 | 18.0 | 12 | 3.0 | 0.0 | 1.0 | 50 | 12.5 | ... | 0.064410 | 0.003345 | -3.295700 | 1.816499 | 3.554439 | -0.683675 | 0.971495 | 2.625318 | -0.851922 | -1.246135 |
3 | 57332 | 5382 | 20000611 | 38.0 | 8 | 7.0 | 0.0 | 1.0 | 54 | 15.0 | ... | 0.069231 | 0.000000 | -3.405521 | 1.497826 | 4.782636 | 0.039101 | 1.227646 | 3.040629 | -0.801854 | -1.251894 |
4 | 265235 | 173174 | 20030109 | 87.0 | 0 | 5.0 | 5.0 | 1.0 | 131 | 3.0 | ... | 0.000099 | 0.001655 | -4.475429 | 0.124138 | 1.364567 | -0.319848 | -1.131568 | -3.303424 | -1.998466 | -1.279368 |
49995 | 375033 | 3803 | 20010407 | 6.0 | 29 | 5.0 | 0.0 | 0.0 | 186 | 10.0 | ... | 0.000000 | 0.000372 | -3.397636 | 0.940183 | 4.115667 | 0.146320 | -2.348749 | -2.636560 | -0.965214 | -1.097192 |
49996 | 406556 | 28500 | 20071001 | 130.0 | 10 | 2.0 | 0.0 | 0.0 | 272 | 7.0 | ... | 0.003208 | 0.116459 | -7.055336 | -1.260228 | -4.937979 | 0.881517 | -1.590285 | -3.495608 | 3.301887 | 3.947193 |
49997 | 511668 | 98383 | 19980102 | 23.0 | 10 | 4.0 | 0.0 | 1.0 | 190 | 0.5 | ... | 0.049580 | 0.067015 | -4.916501 | 0.507919 | -0.035475 | 0.256285 | 0.734084 | 0.779931 | 1.822416 | 5.012697 |
49998 | 533139 | 1489 | 20031001 | 70.0 | 1 | 7.0 | 4.0 | NaN | 101 | 15.0 | ... | 0.084591 | 0.000000 | -0.424439 | 3.893203 | -0.146884 | 1.830694 | 18.008141 | -2.513048 | -3.310876 | -1.589404 |
49999 | 592803 | 994 | 20070407 | 76.0 | 0 | 4.0 | 5.0 | NaN | 0 | 15.0 | ... | 0.055724 | 0.110924 | -1.422750 | 2.749703 | -2.160718 | 0.838089 | 17.664283 | -5.802325 | 3.063008 | -1.308131 |
10 rows × 40 columns
Train_data.shape
(250000, 40)
Test_data.head().append(Test_data.tail())
SaleID | name | regDate | model | brand | bodyType | fuelType | gearbox | power | kilometer | ... | v_14 | v_15 | v_16 | v_17 | v_18 | v_19 | v_20 | v_21 | v_22 | v_23 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 720326 | 505 | 20060505 | 19.0 | 13 | 7.0 | 0.0 | 1.0 | 90 | 8.0 | ... | 0.083340 | 0.105382 | -5.998993 | 0.147048 | -1.902847 | 0.348990 | 2.324961 | 3.343910 | 4.048742 | -1.431822 |
1 | 714316 | 1836 | 20010301 | 5.0 | 5 | 3.0 | 4.0 | 1.0 | 75 | 15.0 | ... | 0.074478 | 0.000000 | -3.287221 | 2.081317 | 2.937052 | -0.123018 | 1.202395 | 3.570743 | -1.180587 | -1.348598 |
2 | 704693 | 212291 | 20170610 | 6.0 | 18 | NaN | 5.0 | 0.0 | 150 | 15.0 | ... | 0.002032 | 0.000000 | 4.368218 | 8.252188 | -4.136109 | -13.334970 | -4.444620 | -0.706978 | -1.720218 | 3.569112 |
3 | 624972 | 1345 | 19820005 | 215.0 | 32 | 7.0 | 0.0 | 1.0 | 0 | 6.0 | ... | 0.098806 | 0.100883 | -2.537486 | 0.513955 | 4.414962 | 0.357685 | 2.700732 | 5.323602 | 6.085956 | -0.900585 |
4 | 669753 | 1428 | 20060205 | 30.0 | 4 | 7.0 | 5.0 | 1.0 | 122 | 15.0 | ... | 0.088397 | 0.002509 | -6.197633 | -0.191814 | -1.224360 | -0.326985 | 2.254931 | 4.183037 | -2.574004 | 0.014203 |
49995 | 375033 | 3803 | 20010407 | 6.0 | 29 | 5.0 | 0.0 | 0.0 | 186 | 10.0 | ... | 0.000000 | 0.000372 | -3.397636 | 0.940183 | 4.115667 | 0.146320 | -2.348749 | -2.636560 | -0.965214 | -1.097192 |
49996 | 406556 | 28500 | 20071001 | 130.0 | 10 | 2.0 | 0.0 | 0.0 | 272 | 7.0 | ... | 0.003208 | 0.116459 | -7.055336 | -1.260228 | -4.937979 | 0.881517 | -1.590285 | -3.495608 | 3.301887 | 3.947193 |
49997 | 511668 | 98383 | 19980102 | 23.0 | 10 | 4.0 | 0.0 | 1.0 | 190 | 0.5 | ... | 0.049580 | 0.067015 | -4.916501 | 0.507919 | -0.035475 | 0.256285 | 0.734084 | 0.779931 | 1.822416 | 5.012697 |
49998 | 533139 | 1489 | 20031001 | 70.0 | 1 | 7.0 | 4.0 | NaN | 101 | 15.0 | ... | 0.084591 | 0.000000 | -0.424439 | 3.893203 | -0.146884 | 1.830694 | 18.008141 | -2.513048 | -3.310876 | -1.589404 |
49999 | 592803 | 994 | 20070407 | 76.0 | 0 | 4.0 | 5.0 | NaN | 0 | 15.0 | ... | 0.055724 | 0.110924 | -1.422750 | 2.749703 | -2.160718 | 0.838089 | 17.664283 | -5.802325 | 3.063008 | -1.308131 |
10 rows × 39 columns
Test_data.shape
(50000, 39)
Train_data.describe()
SaleID | name | regDate | model | brand | bodyType | fuelType | gearbox | power | kilometer | ... | v_14 | v_15 | v_16 | v_17 | v_18 | v_19 | v_20 | v_21 | v_22 | v_23 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 250000.000000 | 250000.000000 | 2.500000e+05 | 250000.000000 | 250000.000000 | 224620.000000 | 227510.000000 | 236487.000000 | 250000.000000 | 250000.000000 | ... | 250000.000000 | 250000.000000 | 250000.000000 | 250000.000000 | 250000.000000 | 250000.000000 | 250000.000000 | 250000.000000 | 250000.000000 | 250000.000000 |
mean | 185351.790768 | 83153.362172 | 2.003401e+07 | 44.911480 | 7.785236 | 4.563271 | 1.665008 | 0.780783 | 115.528412 | 12.577418 | ... | 0.032489 | 0.030408 | 0.014725 | 0.000915 | 0.006273 | 0.006604 | -0.001374 | 0.000609 | -0.004025 | 0.001834 |
std | 107121.188763 | 72540.799964 | 7.770250e+04 | 50.640081 | 7.694010 | 1.912515 | 2.339646 | 0.413717 | 196.141828 | 3.990632 | ... | 0.038792 | 0.049333 | 8.779163 | 5.771081 | 4.880981 | 4.124722 | 3.803626 | 3.555353 | 2.864713 | 2.323680 |
min | 1.000000 | 0.000000 | 1.910000e+07 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.500000 | ... | 0.000000 | 0.000000 | -10.412444 | -15.538236 | -21.009214 | -13.989955 | -9.599285 | -11.181255 | -7.671327 | -2.350888 |
25% | 92501.750000 | 14500.000000 | 1.999061e+07 | 6.000000 | 1.000000 | 3.000000 | 0.000000 | 1.000000 | 70.000000 | 12.500000 | ... | 0.000129 | 0.000000 | -5.552269 | -0.901181 | -3.150385 | -0.478173 | -1.727237 | -3.067073 | -2.092178 | -1.402804 |
50% | 185264.500000 | 65314.500000 | 2.003111e+07 | 27.000000 | 6.000000 | 4.000000 | 0.000000 | 1.000000 | 105.000000 | 15.000000 | ... | 0.001961 | 0.002567 | -3.821770 | 0.223181 | -0.058502 | 0.038427 | -0.995044 | -0.880587 | -1.199807 | -1.145588 |
75% | 278128.500000 | 143761.250000 | 2.008081e+07 | 70.000000 | 11.000000 | 7.000000 | 5.000000 | 1.000000 | 150.000000 | 15.000000 | ... | 0.075672 | 0.056568 | 3.599747 | 1.263737 | 2.800475 | 0.569198 | 1.563382 | 3.269987 | 2.737614 | 0.044865 |
max | 370946.000000 | 233044.000000 | 2.019121e+07 | 250.000000 | 39.000000 | 7.000000 | 6.000000 | 1.000000 | 20000.000000 | 15.000000 | ... | 0.130785 | 0.184340 | 36.756878 | 26.134561 | 23.055660 | 16.576027 | 20.324572 | 14.039422 | 8.764597 | 8.574730 |
8 rows × 40 columns
Test_data.describe()
SaleID | name | regDate | model | brand | bodyType | fuelType | gearbox | power | kilometer | ... | v_14 | v_15 | v_16 | v_17 | v_18 | v_19 | v_20 | v_21 | v_22 | v_23 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 50000.000000 | 50000.000000 | 5.000000e+04 | 50000.000000 | 50000.000000 | 44890.000000 | 45598.000000 | 47287.000000 | 50000.000000 | 50000.000000 | ... | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 |
mean | 556029.053380 | 82878.251420 | 2.003441e+07 | 44.922840 | 7.779420 | 4.556226 | 1.681192 | 0.781081 | 114.116060 | 12.555210 | ... | 0.032570 | 0.030773 | -0.024819 | 0.007051 | -0.008488 | -0.030104 | 0.014609 | -0.003353 | 0.013125 | -0.011936 |
std | 106952.402565 | 72292.076936 | 7.788055e+04 | 50.576255 | 7.661667 | 1.908291 | 2.344829 | 0.413518 | 177.274154 | 4.034901 | ... | 0.038779 | 0.049521 | 8.759663 | 5.784299 | 4.825261 | 4.100561 | 3.812667 | 3.548944 | 2.866774 | 2.316144 |
min | 370951.000000 | 0.000000 | 1.910000e+07 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.500000 | ... | 0.000000 | 0.000000 | -10.196998 | -15.167961 | -21.925773 | -13.682825 | -9.282567 | -11.117367 | -6.365723 | -2.394516 |
25% | 463258.500000 | 14121.250000 | 1.999061e+07 | 6.000000 | 1.000000 | 3.000000 | 0.000000 | 1.000000 | 69.000000 | 12.500000 | ... | 0.000135 | 0.000000 | -5.575131 | -0.891030 | -3.105073 | -0.481952 | -1.697763 | -3.069575 | -2.089326 | -1.402958 |
50% | 556296.000000 | 65359.000000 | 2.003111e+07 | 27.000000 | 6.000000 | 4.000000 | 0.000000 | 1.000000 | 105.000000 | 15.000000 | ... | 0.001949 | 0.002593 | -3.837572 | 0.221379 | -0.081836 | 0.039376 | -0.971210 | -0.877377 | -1.192502 | -1.146398 |
75% | 648862.250000 | 143083.750000 | 2.008091e+07 | 70.000000 | 11.000000 | 7.000000 | 5.000000 | 1.000000 | 150.000000 | 15.000000 | ... | 0.075826 | 0.062063 | 3.531269 | 1.257687 | 2.784538 | 0.560046 | 1.572508 | 3.276918 | 2.772742 | -0.010769 |
max | 741887.000000 | 233028.000000 | 2.019040e+07 | 248.000000 | 39.000000 | 7.000000 | 6.000000 | 1.000000 | 17700.000000 | 15.000000 | ... | 0.135900 | 0.180091 | 36.364986 | 26.043572 | 22.598441 | 16.333051 | 20.273633 | 11.691851 | 7.970303 | 8.749647 |
8 rows × 39 columns
## 2) Get familiar with data types through info() Train_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 250000 entries, 0 to 249999 Data columns (total 40 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SaleID 250000 non-null int64 1 name 250000 non-null int64 2 regDate 250000 non-null int64 3 model 250000 non-null float64 4 brand 250000 non-null int64 5 bodyType 224620 non-null float64 6 fuelType 227510 non-null float64 7 gearbox 236487 non-null float64 8 power 250000 non-null int64 9 kilometer 250000 non-null float64 10 notRepairedDamage 201464 non-null float64 11 regionCode 250000 non-null int64 12 seller 250000 non-null int64 13 offerType 250000 non-null int64 14 creatDate 250000 non-null int64 15 price 250000 non-null int64 16 v_0 250000 non-null float64 17 v_1 250000 non-null float64 18 v_2 250000 non-null float64 19 v_3 250000 non-null float64 20 v_4 250000 non-null float64 21 v_5 250000 non-null float64 22 v_6 250000 non-null float64 23 v_7 250000 non-null float64 24 v_8 250000 non-null float64 25 v_9 250000 non-null float64 26 v_10 250000 non-null float64 27 v_11 250000 non-null float64 28 v_12 250000 non-null float64 29 v_13 250000 non-null float64 30 v_14 250000 non-null float64 31 v_15 250000 non-null float64 32 v_16 250000 non-null float64 33 v_17 250000 non-null float64 34 v_18 250000 non-null float64 35 v_19 250000 non-null float64 36 v_20 250000 non-null float64 37 v_21 250000 non-null float64 38 v_22 250000 non-null float64 39 v_23 250000 non-null float64 dtypes: float64(30), int64(10) memory usage: 76.3 MB
Test_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 50000 entries, 0 to 49999 Data columns (total 39 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SaleID 50000 non-null int64 1 name 50000 non-null int64 2 regDate 50000 non-null int64 3 model 50000 non-null float64 4 brand 50000 non-null int64 5 bodyType 44890 non-null float64 6 fuelType 45598 non-null float64 7 gearbox 47287 non-null float64 8 power 50000 non-null int64 9 kilometer 50000 non-null float64 10 notRepairedDamage 40372 non-null float64 11 regionCode 50000 non-null int64 12 seller 50000 non-null int64 13 offerType 50000 non-null int64 14 creatDate 50000 non-null int64 15 v_0 50000 non-null float64 16 v_1 50000 non-null float64 17 v_2 50000 non-null float64 18 v_3 50000 non-null float64 19 v_4 50000 non-null float64 20 v_5 50000 non-null float64 21 v_6 50000 non-null float64 22 v_7 50000 non-null float64 23 v_8 50000 non-null float64 24 v_9 50000 non-null float64 25 v_10 50000 non-null float64 26 v_11 50000 non-null float64 27 v_12 50000 non-null float64 28 v_13 50000 non-null float64 29 v_14 50000 non-null float64 30 v_15 50000 non-null float64 31 v_16 50000 non-null float64 32 v_17 50000 non-null float64 33 v_18 50000 non-null float64 34 v_19 50000 non-null float64 35 v_20 50000 non-null float64 36 v_21 50000 non-null float64 37 v_22 50000 non-null float64 38 v_23 50000 non-null float64 dtypes: float64(30), int64(9) memory usage: 14.9 MB
Train_data.isnull().sum()
SaleID 0 name 0 regDate 0 model 0 brand 0 bodyType 25380 fuelType 22490 gearbox 13513 power 0 kilometer 0 notRepairedDamage 48536 regionCode 0 seller 0 offerType 0 creatDate 0 price 0 v_0 0 v_1 0 v_2 0 v_3 0 v_4 0 v_5 0 v_6 0 v_7 0 v_8 0 v_9 0 v_10 0 v_11 0 v_12 0 v_13 0 v_14 0 v_15 0 v_16 0 v_17 0 v_18 0 v_19 0 v_20 0 v_21 0 v_22 0 v_23 0 dtype: int64
Test_data.isnull().sum()
SaleID 0 name 0 regDate 0 model 0 brand 0 bodyType 5110 fuelType 4402 gearbox 2713 power 0 kilometer 0 notRepairedDamage 9628 regionCode 0 seller 0 offerType 0 creatDate 0 v_0 0 v_1 0 v_2 0 v_3 0 v_4 0 v_5 0 v_6 0 v_7 0 v_8 0 v_9 0 v_10 0 v_11 0 v_12 0 v_13 0 v_14 0 v_15 0 v_16 0 v_17 0 v_18 0 v_19 0 v_20 0 v_21 0 v_22 0 v_23 0 dtype: int64
# nan visualization missing = Train_data.isnull().sum() missing = missing[missing > 0] missing.sort_values(inplace=True) missing.plot.bar()
<AxesSubplot:>
[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-IF3bYWr1-1618460688068)(output_14_1.png)]
# Visualize the default values msno.matrix(Train_data.sample(250))
<AxesSubplot:>
[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-Hmg3T2pE-1618460688071)(output_15_1.png)]
msno.bar(Train_data.sample(1000))
<AxesSubplot:>
[the external chain image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (IMG gihsop0o-1618460688080) (output_16_1. PNG)]
msno.bar(Train_data.sample(1000))
<AxesSubplot:>
[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-O6oFq5c8-1618460688082)(output_17_1.png)]
Train_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 250000 entries, 0 to 249999 Data columns (total 40 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 SaleID 250000 non-null int64 1 name 250000 non-null int64 2 regDate 250000 non-null int64 3 model 250000 non-null float64 4 brand 250000 non-null int64 5 bodyType 224620 non-null float64 6 fuelType 227510 non-null float64 7 gearbox 236487 non-null float64 8 power 250000 non-null int64 9 kilometer 250000 non-null float64 10 notRepairedDamage 201464 non-null float64 11 regionCode 250000 non-null int64 12 seller 250000 non-null int64 13 offerType 250000 non-null int64 14 creatDate 250000 non-null int64 15 price 250000 non-null int64 16 v_0 250000 non-null float64 17 v_1 250000 non-null float64 18 v_2 250000 non-null float64 19 v_3 250000 non-null float64 20 v_4 250000 non-null float64 21 v_5 250000 non-null float64 22 v_6 250000 non-null float64 23 v_7 250000 non-null float64 24 v_8 250000 non-null float64 25 v_9 250000 non-null float64 26 v_10 250000 non-null float64 27 v_11 250000 non-null float64 28 v_12 250000 non-null float64 29 v_13 250000 non-null float64 30 v_14 250000 non-null float64 31 v_15 250000 non-null float64 32 v_16 250000 non-null float64 33 v_17 250000 non-null float64 34 v_18 250000 non-null float64 35 v_19 250000 non-null float64 36 v_20 250000 non-null float64 37 v_21 250000 non-null float64 38 v_22 250000 non-null float64 39 v_23 250000 non-null float64 dtypes: float64(30), int64(10) memory usage: 76.3 MB
Test_data['notRepairedDamage'].value_counts()
1.0 35555 0.0 4817 Name: notRepairedDamage, dtype: int64
Test_data['notRepairedDamage'].replace('-', np.nan, inplace=True)
Train_data["seller"].value_counts()
1 249999 0 1 Name: seller, dtype: int64
Train_data["offerType"].value_counts()
0 249991 1 9 Name: offerType, dtype: int64
del Train_data["seller"] del Train_data["offerType"] del Test_data["seller"] del Test_data["offerType"] #The drop function is used for pandas
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance) 3079 try: -> 3080 return self._engine.get_loc(casted_key) 3081 except KeyError as err: pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: 'seller' The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) <ipython-input-25-ac78d43311cb> in <module> ----> 1 del Train_data["seller"] 2 del Train_data["offerType"] 3 del Test_data["seller"] 4 del Test_data["offerType"] 5 #The drop function is used for pandas C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in __delitem__(self, key) 3964 # there was no match, this call should raise the appropriate 3965 # exception: -> 3966 loc = self.axes[-1].get_loc(key) 3967 self._mgr.idelete(loc) 3968 C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance) 3080 return self._engine.get_loc(casted_key) 3081 except KeyError as err: -> 3082 raise KeyError(key) from err 3083 3084 if tolerance is not None: KeyError: 'seller'
Train_data['price']
0 520 1 5500 2 1100 3 1200 4 3300 ... 249995 1200 249996 1200 249997 16500 249998 31950 249999 1990 Name: price, Length: 250000, dtype: int64
Train_data['price'].value_counts()
0 7312 500 3815 1500 3587 1000 3149 1200 3071 ... 11320 1 7230 1 11448 1 9529 1 8188 1 Name: price, Length: 4585, dtype: int64
%matplotlib inline #The magic method in IPython notebook, so that you can get the image directly after each run, and you don't need to use PLT show() import numpy as np #Import numpy package for generating arrays import seaborn as sns #It is conventionally abbreviated as sns sns.set()#Switch to the default running configuration of seaborn
UsageError: unrecognized arguments: #The magic method in IPython notebook, so that you can get the image directly after each run, and you don't need to use PLT show()
x=np.random.randn(100) sns.kdeplot(x,cut=0)
<AxesSubplot:ylabel='Density'>
[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-BZyyCorw-1618460688084)(output_27_1.png)]
y=np.random.randn(100) sns.kdeplot(x,y,shade=True) sns.kdeplot(x,y,shade=True,cbar=True)
<AxesSubplot:>
[the external link image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (img-UA8LOj4n-1618460688086)(output_28_1.png)]
displot() integrates the functions of hist() of matplotlib and kernel function to estimate kdeplot, and adds a novel use of rugplot distribution observation bar display and fitting parameter distribution with scipy library fit. The specific usage is as follows:
seaborn.distplot(a,bins=None,hist=True,kde=True, rug=False, fit=None, hist_kws=None, kde_kws=None, rug_kws=None, fit_kws=None, color=None, vertical=False, norm_hist=False, axlabel=None, label=None, ax=None)
Let's first introduce histograms:
Histogram, also known as quality distribution map, is a main tool to represent the changes of data. The regularity of data can be analyzed by histogram, and the distribution state of product quality characteristics can be seen intuitively. The distribution state of data is clear at a glance, which is convenient to judge the overall quality distribution. The histogram represents the data distribution by forming sub boxes along the data range, and then drawing bars to show the number of observations falling into each sub box.
Next, let's experience the usage of distplot through specific examples:
sns.distplot(x,color="g")
<AxesSubplot:ylabel='Density'>
[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-uPlnSZg2-1618460688087)(output_31_1.png)]
import matplotlib.pyplot as plt fig,axes=plt.subplots(1,3) #Create a canvas with one row and three columns sns.distplot(x,ax=axes[0]) #Left picture sns.distplot(x,hist=False,ax=axes[1]) #Middle picture sns.distplot(x,kde=False,ax=axes[2]) #Right picture
<AxesSubplot:>
[the external link image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (img-QB16zc3q-1618460688088)(output_32_1.png)]
## 1) General distribution (unbounded Johnson distribution, etc.) #Johnson distribution for short. The probability distribution of random variables subject to normal distribution after John transformation import scipy.stats as st y = Train_data['price'] plt.figure(1); plt.title('Johnson SU') sns.distplot(y, kde=True, fit=st.johnsonsu) plt.figure(2); plt.title('Normal') sns.distplot(y, kde=False, fit=st.norm) plt.figure(3); plt.title('Log Normal') sns.distplot(y, kde=False, fit=st.lognorm) '''Let's see what general distribution he fits Unbounded Johnson distribution johnsonsu? Normal norm? Lognormal (slightly higher than normal) lognorm? '''
'Let's see what general distribution he fits\n Unbounded Johnson distribution johnsonsu?\n Normal norm?\n Lognormal (slightly higher than normal) lognorm?\n\n'
[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-BIRc6ZH0-1618460688091)(output_33_1.png)]
[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-UXCH7aXj-1618460688093)(output_33_2.png)]
[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-dVSpJeqf-1618460688093)(output_33_3.png)]
2) Kurtosis and kurtosis
Skewness: it is a statistical quantity describing the distribution form of data. It describes the symmetry of a certain overall value distribution. In short, it is the degree of data asymmetry. The greater the absolute value, the more asymmetric the data distribution and the greater the skewness
Kurtosis: a statistic that describes the steepness and slowness of the distribution form of all values of a variable. In short, it is the sharpness of the top of the data distribution (> 0 sharp peak, < 0 flat peak, = 0, which is consistent with the steepness of the normal distribution)
## 2) View sketchness and kurtosis sns.distplot(Train_data['price']); print("Skewness: %f" % Train_data['price'].skew()) print("Kurtosis: %f" % Train_data['price'].kurt())
Skewness: 3.535346 Kurtosis: 21.230678
[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-sTZN54X1-1618460688095)(output_35_1.png)]
Train_data.skew(), Train_data.kurt()
(SaleID 0.001712 name 0.513079 regDate -1.540844 model 1.499765 brand 1.314846 bodyType -0.070459 fuelType 0.701802 gearbox -1.357379 power 58.590829 kilometer -1.557472 notRepairedDamage -2.312519 regionCode 0.690405 creatDate -95.428563 price 3.535346 v_0 -1.504738 v_1 1.582428 v_2 1.198679 v_3 1.352193 v_4 0.217941 v_5 2.052749 v_6 0.090718 v_7 0.823610 v_8 -1.532964 v_9 1.529931 v_10 -2.584452 v_11 -0.906428 v_12 -2.842834 v_13 -3.869655 v_14 0.491706 v_15 1.308716 v_16 1.662893 v_17 0.233318 v_18 0.814453 v_19 0.100073 v_20 2.001253 v_21 0.180020 v_22 0.819133 v_23 1.357847 dtype: float64, SaleID -1.201476 name -1.084474 regDate 11.041006 model 1.741896 brand 1.814245 bodyType -1.070358 fuelType -1.495782 gearbox -0.157525 power 4473.885260 kilometer 1.250933 notRepairedDamage 3.347777 regionCode -0.352973 creatDate 11376.694263 price 21.230678 v_0 2.901641 v_1 1.098703 v_2 3.749872 v_3 4.294578 v_4 6.953348 v_5 6.489791 v_6 -0.564878 v_7 -0.729838 v_8 0.370812 v_9 0.377943 v_10 4.796855 v_11 1.547812 v_12 6.136342 v_13 13.199575 v_14 -1.597532 v_15 -0.029594 v_16 2.240928 v_17 2.569341 v_18 2.967738 v_19 6.923953 v_20 6.852809 v_21 -0.759948 v_22 -0.741708 v_23 0.143713 dtype: float64)
sns.distplot(Train_data.skew(),color='blue',axlabel ='Skewness')
<AxesSubplot:xlabel='Skewness', ylabel='Density'>
[the external link image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (img-imnSDRwd-1618460688096)(output_37_1.png)]
sns.distplot(Train_data.kurt(),color='orange',axlabel ='Kurtness')
<AxesSubplot:xlabel='Kurtness', ylabel='Density'>
[the external link image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (IMG engtrb0y-1618460688099) (output_38_1. PNG)]
## 3) Check the specific frequency of the predicted value plt.hist(Train_data['price'], orientation = 'vertical',histtype = 'bar', color ='red') plt.show()
[the external chain picture transfer fails, and the source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-rmh7p764-1618460688100)(output_39_0.png)]
plt.hist
x: Data set, the final histogram will make statistics on the data set
bins: interval distribution of Statistics
range: tuple, the displayed range. Range takes effect when bins is not given
density: bool, which is false by default. It displays the frequency statistics result. If it is True, it displays the frequency statistics result. Here, it should be noted that the frequency statistics result = number of intervals / (total number * interval width), which is consistent with the normalized effect. density is officially recommended
histtype: one of {'bar', 'barstacked', 'step', 'stepfilled'} can be selected. The default is bar. It is recommended to use the default configuration. Step uses a ladder shape, and stepfilled will fill the interior of the ladder shape. The effect is similar to that of bar
align: one of {'left', 'mid' and 'right'} can be selected. The default is' mid ', which controls the horizontal distribution of the histogram. Left or right, there will be some blank areas. It is recommended to use the default
log: bool, the default is False, that is, whether the index scale is selected for the y coordinate axis
stacked: bool, which is False by default. Is it a stacked graph
plt. The parameter density of hist histogram is True and False, which respectively represents whether to normalize. The parameter orientation determines whether to use the presentation form of vertical axis representing frequency or horizontal axis representing frequency
# The distribution z after log transformation is relatively uniform, so log transformation can be used for prediction, which is also a commonly used trick in prediction problems plt.hist(np.log(Train_data['price']), orientation = 'vertical',histtype = 'bar', color ='red') plt.show()
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-41-0f7fcb2a3190> in <module> 1 # The distribution z after log transformation is relatively uniform, so log transformation can be used for prediction, which is also a commonly used trick in prediction problems ----> 2 plt.hist(np.log(Train_data['price']), orientation = 'vertical',histtype = 'bar', color ='red') 3 plt.show() C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py in hist(x, bins, range, density, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, data, **kwargs) 2683 orientation='vertical', rwidth=None, log=False, color=None, 2684 label=None, stacked=False, *, data=None, **kwargs): -> 2685 return gca().hist( 2686 x, bins=bins, range=range, density=density, weights=weights, 2687 cumulative=cumulative, bottom=bottom, histtype=histtype, C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py in inner(ax, data, *args, **kwargs) 1445 def inner(ax, *args, data=None, **kwargs): 1446 if data is None: -> 1447 return func(ax, *map(sanitize_sequence, args), **kwargs) 1448 1449 bound = new_sig.bind(ax, *args, **kwargs) C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py in hist(self, x, bins, range, density, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, **kwargs) 6649 # this will automatically overwrite bins, 6650 # so that each histogram uses the same bins -> 6651 m, bins = np.histogram(x[i], bins, weights=w[i], **hist_kwargs) 6652 tops.append(m) 6653 tops = np.array(tops, float) # causes problems later if it's an int <__array_function__ internals> in histogram(*args, **kwargs) C:\ProgramData\Anaconda3\lib\site-packages\numpy\lib\histograms.py in histogram(a, bins, range, normed, weights, density) 790 a, weights = _ravel_and_check_weights(a, weights) 791 --> 792 bin_edges, uniform_bins = _get_bin_edges(a, bins, range, weights) 793 794 # Histogram is an integer or a float array depending on the weights. C:\ProgramData\Anaconda3\lib\site-packages\numpy\lib\histograms.py in _get_bin_edges(a, bins, range, weights) 424 raise ValueError('`bins` must be positive, when an integer') 425 --> 426 first_edge, last_edge = _get_outer_edges(a, range) 427 428 elif np.ndim(bins) == 1: C:\ProgramData\Anaconda3\lib\site-packages\numpy\lib\histograms.py in _get_outer_edges(a, range) 313 'max must be larger than min in range parameter.') 314 if not (np.isfinite(first_edge) and np.isfinite(last_edge)): --> 315 raise ValueError( 316 "supplied range of [{}, {}] is not finite".format(first_edge, last_edge)) 317 elif a.size == 0: ValueError: supplied range of [-inf, 11.512925464970229] is not finite
[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-DSC7R3b8-1618460688103)(output_41_1.png)]
# Separate label, i.e. predicted value Y_train = Train_data['price']
# This distinction applies to data without direct label coding # This is not applicable. It needs to be distinguished artificially according to the actual meaning # Digital features # numeric_features = Train_data.select_dtypes(include=[np.number]) # numeric_features.columns # # Type characteristics # categorical_features = Train_data.select_dtypes(include=[np.object]) # categorical_features.columns
numeric_features = ['power', 'kilometer', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13','v_14' ] categorical_features = ['name', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage', 'regionCode',]
unique() returns all unique values of a column (all unique values of a feature) in the form of an array (numpy.ndarray)
nunique() Return number of unique elements in the object. That is, the number of unique values is returned
# Characteristic nunique distribution for cat_fea in categorical_features: print(cat_fea + "The characteristic distribution of is as follows:") print("{}The feature has a{}Different values".format(cat_fea, Train_data[cat_fea].nunique())) print(Train_data[cat_fea].value_counts())
name The characteristic distribution of is as follows: name The feature has a 164312 different value 451 452 73 429 1791 428 821 391 243 346 ... 92419 1 88325 1 82182 1 84231 1 157427 1 Name: name, Length: 164312, dtype: int64 model The characteristic distribution of is as follows: model The feature has 251 different values 0.0 20344 6.0 17741 4.0 13837 1.0 13634 12.0 8841 ... 226.0 5 245.0 5 243.0 4 249.0 4 250.0 1 Name: model, Length: 251, dtype: int64 brand The characteristic distribution of is as follows: brand The feature has 40 different values 0 53699 4 27109 11 26944 10 23762 1 22144 6 17202 9 12210 5 7343 15 6500 12 4704 7 3839 3 3831 17 3543 13 3502 8 3374 28 3161 19 2561 18 2451 16 2274 22 2264 23 2088 14 1892 24 1678 25 1611 20 1610 27 1392 29 1259 34 963 30 604 2 570 31 540 21 522 38 516 35 415 32 406 36 377 33 368 37 324 26 307 39 141 Name: brand, dtype: int64 bodyType The characteristic distribution of is as follows: bodyType The feature has 8 different values 7.0 64571 3.0 53858 4.0 45646 5.0 20343 6.0 15290 2.0 12755 1.0 9882 0.0 2275 Name: bodyType, dtype: int64 fuelType The characteristic distribution of is as follows: fuelType The feature has 7 different values 0.0 150664 5.0 72494 4.0 3577 3.0 385 2.0 183 1.0 147 6.0 60 Name: fuelType, dtype: int64 gearbox The characteristic distribution of is as follows: gearbox The feature has 2 different values 1.0 184645 0.0 51842 Name: gearbox, dtype: int64 notRepairedDamage The characteristic distribution of is as follows: notRepairedDamage The feature has 2 different values 1.0 176922 0.0 24542 Name: notRepairedDamage, dtype: int64 regionCode The characteristic distribution of is as follows: regionCode The feature has 8081 different values 487 550 868 424 149 236 539 227 32 216 ... 7959 1 8002 1 6715 1 7117 1 4144 1 Name: regionCode, Length: 8081, dtype: int64
# Characteristic nunique distribution for cat_fea in categorical_features: print(cat_fea + "The characteristic distribution of is as follows:") print("{}The feature has a{}Different values".format(cat_fea, Test_data[cat_fea].nunique())) print(Test_data[cat_fea].value_counts())
name The characteristic distribution of is as follows: name The feature has a 38668 different value 73 98 821 89 243 77 451 74 826 73 .. 106879 1 108926 1 176509 1 178556 1 67583 1 Name: name, Length: 38668, dtype: int64 model The characteristic distribution of is as follows: model The feature has 249 different values 0.0 3916 6.0 3496 1.0 2806 4.0 2802 12.0 1745 ... 247.0 2 246.0 2 214.0 1 243.0 1 232.0 1 Name: model, Length: 249, dtype: int64 brand The characteristic distribution of is as follows: brand The feature has 40 different values 0 10697 4 5464 11 5374 10 4747 1 4390 6 3496 9 2408 5 1534 15 1325 12 929 7 782 3 736 17 732 13 679 8 666 28 645 19 534 18 487 16 458 22 430 14 416 23 397 24 390 25 297 20 293 27 265 29 236 34 206 30 133 21 121 2 101 38 92 31 87 35 76 36 73 26 72 32 70 37 61 33 61 39 40 Name: brand, dtype: int64 bodyType The characteristic distribution of is as follows: bodyType The feature has 8 different values 7.0 12748 3.0 10808 4.0 9143 5.0 4175 6.0 3079 2.0 2484 1.0 1980 0.0 473 Name: bodyType, dtype: int64 fuelType The characteristic distribution of is as follows: fuelType The feature has 7 different values 0.0 30045 5.0 14645 4.0 754 3.0 73 2.0 43 1.0 23 6.0 15 Name: fuelType, dtype: int64 gearbox The characteristic distribution of is as follows: gearbox The feature has 2 different values 1.0 36935 0.0 10352 Name: gearbox, dtype: int64 notRepairedDamage The characteristic distribution of is as follows: notRepairedDamage The feature has 2 different values 1.0 35555 0.0 4817 Name: notRepairedDamage, dtype: int64 regionCode The characteristic distribution of is as follows: regionCode The feature has a 7078 different value 487 122 868 93 539 46 32 46 222 46 ... 3761 1 6232 1 7891 1 2106 1 2246 1 Name: regionCode, Length: 7078, dtype: int64
numeric_features.append('price') numeric_features
['power', 'kilometer', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14', 'price']
## 1) Correlation analysis price_numeric = Train_data[numeric_features] correlation = price_numeric.corr() print(correlation['price'].sort_values(ascending = False),'\n')
price 1.000000 v_0 0.514477 v_11 0.481618 power 0.189456 v_8 0.183505 v_10 0.163891 v_12 0.129570 v_13 0.114883 v_7 0.090440 v_14 0.075673 v_4 0.004413 v_2 -0.018823 v_6 -0.036826 v_5 -0.039637 v_9 -0.165831 v_1 -0.207255 kilometer -0.404961 v_3 -0.595468 Name: price, dtype: float64
f , ax = plt.subplots(figsize = (7, 7)) plt.title('Correlation of Numeric Features with Price',y=1,size=16) sns.heatmap(correlation,square = True, vmax=0.8)
<AxesSubplot:title={'center':'Correlation of Numeric Features with Price'}>
[the external chain picture transfer fails, and the source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-JwxFTkaQ-1618460688104)(output_50_1.png)]
Data: matrix data set, which can be a numpy array. If it is a dataframe of pandas, the index/column information of df will correspond to the columns and rows of heatmap respectively
linewidths, the interval between thermodynamic diagram matrices
vmax,vmin, the display values of the maximum and minimum values in the legend. Without this parameter, it is not displayed by default
cmap: colormap name or color object of matplotlib; If it is not provided, it defaults to cubehelix map (when the dataset is a continuous dataset) or RdBu_r (when the data set is a discrete data set)
Center: set the data to the mean data in the legend, that is, the data value in the legend center; By setting the center value, you can adjust the overall depth of the generated image color; When setting center data, if there is data overflow, the manually set vmax and vmin will change automatically
The abbreviation of annotate. Annotate is False by default. When annotate is True, data is written to each square in the heatmap
annot_kws, when annot is True, you can set various parameters, including size, color, bold, italic, etc
square: Boolean value, optional parameter.
If True, set the axis direction to equal so that each cell is square.
del price_numeric['price']
## 2) Look at the skewness and peaks of several features for col in numeric_features: print('{:15}'.format(col), 'Skewness: {:05.2f}'.format(Train_data[col].skew()) , ' ' , 'Kurtosis: {:06.2f}'.format(Train_data[col].kurt()) )
power Skewness: 58.59 Kurtosis: 4473.89 kilometer Skewness: -1.56 Kurtosis: 001.25 v_0 Skewness: -1.50 Kurtosis: 002.90 v_1 Skewness: 01.58 Kurtosis: 001.10 v_2 Skewness: 01.20 Kurtosis: 003.75 v_3 Skewness: 01.35 Kurtosis: 004.29 v_4 Skewness: 00.22 Kurtosis: 006.95 v_5 Skewness: 02.05 Kurtosis: 006.49 v_6 Skewness: 00.09 Kurtosis: -00.56 v_7 Skewness: 00.82 Kurtosis: -00.73 v_8 Skewness: -1.53 Kurtosis: 000.37 v_9 Skewness: 01.53 Kurtosis: 000.38 v_10 Skewness: -2.58 Kurtosis: 004.80 v_11 Skewness: -0.91 Kurtosis: 001.55 v_12 Skewness: -2.84 Kurtosis: 006.14 v_13 Skewness: -3.87 Kurtosis: 013.20 v_14 Skewness: 00.49 Kurtosis: -01.60 price Skewness: 03.54 Kurtosis: 021.23
Convert data
df.melt() is DF Pivot() reverse operation function
Convert the column name into column data (columns name → column values) and reconstruct the DataFrame
If DF Pivot() converts a long dataset into a wide dataset, DF Melt () changes a wide data set into a long data set
melt() is both a top-level class function and an instance object function. When it appears as a class function, you need to specify the name of the DataFrame
Parameter type description
frame dataframe
The name of the dataset being melt
In PD Used in melt()
id_vars
tuple
list
ndarray
Optional
Column names that do not need to be converted are treated as identifier columns (not index columns) after conversion
value_vars
tuple
list
ndarray
Optional
Existing columns that need to be converted
If not specified, except ID_ Columns other than vars are converted
var_name string
variable default
Custom column name
Set by 'value'_ New column name composed of vars'
value_name
string
value default
Custom column name
Set by 'value'_ New column name composed of data of 'vars'
col_level
int
string
Optional
This level is used if the column is a MultiIndex
seaborn.FacetGrid
data : DataFrame
The processed ("long format") dataframe data, in which each column is a variable (feature) and each row is a sample
row, col, hue : strings
Variables that define a subset of data that will be drawn in different aspects of the grid. See below*_ The order parameter controls the level order of the variable
For example: col = "sex", hue = "smoker", that is, the column indicates gender, and the color semantics indicates whether to smoke. The following example will give a detailed description
col_wrap : int, optional
This means that the grid dimension is limited, such as col_wrap = 3, then you can only draw up to 3 columns in this canvas. Rows are not limited, which limits the number of columns.
share{x,y} : bool, 'col', or 'row' optional
Whether to share the x-axis or y-axis, that is, if it is True, it will share the same axis, otherwise it will not be shared. It is shared by default, that is, it is True
g = sns.FacetGrid(tips, col = "sex", hue = "smoker", sharex=True, sharey=True) # are shared
g.map(plt.scatter, "total_bill", "tip", alpha=0.8)
g.add_legend();
map is a python built-in function that maps the specified sequence according to the provided function.
The format of the map() function is:
map(function,iterable,...)
The first parameter accepts a function name, and the following parameters accept one or more iteratable sequences and return a collection.
Apply the function to each element in the list in turn to get a new list and return it. Note that the map does not change the original list, but returns a new list.
## 3) Visualization of the distribution of each digital feature f = pd.melt(Train_data, value_vars=numeric_features) g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False) g = g.map(sns.distplot, "value")
[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-y5qOGg1w-1618460688110)(output_57_0.png)]
## 4) Visualization of the relationship between digital features sns.set() columns = ['price', 'v_12', 'v_8' , 'v_0', 'power', 'v_5', 'v_2', 'v_6', 'v_1', 'v_14'] sns.pairplot(Train_data[columns],size = 2 ,kind ='scatter',diag_kind='kde') plt.show()
[the external chain image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (img-Oa5cvdtd-1618460688111)(output_58_0.png)]
Train_data.columns
Index(['SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode', 'creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14', 'v_15', 'v_16', 'v_17', 'v_18', 'v_19', 'v_20', 'v_21', 'v_22', 'v_23'], dtype='object')
Y_train
0 520 1 5500 2 1100 3 1200 4 3300 ... 249995 1200 249996 1200 249997 16500 249998 31950 249999 1990 Name: price, Length: 250000, dtype: int64
fig, ax = plt.subplots(1,3), where parameters 1 and 3 represent the number of rows and columns of the subgraph respectively, with a total of 1x3 sub images. Function returns an array list of figure image and subgraph ax.
fig, ax = plt.subplots(1,3,1). The last parameter 1 represents the first subgraph.
If you want to set the width and height of the subgraph, you can add figsize value to the function
fig, ax = plt.subplots(1,3,figsize=(15,7)), so there will be three 15x7 subgraphs in one row.
## 5) Visualization of multivariate regression relationship fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8), (ax9, ax10)) = plt.subplots(nrows=5, ncols=2, figsize=(24, 20)) # ['v_12', 'v_8' , 'v_0', 'power', 'v_5', 'v_2', 'v_6', 'v_1', 'v_14'] v_12_scatter_plot = pd.concat([Y_train,Train_data['v_12']],axis = 1) ''' objs: series,dataframe Or panel Composition sequence lsit axis: Axes that need to merge links. 0 is row and 1 is column join: Connection mode inner,perhaps outer ''' sns.regplot(x='v_12',y = 'price', data = v_12_scatter_plot,scatter= True, fit_reg=True, ax=ax1) v_8_scatter_plot = pd.concat([Y_train,Train_data['v_8']],axis = 1) sns.regplot(x='v_8',y = 'price',data = v_8_scatter_plot,scatter= True, fit_reg=True, ax=ax2) v_0_scatter_plot = pd.concat([Y_train,Train_data['v_0']],axis = 1) sns.regplot(x='v_0',y = 'price',data = v_0_scatter_plot,scatter= True, fit_reg=True, ax=ax3) power_scatter_plot = pd.concat([Y_train,Train_data['power']],axis = 1) sns.regplot(x='power',y = 'price',data = power_scatter_plot,scatter= True, fit_reg=True, ax=ax4) v_5_scatter_plot = pd.concat([Y_train,Train_data['v_5']],axis = 1) sns.regplot(x='v_5',y = 'price',data = v_5_scatter_plot,scatter= True, fit_reg=True, ax=ax5) v_2_scatter_plot = pd.concat([Y_train,Train_data['v_2']],axis = 1) sns.regplot(x='v_2',y = 'price',data = v_2_scatter_plot,scatter= True, fit_reg=True, ax=ax6) v_6_scatter_plot = pd.concat([Y_train,Train_data['v_6']],axis = 1) sns.regplot(x='v_6',y = 'price',data = v_6_scatter_plot,scatter= True, fit_reg=True, ax=ax7) v_1_scatter_plot = pd.concat([Y_train,Train_data['v_1']],axis = 1) sns.regplot(x='v_1',y = 'price',data = v_1_scatter_plot,scatter= True, fit_reg=True, ax=ax8) v_14_scatter_plot = pd.concat([Y_train,Train_data['v_14']],axis = 1) sns.regplot(x='v_14',y = 'price',data = v_14_scatter_plot,scatter= True, fit_reg=True, ax=ax9) v_13_scatter_plot = pd.concat([Y_train,Train_data['v_13']],axis = 1) sns.regplot(x='v_13',y = 'price',data = v_13_scatter_plot,scatter= True, fit_reg=True, ax=ax10) ''' sns.regplot()Usage of Parameter description x,y: namely x,y Value of axis data: x,y Belonging to df x_estimator: Apply this function to x For each unique value and plot the resulting estimate. When x This is useful when is a discrete variable. If given x_ci,Then this estimate will bootstrap and plot the confidence interval x_bins: take x How many segments For other parameters, please refer to the official website document: https://www.cntofu.com/book/172/docs/28.md sns.regplot(): Fitting of drawing data and linear regression model '''
<AxesSubplot:xlabel='v_13', ylabel='price'>
[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-gY8sdLxP-1618460688115)(output_62_1.png)]
2,pd.qcut function is divided according to the percentage of data occurrence frequency. For example, if the data is divided into four parts, the four sections are 0-25%, 25% - 50%, 50% - 75% and 75% - 100% of the data respectively
pd.qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise')
1,pd. The cut function has seven parameters, which are mainly used to divide the data equidistant from the maximum value to the minimum value
pandas.cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False)
Parameters:
x: enter the one-dimensional array to be cut
bins: the number of segments of cut. It is generally an integer, but it can also be a sequence vector.
Right: Boolean value to determine whether the right section is open or closed. When True, the right section is closed
labels: array or Boolean value. The default value is None. It is used to identify the bins after division. The length must be equal to the result bins. The return value is an integer or the identification of bins
retbins: Boolean value, optional. Whether to return the group where the value is located, and turn returns
Precision: integer type, bins decimal precision, that is, the data is displayed in several decimal places
include_lowest: Boolean type, whether the left interval is included
## 1) unique distribution for fea in categorical_features: print(Train_data[fea].nunique())
164312 251 40 8 7 2 2 8081
categorical_features
['name', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage', 'regionCode']
## 2) Category feature box diagram visualization # Because the categories of name and regionCode are too sparse, let's draw some non sparse categories here categorical_features = ['model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage'] for c in categorical_features: Train_data[c] = Train_data[c].astype('category') if Train_data[c].isnull().any(): '''1.np.array.any()and numpy.array.all() np.array.any()Yes or operation, any element is True,Output as True. np.array.all()Yes and operation, all elements are True,Output as True. ''' Train_data[c] = Train_data[c].cat.add_categories(['MISSING']) Train_data[c] = Train_data[c].fillna('MISSING') def boxplot(x, y, **kwargs): sns.boxplot(x=x, y=y) x=plt.xticks(rotation=90) f = pd.melt(Train_data, id_vars=['price'], value_vars=categorical_features) g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False, size=5) g = g.map(boxplot, "value", "price")
It is suggested to save the picture directly to the external link (qpg-84pnu-88d) (it is suggested to save the picture directly to the external link)
Train_data.columns
Index(['SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode', 'creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14', 'v_15', 'v_16', 'v_17', 'v_18', 'v_19', 'v_20', 'v_21', 'v_22', 'v_23'], dtype='object')
## 3) Violin graph visualization of category features catg_list = categorical_features target = 'price' for catg in catg_list : sns.violinplot(x=catg, y=target, data=Train_data) plt.show()
[the external chain image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (img-voCMBxYk-1618460688127)(output_68_0.png)]
[the external chain picture transfer fails, and the source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-Uo7yvKkg-1618460688129)(output_68_1.png)]
[the external link image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (IMG kuvlfuzr-1618460688133) (output_68_2. PNG)]
[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (IMG kcueuakj-1618460688134) (output_68_3. PNG)]
[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-QCkMQ3bM-1618460688136)(output_68_4.png)]
[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-Jh7J6swE-1618460688140)(output_68_5.png)]
categorical_features = ['model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage']
## 4) Column chart visualization of category features def bar_plot(x, y, **kwargs): sns.barplot(x=x, y=y) x=plt.xticks(rotation=90) f = pd.melt(Train_data, id_vars=['price'], value_vars=categorical_features) g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False, size=5) g = g.map(bar_plot, "value", "price")
[the external link image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (img-o3RO6hfI-1618460688142)(output_70_0.png)]
## 5) Visualization of each category frequency of category characteristics (count_plot) def count_plot(x, **kwargs): sns.countplot(x=x) x=plt.xticks(rotation=90) f = pd.melt(Train_data, value_vars=categorical_features) g = sns.FacetGrid(f, col="variable", col_wrap=2, sharex=False, sharey=False, size=5) g = g.map(count_plot, "value")
[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-1ugoBEPk-1618460688143)(output_71_0.png)]
import pandas_profiling
pfr = pandas_profiling.ProfileReport(Train_data) pfr.to_file("./example.html")
Summarize dataset: 0%| | 0/51 [00:00<?, ?it/s]