# Multiple linear regression -- case analysis and python practice

Regression analysis -- multiple regression

Introduce the statistics in multiple regression analysis

• Total observed value • Total independent variable • Freedom : return degrees of freedom , residual degrees of freedom • Sum of total square of SST • Sum of squares of SSR regression • Sum of squares of SSE residuals • MSR mean square regression • MSE mean square residual • Determination coefficient R ﹣ square • Adjusted? R? Square • Multiple? R • Standard error of estimation • F-test statistics • Standard error of sampling distribution of regression coefficient • t-test statistics of regression coefficients • Confidence interval of each regression coefficient • log likelihood  • AIC guidelines • BIC guidelines Case analysis and python practice

```# Import related packages
import pandas as pd
import numpy as np
import math
import scipy
import matplotlib.pyplot as plt
from scipy.stats import t```
```# Building data
columns = {'A':"Branch number", 'B':"Non-performing Loan(Billion yuan)", 'C':"Loan balance(Billion yuan)", 'D':"Accumulated loans receivable(Billion yuan)", 'E':"Number of loan items", 'F':"Fixed assets investment(Billion yuan)"}
data={"A":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25],
"B":[0.9,1.1,4.8,3.2,7.8,2.7,1.6,12.5,1.0,2.6,0.3,4.0,0.8,3.5,10.2,3.0,0.2,0.4,1.0,6.8,11.6,1.6,1.2,7.2,3.2],
"C":[67.3,111.3,173.0,80.8,199.7,16.2,107.4,185.4,96.1,72.8,64.2,132.2,58.6,174.6,263.5,79.3,14.8,73.5,24.7,139.4,368.2,95.7,109.6,196.2,102.2],
"D":[6.8,19.8,7.7,7.2,16.5,2.2,10.7,27.1,1.7,9.1,2.1,11.2,6.0,12.7,15.6,8.9,0,5.9,5.0,7.2,16.8,3.8,10.3,15.8,12.0],
"E":[5,16,17,10,19,1,17,18,10,14,11,23,14,26,34,15,2,11,4,28,32,10,14,16,10],
"F":[51.9,90.9,73.7,14.5,63.2,2.2,20.2,43.8,55.9,64.3,42.7,76.7,22.8,117.1,146.7,29.9,42.1,25.3,13.4,64.3,163.9,44.5,67.9,39.7,97.1]
}

df = pd.DataFrame(data)
X = df[["C", "D", "E", "F"]]
Y = df[["B"]]```
```# Building multiple linear regression model
from sklearn.linear_model import LinearRegression
lreg = LinearRegression()
lreg.fit(X, Y)

x = X
y_pred = lreg.predict(X)
y_true = np.array(Y).reshape(-1,1)

coef = lreg.coef_
intercept = lreg.intercept_```
```# Custom function
def log_like(y_true, y_pred):
"""
y_true: True value
y_pred: predicted value
"""
sig = np.sqrt(sum((y_true - y_pred)**2) / len(y_pred)) # Residual standard deviation δ
y_sig = np.exp(-(y_true - y_pred) ** 2 / (2 * sig ** 2)) / (math.sqrt(2 * math.pi) * sig)
loglik = sum(np.log(y_sig))
return loglik

def param_var(x):
"""
x:Only independent variable wide table
"""
n = len(x)
beta0 = np.ones((n,1))
df_to_matrix = x.as_matrix()
concat_matrix = np.hstack((beta0, df_to_matrix))         # Matrix merging
transpose_matrix = np.transpose(concat_matrix)           # Matrix transposition
dot_matrix = np.dot(transpose_matrix, concat_matrix)     # (X.T X)^(-1)
inv_matrix = np.linalg.inv(dot_matrix)                   # Find (X.T X)^(-1) inverse matrix
diag = np.diag(inv_matrix)                               # Obtain the diagonal of the matrix, i.e. the variance of each parameter
return diag

def param_test_stat(x, Se, intercept, coef, alpha=0.05):
n = len(x)
k = len(x.columns)
beta_array = param_var(x)
beta_k = beta_array.shape

coef = [intercept] + list(coef)
std_err = []
t_Stat = []
P_value = []
t_intv = []
coefLower = []
coefupper = []

for i in range(beta_k):
se_belta = np.sqrt(Se**2 * beta_array[i])            # Sampling standard error of regression coefficient
t = coef[i] / se_belta                               # T statistic used to test regression coefficient, i.e. test statistic t
p_value = scipy.stats.t.sf(np.abs(t), n-k-1)*2       # P value used to test regression coefficient
t_score = scipy.stats.t.isf(alpha/2, df = n-k-1)     # t critical value
coef_lower = coef[i] - t_score * se_belta            # Lower confidence interval limit of regression coefficient (slope)
coef_upper = coef[i] + t_score * se_belta            # Upper limit of confidence interval of regression coefficient (slope)

std_err.append(round(se_belta, 3))
t_Stat.append(round(t,3))
P_value.append(round(p_value,3))
t_intv.append(round(t_score,3))
coefLower.append(round(coef_lower,3))
coefupper.append(round(coef_upper,3))

dict_ = {"coefficients":list(map(lambda x:round(x, 4), coef)),
'std_err':std_err,
't_Stat':t_Stat,
'P_value':P_value,
't critical value':t_intv,
'Lower_95%':coefLower,
'Upper_95%':coefupper}

index = ["intercept"] + list(x.columns)
stat = pd.DataFrame(dict_, index=index)
return stat```
```# Custom function (calculate and output statistics of regression analysis)
def get_lr_stats(x, y_true, y_pred, coef, intercept, alpha=0.05):

n   = len(x)
k   = len(x.columns)
ssr = sum((y_pred - np.mean(y_true))**2)  # Regression square sum SSR
sse = sum((y_true - y_pred)**2)           # Sum of squared residuals SSE
sst = ssr + sse                              # Total square sum SST
msr = ssr / k                                # Mean square regression MSR
mse = sse / (n-k-1)                          # Mean square residual MSE

R_square = ssr / sst                                   # Determination coefficient R^2
Multiple_R = np.sqrt(R_square)                         # Complex correlation coefficient
Se = np.sqrt(sse/(n - k - 1))                          # Standard error of estimation

loglike = log_like(y_true, y_pred)
AIC = 2*(k+1) - 2 * loglike                  # (k+1) represents k regression parameters or coefficients and 1 intercept parameter
BIC = -2*loglike + (k+1)*np.log(n)

# Significance test of linear relationship
F  = (ssr / k) / (sse / ( n - k - 1 ))            # Test statistic F (test of linear relationship)
pf = scipy.stats.f.sf(F, k, n-k-1)                # Significance F for test, i.e. significance F
Fa = scipy.stats.f.isf(alpha, dfn=k, dfd=n-k-1)   # F critical value

# Significance test of regression coefficient
stat = param_test_stat(x, Se, intercept, coef, alpha=alpha)

# Output statistics of regression analysis
print('='*80)
print('df_Model:{}  df_Residuals:{}'.format(k, n-k-1), '\n')
print('loglike:{}  AIC:{}  BIC:{}'.format(round(loglike,3), round(AIC,1), round(BIC,1)), '\n')
print('SST:{}  SSR:{}  SSE:{}  MSR:{}  MSE:{}  Se:{}'.format(round(sst,4),
round(ssr,4),
round(sse,4),
round(msr,4),
round(mse,4),
round(Se,4)), '\n')

round(R_square,4),
print('F:{}  pf:{}  Fa:{}'.format(round(F,4), pf, round(Fa,4)))

print('='*80)
print(stat)
print('='*80)

return 0```

The output results are as follows: Compared with ols results under statsmodels:   Published 15 original articles, won praise 5, visited 366

Keywords: Python Lambda

Added by .-INSANE-. on Mon, 03 Feb 2020 17:07:34 +0200