Multiple linear regression -- case analysis and python practice

Regression analysis -- multiple regression

Introduce the statistics in multiple regression analysis

Total observed value
Total independent variable
Freedom: return degrees of freedom, residual degrees of freedom
Sum of total square of SST
Sum of squares of SSR regression
Sum of squares of SSE residuals
MSR mean square regression
MSE mean square residual
Determination coefficient R ﹣ square
Adjusted? R? Square
Multiple? R
Standard error of estimation
F-test statistics
Standard error of sampling distribution of regression coefficient
t-test statistics of regression coefficients
Confidence interval of each regression coefficient
log likelihood

AIC guidelines
BIC guidelines

Case analysis and python practice

# Import related packages
import pandas as pd
import numpy as np
import math
import scipy
import matplotlib.pyplot as plt
from scipy.stats import t

# Building data
columns = {'A':"Branch number", 'B':"Non-performing Loan(Billion yuan)", 'C':"Loan balance(Billion yuan)", 'D':"Accumulated loans receivable(Billion yuan)", 'E':"Number of loan items", 'F':"Fixed assets investment(Billion yuan)"}
data={"A":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25],
      "B":[0.9,1.1,4.8,3.2,7.8,2.7,1.6,12.5,1.0,2.6,0.3,4.0,0.8,3.5,10.2,3.0,0.2,0.4,1.0,6.8,11.6,1.6,1.2,7.2,3.2],
      "C":[67.3,111.3,173.0,80.8,199.7,16.2,107.4,185.4,96.1,72.8,64.2,132.2,58.6,174.6,263.5,79.3,14.8,73.5,24.7,139.4,368.2,95.7,109.6,196.2,102.2],
      "D":[6.8,19.8,7.7,7.2,16.5,2.2,10.7,27.1,1.7,9.1,2.1,11.2,6.0,12.7,15.6,8.9,0,5.9,5.0,7.2,16.8,3.8,10.3,15.8,12.0],
      "E":[5,16,17,10,19,1,17,18,10,14,11,23,14,26,34,15,2,11,4,28,32,10,14,16,10],
      "F":[51.9,90.9,73.7,14.5,63.2,2.2,20.2,43.8,55.9,64.3,42.7,76.7,22.8,117.1,146.7,29.9,42.1,25.3,13.4,64.3,163.9,44.5,67.9,39.7,97.1]
     }

df = pd.DataFrame(data)
X = df[["C", "D", "E", "F"]]
Y = df[["B"]]

# Building multiple linear regression model
from sklearn.linear_model import LinearRegression
lreg = LinearRegression()
lreg.fit(X, Y)

x = X
y_pred = lreg.predict(X)
y_true = np.array(Y).reshape(-1,1)

coef = lreg.coef_[0]
intercept = lreg.intercept_[0]

# Custom function
def log_like(y_true, y_pred):
    """
    y_true: True value
    y_pred: predicted value
    """
    sig = np.sqrt(sum((y_true - y_pred)**2)[0] / len(y_pred)) # Residual standard deviation δ
    y_sig = np.exp(-(y_true - y_pred) ** 2 / (2 * sig ** 2)) / (math.sqrt(2 * math.pi) * sig)
    loglik = sum(np.log(y_sig))
    return loglik

def param_var(x):
    """
    x:Only independent variable wide table
    """
    n = len(x)
    beta0 = np.ones((n,1))
    df_to_matrix = x.as_matrix()
    concat_matrix = np.hstack((beta0, df_to_matrix))         # Matrix merging
    transpose_matrix = np.transpose(concat_matrix)           # Matrix transposition
    dot_matrix = np.dot(transpose_matrix, concat_matrix)     # (X.T X)^(-1)
    inv_matrix = np.linalg.inv(dot_matrix)                   # Find (X.T X)^(-1) inverse matrix
    diag = np.diag(inv_matrix)                               # Obtain the diagonal of the matrix, i.e. the variance of each parameter
    return diag

def param_test_stat(x, Se, intercept, coef, alpha=0.05):
    n = len(x)
    k = len(x.columns)
    beta_array = param_var(x)
    beta_k = beta_array.shape[0]
    
    coef = [intercept] + list(coef)
    std_err = []
    t_Stat = []
    P_value = []
    t_intv = []
    coefLower = []
    coefupper = []
    
    for i in range(beta_k):
        se_belta = np.sqrt(Se**2 * beta_array[i])            # Sampling standard error of regression coefficient
        t = coef[i] / se_belta                               # T statistic used to test regression coefficient, i.e. test statistic t
        p_value = scipy.stats.t.sf(np.abs(t), n-k-1)*2       # P value used to test regression coefficient
        t_score = scipy.stats.t.isf(alpha/2, df = n-k-1)     # t critical value
        coef_lower = coef[i] - t_score * se_belta            # Lower confidence interval limit of regression coefficient (slope)
        coef_upper = coef[i] + t_score * se_belta            # Upper limit of confidence interval of regression coefficient (slope)
        
        std_err.append(round(se_belta, 3))
        t_Stat.append(round(t,3))
        P_value.append(round(p_value,3))
        t_intv.append(round(t_score,3))
        coefLower.append(round(coef_lower,3))
        coefupper.append(round(coef_upper,3))
        
    dict_ = {"coefficients":list(map(lambda x:round(x, 4), coef)), 
             'std_err':std_err, 
             't_Stat':t_Stat, 
             'P_value':P_value, 
             't critical value':t_intv, 
             'Lower_95%':coefLower, 
             'Upper_95%':coefupper}
    
    index = ["intercept"] + list(x.columns)
    stat = pd.DataFrame(dict_, index=index)
    return stat

# Custom function (calculate and output statistics of regression analysis)
def get_lr_stats(x, y_true, y_pred, coef, intercept, alpha=0.05):
    
    n   = len(x)
    k   = len(x.columns)
    ssr = sum((y_pred - np.mean(y_true))**2)[0]  # Regression square sum SSR
    sse = sum((y_true - y_pred)**2)[0]           # Sum of squared residuals SSE
    sst = ssr + sse                              # Total square sum SST
    msr = ssr / k                                # Mean square regression MSR
    mse = sse / (n-k-1)                          # Mean square residual MSE
    
    R_square = ssr / sst                                   # Determination coefficient R^2
    Adjusted_R_square = 1-(1-R_square)*((n-1) / (n-k-1))   # Determination coefficient of adjustment
    Multiple_R = np.sqrt(R_square)                         # Complex correlation coefficient   
    Se = np.sqrt(sse/(n - k - 1))                          # Standard error of estimation
    
    loglike = log_like(y_true, y_pred)[0]
    AIC = 2*(k+1) - 2 * loglike                  # (k+1) represents k regression parameters or coefficients and 1 intercept parameter
    BIC = -2*loglike + (k+1)*np.log(n)  
    
    # Significance test of linear relationship
    F  = (ssr / k) / (sse / ( n - k - 1 ))            # Test statistic F (test of linear relationship)
    pf = scipy.stats.f.sf(F, k, n-k-1)                # Significance F for test, i.e. significance F
    Fa = scipy.stats.f.isf(alpha, dfn=k, dfd=n-k-1)   # F critical value
    
    # Significance test of regression coefficient
    stat = param_test_stat(x, Se, intercept, coef, alpha=alpha)
    
    # Output statistics of regression analysis
    print('='*80)
    print('df_Model:{}  df_Residuals:{}'.format(k, n-k-1), '\n')
    print('loglike:{}  AIC:{}  BIC:{}'.format(round(loglike,3), round(AIC,1), round(BIC,1)), '\n')
    print('SST:{}  SSR:{}  SSE:{}  MSR:{}  MSE:{}  Se:{}'.format(round(sst,4),
                                                                 round(ssr,4),
                                                                 round(sse,4),
                                                                 round(msr,4),
                                                                 round(mse,4),
                                                                 round(Se,4)), '\n')
    
    print('Multiple_R:{}  R_square:{}  Adjusted_R_square:{}'.format(round(Multiple_R,4),
                                                                    round(R_square,4),
                                                                    round(Adjusted_R_square,4)), '\n')
    print('F:{}  pf:{}  Fa:{}'.format(round(F,4), pf, round(Fa,4)))
    
    print('='*80)
    print(stat)
    print('='*80)
    
    return 0

The output results are as follows:

Compared with ols results under statsmodels:

kelly_zero

Published 15 original articles, won praise 5, visited 366

Private letter follow

Keywords: Python Lambda

Added by .-INSANE-. on Mon, 03 Feb 2020 17:07:34 +0200

Programming VIP

Multiple linear regression -- case analysis and python practice

Popular Keywords