多元回归分析 ¶
多元线性回归模型表示为: $$ f(x_i) = \mathbf{w}^T \mathbf{x}_i + b $$ 其中,$\mathbf{w}$ 是权重向量,$b$ 是偏置项,$\mathbf{x}_i$ 是输入向量,$f(x_i)$ 是预测值。
参数估计 ¶
为了估计 $\mathbf{w}$ 和 $b$,可以使用最小二乘法。将 $\mathbf{w}$ 和 $b$ 合并为向量形式 $\hat{\mathbf{w}} = (\mathbf{w}; b)$,数据集 $D$ 表示为矩阵 $\mathbf{X}$,其中每行对应一个样本,前 $d$ 个元素对应样本的 $d$ 个属性值,最后一个元素恒定为 1。
$$ \mathbf{X} = \begin{pmatrix} x_{11} & x_{12} & \ldots & x_{1d} & 1 \\ x_{21} & x_{22} & \ldots & x_{2d} & 1 \\ \vdots & \vdots & \ddots & \vdots & \vdots \\ x_{m1} & x_{m2} & \ldots & x_{md} & 1 \\ \end{pmatrix} = \begin{pmatrix} \mathbf{x}_1^T & 1 \\ \mathbf{x}_2^T & 1 \\ \vdots & \vdots \\ \mathbf{x}_m^T & 1 \\ \end{pmatrix} $$
标记向量 $\mathbf{y} = (y_1; y_2; \ldots; y_m)$,则目标函数为:
$$ \hat{\mathbf{w}}^* = \arg \min_{\hat{\mathbf{w}}} (\mathbf{y} - \mathbf{X}\hat{\mathbf{w}})^T (\mathbf{y} - \mathbf{X}\hat{\mathbf{w}}) $$
令 $E_{\hat{\mathbf{w}}} = (\mathbf{y} - \mathbf{X}\hat{\mathbf{w}})^T (\mathbf{y} - \mathbf{X}\hat{\mathbf{w}})$,对 $\hat{\mathbf{w}}$ 求导并令其为零,得到最优解的形式:
$$ \frac{\partial E_{\hat{\mathbf{w}}}}{\partial \hat{\mathbf{w}}} = 2 \mathbf{X}^T (\mathbf{X}\hat{\mathbf{w}} - \mathbf{y}) = 0 $$
当 $\mathbf{X}^T \mathbf{X}$ 为满秩矩阵(full-rank matrix)或正定矩阵(positive definite matrix)时,可得:
$$ \hat{\mathbf{w}}^* = (\mathbf{X}^T \mathbf{X})^{-1} \mathbf{X}^T \mathbf{y} $$
参数的区间 ¶
$$ (\hat\beta_j -\beta_j)/(\hat{\sigma}\sqrt{L^{-1}_{jj}}) \sim t_{n-p-1} $$
import scipy.stats as st
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import linregress
n=2
fa=n/2/(n-1)
a=st.f(2,n-2).ppf(0.95)/fa
achi=st.chi2(2).ppf(0.95)
print(a,achi)
nan 5.991464547107979
线性回归的误差 ¶
$L=(X^TX)$,$\sigma^2 L^{-1}$ 是 $\hat{\mathbf{w}}$ 的协方差矩阵
有
- $\text{Var}(\beta_0) = \sigma^2/n$
- $\text{Var}(\beta_i) = \sigma^2 L^{-1}_{ii}$
- $\text{Var}(\beta_0,\beta_j)=0$
- $\text{Var}(\beta_i,\beta_j) = \sigma^2 L^{-1}_{ij}$
$\sigma^2$ 的估计仍然可以用残差估计
$$ \begin{aligned} \delta^2 = (Y - X\beta)^\intercal (Y-X\beta)\\ \hat \sigma^2 =\delta^2/(n-p-1)\\ \delta^2/\sigma^2 \sim \chi^2_{n-p-1} \end{aligned} $$
%matplotlib notebook
import scipy.stats as st
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
np.random.seed(304)
one=np.ones(20)
x=st.uniform(0,10).rvs(20)
y=st.uniform(0,8).rvs(20)
print(x.shape,y.shape)
Xmm=np.vstack([one,x-np.mean(x),y-np.mean(y)]).T # 先叠成3行,再转置成为3列,变成每一个样本为一行
# print(Xmm.shape)
beta=np.linalg.solve(Xmm.T@Xmm,Xmm.T@Y)
# print(beta,"\n")
L_inv=np.linalg.inv(Xmm.T@Xmm)
sigma2=(Y-Xmm@beta).T @(Y-Xmm@beta)/(20-2-1)
# print(L_inv,"\n")
err=np.sqrt(sigma2*np.diag(L_inv))
# print(err)
(20,) (20,)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[2], line 17 15 Xmm=np.vstack([one,x-np.mean(x),y-np.mean(y)]).T # 先叠成3行,再转置成为3列,变成每一个样本为一行 16 # print(Xmm.shape) ---> 17 beta=np.linalg.solve(Xmm.T@Xmm,Xmm.T@Y) 18 # print(beta,"\n") 19 L_inv=np.linalg.inv(Xmm.T@Xmm) NameError: name 'Y' is not defined
两个变量的时候,通常检验 $\beta_1$ 和 $\beta_2$ 的计算表达式是否小于 5.99
$f(x)$ 的区间比 $f(x_i)$ 更宽泛
%matplotlib notebook
import scipy.stats as st
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import linregress
from matplotlib.animation import FuncAnimation
np.random.seed(304)
a=4
b=4
c=10
x=st.uniform(0,10).rvs(20)
y=st.uniform(0,8).rvs(20)
z=[np.random.normal(a*i+b*j+c,5) for i,j in zip(x,y)]
def animate(dfn):
ax.view_init(elev=32, azim=dfn)
fig = plt.figure(figsize=(4, 4))
ax = fig.add_subplot(111, projection='3d')
ax.plot(x, y, z, color='k', zorder=15, linestyle='none', marker='o', alpha=0.5)
anim = FuncAnimation(fig, animate, frames=range(0,360,2))
# anim.save('data.gif',fps=3,dpi=200)
from IPython.display import HTML
HTML(anim.to_jshtml())
fig.show()
# 使用sklearn进行回归
from sklearn import linear_model
X=np.vstack((x, y)).T # vstack 叠叠乐函数
ols = linear_model.LinearRegression() # 创建模型
model = ols.fit(X,z) # 预测模型
x_pred = np.linspace(0, 10, 50)
y_pred = np.linspace(0, 8, 50)
xx_pred, yy_pred = np.meshgrid(x_pred, y_pred) # 拼接成一个50x50的网格
#flatten() change the 50x50 to 1x2500
model_viz = np.array([xx_pred.flatten(), yy_pred.flatten()]).T
# print(model_viz.shape)----- (2500,2)
predicted = model.predict(model_viz)
fig = plt.figure(figsize=(4, 4))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z,color='k',alpha=0.5 )
ax.plot(xx_pred.flatten(), yy_pred.flatten(), predicted, alpha=0.9)
anim = FuncAnimation(fig, animate, frames=range(0,360,3))
fig.show()
# anim.save('fit.gif',fps=3,dpi=200)
from IPython.display import HTML
HTML(anim.to_jshtml())
print(model.intercept_,model.coef_) # 打印截距和每个X的系数
7.546774866213575 [3.8898734 4.2202737]
当然,也可以使用矩阵进行计算 需要注意的是,这里第一列是全1列
- 非 0 偏置(截距)缩到 $\beta$ 矩阵当中了,所以对应的 $\beta X$ 中 $X$ 要多一列全 1 值
# 使用矩阵方法进行计算
one=np.ones(20)
x=st.uniform(0,10).rvs(20)
y=st.uniform(0,8).rvs(20)
z=[np.random.normal(a*i+b*j+c,5) for i,j in zip(x,y)]
Xm=np.vstack([one,x,y]).T
Y = np.array(z).reshape(20,1)
b=np.linalg.solve(Xm.T@Xm,Xm.T@Y)
print(b)
[[6.96602352] [4.34440642] [4.11604012]]
stats 中的 OLS model ¶
import statsmodels.api as sm
#Xp=np.vstack((np.ones(20),x, y)).T
Xp = sm.add_constant(X) # 等同于上面一行
# Ordinary least square
model = sm.OLS(z, Xp)
results = model.fit()
print(results.summary())
OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.294 Model: OLS Adj. R-squared: 0.211 Method: Least Squares F-statistic: 3.540 Date: Tue, 03 Dec 2024 Prob (F-statistic): 0.0518 Time: 01:26:15 Log-Likelihood: -78.492 No. Observations: 20 AIC: 163.0 Df Residuals: 17 BIC: 166.0 Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const 28.1183 8.023 3.505 0.003 11.192 45.045 x1 0.7903 1.102 0.717 0.483 -1.535 3.115 x2 3.5215 1.388 2.537 0.021 0.593 6.450 ============================================================================== Omnibus: 0.855 Durbin-Watson: 2.811 Prob(Omnibus): 0.652 Jarque-Bera (JB): 0.685 Skew: 0.415 Prob(JB): 0.710 Kurtosis: 2.635 Cond. No. 18.6 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
可以使用 python 集成的功能进行计算
F 值
t 值: 相当于对 x1,x2 进行了原假设为 $\mu_{x_1} = \mu_{x_2} = 0$ 的假设
最后两列还会给出参数 95% 置信区间
可以通过 result 的函数获取计算结果
pingouin 包 ¶
也可以实现相同的效果,更加精简一点
import pingouin as pg
lm = pg.linear_regression(X, z)
print(lm)
names coef se T pval r2 adj_r2 \ 0 Intercept 28.118337 8.022802 3.504803 0.002716 0.294045 0.210992 1 x1 0.790336 1.102023 0.717168 0.483009 0.294045 0.210992 2 x2 3.521503 1.387927 2.537239 0.021260 0.294045 0.210992 CI[2.5%] CI[97.5%] 0 11.191704 45.044970 1 -1.534730 3.115402 2 0.593233 6.449772
假设检验 ¶
检验每个 $\beta_i$ ¶
$$ (\hat\beta_j -\beta_j)/(\hat{\sigma}\sqrt{L^{-1}_{jj}}) \sim t_{n-p-1} $$
检验整体 ( 看是否有相关性 ) ¶
一系列 $\beta_1,\beta_2,\beta_3 \dots$ 系数是否为 0
- $H_0: \beta_1, \cdots, \beta_r = 0$
- $R_2 = \delta'^2 | \beta_1, \cdots, \beta_r = 0$
- $R_1 = \delta^2$
- $\delta$ 重新在只有 $p-r$ 个变量时通过最小二乘法计算
- $(R_2 - R_1)/\sigma^2 \sim \chi^2_r$
- $(n-p-1)\hat{\sigma}^2/\sigma^2 \sim \chi^2_{n-p-1}$
- $(R_2 - R_1)/(r\hat{\sigma}^2) \sim F_{r,n-p-1}$
- 当 $(R_2 - R_1)/(r\hat{\sigma}^2) > F_{r,n-p-1}(0.05)$ 时否定假设
## Single beta_i=0
print(results.tvalues)
## multiple beta_r=0
## 验证方法1
print(results.f_pvalue)
hypotheses = '(x1 = x2 =0)'
print(results.f_test(hypotheses))
# p值小于0.05 就要拒绝原来的假设了
## 验证方法2:使得A矩阵线性无关的两个向量是要验证的向量
A = np.identity(len(results.params))
A = A[1:,:]
print(A)
print(results.f_test(A))
[3.50480251 0.71716783 2.53723929] 0.051832539952542715 <F test: F=3.540434571475472, p=0.05183253995254327, df_denom=17, df_num=2> [[0. 1. 0.] [0. 0. 1.]] <F test: F=3.540434571475474, p=0.0518325399525432, df_denom=17, df_num=2>
%matplotlib inline
from sklearn.preprocessing import PolynomialFeatures
a=0.3
b=-4
c=10
d=8
x2=st.uniform(0,10).rvs(40)
x2=np.sort(x2)
print("x2=",x2.shape)
y2=np.random.normal(a*x2**3+b*x2**2+c*x2+d,2)
x2= (40,)
需要准备数据,把数据转换成每列代表不同的属性,每一行代表不同的样本
poly = PolynomialFeatures(degree = 5)
X_poly = poly.fit_transform(np.array(x2).reshape(40,1))
fig = plt.figure()
plt.scatter(x2,y2)
poly = PolynomialFeatures(degree = 5)
X_poly = poly.fit_transform(np.array(x2).reshape(40,1))
print(X_poly.shape)
model2 = sm.OLS(y2, X_poly)
results2 = model2.fit()
print(x2.shape,y2.shape)
print(results2.params.shape)
ypred = model2.predict(results2.params,X_poly)
plt.plot(x2,ypred,color = 'C0')
# plt.show()
print(results2.summary())
(40, 6) (40,) (40,) (6,) OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.980 Model: OLS Adj. R-squared: 0.977 Method: Least Squares F-statistic: 325.7 Date: Tue, 03 Dec 2024 Prob (F-statistic): 1.09e-27 Time: 01:26:15 Log-Likelihood: -72.219 No. Observations: 40 AIC: 156.4 Df Residuals: 34 BIC: 166.6 Df Model: 5 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const 6.8112 1.211 5.624 0.000 4.350 9.272 x1 13.8635 3.163 4.383 0.000 7.436 20.291 x2 -6.7615 2.080 -3.251 0.003 -10.989 -2.534 x3 1.0314 0.555 1.857 0.072 -0.097 2.160 x4 -0.0797 0.065 -1.235 0.225 -0.211 0.051 x5 0.0031 0.003 1.134 0.265 -0.002 0.009 ============================================================================== Omnibus: 1.219 Durbin-Watson: 2.149 Prob(Omnibus): 0.544 Jarque-Bera (JB): 0.414 Skew: 0.047 Prob(JB): 0.813 Kurtosis: 3.489 Cond. No. 2.63e+05 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 2.63e+05. This might indicate that there are strong multicollinearity or other numerical problems.
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib_inline/backend_inline.py:99, in show(close, block) 96 # only call close('all') if any to close 97 # close triggers gc.collect, which can be slow 98 if close and Gcf.get_all_fig_managers(): ---> 99 matplotlib.pyplot.close('all') File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/pyplot.py:1179, in close(fig) 1177 _pylab_helpers.Gcf.destroy(manager) 1178 elif fig == 'all': -> 1179 _pylab_helpers.Gcf.destroy_all() 1180 elif isinstance(fig, int): 1181 _pylab_helpers.Gcf.destroy(fig) File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/_pylab_helpers.py:81, in Gcf.destroy_all(cls) 79 for manager in list(cls.figs.values()): 80 manager.canvas.mpl_disconnect(manager._cidgcf) ---> 81 manager.destroy() 82 cls.figs.clear() File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/backends/backend_nbagg.py:144, in FigureManagerNbAgg.destroy(self) 142 for comm in list(self.web_sockets): 143 comm.on_close() --> 144 self.clearup_closed() File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/backends/backend_nbagg.py:152, in FigureManagerNbAgg.clearup_closed(self) 148 self.web_sockets = {socket for socket in self.web_sockets 149 if socket.is_open()} 151 if len(self.web_sockets) == 0: --> 152 CloseEvent("close_event", self.canvas)._process() File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/backend_bases.py:1217, in Event._process(self) 1215 def _process(self): 1216 """Process this event on ``self.canvas``, then unset ``guiEvent``.""" -> 1217 self.canvas.callbacks.process(self.name, self) 1218 self._guiEvent_deleted = True File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/cbook.py:303, in CallbackRegistry.process(self, s, *args, **kwargs) 301 except Exception as exc: 302 if self.exception_handler is not None: --> 303 self.exception_handler(exc) 304 else: 305 raise File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/cbook.py:87, in _exception_printer(exc) 85 def _exception_printer(exc): 86 if _get_running_interactive_framework() in ["headless", None]: ---> 87 raise exc 88 else: 89 traceback.print_exc() File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/cbook.py:298, in CallbackRegistry.process(self, s, *args, **kwargs) 296 if func is not None: 297 try: --> 298 func(*args, **kwargs) 299 # this does not capture KeyboardInterrupt, SystemExit, 300 # and GeneratorExit 301 except Exception as exc: File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/animation.py:904, in Animation._stop(self, *args) 902 self._fig.canvas.mpl_disconnect(self._resize_id) 903 self._fig.canvas.mpl_disconnect(self._close_id) --> 904 self.event_source.remove_callback(self._step) 905 self.event_source = None AttributeError: 'NoneType' object has no attribute 'remove_callback'
# 使用f-test对poly阶数进行检验
print(results2.f_test('x4=x5=0'))
print(results2.f_test('x3=x4=x5=0'))
<F test: F=1.2003554155678247, p=0.3135262379521731, df_denom=34, df_num=2> <F test: F=115.33656688365886, p=6.8633367520199366e-18, df_denom=34, df_num=3>
for i in (2,3):
poly = PolynomialFeatures(degree = i)
X_poly = poly.fit_transform(np.array(x2).reshape(40,1))
model2 = sm.OLS(y2, X_poly)
results2 = model2.fit()
print(results2.summary())
ypred = model2.predict(results2.params,X_poly)
fig.gca().plot(x2,ypred,color = 'C{:d}'.format(i))
fig
OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.771 Model: OLS Adj. R-squared: 0.759 Method: Least Squares F-statistic: 62.42 Date: Tue, 03 Dec 2024 Prob (F-statistic): 1.39e-12 Time: 01:26:16 Log-Likelihood: -120.50 No. Observations: 40 AIC: 247.0 Df Residuals: 37 BIC: 252.1 Df Model: 2 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const 15.6061 1.855 8.415 0.000 11.848 19.364 x1 -3.8036 1.029 -3.697 0.001 -5.888 -1.719 x2 0.0469 0.119 0.394 0.696 -0.195 0.289 ============================================================================== Omnibus: 11.658 Durbin-Watson: 0.499 Prob(Omnibus): 0.003 Jarque-Bera (JB): 13.411 Skew: 0.915 Prob(JB): 0.00122 Kurtosis: 5.168 Cond. No. 80.7 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.978 Model: OLS Adj. R-squared: 0.976 Method: Least Squares F-statistic: 536.0 Date: Tue, 03 Dec 2024 Prob (F-statistic): 6.48e-30 Time: 01:26:16 Log-Likelihood: -73.584 No. Observations: 40 AIC: 155.2 Df Residuals: 36 BIC: 161.9 Df Model: 3 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const 8.3412 0.703 11.869 0.000 6.916 9.766 x1 9.1856 0.775 11.852 0.000 7.614 10.757 x2 -3.7585 0.210 -17.915 0.000 -4.184 -3.333 x3 0.2844 0.015 18.434 0.000 0.253 0.316 ============================================================================== Omnibus: 1.220 Durbin-Watson: 2.023 Prob(Omnibus): 0.543 Jarque-Bera (JB): 0.424 Skew: 0.102 Prob(JB): 0.809 Kurtosis: 3.461 Cond. No. 960. ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
指数拟合 ¶
$$ \begin{align*} Y = b_0 e^{b_1X}\\ \log Y = \log b_0 + b_1 X\\ \end{align*} $$
两个变量的相关性 ¶
- 相关系数 (Correlation)
$$ \rho = \frac{\text{Cov}(x, y)}{\sqrt{\text{Var}(x) \text{Var}(y)}} $$
- 用样本估计
$$ r = \frac{\sum (x - \bar{x})(y - \bar{y})}{\sqrt{\sum (x - \bar{x})^2 \sum (y - \bar{y})^2}} $$
- 假设检验:皮尔逊相关系数检验
$$ H_0: \rho = 0,H_1: \rho \ne 0 $$
检验统计量为
$$ T = \frac{r\sqrt{n - 2}}{\sqrt{1 - r^2}} \sim t_{n-2} $$
$$ C = t_{n-2}(\alpha/2) \sqrt{\frac{n - 2 + t_{n-2}^2(\alpha/2)}{n - 2}} $$
需要注意,在判断相关性的时候,要注意系统的自由度
系统自由度很低的时候(样本很少
由下图可以看出,在样本数量 20 的时候,小于 0.63 就认为不相关了
## C的取值
cl=[]
nl=[]
for n in range(10,100,5):
# print(n,cl)
tn=st.t.ppf(0.975,n-2)
c=tn/np.sqrt(n-2+tn**2)
cl.append(c)
nl.append(n)
plt.plot(nl,cl)
[<matplotlib.lines.Line2D at 0x7f8e39120ad0>]
偏相关 ¶
分析相关性,给 x1 和给 x2 差不多的话,就可以先去掉一个
取结果中方差最小的
根据全相关系数来求偏相关系数
p 值很大,支持原假设 p值很小,不支持原假设
一组观测量 $X_i$ 中,互相之间有关联
- $X_3$ 为一个人的收入,$X_1, X_2$ 为一个人在吃和穿上的花费
- 观察到 $X_1, X_2$ 的正相关,$r > 0$
- 实际 $X_1, X_2$ 均由 $X_3$ 带动,使得他们呈现正相关
- 若能去掉 $X_3$ 的影响,观测它们的实际相关,可能转为负相关
重新定义 $X_1', X_2'$
$$ X_1' = X_1 - L(X_3 \cdots) $$
$$ X_2' = X_2 - L'(X_3 \cdots) $$
使得 $E(X_1')^2$ 和 $E(X_2')^2$ 最小
可以证明
$$ \rho(X_1', X_2') = (\rho_{12} - \rho_{13} \rho_{23})/[(1 - \rho_{13}^2)(1 - \rho_{23}^2)]^{1/2} $$
将原本 $X$ 的相关矩阵写为
$$ P = \begin{pmatrix} 1 & \cdots & \rho_{1p} \\ \vdots & 1 & \vdots \\ \rho_{p1} & \cdots & 1 \end{pmatrix} $$
记 $P_{ij}$ 为 $\rho(X_i', X_j')$ 为去掉第 $i$ 行第 $j$ 列后的行列式
- $\rho(X_1', X_2') = P_{12}/\sqrt{P_{11} P_{22}}$
%matplotlib inline
import pandas as pd
x3=st.expon(0,1000).rvs(50) # income
x1=np.random.normal(x3*0.4+30,100) # spent on eating
x2= np.random.normal(x3-x1-20,10) # spent on clothes
plt.scatter(x1,x2)
plt.show()
xyz=np.vstack((x1,x2,x3)).T
df = pd.DataFrame(xyz, columns = ['x', 'y', 'z'])
print("Correlation\n",df.corr())
print("Partial Correlation\n",df.pcorr())
print(pg.partial_corr(data=df, x='x', y='y', covar='z',method='pearson'))
cor=np.corrcoef(xyz.T)
Correlation x y z x 1.000000 0.968299 0.988874 y 0.968299 1.000000 0.994647 z 0.988874 0.994647 1.000000 Partial Correlation x y z x 1.000000 -0.994087 0.997907 y -0.994087 1.000000 0.998991 z 0.997907 0.998991 1.000000 n r CI95% p-val pearson 50 -0.994087 [-1.0, -0.99] 5.593235e-47
# 自己计算也是一样的效果
print((cor[0,1]-cor[0,2]*cor[1,2])/np.sqrt((1-cor[0,2]**2)*(1-cor[1,2]**2)))
print((cor[1,2]-cor[0,1]*cor[0,2])/np.sqrt((1-cor[0,1]**2)*(1-cor[0,2]**2)))
print((cor[0,2]-cor[0,1]*cor[1,2])/np.sqrt((1-cor[0,1]**2)*(1-cor[1,2]**2)))
-0.9940869184414207 0.9989905555878712 0.9979069885854778
meana=[23,34,51]
covD=np.array((4,5,6))
corD=[[1,0.5,0.9],[0.5,1,0.2],[0.9,0.2,1]]
cov=covD*covD.reshape(3,-1)*corD
print(np.array(corD))
mn=st.multivariate_normal(meana,cov)
xyz=mn.rvs(100)
df = pd.DataFrame(xyz, columns = ['x', 'y', 'z'])
print(df.corr())
df.pcorr().round(3)
[[1. 0.5 0.9] [0.5 1. 0.2] [0.9 0.2 1. ]] x y z x 1.000000 0.503486 0.890283 y 0.503486 1.000000 0.194100 z 0.890283 0.194100 1.000000
x | y | z | |
---|---|---|---|
x | 1.000 | 0.740 | 0.935 |
y | 0.740 | 1.000 | -0.646 |
z | 0.935 | -0.646 | 1.000 |