#-*- coding: utf-8 -*-
# 代码6-1
import numpy as np
import pandas as pd
# 输入的数据文件
data = pd.read_csv(r'D:\python学习\Python数据分析与挖掘实战(第2版)》源数据和代码-各章节\chapter6\demo\data/data.csv') # 读取数据
data.head()
Out[2]:
x1  | x2  | x3  | x4  | x5  | x6  | x7  | x8  | x9  | x10  | x11  | x12  | x13  | y  | |
0  | 3831732  | 181.54  | 448.19  | 7571.00  | 6212.70  | 6370241  | 525.71  | 985.31  | 60.62  | 65.66  | 120.0  | 1.029  | 5321  | 64.87  | 
1  | 3913824  | 214.63  | 549.97  | 9038.16  | 7601.73  | 6467115  | 618.25  | 1259.20  | 73.46  | 95.46  | 113.5  | 1.051  | 6529  | 99.75  | 
2  | 3928907  | 239.56  | 686.44  | 9905.31  | 8092.82  | 6560508  | 638.94  | 1468.06  | 81.16  | 81.16  | 108.2  | 1.064  | 7008  | 88.11  | 
3  | 4282130  | 261.58  | 802.59  | 10444.60  | 8767.98  | 6664862  | 656.58  | 1678.12  | 85.72  | 91.70  | 102.2  | 1.092  | 7694  | 106.07  | 
4  | 4453911  | 283.14  | 904.57  | 11255.70  | 9422.33  | 6741400  | 758.83  | 1893.52  | 88.88  | 114.61  | 97.7  | 1.200  | 8027  | 137.32  | 
In [3]:
# 描述性统计分析
description = [data.min(), data.max(), data.mean(), data.std()]  # 依次计算最小值、最大值、均值、标准差
description = pd.DataFrame(description, index = ['Min', 'Max', 'Mean', 'STD']).T  # 将结果存入数据框
print('描述性统计结果:\n',np.round(description, 2))  # 保留两位小数
描述性统计结果:
             Min         Max        Mean         STD
x1   3831732.00  7599295.00  5579519.95  1262194.72
x2       181.54     2110.78      765.04      595.70
x3       448.19     6882.85     2370.83     1919.17
x4      7571.00    42049.14    19644.69    10203.02
x5      6212.70    33156.83    15870.95     8199.77
x6   6370241.00  8323096.00  7350513.60   621341.85
x7       525.71     4454.55     1712.24     1184.71
x8       985.31    15420.14     5705.80     4478.40
x9        60.62      228.46      129.49       50.51
x10       65.66      852.56      340.22      251.58
x11       97.50      120.00      103.30        5.51
x12        1.03        1.91        1.42        0.25
x13     5321.00    41972.00    17273.80    11109.19
y         64.87     2088.14      618.08      609.25
In [4]:
# 代码6-2
# 相关性分析
corr = data.corr(method = 'pearson')  # 计算相关系数矩阵
print('相关系数矩阵为:\n',np.round(corr, 2))  # 保留两位小数
相关系数矩阵为:
        x1    x2    x3    x4    x5    x6    x7    x8    x9   x10   x11   x12  \
x1   1.00  0.95  0.95  0.97  0.97  0.99  0.95  0.97  0.98  0.98 -0.29  0.94   
x2   0.95  1.00  1.00  0.99  0.99  0.92  0.99  0.99  0.98  0.98 -0.13  0.89   
x3   0.95  1.00  1.00  0.99  0.99  0.92  1.00  0.99  0.98  0.99 -0.15  0.89   
x4   0.97  0.99  0.99  1.00  1.00  0.95  0.99  1.00  0.99  1.00 -0.19  0.91   
x5   0.97  0.99  0.99  1.00  1.00  0.95  0.99  1.00  0.99  1.00 -0.18  0.90   
x6   0.99  0.92  0.92  0.95  0.95  1.00  0.93  0.95  0.97  0.96 -0.34  0.95   
x7   0.95  0.99  1.00  0.99  0.99  0.93  1.00  0.99  0.98  0.99 -0.15  0.89   
x8   0.97  0.99  0.99  1.00  1.00  0.95  0.99  1.00  0.99  1.00 -0.15  0.90   
x9   0.98  0.98  0.98  0.99  0.99  0.97  0.98  0.99  1.00  0.99 -0.23  0.91   
x10  0.98  0.98  0.99  1.00  1.00  0.96  0.99  1.00  0.99  1.00 -0.17  0.90   
x11 -0.29 -0.13 -0.15 -0.19 -0.18 -0.34 -0.15 -0.15 -0.23 -0.17  1.00 -0.43   
x12  0.94  0.89  0.89  0.91  0.90  0.95  0.89  0.90  0.91  0.90 -0.43  1.00   
x13  0.96  1.00  1.00  1.00  0.99  0.94  1.00  1.00  0.99  0.99 -0.16  0.90   
y    0.94  0.98  0.99  0.99  0.99  0.91  0.99  0.99  0.98  0.99 -0.12  0.87   
      x13     y  
x1   0.96  0.94  
x2   1.00  0.98  
x3   1.00  0.99  
x4   1.00  0.99  
x5   0.99  0.99  
x6   0.94  0.91  
x7   1.00  0.99  
x8   1.00  0.99  
x9   0.99  0.98  
x10  0.99  0.99  
x11 -0.16 -0.12  
x12  0.90  0.87  
x13  1.00  0.99  
y    0.99  1.00  
In [8]:
# 中文乱码和坐标轴负号的处理
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
# 绘制热力图
import matplotlib.pyplot as plt
import seaborn as sns
plt.subplots(figsize=(10, 10)) # 设置画面大小 
sns.heatmap(corr, annot=True, vmax=1, square=True, cmap="Blues") 
plt.title('相关性热力图')
plt.show()
plt.close
Out[8]:
<function matplotlib.pyplot.close(fig=None)>In [ ]:
 
    
