#-*- coding: utf-8 -*-
# 代码6-1
import numpy as np
import pandas as pd
# 输入的数据文件
data = pd.read_csv(r'D:\python学习\Python数据分析与挖掘实战(第2版)》源数据和代码-各章节\chapter6\demo\data/data.csv') # 读取数据
data.head()
Out[2]:
x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | x13 | y | |
0 | 3831732 | 181.54 | 448.19 | 7571.00 | 6212.70 | 6370241 | 525.71 | 985.31 | 60.62 | 65.66 | 120.0 | 1.029 | 5321 | 64.87 |
1 | 3913824 | 214.63 | 549.97 | 9038.16 | 7601.73 | 6467115 | 618.25 | 1259.20 | 73.46 | 95.46 | 113.5 | 1.051 | 6529 | 99.75 |
2 | 3928907 | 239.56 | 686.44 | 9905.31 | 8092.82 | 6560508 | 638.94 | 1468.06 | 81.16 | 81.16 | 108.2 | 1.064 | 7008 | 88.11 |
3 | 4282130 | 261.58 | 802.59 | 10444.60 | 8767.98 | 6664862 | 656.58 | 1678.12 | 85.72 | 91.70 | 102.2 | 1.092 | 7694 | 106.07 |
4 | 4453911 | 283.14 | 904.57 | 11255.70 | 9422.33 | 6741400 | 758.83 | 1893.52 | 88.88 | 114.61 | 97.7 | 1.200 | 8027 | 137.32 |
In [3]:
# 描述性统计分析
description = [data.min(), data.max(), data.mean(), data.std()] # 依次计算最小值、最大值、均值、标准差
description = pd.DataFrame(description, index = ['Min', 'Max', 'Mean', 'STD']).T # 将结果存入数据框
print('描述性统计结果:\n',np.round(description, 2)) # 保留两位小数
描述性统计结果:
Min Max Mean STD
x1 3831732.00 7599295.00 5579519.95 1262194.72
x2 181.54 2110.78 765.04 595.70
x3 448.19 6882.85 2370.83 1919.17
x4 7571.00 42049.14 19644.69 10203.02
x5 6212.70 33156.83 15870.95 8199.77
x6 6370241.00 8323096.00 7350513.60 621341.85
x7 525.71 4454.55 1712.24 1184.71
x8 985.31 15420.14 5705.80 4478.40
x9 60.62 228.46 129.49 50.51
x10 65.66 852.56 340.22 251.58
x11 97.50 120.00 103.30 5.51
x12 1.03 1.91 1.42 0.25
x13 5321.00 41972.00 17273.80 11109.19
y 64.87 2088.14 618.08 609.25
In [4]:
# 代码6-2
# 相关性分析
corr = data.corr(method = 'pearson') # 计算相关系数矩阵
print('相关系数矩阵为:\n',np.round(corr, 2)) # 保留两位小数
相关系数矩阵为:
x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 x11 x12 \
x1 1.00 0.95 0.95 0.97 0.97 0.99 0.95 0.97 0.98 0.98 -0.29 0.94
x2 0.95 1.00 1.00 0.99 0.99 0.92 0.99 0.99 0.98 0.98 -0.13 0.89
x3 0.95 1.00 1.00 0.99 0.99 0.92 1.00 0.99 0.98 0.99 -0.15 0.89
x4 0.97 0.99 0.99 1.00 1.00 0.95 0.99 1.00 0.99 1.00 -0.19 0.91
x5 0.97 0.99 0.99 1.00 1.00 0.95 0.99 1.00 0.99 1.00 -0.18 0.90
x6 0.99 0.92 0.92 0.95 0.95 1.00 0.93 0.95 0.97 0.96 -0.34 0.95
x7 0.95 0.99 1.00 0.99 0.99 0.93 1.00 0.99 0.98 0.99 -0.15 0.89
x8 0.97 0.99 0.99 1.00 1.00 0.95 0.99 1.00 0.99 1.00 -0.15 0.90
x9 0.98 0.98 0.98 0.99 0.99 0.97 0.98 0.99 1.00 0.99 -0.23 0.91
x10 0.98 0.98 0.99 1.00 1.00 0.96 0.99 1.00 0.99 1.00 -0.17 0.90
x11 -0.29 -0.13 -0.15 -0.19 -0.18 -0.34 -0.15 -0.15 -0.23 -0.17 1.00 -0.43
x12 0.94 0.89 0.89 0.91 0.90 0.95 0.89 0.90 0.91 0.90 -0.43 1.00
x13 0.96 1.00 1.00 1.00 0.99 0.94 1.00 1.00 0.99 0.99 -0.16 0.90
y 0.94 0.98 0.99 0.99 0.99 0.91 0.99 0.99 0.98 0.99 -0.12 0.87
x13 y
x1 0.96 0.94
x2 1.00 0.98
x3 1.00 0.99
x4 1.00 0.99
x5 0.99 0.99
x6 0.94 0.91
x7 1.00 0.99
x8 1.00 0.99
x9 0.99 0.98
x10 0.99 0.99
x11 -0.16 -0.12
x12 0.90 0.87
x13 1.00 0.99
y 0.99 1.00
In [8]:
# 中文乱码和坐标轴负号的处理
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
# 绘制热力图
import matplotlib.pyplot as plt
import seaborn as sns
plt.subplots(figsize=(10, 10)) # 设置画面大小
sns.heatmap(corr, annot=True, vmax=1, square=True, cmap="Blues")
plt.title('相关性热力图')
plt.show()
plt.close
Out[8]:
<function matplotlib.pyplot.close(fig=None)>
In [ ]: