import pandas as pd
import numpy as np
from scipy import stats
data1 = [1,2,3]
data1
[1, 2, 3]
data2 = np.array([1,2,3])
data2
array([1, 2, 3])
data3 = pd.Series([1,2,3])
data3
0 1 1 2 2 3 dtype: int64
# 创建一组服从正态分布的数值型数据
data4 = np.random.normal(0,10,size=10)
data4
array([-12.40560225, -8.70345216, -4.97941255, -1.94967252, 7.28696093, -7.06381858, -2.51574862, 5.10898802, 13.24926772, -8.8636106 ])
# 创建一组服从均匀分布的分类数据(用户数字代表类别)
data5 = np.random.randint(0,10,size=10)
data5
array([1, 9, 8, 0, 6, 6, 9, 6, 2, 9])
def do_mean(nums):
return sum(nums) / len(nums)
do_mean(data1)
2.0
np.mean(data1)
2.0
def do_median(nums):
length = len(nums)
sorted_nums = sorted(nums)
if length %2 :
return sorted_nums[length//2]
else:
return (sorted_nums[length//2 -1] +sorted_nums[length//2]) /2
do_median(data5)
6.0
np.median(data5)
6.0
data5
array([1, 9, 8, 0, 6, 6, 9, 6, 2, 9])
np.bincount(data5)
array([1, 1, 1, 0, 0, 0, 3, 0, 1, 3], dtype=int64)
# # 我们可以看到x中最大的数为7,因此bin的数量为8,那么它的索引值为0->7
# x = np.array([0, 1, 1, 3, 2, 1, 7])
# # 索引0出现了1次,索引1出现了3次......索引5出现了0次......
# np.bincount(x)
# #因此,输出结果为:array([1, 3, 1, 1, 0, 0, 0, 1])
# # 我们可以看到x中最大的数为7,因此bin的数量为8,那么它的索引值为0->7
# x = np.array([7, 6, 2, 1, 4])
# # 索引0出现了0次,索引1出现了1次......索引5出现了0次......
# np.bincount(x)
# #输出结果为:array([0, 1, 1, 0, 1, 0, 1, 1])
# 先获取每个索引位置值在原始数据中出现的位置
counts = np.bincount(data5)
# 返回频次最大值在数列中的索引位置
mode_val = np.argmax(counts)
# 返回众数
print(mode_val)
6
# 使用scipy实现
stats.mode(data5)
ModeResult(mode=array([6]), count=array([3]))
stats.mode(data5)[0][0]
6
def do_ptp(nums):
return max(nums) - min(nums)
do_ptp(data4)
25.65486996126144
np.ptp(data4)
25.65486996126144
def do_var(nums):
nums_mean = do_mean(nums)
res =0
for num in nums:
res += (num - nums_mean) ** 2
res = res / len(nums)
return res
do_var(data4)
60.43652801408882
np.var(data4)
60.43652801408882
def do_std(nums):
return do_var(nums) ** 0.5
do_std(data4)
7.774093388562349
np.std(data4)
7.774093388562349
def do_cv(nums):
return do_std(nums) / do_mean(nums)
do_cv(data4)
-3.7310692308360403
np.std(data4) / np.mean(data4)
-3.7310692308360407
# data4数据中第一个值的Z-分数
(data4[0] -np.mean(data4)) / np.std(data4)
-1.3277422418733251
(data4 -np.mean(data4)) / np.std(data4)
array([-1.32774224, -0.85152593, -0.37249392, 0.0172287 , 1.20535869, -0.64061599, -0.055587 , 0.92520089, 1.97230429, -0.87212749])
(data3-np.mean(data3)) / np.std(data3)
0 -1.224745 1 0.000000 2 1.224745 dtype: float64
data_new = np.array([data4,data5])
data_new
array([[-12.40560225, -8.70345216, -4.97941255, -1.94967252, 7.28696093, -7.06381858, -2.51574862, 5.10898802, 13.24926772, -8.8636106 ], [ 1. , 9. , 8. , 0. , 6. , 6. , 9. , 6. , 2. , 9. ]])
def do_cov(num_matrix):
length = len(num_matrix)
res = [[0 for _ in range(length)] for _ in range(length)]
for m in range(length):
for n in range(length):
res[m][n] = sum([(x- do_mean(num_matrix[m]))*(y -do_mean(num_matrix[n])) for x,y in zip(num_matrix[m],num_matrix[n])]) / len(num_matrix[m])
return np.array(res)
do_cov(data_new)
array([[60.43652801, -5.7812724 ], [-5.7812724 , 10.64 ]])
# 计算data4与data5两组数据(变量)间的协方差
# 参数bias=1表示结果需要除以N(默认是除以N-1)
# 返回结果为矩阵,第i行第j列的数据表示第i组数与第j组数的协方差,对角线为方差
np.cov(data_new,bias=1)
array([[60.43652801, -5.7812724 ], [-5.7812724 , 10.64 ]])
def do_corr(num_matrix):
length = len(num_matrix)
res = do_cov(num_matrix)
for m in range(length):
for n in range(length):
res[m][n] = res[m][n] / (do_std(num_matrix[m] * do_std(num_matrix[n])))
return res
do_corr(data_new)
array([[ 1. , -0.22798321], [-0.22798321, 1. ]])
np.corrcoef(data_new)
array([[ 1. , -0.22798321], [-0.22798321, 1. ]])