当前位置 : 主页 > 网络编程 > PHP >

Numpy/Pandas均值处理数据缺失值

来源:互联网 收集:自由互联 发布时间:2023-09-06
# -*- coding: utf-8 -*- #----------------------------------------------------------------------------------------------------------------------- __Author__ = 'assasin' __DateTime__ = '2020/1/5 15:13' #---------------------------------------


# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------------------------------------------------
__Author__ = 'assasin'
__DateTime__ = '2020/1/5 15:13'
#-----------------------------------------------------------------------------------------------------------------------

'''
处理数据缺失值
Numpy均值处理数据缺失值
Pandas加载数据,重构缺失数据矩阵
Pandas 实现均质填充
Pandas 处理缺失值: 标量法,丢失法,忽略法,前后法
'''

import numpy as np
import pandas as pd
from numpy import *

def loadDataSet(filepath,delim='\t'):
fr = open(filepath)
stringArr = [line.strip().split(delim) for line in fr.readlines()]
#print(stringArr)
dataArr = [list(map(float,line)) for line in stringArr]
return mat(dataArr)


def replaceNanwithMean(dataArr):
numfeat = shape(dataArr)
for i in range(numfeat[1]-1):
meanVal = mean(dataArr[nonzero((~isnan(dataArr[:,i].A))[0],i)])
dataArr[nonzero(isnan(dataArr[:,i].A))[0],i] = meanVal

return dataArr





if __name__ == '__main__':
# 加载数据集
dataArr = loadDataSet(r'../xxx.txt',' ')

# 均值填充缺失值
replaceNanwithMean(dataArr)

datamat = loadDataSet(r'../xxx.txt',' ')
df = pd.DataFrame(datamat)
# 重构矩阵
df = df.reindex(range(datamat.shape[0] + 5 ))
# NAN 视为0
loassVs = [df[col].mean() for col in range(datamat.shape[1])]
lists = [list(df[i].fillna(loassVs[i])) for i in range(len(loassVs))]
print(mat(lists).T)

 

网友评论