#导入库import pandas as pdimport numpy as npfrom sklearn.preprocessing import Imputer#生成缺失数据df=pd.DataFrame(np.random.randn(6,4),columns=['col1','col2','col3','col4'])df.iloc[1:2,1] = np.nan #增加缺失值df.iloc[4,3] = np.
#导入库 import pandas as pd import numpy as np from sklearn.preprocessing import Imputer #生成缺失数据 df=pd.DataFrame(np.random.randn(6,4),columns=['col1','col2','col3','col4']) df.iloc[1:2,1] = np.nan #增加缺失值 df.iloc[4,3] = np.nan #增加缺失值 print(df) #打印输出 col1 col2 col3 col4 0 -0.977511 -0.566332 -0.529934 1.489695 1 -0.491128 NaN -0.811174 -1.102717 2 0.385777 -0.638822 0.325953 -0.240780 3 0.938351 -0.746889 0.375200 -0.715265 4 1.103418 0.238959 -0.459114 NaN 5 1.002177 0.448844 -0.584634 -1.038151 #查看缺失值位置 nan_all=df.isnull() print(nan_all) col1 col2 col3 col4 0 False False False False 1 False True False False 2 False False False False 3 False False False False 4 False False False True 5 False False False False nan_col1=df.isnull().any() #获取含有NA的列 print(nan_col1) col1 False col2 True col3 False col4 True dtype: bool nan_col2=df.isnull().all() #获得全部为NA的列 print(nan_col2) col1 False col2 False col3 False col4 False dtype: bool #丢弃缺失值 df2=df.dropna() #直接丢弃含有NA的行纪录 print(df2) col1 col2 col3 col4 0 -0.977511 -0.566332 -0.529934 1.489695 2 0.385777 -0.638822 0.325953 -0.240780 3 0.938351 -0.746889 0.375200 -0.715265 5 1.002177 0.448844 -0.584634 -1.038151 #通过sklearn的数据预处理方法对缺失值进行处理 nan_model=Imputer(missing_values='NaN',strategy='mean',axis=0) #建立替换规则:将值为NaN的缺失值以均值做替换 nan_result=nan_model.fit_transform(df) #应用模型规则 print(nan_result) #打印输出 [[-0.97751051 -0.56633185 -0.52993389 1.48969465] [-0.49112788 -0.25284792 -0.81117388 -1.10271738] [ 0.38577678 -0.63882219 0.32595345 -0.24077995] [ 0.93835121 -0.74688892 0.37519957 -0.71526484] [ 1.10341788 0.23895916 -0.45911413 -0.32144373] [ 1.00217657 0.4488442 -0.58463419 -1.03815116]] #使用Pandas做缺失值处理 nan_result_pd1 = df.fillna(method='backfill') #用后面的值替换缺失值 print(nan_result_pd1) col1 col2 col3 col4 0 -0.977511 -0.566332 -0.529934 1.489695 1 -0.491128 -0.638822 -0.811174 -1.102717 2 0.385777 -0.638822 0.325953 -0.240780 3 0.938351 -0.746889 0.375200 -0.715265 4 1.103418 0.238959 -0.459114 -1.038151 5 1.002177 0.448844 -0.584634 -1.038151 nan_result_pd2 = df.fillna(method='bfill',limit=1) #用后面的值替换缺失值,限制每列只能替代一个缺失值 print(nan_result_pd2) col1 col2 col3 col4 0 -0.977511 -0.566332 -0.529934 1.489695 1 -0.491128 -0.638822 -0.811174 -1.102717 2 0.385777 -0.638822 0.325953 -0.240780 3 0.938351 -0.746889 0.375200 -0.715265 4 1.103418 0.238959 -0.459114 -1.038151 5 1.002177 0.448844 -0.584634 -1.038151 nan_result_df3=df.fillna(method='pad') #用前面的值替换缺失值 print(nan_result_df3) col1 col2 col3 col4 0 -0.977511 -0.566332 -0.529934 1.489695 1 -0.491128 -0.566332 -0.811174 -1.102717 2 0.385777 -0.638822 0.325953 -0.240780 3 0.938351 -0.746889 0.375200 -0.715265 4 1.103418 0.238959 -0.459114 -0.715265 5 1.002177 0.448844 -0.584634 -1.038151 nan_result_df4=df.fillna(0) #用0替换缺失值 print(nan_result_df4) col1 col2 col3 col4 0 -0.977511 -0.566332 -0.529934 1.489695 1 -0.491128 0.000000 -0.811174 -1.102717 2 0.385777 -0.638822 0.325953 -0.240780 3 0.938351 -0.746889 0.375200 -0.715265 4 1.103418 0.238959 -0.459114 0.000000 5 1.002177 0.448844 -0.584634 -1.038151 nan_result_df5=df.fillna({'col2':1.1,'col4':1.2}) #用不同值替换不同列的缺失值 print(nan_result_df5) col1 col2 col3 col4 0 -0.977511 -0.566332 -0.529934 1.489695 1 -0.491128 1.100000 -0.811174 -1.102717 2 0.385777 -0.638822 0.325953 -0.240780 3 0.938351 -0.746889 0.375200 -0.715265 4 1.103418 0.238959 -0.459114 1.200000 5 1.002177 0.448844 -0.584634 -1.038151 nan_result_df6=df.fillna(df.mean()['col2':'col4']) #用各自列的平均数替换缺失值 print(nan_result_df6) col1 col2 col3 col4 0 -0.977511 -0.566332 -0.529934 1.489695 1 -0.491128 -0.252848 -0.811174 -1.102717 2 0.385777 -0.638822 0.325953 -0.240780 3 0.938351 -0.746889 0.375200 -0.715265 4 1.103418 0.238959 -0.459114 -0.321444 5 1.002177 0.448844 -0.584634 -1.038151 nan_result_df7=df.replace(np.nan,0) #用Pandas的replace替换缺失值 print(nan_result_df7) col1 col2 col3 col4 0 -0.977511 -0.566332 -0.529934 1.489695 1 -0.491128 0.000000 -0.811174 -1.102717 2 0.385777 -0.638822 0.325953 -0.240780 3 0.938351 -0.746889 0.375200 -0.715265 4 1.103418 0.238959 -0.459114 0.000000 5 1.002177 0.448844 -0.584634 -1.038151