缺失值分析处理 可视化数据分析 代码 import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import warnings warnings.filterwarnings( 'ignore' ) train=pd.read_csv( "train.csv" ) test=pd.rea
data:image/s3,"s3://crabby-images/5bed2/5bed242de6ea15b4779bac05f3e0649ded6737ad" alt="python泰坦尼克号生存预测_数据"
data:image/s3,"s3://crabby-images/1eb95/1eb959f412a8447cfad2d7e662a06cdb5ce5f1ab" alt="python泰坦尼克号生存预测_填充空白_02"
data:image/s3,"s3://crabby-images/91128/91128f38b65659497ac0280305bdc598ac5dab6d" alt="python泰坦尼克号生存预测_数据分析_03"
缺失值分析处理
data:image/s3,"s3://crabby-images/938d4/938d4a7835c7e7f8e5efe9b73f19a9e88cfe7da7" alt="python泰坦尼克号生存预测_数据_04"
data:image/s3,"s3://crabby-images/60e79/60e79f623a4194d83d20f091fc756e5f7df5b19a" alt="python泰坦尼克号生存预测_数据分析_05"
data:image/s3,"s3://crabby-images/a91d0/a91d01fceae8fb53bf2f09babb480950fdacc7b4" alt="python泰坦尼克号生存预测_数据分析_06"
data:image/s3,"s3://crabby-images/bb142/bb142000c59f8e368a2f1894da92592e567ab5d4" alt="python泰坦尼克号生存预测_数据_07"
data:image/s3,"s3://crabby-images/b61dd/b61dd3af5001895ec1422595eb14c1264cfb0eea" alt="python泰坦尼克号生存预测_填充空白_08"
可视化&数据分析
data:image/s3,"s3://crabby-images/e6d33/e6d33f187e9776408e0acaa5c0120a176715040f" alt="python泰坦尼克号生存预测_填充空白_09"
data:image/s3,"s3://crabby-images/1cc1b/1cc1b8d01b072dd8515851eac36ed72c0f6ef988" alt="python泰坦尼克号生存预测_数据分析_10"
data:image/s3,"s3://crabby-images/08098/0809896acc684b968b1f3f86fe6f82afefff2bdf" alt="python泰坦尼克号生存预测_数据分析_11"
data:image/s3,"s3://crabby-images/d7c35/d7c35ad7ec9388a00388f247887d2ae5e35b3283" alt="python泰坦尼克号生存预测_数据_12"
代码
importnumpyasnpimportpandasaspdimportmatplotlib.pyplotaspltimportseabornassnsimportwarningswarnings.filterwarnings('ignore')
train = pd.read_csv("train.csv")test = pd.read_csv("test.csv")gender= pd.read_csv("gender.csv")print(train.head())print(test.head())print(gender.head())
data=pd.concat([train,test],ignore_index=True)data
print(data.info())
print(data.isnull().sum())#Cabin缺失数据较多,可直接删除
data.describe()#查看年龄统计数据数据,使用年龄的平均值填充空白值
data.Age=data.Age.fillna(data.Age.mean())#用平均值填充年龄data
data[data.Fare.isnull()]
data.Fare=data.Fare.fillna(data[(data.Embarked =='S')&(data.Pclass ==3)].Fare.median())
data[data.Embarked.isnull()]
data.groupby(by=['Pclass','Embarked']).Fare.median()
data.groupby(by=['Pclass','Embarked']).Survived.count()
data.Embarked=data.Embarked.fillna('C')
data.iloc[61]
data=data.drop('Cabin',axis=1)#Cabin数据缺失将近70%,删除Cabin列。data
print(data.isnull().sum())
sns.barplot(x='Pclass',y='Survived',data=data)
plt.subplots(figsize=(15,8))sns.kdeplot(data.loc[(data['Survived']==0),'Pclass'],shade=True,color='red',label='Not Survived')sns.kdeplot(data.loc[(data['Survived']==1),'Pclass'],shade=True,color='blue',label='Survived')labels=['1','2','3']plt.xticks(sorted(data.Pclass.unique()),labels)plt.show()
sns.barplot(x='Sex',y='Survived',data=data)#女性生存数量大于男性
sns.barplot(x='Parch',y='Survived',data=data)
sns.barplot(x='SibSp',y='Survived',data=data)
sns.barplot(x='Embarked',y='Survived',data=data)