import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns %matplotlib inline plt.style.use('fivethirtyeight')
pima_column_names = ['times_pregnant', 'plasma_glucose_concentration', 'diastolic_blood_pressure', 'triceps_thickness', 'serum_insulin', 'bmi', 'pedigree_function', 'age', 'onset_diabetes']
path ='D:\pima-indians-diabetes.csv'
pima = pd.read_csv(path,header=0,names=pima_column_names) pima.head()
pima.shape
pima['onset_diabetes'].value_counts(normalize=True)
col = 'plasma_glucose_concentration'
plt.hist(pima[pima['onset_diabetes']==0][col],bins=10,alpha=0.5,label='non-diabetes')
plt.hist(pima[pima['onset_diabetes']==1][col],bins=10,alpha=0.5,label='diabetes') plt.legend(loc='upper right') plt.xlabel(col) plt.ylabel('Frequency') plt.title('Histogram of {}'.format(col))
cols = ['bmi','diastolic_blood_pressure','plasma_glucose_concentration'] for col in cols: plt.hist(pima[pima['onset_diabetes']==0][col],bins=10,alpha=0.5,label='non-diabetes') plt.hist(pima[pima['onset_diabetes']==1][col],bins=10,alpha=0.5,label='diabetes') plt.legend(loc='upper right') plt.xlabel(col) plt.ylabel('Frequency') plt.title('Histogram of {}'.format(col)) plt.show()
sns.heatmap(pima.corr())
pima.corr()['onset_diabetes'].sort_values(ascending=False)
pima.isnull().sum()
pima.describe()
print(pima['serum_insulin'].isnull().sum())
pima['serum_insulin']=pima['serum_insulin'].map(lambda x:x if x!=0 else None)
print(pima['serum_insulin'].isnull().sum())
columns = ['serum_insulin', 'bmi', 'plasma_glucose_concentration', 'diastolic_blood_pressure', 'triceps_thickness'] for col in columns: pima[col].replace([0], [None], inplace=True)
pima.isnull().sum()
pima.head()
pima.describe()
pima_dropped = pima.dropna()
num_rows_lost = round(100*(pima.shape[0] - pima_dropped.shape[0]) / float(pima.shape[0])) print("删除 {}%".format(num_rows_lost))
pima['onset_diabetes'].value_counts(normalize=True)
pima_dropped['onset_diabetes'].value_counts(normalize=True)
pima.mean()
pima_dropped.mean()
ax = ((pima_dropped.mean() - pima.mean()) / pima.mean()).plot(kind='bar', title='% change in average column values') ax.set_ylabel('% change')
from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import GridSearchCV import warnings warnings.filterwarnings('ignore')
X_dropped = pima_dropped.drop('onset_diabetes', axis=1) print("leanrning from {} rows".format(X_dropped.shape[0])) y_dropped = pima_dropped['onset_diabetes']
knn_params = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7]}
knn = KNeighborsClassifier()
grid = GridSearchCV(knn, knn_params) grid.fit(X_dropped, y_dropped)
print(grid.best_score_, grid.best_params_)
pima.isnull().sum()
pima['plasma_glucose_concentration'].fillna(pima['plasma_glucose_concentration'].mean(),inplace=True) pima.isnull().sum()
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
pima_imputed = imputer.fit_transform(pima)
pima_imputed = pd.DataFrame(pima_imputed, columns=pima_column_names)
pima_imputed.head()
pima_imputed.isnull().sum()
from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import GridSearchCV import warnings warnings.filterwarnings('ignore')
pima_zero = pima.fillna(0) X_zero = pima_zero.drop('onset_diabetes', axis=1) y_zero = pima_zero['onset_diabetes'] print("learning from {} rows".format(X_zero.shape[0]))
knn_params = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7]}
knn = KNeighborsClassifier()
grid = GridSearchCV(knn, knn_params) grid.fit(X_zero, y_zero)
print(grid.best_score_, grid.best_params_)
|