반응형
Train Data
▶ Read file
df_train = pd.read_csv("./titanic/train.csv")
▶ Drop column
df_train.drop(columns=["PassengerId", "Cabin"], inplace=True)
▶ Resolution of missing values in Embarked.
df_train["Embarked"].fillna(df_train["Embarked"].unique()[0], inplace=True)
▶ Detach the master from the name column.
df_train['Name'] = df_train['Name'].str.split(', ').str[1].str.split('. ').str[0]\
df_train['Master'] = (df_train['Name']=='Master')
df_train=df_train.drop(columns='Name')
df_train=df_train.drop(columns='Ticket')
▶ create "age_new" columns.
df_train.loc[df_train.Age >= 50, "age_new"] = "old"
df_train.loc[(df_train.Age < 50) & (df_train.Age>=10), "age_new"] = "young"
df_train.loc[df_train.Age < 10, "age_new"] = "baby"
▶ Encoding
og_columns = df_train.columns[(df_train.dtypes=='O')|(df_train.dtypes=='category')|(df_train.dtypes=='bool')]
for i in og_columns :
globals()[f'df_train_{i}_encoder'] = LabelEncoder()
globals()[f'df_train_{i}_encoder'].fit(df_train[i])
df_train[i] = globals()[f'df_train_{i}_encoder'].transform(df_train[i])
▶ Resolution of missing values in "Age".
Age_md = df_train.groupby(['Pclass', 'Sex']).Age.agg(['median'])
df_train.loc[(df_train['Sex'] == 0) & (df_train['Pclass'] == 1) & (df_train.Age.isna()), "Age"] = Age_md.loc[1, 0][0]
df_train.loc[(df_train['Sex'] == 0) & (df_train['Pclass'] == 2) & (df_train.Age.isna()), "Age"] = Age_md.loc[2, 0][0]
df_train.loc[(df_train['Sex'] == 0) & (df_train['Pclass'] == 3) & (df_train.Age.isna()), "Age"] = Age_md.loc[3, 0][0]
df_train.loc[(df_train['Sex'] == 1) & (df_train['Pclass'] == 1) & (df_train.Age.isna()), "Age"] = Age_md.loc[1, 1][0]
df_train.loc[(df_train['Sex'] == 1) & (df_train['Pclass'] == 2) & (df_train.Age.isna()), "Age"] = Age_md.loc[2, 1][0]
df_train.loc[(df_train['Sex'] == 1) & (df_train['Pclass'] == 3) & (df_train.Age.isna()), "Age"] = Age_md.loc[3, 1][0]
▶ Create family columns and delete columns.
df_train["family"] = df_train.SibSp + df_train.Parch
df_train.drop(columns=["SibSp", "Parch"], inplace=True)
▶ Separate X and y.
X = df_train.drop(columns='Survived')
y = df_train['Survived']
Test Data
▶ Read data
df_test = pd.read_csv('./titanic/test.csv')
▶ Drop column
df_test.drop(columns=["PassengerId", "Cabin"], inplace=True)
▶ Detach the master from the name column.
df_test['Name'] = df_test['Name'].str.split(', ').str[1].str.split('. ').str[0]
df_test['Master']=(df_test['Name']=='Master')
df_test=df_test.drop(columns='Name')
df_test=df_test.drop(columns='Ticket')
▶ Resolution of missing values in "Age", "Fare".
age_md = df_test.groupby(['Pclass', 'Sex']).Age.agg(['mean'])
df_test.loc[(df_test['Sex'] == 'male') & (df_test['Pclass'] == 1) & (df_test.Age.isna()), 'Age'] = age_md.loc[1, 'male'][0]
df_test.loc[(df_test['Sex'] == 'male') & (df_test['Pclass'] == 2) & (df_test.Age.isna()), 'Age'] = age_md.loc[2, 'male'][0]
df_test.loc[(df_test['Sex'] == 'male') & (df_test['Pclass'] == 3) & (df_test.Age.isna()), 'Age'] = age_md.loc[3, 'male'][0]
df_test.loc[(df_test['Sex'] == 'female') & (df_test['Pclass'] == 1) & (df_test.Age.isna()), 'Age'] = age_md.loc[1, 'female'][0]
df_test.loc[(df_test['Sex'] == 'female') & (df_test['Pclass'] == 2) & (df_test.Age.isna()), 'Age'] = age_md.loc[2, 'female'][0]
df_test.loc[(df_test['Sex'] == 'female') & (df_test['Pclass'] == 3) & (df_test.Age.isna()), 'Age'] = age_md.loc[3, 'female'][0]
df_test.Fare.fillna(df_test["Fare"].mean(), inplace=True)
▶ create "age_new" columns.
df_test.loc[df_test.Age >= 50, "age_new"] = "old"
df_test.loc[(df_test.Age < 50) & (df_test.Age>=10), "age_new"] = "young"
df_test.loc[df_test.Age < 10, "age_new"] = "baby"
▶ Create family columns and delete columns.
df_test["family"] = df_test.SibSp + df_test.Parch
df_test.drop(columns=["SibSp", "Parch"], inplace=True)
▶ Encoding
og_columns = df_test.columns[(df_test.dtypes=='O')|(df_test.dtypes=='category')|(df_test.dtypes=='bool')]
for i in og_columns :
globals()[f'df_test{i}_encoder'] = LabelEncoder()
globals()[f'df_test{i}_encoder'].fit(df_test[i])
df_test[i] = globals()[f'df_test{i}_encoder'].transform(df_test[i])
Answer Data
Read the answer data.
df_answer = pd.read_csv("./titanic/answer_tit.csv")
df_answer = df_answer.Survived
Create Model
▶ Hyperparameter
▶ install keras-tuner
!pip install keras-tuner --upgrade
▶ model setting.
import keras_tuner
from tensorflow import keras
def build_model(hp):
ip = Input(shape=(X.shape[1],))
n = BatchNormalization()(ip)
n = Dense(hp.Choice('units', [8, 16, 32, 64, 128, 256]),
activation='elu')(n)
n = Dropout(0.5)(n)
n = BatchNormalization()(n)
n = Dense(hp.Choice('units', [8, 16, 32, 64, 128, 256]),
activation='elu')(n)
n = BatchNormalization()(n)
n = Dense(hp.Choice('units', [8, 16, 32, 64, 128, 256]),
activation='elu')(n)
n = Dropout(0.5)(n)
n = BatchNormalization()(n)
n = Dense(hp.Choice('units', [8, 16, 32, 64, 128, 256]),
activation='elu')(n)
n = Dense(1, activation='sigmoid')(n)
model = Model(inputs=ip, outputs=n)
model.compile(loss='mse', optimizer="adam", metrics='accuracy')
return model
▶ set tuner
tuner1 = keras_tuner.RandomSearch(
build_model, objective='val_accuracy', max_trials=5, directory='./tuner1')
▶ tuner search
tuner1.search(X, y, epochs=1000, validation_split=0.1)
▶ get best models
best_model1 = tuner1.get_best_models()[0]
▶ evaluate model
best_model1.evaluate(df_test, df_answer)
inputs = Input(shape=(8,))
n = Dense(128, activation="elu")(inputs)
n = Dropout(0.5)(n)
n = BatchNormalization()(n)
n = Dense(64, activation="elu")(n)
n = Dropout(0.5)(n)
n = BatchNormalization()(n)
n = Dense(32, activation="elu")(n)
n = BatchNormalization()(n)
n = Dense(16, activation="elu")(n)
n = BatchNormalization()(n)
n = Dense(1, activation="sigmoid")(n)
model = Model(inputs=inputs, outputs=n)
model.compile(loss="mse",
optimizer="adam",
metrics=["acc"])
▶ Best Weight Choice
checkpoint_filepath = './tmp/checkpoint'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
filepath=checkpoint_filepath, save_weights_only=True,
monitor='val_accuracy', mode='max', save_best_only=True)
model.fit(X, y, epochs=300, verbose=0, validation_split=0.2, callbacks=[model_checkpoint_callback])
model.load_weights(checkpoint_filepath)
Fit & Predict Model
▶ Best Weight Choice
checkpoint_filepath = './tmp/checkpoint'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
filepath=checkpoint_filepath, save_weights_only=True,
monitor='val_accuracy', mode='max', save_best_only=True)
model.fit(X, y, epochs=300, verbose=0, validation_split=0.2, callbacks=[model_checkpoint_callback])
model.load_weights(checkpoint_filepath)
test_loss, test_acc = model.evaluate(df_test, df_answer, verbose=2)
print(f"test loss : {test_loss}, test acc : {test_acc}")
반응형
'Python' 카테고리의 다른 글
[Python] Tensorflow (1) | 2022.12.19 |
---|---|
[Python] K-means Algorithm (0) | 2022.11.17 |
[Python] PCA, LDA (0) | 2022.11.17 |
[Python] openCV : Matching (0) | 2022.11.17 |
[Python] openCV : Moments (0) | 2022.11.17 |