월간 데이콘 2 천체 유형 분류

  • 상금 : 총 94만원
  • 2020.02.01 ~ 2020.02.29 23:59
  • 657팀
  • D-10
참여

RF_BASE

  • day2020.02.13 20:06
  • view2 view
  • languagePython
  • writerby 런투런
댓글 0
RANDOM FOFEST 기본 코드
코드
# train = pd.read_csv('train.csv' , index_col =0)
# test = pd.read_csv('test.csv' , index_col =0)
import numpy as np # linear algebra
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, roc_auc_score, f1_score, roc_curve, auc,precision_recall_curve
from sklearn import metrics
import gc
import matplotlib.pyplot as plt
import seaborn as sns

sample_submission = pd.read_csv("../input/sample_submission.csv" , index_col = 0)
test = pd.read_csv("../input/test.csv", index_col = 0 )
train = pd.read_csv("../input/train.csv", index_col = 0 )
train.head()
type fiberID psfMag_u psfMag_g psfMag_r psfMag_i psfMag_z fiberMag_u fiberMag_g fiberMag_r ... petroMag_u petroMag_g petroMag_r petroMag_i petroMag_z modelMag_u modelMag_g modelMag_r modelMag_i modelMag_z
id
0 QSO 601 23.198224 21.431953 21.314148 21.176553 21.171444 22.581309 21.644453 21.657571 ... 22.504317 21.431636 21.478312 21.145409 20.422446 22.749241 21.465534 21.364187 21.020605 21.147340
1 QSO 788 21.431355 20.708104 20.678850 20.703420 20.473229 21.868797 21.029773 20.967054 ... 21.360701 20.778968 20.889705 20.639812 20.646660 21.492955 20.758527 20.753925 20.693389 20.512314
2 QSO 427 17.851451 16.727898 16.679677 16.694640 16.641788 18.171890 17.033098 16.999682 ... 17.867253 16.738784 16.688874 16.744210 16.808006 17.818063 16.697434 16.641249 16.660177 16.688928
3 QSO 864 20.789900 20.040371 19.926909 19.843840 19.463270 21.039030 20.317165 20.217898 ... 20.433907 19.993727 19.985531 19.750917 19.455117 20.770711 20.001699 19.889798 19.758113 19.552855
4 STAR_RED_DWARF 612 26.454969 23.058767 21.471406 19.504961 18.389096 25.700632 23.629122 21.742750 ... 25.859229 22.426929 21.673551 19.610012 18.376141 24.877052 23.147993 21.475342 19.487330 18.375655

5 rows × 22 columns

test.head()
fiberID psfMag_u psfMag_g psfMag_r psfMag_i psfMag_z fiberMag_u fiberMag_g fiberMag_r fiberMag_i ... petroMag_u petroMag_g petroMag_r petroMag_i petroMag_z modelMag_u modelMag_g modelMag_r modelMag_i modelMag_z
id
199991 251 23.817399 22.508963 20.981106 18.517316 17.076079 25.053890 23.167848 21.335901 18.835858 ... 22.246697 22.796239 21.195315 18.584486 17.154284 25.391534 22.499435 21.011918 18.499341 17.091474
199992 386 22.806983 21.937111 20.335770 20.000512 19.527369 22.498565 22.186000 20.618879 20.301204 ... 21.729831 21.837511 20.196128 19.967204 19.683671 22.475338 21.853442 20.173169 19.796757 19.567372
199993 232 21.024250 19.235669 18.304061 17.808608 17.380113 21.205546 19.439533 18.344433 17.909690 ... 20.722629 18.710223 17.611851 17.158519 16.843986 20.579314 18.653338 17.562108 17.120529 16.708748
199994 557 20.503424 20.286261 20.197204 20.162419 20.059832 20.976132 20.611498 20.567262 20.479318 ... 20.329269 20.385262 20.129157 20.206574 20.212342 20.479879 20.280943 20.150499 20.206221 20.092909
199995 75 24.244851 22.668237 21.239333 19.284777 18.235939 25.681860 22.935289 21.642456 19.624926 ... 22.308298 22.957496 21.285033 19.299120 18.307526 25.489360 22.857290 21.191862 19.237964 18.280368

5 rows × 21 columns

sample_submission.head()
STAR_WHITE_DWARF STAR_CATY_VAR STAR_BROWN_DWARF SERENDIPITY_RED REDDEN_STD STAR_BHB GALAXY SERENDIPITY_DISTANT QSO SKY STAR_RED_DWARF ROSAT_D STAR_PN SERENDIPITY_FIRST STAR_CARBON SPECTROPHOTO_STD STAR_SUB_DWARF SERENDIPITY_MANUAL SERENDIPITY_BLUE
id
199991 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
199992 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
199993 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
199994 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
199995 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
column_number = {}
for i, column in enumerate(sample_submission.columns):
    column_number[column] = i
def to_number(x, dic):
    return dic[x]
train['type_num'] = train['type'].apply(lambda x : to_number(x, column_number))
trainval_x = train.drop(columns = ['type', 'type_num'] , axis = 1)
trainval_y = train['type_num']
test_x = test
trainval_x
fiberID psfMag_u psfMag_g psfMag_r psfMag_i psfMag_z fiberMag_u fiberMag_g fiberMag_r fiberMag_i ... petroMag_u petroMag_g petroMag_r petroMag_i petroMag_z modelMag_u modelMag_g modelMag_r modelMag_i modelMag_z
id
0 601 23.198224 21.431953 21.314148 21.176553 21.171444 22.581309 21.644453 21.657571 21.387653 ... 22.504317 21.431636 21.478312 21.145409 20.422446 22.749241 21.465534 21.364187 21.020605 21.147340
1 788 21.431355 20.708104 20.678850 20.703420 20.473229 21.868797 21.029773 20.967054 20.937731 ... 21.360701 20.778968 20.889705 20.639812 20.646660 21.492955 20.758527 20.753925 20.693389 20.512314
2 427 17.851451 16.727898 16.679677 16.694640 16.641788 18.171890 17.033098 16.999682 17.095999 ... 17.867253 16.738784 16.688874 16.744210 16.808006 17.818063 16.697434 16.641249 16.660177 16.688928
3 864 20.789900 20.040371 19.926909 19.843840 19.463270 21.039030 20.317165 20.217898 20.073852 ... 20.433907 19.993727 19.985531 19.750917 19.455117 20.770711 20.001699 19.889798 19.758113 19.552855
4 612 26.454969 23.058767 21.471406 19.504961 18.389096 25.700632 23.629122 21.742750 19.861718 ... 25.859229 22.426929 21.673551 19.610012 18.376141 24.877052 23.147993 21.475342 19.487330 18.375655
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
199986 568 20.630780 20.437920 20.050756 20.020974 19.959646 20.948519 20.726709 20.311562 20.323418 ... 20.836400 20.536338 20.126561 20.102815 19.796722 20.783295 20.326879 20.014046 20.061940 19.862513
199987 362 25.283695 22.807243 21.069792 20.187531 19.595598 26.677671 22.818723 20.786120 19.961857 ... 28.895442 22.250489 20.091693 18.985791 18.803354 26.921903 21.952440 19.776321 18.990039 18.528306
199988 146 23.609826 21.902709 20.330747 18.457535 17.326771 24.320816 22.287946 20.711737 18.810560 ... 25.982378 22.132993 20.329624 18.507337 17.418521 24.503460 21.897599 20.420524 18.443126 17.331724
199989 85 21.172003 20.561208 20.031754 19.793644 19.540471 21.704126 20.469748 19.872089 19.603742 ... 18.880781 18.065559 17.501307 17.180639 17.175205 19.109086 18.004976 17.440331 17.193743 16.969854
199990 964 20.624498 20.122276 20.332241 20.360527 19.953043 20.931572 20.432758 20.678753 20.641050 ... 20.796250 20.129721 20.415419 20.559018 19.816157 20.748047 20.116332 20.327396 20.217198 19.995023

199991 rows × 21 columns

test_x
fiberID psfMag_u psfMag_g psfMag_r psfMag_i psfMag_z fiberMag_u fiberMag_g fiberMag_r fiberMag_i ... petroMag_u petroMag_g petroMag_r petroMag_i petroMag_z modelMag_u modelMag_g modelMag_r modelMag_i modelMag_z
id
199991 251 23.817399 22.508963 20.981106 18.517316 17.076079 25.053890 23.167848 21.335901 18.835858 ... 22.246697 22.796239 21.195315 18.584486 17.154284 25.391534 22.499435 21.011918 18.499341 17.091474
199992 386 22.806983 21.937111 20.335770 20.000512 19.527369 22.498565 22.186000 20.618879 20.301204 ... 21.729831 21.837511 20.196128 19.967204 19.683671 22.475338 21.853442 20.173169 19.796757 19.567372
199993 232 21.024250 19.235669 18.304061 17.808608 17.380113 21.205546 19.439533 18.344433 17.909690 ... 20.722629 18.710223 17.611851 17.158519 16.843986 20.579314 18.653338 17.562108 17.120529 16.708748
199994 557 20.503424 20.286261 20.197204 20.162419 20.059832 20.976132 20.611498 20.567262 20.479318 ... 20.329269 20.385262 20.129157 20.206574 20.212342 20.479879 20.280943 20.150499 20.206221 20.092909
199995 75 24.244851 22.668237 21.239333 19.284777 18.235939 25.681860 22.935289 21.642456 19.624926 ... 22.308298 22.957496 21.285033 19.299120 18.307526 25.489360 22.857290 21.191862 19.237964 18.280368
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
209995 389 19.765035 19.542406 19.439286 19.357957 19.357305 20.093308 19.827172 19.714033 19.686861 ... 19.882728 19.549257 19.457564 19.303361 19.483389 19.792181 19.536518 19.432463 19.376445 19.354976
209996 247 19.960030 19.609379 19.325949 19.391290 19.258862 20.319914 19.928523 19.696262 19.763771 ... 20.085454 19.635609 19.381924 19.460517 19.390865 20.066552 19.604800 19.334113 19.401240 19.159412
209997 941 21.221899 20.276182 20.090775 19.927208 19.766732 21.254454 20.551190 20.365800 20.200872 ... 20.998020 20.315201 20.236725 20.058801 19.988363 21.045501 20.255485 20.141549 19.906014 19.953932
209998 674 21.625469 21.624585 21.093379 21.191775 21.186596 22.042942 21.950139 21.610805 21.501469 ... 22.320173 21.516809 21.270925 21.196019 21.331529 21.700769 21.589489 21.128330 21.430526 21.220902
209999 358 17.438361 16.531405 16.186019 16.034888 15.995932 17.747088 16.836321 16.501623 16.373316 ... 17.481303 16.575405 16.221938 16.103483 16.052459 17.444318 16.530546 16.176274 16.045822 15.995814

10009 rows × 21 columns

FE

Train

# train_x, val_x, train_y, val_y = train_test_split(trainval_x, trainval_y, test_size = 0.2, stratify= trainval_y )
trainval_y.value_counts()
8     49680
6     37347
18    21760
15    14630
4     14618
10    13750
5     13500
13     7132
11     6580
1      6506
7      4654
14     3257
3      2562
0      2160
16     1154
2       500
9       127
17       61
12       13
Name: type_num, dtype: int64
len(val_x)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-14-b88e0de2ed92> in <module>
----> 1 len(val_x)

NameError: name 'val_x' is not defined
# clf = RandomForestClassifier(n_estimators=1000, random_state=42)
# clf.fit(train_x, train_y)
# clf_probs = clf.predict_proba(val_x)
# score = log_loss(val_y, clf_probs)
# print(score)
# params = {
#     'objective':'binary',
#     'boosting_type':'gbdt',
#     'metric':'auc',
#     'n_jobs':-1,
#     'learning_rate':0.01,
#     'num_leaves': 2**8,
#     'max_depth':-1,
#     'tree_learner':'serial',
#     'colsample_bytree': 0.85,
#     'subsample_freq':1,
#     'subsample':0.85,
#     'max_bin':255,
#     'verbose':-1,
#     'seed': 47,
#     'n_estimators':10000, # 800
#     'early_stopping_rounds':500, # 100 
#     'reg_alpha':0.3,
#     'reg_lamdba':0.243
# } 
# %%time

# NFOLDS = 5
# folds = KFold(n_splits=NFOLDS)

# columns = X.columns
# splits = folds.split(X, y)
# y_preds = np.zeros(X_test.shape[0])
# y_oof = np.zeros(X.shape[0])
# score = 0

# feature_importances = pd.DataFrame()
# feature_importances['feature'] = columns
  
# for fold_n, (train_index, valid_index) in enumerate(splits):
#     X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
#     y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
#     dtrain = lgb.Dataset(X_train, label=y_train)
#     dvalid = lgb.Dataset(X_valid, label=y_valid)

#     clf = lgb.train(params, dtrain, valid_sets = [dtrain, dvalid], verbose_eval=200)
    
#     feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
    
#     y_pred_valid = clf.predict(X_valid)
#     y_oof[valid_index] = y_pred_valid
#     print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")
    
#     score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
#     y_preds += clf.predict(X_test) / NFOLDS
    
#     del X_train, X_valid, y_train, y_valid
#     gc.collect()

log loss 계산

X = trainval_x
y = trainval_y
X_test = test_x
clf_probs[0,:]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-19-834a4f1708fe> in <module>
----> 1 clf_probs[0,:]

NameError: name 'clf_probs' is not defined
clf = RandomForestClassifier(n_estimators=50, random_state=42)
clf.fit(train_x, train_y)
clf_probs = clf.predict_proba(val_x)
score = log_loss(val_y, clf_probs)
print(score)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-20-aebc3577213e> in <module>
      1 clf = RandomForestClassifier(n_estimators=50, random_state=42)
----> 2 clf.fit(train_x, train_y)
      3 clf_probs = clf.predict_proba(val_x)
      4 score = log_loss(val_y, clf_probs)
      5 print(score)

NameError: name 'train_x' is not defined
clf = RandomForestClassifier(
    n_estimators=1000,
    verbose=1,
    n_jobs=-1,
    random_state=42,
)
max(y_valid)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-22-3ec65fe560e7> in <module>
----> 1 max(y_valid)

NameError: name 'y_valid' is not defined
NFOLDS = 5
folds = StratifiedKFold(n_splits=NFOLDS, random_state=42, shuffle=False)
loss_average = 0.0
columns = X.columns

splits = folds.split(X, y)
y_preds = np.zeros((X_test.shape[0],19))
#y_oof = np.zeros((X.shape[0],20))
score = 0

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns

for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

#     dtrain = lgb.Dataset(X_train, label=y_train)
#     dvalid = lgb.Dataset(X_valid, label=y_valid)

#     clf = lgb.train(params, dtrain, valid_sets = [dtrain, dvalid], verbose_eval=200)

    #clf = RandomForestClassifier(n_estimators=25)
    clf.fit(X_train,y_train)
    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importances_

    y_pred_valid = clf.predict(X_valid)
    y_pred_valid_probs = clf.predict_proba(X_valid)
    #y_oof[valid_index] = y_pred_valid
    #print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")
    #score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
    #y_preds += clf.predict_proba(X_test) / NFOLDS
    print(f"Fold {fold_n + 1} | logloss: {log_loss(y_valid, y_pred_valid_probs)}")
    loss_average += log_loss(y_valid, y_pred_valid_probs) / NFOLDS
    y_preds += clf.predict_proba(X_test) / NFOLDS
    
    del X_train, X_valid, y_train, y_valid
    gc.collect()
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   29.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 11.3min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    3.1s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    5.6s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    7.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    2.8s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    5.0s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    6.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
Fold 1 | logloss: 0.39656748005486153
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    1.5s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    1.9s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   29.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 11.2min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    2.8s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    5.0s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    6.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    2.8s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    5.0s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    6.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
Fold 2 | logloss: 0.3889103313873209
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    1.8s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   29.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 10.9min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    2.8s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    5.0s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    6.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    2.8s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    4.9s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    6.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
Fold 3 | logloss: 0.3946901747479173
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    1.7s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   28.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 10.9min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    2.8s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    5.0s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    6.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    3.0s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    5.3s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    6.6s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
Fold 4 | logloss: 0.3830213045315384
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    1.8s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   28.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 11.0min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    2.8s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    4.9s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    6.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    2.7s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    4.9s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    6.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
Fold 5 | logloss: 0.3809514337492767
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    1.7s finished
loss_average
0.38882814489418294
submission = pd.DataFrame(data = y_preds, columns = sample_submission.columns, index = sample_submission.index)
submission.to_csv('submission.csv', index = True)
submission
STAR_WHITE_DWARF STAR_CATY_VAR STAR_BROWN_DWARF SERENDIPITY_RED REDDEN_STD STAR_BHB GALAXY SERENDIPITY_DISTANT QSO SKY STAR_RED_DWARF ROSAT_D STAR_PN SERENDIPITY_FIRST STAR_CARBON SPECTROPHOTO_STD STAR_SUB_DWARF SERENDIPITY_MANUAL SERENDIPITY_BLUE
id
199991 0.0000 0.0000 0.0 0.0570 0.0 0.0000 0.0000 0.0000 0.0002 0.0 0.9424 0.0000 0.0000 0.0000 0.0000 0.0000 0.0 0.0004 0.0000
199992 0.0000 0.0000 0.0 0.0000 0.0 0.0000 0.0038 0.0004 0.5650 0.0 0.0000 0.2482 0.0000 0.1822 0.0000 0.0000 0.0 0.0000 0.0004
199993 0.0000 0.0000 0.0 0.0000 0.0 0.0000 0.9834 0.0000 0.0002 0.0 0.0000 0.0162 0.0000 0.0000 0.0002 0.0000 0.0 0.0000 0.0000
199994 0.0000 0.0000 0.0 0.0000 0.0 0.0000 0.0000 0.0082 0.1334 0.0 0.0000 0.0310 0.0000 0.0430 0.0000 0.0000 0.0 0.0000 0.7844
199995 0.0000 0.0000 0.0 0.0006 0.0 0.0000 0.0000 0.0000 0.0000 0.0 0.9992 0.0002 0.0000 0.0000 0.0000 0.0000 0.0 0.0000 0.0000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
209995 0.0304 0.0396 0.0 0.0000 0.0 0.0002 0.0002 0.0050 0.1130 0.0 0.0000 0.0826 0.0000 0.0422 0.0000 0.0000 0.0 0.0000 0.6868
209996 0.0308 0.0418 0.0 0.0000 0.0 0.0012 0.0000 0.0080 0.2338 0.0 0.0000 0.0666 0.0002 0.0460 0.0000 0.0000 0.0 0.0000 0.5716
209997 0.0012 0.0000 0.0 0.0000 0.0 0.0000 0.0000 0.0000 0.9788 0.0 0.0000 0.0058 0.0000 0.0056 0.0000 0.0000 0.0 0.0000 0.0086
209998 0.0000 0.0000 0.0 0.0000 0.0 0.0000 0.0000 0.0000 0.9996 0.0 0.0000 0.0000 0.0000 0.0002 0.0000 0.0000 0.0 0.0000 0.0002
209999 0.0000 0.0000 0.0 0.0000 0.0 0.0004 0.0000 0.0000 0.0000 0.0 0.0000 0.0004 0.0000 0.0000 0.0000 0.9992 0.0 0.0000 0.0000

10009 rows × 19 columns

submission = pd.DataFrame(data=y_preds, columns=sub.columns, index=sub.index)
submission['id'] = sub['id']
submission.to_csv('submission.csv', index=False)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-27-e66eed236cec> in <module>
----> 1 submission = pd.DataFrame(data=y_preds, columns=sub.columns, index=sub.index)
      2 submission['id'] = sub['id']
      3 submission.to_csv('submission.csv', index=False)

NameError: name 'sub' is not defined
y_preds
array([[0.    , 0.    , 0.    , ..., 0.    , 0.0004, 0.    ],
       [0.    , 0.    , 0.    , ..., 0.    , 0.    , 0.0004],
       [0.    , 0.    , 0.    , ..., 0.    , 0.    , 0.    ],
       ...,
       [0.0012, 0.    , 0.    , ..., 0.    , 0.    , 0.0086],
       [0.    , 0.    , 0.    , ..., 0.    , 0.    , 0.0002],
       [0.    , 0.    , 0.    , ..., 0.    , 0.    , 0.    ]])
y_preds
array([[0.    , 0.    , 0.    , ..., 0.    , 0.0004, 0.    ],
       [0.    , 0.    , 0.    , ..., 0.    , 0.    , 0.0004],
       [0.    , 0.    , 0.    , ..., 0.    , 0.    , 0.    ],
       ...,
       [0.0012, 0.    , 0.    , ..., 0.    , 0.    , 0.0086],
       [0.    , 0.    , 0.    , ..., 0.    , 0.    , 0.0002],
       [0.    , 0.    , 0.    , ..., 0.    , 0.    , 0.    ]])
X_test.head()
fiberID psfMag_u psfMag_g psfMag_r psfMag_i psfMag_z fiberMag_u fiberMag_g fiberMag_r fiberMag_i ... petroMag_u petroMag_g petroMag_r petroMag_i petroMag_z modelMag_u modelMag_g modelMag_r modelMag_i modelMag_z
id
199991 251 23.817399 22.508963 20.981106 18.517316 17.076079 25.053890 23.167848 21.335901 18.835858 ... 22.246697 22.796239 21.195315 18.584486 17.154284 25.391534 22.499435 21.011918 18.499341 17.091474
199992 386 22.806983 21.937111 20.335770 20.000512 19.527369 22.498565 22.186000 20.618879 20.301204 ... 21.729831 21.837511 20.196128 19.967204 19.683671 22.475338 21.853442 20.173169 19.796757 19.567372
199993 232 21.024250 19.235669 18.304061 17.808608 17.380113 21.205546 19.439533 18.344433 17.909690 ... 20.722629 18.710223 17.611851 17.158519 16.843986 20.579314 18.653338 17.562108 17.120529 16.708748
199994 557 20.503424 20.286261 20.197204 20.162419 20.059832 20.976132 20.611498 20.567262 20.479318 ... 20.329269 20.385262 20.129157 20.206574 20.212342 20.479879 20.280943 20.150499 20.206221 20.092909
199995 75 24.244851 22.668237 21.239333 19.284777 18.235939 25.681860 22.935289 21.642456 19.624926 ... 22.308298 22.957496 21.285033 19.299120 18.307526 25.489360 22.857290 21.191862 19.237964 18.280368

5 rows × 21 columns

loss_average
0.38882814489418294
# sub['isFraud'] = y_preds
# sub.to_csv("submission.csv", index=False)
# submission = pd.DataFrame(data = y_pred, columns = sample_submission.columns, index = sample_submission.index)
# submission.to_csv('submission.csv', index = True)
feature_importances['average'] = feature_importances[[f'fold_{fold_n + 1}' for fold_n in range(folds.n_splits)]].mean(axis=1)
feature_importances.to_csv('feature_importances.csv')
plt.figure(figsize=(16, 16))
sns.barplot(data=feature_importances.sort_values(by='average', ascending=False).head(50), x='average', y='feature')
plt.title('50 TOP feature importance over {} folds average'.format(folds.n_splits))
Text(0.5, 1.0, '50 TOP feature importance over 5 folds average')
댓글 0개
    로그인이 필요합니다
    목록으로