機械学習一般の話題。特にDeep Learning専用のカレンダーは既にあるので、Deep Learning以外の話題が望ましい。 Machine Learning Advent Calendar 2016 - Qiita - Qiita |
docker-config-mlenv - Dockerfile for machine learning environment(scikit-learn, chainer, gensim, tensorflow, jupyter) zuqqhi2/docker-config-mlenv - GitHub |
上記のコードを実行すると以下のようになります。#!/usr/bin/env python import numpy as np import pandas as pd from sklearn.cross_validation import ShuffleSplit, train_test_split from sklearn.linear_model import LinearRegression from sklearn.metrics import r2_score, make_scorer, mean_squared_error from sklearn.grid_search import GridSearchCV # Load UCI housing data set data = pd.read_csv('./housing.data', delim_whitespace=True, header=None) prices = data[[13]] prices.columns = ['MEDV'] features = data[[5]] features.columns = ['RM'] # Cross-Validation setting X_train, X_test, y_train, y_test = train_test_split(features, prices, test_size=0.2, random_state=42) cv_sets = ShuffleSplit(X_train.shape[0], n_iter = 10, test_size = 0.20, random_state = 0) params = {'normalize': [False, True], 'fit_intercept': [False, True]} # Learning regressor = LinearRegression() scoring_fnc = make_scorer(r2_score) grid = GridSearchCV(regressor, params, cv=cv_sets, scoring=scoring_fnc) best_reg = grid.fit(X_train, y_train) # Prediction result print("Prediction sample : room number = {0}, actual price = {1}, predicted price = {2}".format(X_test['RM'][0], y_test['MEDV'][0], best_reg.predict(X_test['RM'][0])[0][0])) print("MSE : {}".format(mean_squared_error(y_test, best_reg.predict(X_test)))) # モデルの係数はインタプリタ上で以下を実行すれば確認できます。 # best_reg.best_estimator_.coef_ # best_reg.best_estimator_.intercept_
得られたモデルを散布図上にプロットすると以下のようになります。Prediction sample : room number = 6.575, actual price = 24.0, predicted price = 25.218762849584607 MSE : 46.144775347317264
このコードを実行した結果と最終的に得られた決定木は以下のようになりました。#!/usr/bin/env python import numpy as np import pandas as pd from sklearn.cross_validation import ShuffleSplit, train_test_split from sklearn.tree import DecisionTreeClassifier, export_graphviz from sklearn.metrics import f1_score, make_scorer, accuracy_score from sklearn.grid_search import GridSearchCV from sklearn import datasets from pydotplus import graph_from_dot_data # Load data iris = datasets.load_iris() features = iris.data categories = iris.target # Cross-Validation setting X_train, X_test, y_train, y_test = train_test_split(features, categories, test_size=0.2, random_state=42) cv_sets = ShuffleSplit(X_train.shape[0], n_iter = 10, test_size = 0.20, random_state = 0) params = {'max_depth': np.arange(2,11), 'min_samples_leaf': np.array([5])} # Learning def performance_metric(y_true, y_predict): score = f1_score(y_true, y_predict, average='micro') return score classifier = DecisionTreeClassifier() scoring_fnc = make_scorer(performance_metric) grid = GridSearchCV(classifier, params, cv=cv_sets, scoring=scoring_fnc) best_clf = grid.fit(X_train, y_train) # Prediction result print("Optimal models's parameter 'max_depth' : {} ".format(best_clf.best_estimator_.get_params()['max_depth'])) print("Classifiction sample : features = {0}, actual category = {1}, classification result = {2}".format(X_test[0], y_test[0], best_clf.predict(np.array([X_test[0]])[0])[0])) print("Accuracy : {}".format(accuracy_score(y_test, best_clf.predict(X_test)))) # Output decision tree dot_data = export_graphviz(best_clf.best_estimator_, out_file=None, feature_names=iris.feature_names, class_names=iris.target_names, filled=True, rounded=True, special_characters=True) graph = graph_from_dot_data(dot_data) graph.write_pdf('iris_clf_tree.pdf')
“min_samples_leaf”をデフォルトのままにしていたら、以下のようにmax_depthが6となって過学習している感が出ました。なので、”min_samples_leaf”を5に設定しています。Optimal models's parameter 'max_depth' : 3 Classifiction sample : features = [ 6.1 2.8 4.7 1.2], actual category = 1, classification result = 1 Accuracy : 1.0