Ensemble(Voting and Bagging)
[Notice] [ML_10]
Ensemble(Voting and Bagging)
A machine learning ensemble is a method of finding the optimal answer using multiple machine learning models.
- Learning data using multiple models and averaging the prediction results of all models
Types of ensemble techniques
-
Voting: draw results through voting
-
Bagging: Deriving results by creating duplicate samples
-
Boosting: weighting while compensating for previous errors
-
Stacking: The meta model is predicted once again through the predicted results based on multiple models.
References
import pandas as pd
import numpy as np
from IPython.display import Image
np.set_printoptions(suppress=True)
from sklearn.datasets import load_boston
data = load_boston()
df = pd.DataFrame(data['data'], columns = data['feature_names'])
df['MEDV'] = data['target']
df.head()
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | MEDV | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00632 | 18.0 | 2.31 | 0.0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1.0 | 296.0 | 15.3 | 396.90 | 4.98 | 24.0 |
1 | 0.02731 | 0.0 | 7.07 | 0.0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2.0 | 242.0 | 17.8 | 396.90 | 9.14 | 21.6 |
2 | 0.02729 | 0.0 | 7.07 | 0.0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2.0 | 242.0 | 17.8 | 392.83 | 4.03 | 34.7 |
3 | 0.03237 | 0.0 | 2.18 | 0.0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3.0 | 222.0 | 18.7 | 394.63 | 2.94 | 33.4 |
4 | 0.06905 | 0.0 | 2.18 | 0.0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3.0 | 222.0 | 18.7 | 396.90 | 5.33 | 36.2 |
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.drop('MEDV', 1), df['MEDV'])
x_train.shape, x_test.shape
((379, 13), (127, 13))
x_train.head()
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
479 | 14.33370 | 0.0 | 18.10 | 0.0 | 0.614 | 6.229 | 88.0 | 1.9512 | 24.0 | 666.0 | 20.2 | 383.32 | 13.11 |
370 | 6.53876 | 0.0 | 18.10 | 1.0 | 0.631 | 7.016 | 97.5 | 1.2024 | 24.0 | 666.0 | 20.2 | 392.05 | 2.96 |
43 | 0.15936 | 0.0 | 6.91 | 0.0 | 0.448 | 6.211 | 6.5 | 5.7209 | 3.0 | 233.0 | 17.9 | 394.46 | 7.44 |
173 | 0.09178 | 0.0 | 4.05 | 0.0 | 0.510 | 6.416 | 84.1 | 2.6463 | 5.0 | 296.0 | 16.6 | 395.50 | 9.04 |
197 | 0.04666 | 80.0 | 1.52 | 0.0 | 0.404 | 7.107 | 36.6 | 7.3090 | 2.0 | 329.0 | 12.6 | 354.31 | 8.61 |
y_train.head()
479 21.4 370 50.0 43 24.7 173 23.6 197 30.3 Name: MEDV, dtype: float64
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
my_predictions = {}
colors = ['r', 'c', 'm', 'y', 'k', 'khaki', 'teal', 'orchid', 'sandybrown',
'greenyellow', 'dodgerblue', 'deepskyblue', 'rosybrown', 'firebrick',
'deeppink', 'crimson', 'salmon', 'darkred', 'olivedrab', 'olive',
'forestgreen', 'royalblue', 'indigo', 'navy', 'mediumpurple', 'chocolate',
'gold', 'darkorange', 'seagreen', 'turquoise', 'steelblue', 'slategray',
'peru', 'midnightblue', 'slateblue', 'dimgray', 'cadetblue', 'tomato'
]
def plot_predictions(name_, pred, actual):
df = pd.DataFrame({'prediction': pred, 'actual': y_test})
df = df.sort_values(by='actual').reset_index(drop=True)
plt.figure(figsize=(12, 9))
plt.scatter(df.index, df['prediction'], marker='x', color='r')
plt.scatter(df.index, df['actual'], alpha=0.7, marker='o', color='black')
plt.title(name_, fontsize=15)
plt.legend(['prediction', 'actual'], fontsize=12)
plt.show()
def mse_eval(name_, pred, actual):
global predictions
global colors
plot_predictions(name_, pred, actual)
mse = mean_squared_error(pred, actual)
my_predictions[name_] = mse
y_value = sorted(my_predictions.items(), key=lambda x: x[1], reverse=True)
df = pd.DataFrame(y_value, columns=['model', 'mse'])
print(df)
min_ = df['mse'].min() - 10
max_ = df['mse'].max() + 10
length = len(df)
plt.figure(figsize=(10, length))
ax = plt.subplot()
ax.set_yticks(np.arange(len(df)))
ax.set_yticklabels(df['model'], fontsize=15)
bars = ax.barh(np.arange(len(df)), df['mse'])
for i, v in enumerate(df['mse']):
idx = np.random.choice(len(colors))
bars[i].set_color(colors[idx])
ax.text(v + 2, i, str(round(v, 3)), color='k', fontsize=15, fontweight='bold')
plt.title('MSE Error', fontsize=18)
plt.xlim(min_, max_)
plt.show()
def remove_model(name_):
global my_predictions
try:
del my_predictions[name_]
except KeyError:
return False
return True
def plot_coef(columns, coef):
coef_df = pd.DataFrame(list(zip(columns, coef)))
coef_df.columns=['feature', 'coef']
coef_df = coef_df.sort_values('coef', ascending=False).reset_index(drop=True)
fig, ax = plt.subplots(figsize=(9, 7))
ax.barh(np.arange(len(coef_df)), coef_df['coef'])
idx = np.arange(len(coef_df))
ax.set_yticks(idx)
ax.set_yticklabels(coef_df['feature'])
fig.tight_layout()
plt.show()
single regression prediction model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
linear_reg = LinearRegression(n_jobs = -1)
linear_reg.fit(x_train, y_train)
pred = linear_reg.predict(x_test)
mse_eval('LinearRegression', pred, y_test)
model mse 0 LinearRegression 22.641613
ridge = Ridge(alpha = 1)
ridge.fit(x_train, y_train)
pred = ridge.predict(x_test)
mse_eval('Ridge(alpha=1)', pred, y_test)
model mse 0 LinearRegression 22.641613 1 Ridge(alpha=1) 21.869433
lasso = Lasso(alpha = 0.01)
lasso.fit(x_train, y_train)
pred = lasso.predict(x_test)
mse_eval('Lasso(alpha=0.01)', pred, y_test)
model mse 0 LinearRegression 22.641613 1 Lasso(alpha=0.01) 22.279112 2 Ridge(alpha=1) 21.869433
elasticnet = ElasticNet(alpha = 0.5, l1_ratio = 0.8)
elasticnet.fit(x_train, y_train)
pred = elasticnet.predict(x_test)
mse_eval('ElasticNet(l1_ratio=0.8)', pred, y_test)
model mse 0 ElasticNet(l1_ratio=0.8) 23.074116 1 LinearRegression 22.641613 2 Lasso(alpha=0.01) 22.279112 3 Ridge(alpha=1) 21.869433
elasticnet_pipeline = make_pipeline(
StandardScaler(),
ElasticNet(alpha = 0.1, l1_ratio = 0.2)
)
elasticnet_pred = elasticnet_pipeline.fit(x_train, y_train).predict(x_test)
mse_eval('Standard ElasticNet', elasticnet_pred, y_test)
model mse 0 ElasticNet(l1_ratio=0.8) 23.074116 1 LinearRegression 22.641613 2 Lasso(alpha=0.01) 22.279112 3 Standard ElasticNet 22.058335 4 Ridge(alpha=1) 21.869433
poly_pipeline = make_pipeline(
PolynomialFeatures(degree = 2, include_bias = False),
StandardScaler(),
ElasticNet(alpha = 0.1, l1_ratio = 0.2)
)
poly_pred = poly_pipeline.fit(x_train, y_train).predict(x_test)
mse_eval('Poly ElasticNet', poly_pred, y_test)
C:\Users\boyka\anaconda3\lib\site-packages\sklearn\linear_model\_coordinate_descent.py:530: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 70.54427247266358, tolerance: 3.188612944591029 model = cd_fast.enet_coordinate_descent(
model mse 0 ElasticNet(l1_ratio=0.8) 23.074116 1 LinearRegression 22.641613 2 Lasso(alpha=0.01) 22.279112 3 Standard ElasticNet 22.058335 4 Ridge(alpha=1) 21.869433 5 Poly ElasticNet 17.086267
Ensemble
Voting - Regression
Voting literally means method of deciding through voting Voting is similar to bagging in that it is a voting method, but there are major differences as follows
-
Voting uses a combination of different algorithm models
-
Bagging uses different sample combinations within the same algorithm
from sklearn.ensemble import VotingRegressor, VotingClassifier
You must define the model in the form of Tuple.
single_models = [
('linear_reg', linear_reg),
('ridge', ridge),
('lasso', lasso),
('elasticnet_pipeline', elasticnet_pipeline),
('poly_pipeline', poly_pipeline)
]
voting_regressor = VotingRegressor(single_models, n_jobs=-1)
voting_regressor.fit(x_train, y_train)
VotingRegressor(estimators=[('linear_reg', LinearRegression(n_jobs=-1)), ('ridge', Ridge(alpha=1)), ('lasso', Lasso(alpha=0.01)), ('elasticnet_pipeline', Pipeline(steps=[('standardscaler', StandardScaler()), ('elasticnet', ElasticNet(alpha=0.1, l1_ratio=0.2))])), ('poly_pipeline', Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(include_bias=False)), ('standardscaler', StandardScaler()), ('elasticnet', ElasticNet(alpha=0.1, l1_ratio=0.2))]))], n_jobs=-1)
voting_pred = voting_regressor.predict(x_test)
mse_eval('Voting Ensemble', voting_pred, y_test)
model mse 0 ElasticNet(l1_ratio=0.8) 23.074116 1 LinearRegression 22.641613 2 Lasso(alpha=0.01) 22.279112 3 Standard ElasticNet 22.058335 4 Ridge(alpha=1) 21.869433 5 Voting Ensemble 20.521588 6 Poly ElasticNet 17.086267
Voting - Classification
When building a classifier model, the Voting Ensemble has one important parameter
voting
= {‘hard’, ‘soft’}
When set to hard
Let’s take binary classification as an example of classifying class as 0, 1 and predicting.
In the hard voting method, multiple classes for the result value are borrowed.
Taking classification as an example, assuming that the predicted values for classification were 1, 0, 0, 1, 1, 1 received 3 votes and 0 received 2 votes, so in the Hard Voting method, 1 is predicted as the final value.
soft
The soft vote method calculates the average value of each probability and then determines the value with the highest probability.
For example, if the probability of class 0 was (0.4, 0.9, 0.9, 0.4, 0.4) and the probability of class 1 was (0.6, 0.1, 0.1, 0.6, 0.6),
-
The final probability of class 0 is (0.4+0.9+0.9+0.4+0.4) / 5 = 0.44,
-
The final probability of class 1 is (0.6+0.1+0.1+0.6+0.6) / 5 = 0.4
Because the result is different from the previous Hard Vote result, the final result will be selected.
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
models = [
('Logi', LogisticRegression()),
('Ridge', RidgeClassifier())
]
vc = VotingClassifier(models, voting='hard')
Bagging
Bagging is short for Bootstrap Aggregating.
- Bootstrap = Sample + Aggregating = Aggregating
Bootstrap splits multiple datasets by sampling and allowing overlapping
If the configuration of the data set is [1, 2, 3, 4, 5],
-
group 1 = [1, 2, 3]
-
group 2 = [1, 3, 4]
-
group 3 = [2, 3, 5]
Image('https://teddylee777.github.io/images/2019-12-17/image-20191217015537872.png')
Voting VS Bagging
-
Voting is an ensemble for a combination of multiple algorithms
-
Bagging ensembles multiple sample combinations for one single algorithm.
Representative Bagging Ensemble
-
RandomForest
-
Bagging
RandomForest
-
DecisionTree (tree)-based Bagging Ensemble
-
Very popular ensemble model
-
Easy to use and excellent in performance
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
rfr = RandomForestRegressor()
rfr.fit(x_train, y_train)
RandomForestRegressor()
rfr_pred = rfr.predict(x_test)
mse_eval('RandomForest Ensemble', rfr_pred, y_test)
model mse 0 ElasticNet(l1_ratio=0.8) 23.074116 1 LinearRegression 22.641613 2 Lasso(alpha=0.01) 22.279112 3 Standard ElasticNet 22.058335 4 Ridge(alpha=1) 21.869433 5 Voting Ensemble 20.521588 6 Poly ElasticNet 17.086267 7 RandomForest Ensemble 14.309778
Hyperparameter
-
random_state: random seed fixed value. Stay tuned and tune in!
-
n_jobs: number of CPU usage
-
max_depth: the maximum depth that can be deepened. To prevent overfitting
-
n_estimators: the number of trees to ensemble
-
max_features: The maximum number of features to use. To prevent overfitting
-
min_samples_split: the minimum number of samples when the tree splits. default=2. To prevent overfitting
Image('https://teddylee777.github.io/images/2020-01-09/decistion-tree.png', width=600)
Be sure to fix the random_state
value when tuning
rfr = RandomForestRegressor(random_state = 42, n_estimators = 1000, max_depth = 7, max_features = 0.8)
rfr.fit(x_train, y_train)
rfr_pred = rfr.predict(x_test)
mse_eval('RandomForest Ensemble w/ Tuning', rfr_pred, y_test)
model mse 0 ElasticNet(l1_ratio=0.8) 23.074116 1 LinearRegression 22.641613 2 Lasso(alpha=0.01) 22.279112 3 Standard ElasticNet 22.058335 4 Ridge(alpha=1) 21.869433 5 Voting Ensemble 20.521588 6 Poly ElasticNet 17.086267 7 RandomForest Ensemble 14.309778 8 RandomForest Ensemble w/ Tuning 14.070580
댓글남기기