# -*- coding: utf-8 -*-
\"\"\"集成算法.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1cr8C3JW8619DoKNb0nd9u7_RqfAlruTv
**Bagged Decision Tree**
Bagging算法在数据有很大的方差时很有效,最常见的是决策树的Bagging算法。
\"\"\"
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
# 导入数据
filename = \'http://ftp.ics.uci.edu/pub/machine-learning-data s/pima-indians-diabetes/pima-indians-diabetes.data\'
names = [\'preg\',\'plas\',\'pres\',\'skin\',\'test\',\'mass\',\'pedi\',\'age\',\'class\']
data = read_csv(filename, names=names)
data.shape
# 数据划分
array = data.values
X = array[:, 0:8]
y = array[:, 8]
# 构建模型
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
cart = DecisionTreeClassifier()
num_tree = 100
model = BaggingClassifier( _estimator=cart, n_estimators=num_tree, random_state=seed)
result = cross_val_score(model, X, y, cv=kfold)
print(result.mean())
\"\"\"### 随机森林
用随机的方式建立一个森林,森林由多棵决策树组成,每个决策树之间没有关联。新的输入进入到随机森林,会让每个决策树分别判断,看样本属于哪一类,最后看哪类被选择的最多,就预测这个样本为那个类。
\"\"\"
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
# 导入数据
filename = \'http://ftp.ics.uci.edu/pub/machine-learning-data s/pima-indians-diabetes/pima-indians-diabetes.data\'
names = [\'preg\',\'plas\',\'pres\',\'skin\',\'test\',\'mass\',\'pedi\',\'age\',\'class\']
data = read_csv(filename, names=names)
# 数据划分
array = data.values
X = array[:, 0:8]
y = array[:, 8]
# 构建模型
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
num_tree = 100
max_features = 3
model = RandomForestClassifier(n_estimators=num_tree, random_state=seed, max_features=max_features)
result = cross_val_score(model, X, y, cv=kfold)
print(result.mean())
\"\"\"### 极端随机树
与随机森林类似,都是由很多决策树组成,但是有两个重要区别:
- 随机森林是Bagging模型,ExtraTreesClassifier择时使用所有的训练样本得到每个决策树
- 随机森林是在一个随机子集内得到最优的分叉特征属性,ExtraTreesClassifier则是完全随机选择分叉特征属性
\"\"\"
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import ExtraTreesClassifier
# 导入数据
filename = \'http://ftp.ics.uci.edu/pub/machine-learning-data s/pima-indians-diabetes/pima-indians-diabetes.data\'
names = [\'preg\',\'plas\',\'pres\',\'skin\',\'test\',\'mass\',\'pedi\',\'age\',\'class\']
data = read_csv(filename, names=names)
# 数据划分
array = data.values
X = array[:, 0:8]
y = array[:, 8]
# 构建模型
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
num_tree = 100
max_features = 7
model = ExtraTreesClassifier(n_estimators=num_tree, random_state=seed, max_features=max_features)
result = cross_val_score(model, X, y, cv=kfold)
print(result.mean())
\"\"\"### 提升算法
Boosting算法是用来提高弱分类算法准确度的方法。先构造一个预测函数序列,然后将他们组合成为一个预测函数。
\"\"\"
### AdaBoost
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
# 导入数据
filename = \'http://ftp.ics.uci.edu/pub/machine-learning-data s/pima-indians-diabetes/pima-indians-diabetes.data\'
names = [\'preg\',\'plas\',\'pres\',\'skin\',\'test\',\'mass\',\'pedi\',\'age\',\'class\']
data = read_csv(filename, names=names)
# 数据划分
array = data.values
X = array[:, 0:8]
y = array[:, 8]
# 构建模型
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
num_tree = 100
model = AdaBoostClassifier(n_estimators=num_tree, random_state=seed)
result = cross_val_score(model, X, y, cv=kfold)
print(result.mean())
### 随机梯度提升算法
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
# 导入数据
filename = \'http://ftp.ics.uci.edu/pub/machine-learning-data s/pima-indians-diabetes/pima-indians-diabetes.data\'
names = [\'preg\',\'plas\',\'pres\',\'skin\',\'test\',\'mass\',\'pedi\',\'age\',\'class\']
data = read_csv(filename, names=names)
# 数据划分
array = data.values
X = array[:, 0:8]
y = array[:, 8]
# 构建模型
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
num_tree = 100
model = AdaBoostClassifier(n_estimators=num_tree, random_state=seed)
result = cross_val_score(model, X, y, cv=kfold)
print(result.mean())
\"\"\"### 投票算法
将多个机器学习模型集成集成起来的算法。通过创建两个或两个以上的算法模型,用投票算法将这些算法包裹起来。
\"\"\"
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
# 导入数据
filename = \'http://ftp.ics.uci.edu/pub/machine-learning-data s/pima-indians-diabetes/pima-indians-diabetes.data\'
names = [\'preg\',\'plas\',\'pres\',\'skin\',\'test\',\'mass\',\'pedi\',\'age\',\'class\']
data = read_csv(filename, names=names)
# 数据划分
array = data.values
X = array[:, 0:8]
y = array[:, 8]
# 构建模型
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
cart = DecisionTreeClassifier()
models = {}
models[\'logistic\'] = LogisticRegression()
models[\'cart\'] = DecisionTreeClassifier()
models[\'svm\'] = SVC()
ensemble_model = VotingClassifier(estimators=models)
result = cross_val_score(model, X, y, cv=kfold)
print(result.mean())
END.
参考:
《机器学习Python实践》
继续阅读与本文标签相同的文章
-
吉利缤越,液晶仪表盘,运动座椅,L2级别自动驾驶,8秒破百
2026-05-18栏目: 教程
-
苏泊尔破壁机:技术革新 家族合力
2026-05-18栏目: 教程
-
OpenStack Train版本今日正式发布并开放下载
2026-05-18栏目: 教程
-
文在寅:八年后将韩国打造成全球第一个自动驾驶国家
2026-05-18栏目: 教程
-
Android Studio运行Hello World程序
2026-05-18栏目: 教程
