Sklearn 中常用方法

小编 2026-06-04 阅读:665 评论:0
分割训练集与测试集 from sklearn.model_selection import train_test_split X = a.iloc[:,0:-1] Y = a[\"label\"] X_...

分割训练集与测试集

from sklearn.model_selection import train_test_split
X = a.iloc[:,0:-1]
Y = a[\"label\"]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.5,random_state=0)
Y_train

统计Series值出现次数

a[\"feature_1\"].value_counts()

异常数据处理

删除
a.replace(\'?\', np.nan).dropna(how = \'any\')

独热编码

import pandas as pd
a = pd.DataFrame([[1,2,3],
                  [4,5,6],
                  [1,8,9]],columns = [\"feature_1\", \"feature_2\", \"label\"])
from sklearn.preprocessing import OneHotEncoder
hotCoder=OneHotEncoder(sparse = False, handle_unknown = \"ignore\")
hot = hotCoder.fit_transform(a)
pd.DataFrame(hot)

b = pd.DataFrame([[1,2,3],
                  [4,5,6],
                  [10,8,9]],columns = [\"feature_1\", \"feature_2\", \"label\"])
hotCoder.transform(b)

多项式扩展

import pandas as pd
a = pd.DataFrame([[1,2,3],
                  [4,5,6],
                  [1,8,9]],columns = [\"feature_1\", \"feature_2\", \"label\"])
from sklearn.preprocessing import PolynomialFeatures
polyCoder = PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)
df = polyCoder.fit_transform(a)
pd.DataFrame(df, columns=polyCoder.get_feature_names())

标准化

import pandas as pd
a = pd.DataFrame([[1,2,3],
                  [4,5,6],
                  [7,8,9]],columns = [\"feature_1\", \"feature_2\", \"label\"])
from sklearn.preprocessing import StandardScaler
ssCoder = StandardScaler()
df = ssCoder.fit_transform(a)
pd.DataFrame(df)

规范化,归一化

import pandas as pd
a = pd.DataFrame([[1,2,3],
                  [4,5,6],
                  [7,8,9]],columns = [\"feature_1\", \"feature_2\", \"label\"])
from sklearn.preprocessing import MinMaxScaler
ssCoder = MinMaxScaler(feature_range=[-1,2])
df = ssCoder.fit_transform(a)
pd.DataFrame(df)

LabelEncoder

from sklearn.preprocessing import LabelEncoder
import pandas as pd
a = pd.DataFrame([[\"b\",2,3],
                  [\"a\",5,6],
                  [\"a\",8,9]],columns = [\"feature_1\", \"feature_2\", \"label\"])
laCoder = LabelEncoder()
b = pd.DataFrame(laCoder.fit_transform(a[\"feature_1\"]))
pd.concat([a,b],axis=1)

dataframe样本采样

df = a.sample(frac=0.66)
df = a.sample(n=3)
pd.concat([a,df])

LinearRegression

import numpy as np
X = np.mat([[1,1],[2,1],[3,1],[4,1]])
Y = np.mat([[3.2],[4.7],[7.3],[8.5]])
from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=False)
model.fit(X,Y)
model.coef_

model.score(X,Y)

Ridge

from sklearn.linear_model import Ridge
for alpha in [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 2, 3, 5, 10]:
    clf = Ridge(alpha=alpha, max_iter=2000, solver=\"auto\",fit_intercept=True)
    clf.fit(X_train, Y_train)
    print(\"Ridge:\",mse(Y_test.values, clf.predict(X_test)))
    print(clf.n_iter_)

Lasso

from sklearn.linear_model import Lasso
for alpha in [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 2, 3]:
    clf = Lasso(alpha=alpha, max_iter=100, fit_intercept=True)
    clf.fit(X_train, Y_train)
    print(\"Lasso:\",mse(Y_test.values, clf.predict(X_test)))
    print(clf.n_iter_)

模型评估

from sklearn.metrics import mean_squared_error
print(\"LinearRegression:\",mean_squared_error(Y_test.values, clf.predict(X_test)))

混淆矩阵

pd.crosstab(Y_test,knn.predict(X_test),rownames=[\"label\"],colnames=[\"predict\"])

保存模型

from sklearn.externals import joblib
joblib.dump(enc,\'rf.model\')
enc2 = joblib.load(\'rf.model\')

b = enc2.transform(a).toarray()
pd.DataFrame(b)

绘制函数图像

import numpy as np
import matplotlib.pyplot as plt
x=np.linspace(-5,5,1000)  #这个表示在-5到5之间生成1000个x值
y=[1/(1+np.exp(-i)) for i in x]  #对上述生成的1000个数循环用sigmoid公式求对应的y
plt.plot(x,y)  #用上述生成的1000个xy值对生成1000个点
plt.show()  #绘制图像

Df拷贝

import pandas as pd
a = pd.DataFrame([[1,2,3],
                  [4,5,6],
                  [7,8,9]],columns = [\"feature_1\", \"feature_2\", \"label\"])

df = a.copy()
df.drop(columns=[\"feature_1\"],inplace=True)
print(id(a))
print(id(df))
a

Python拷贝

import copy
a = [1,2,[1,2]]
b = copy.deepcopy(a)
a[2][0] = -1

b

CountVectorizer


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
corpus = [
    \'我 爱 你\',
    \'我 恨 你\'
]
y = [0,1]
vectorizer = CountVectorizer(token_pattern=\"[a-zA-Z|\\u4e00-\\u9fa5]+\")
count = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())  
print(count.toarray())

transformer = TfidfTransformer()
tfidf_matrix = transformer.fit_transform(count)
print(tfidf_matrix.toarray())

tfidf_vec = TfidfVectorizer(token_pattern=\"[a-zA-Z|\\u4e00-\\u9fa5]+\") 
tfidf_matrix = tfidf_vec.fit_transform(corpus)
print(tfidf_matrix.toarray())

from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(tfidf_matrix.toarray(),y)
print(model.predict(tfidf_matrix.toarray()))
corpus = [
    \'仇 恨\',
    \'爱 你\'
]
tfidf_matrix = tfidf_vec.transform(corpus)
model.predict(tfidf_matrix.toarray())

TfidfVectorizer

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
df = pd.read_csv(\"datas/bayes.txt\",header=None)
X = df[1]
Y = df[0]
tfCoder = TfidfVectorizer(token_pattern=\"[a-zA-Z|\\u4e00-\\u9fa5]+\") 
X = tfCoder.fit_transform(X).toarray()
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0, random_state=42)
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train,y_train)
print(model.predict(X_train))
print(y_train.values)

apply

from sklearn import preprocessing
import pandas as pd
enc = preprocessing.OneHotEncoder(categorical_features=[0,1])
a = pd.DataFrame([[1,\"A\",\"a\"],
                  [0,\"B\",\"b\"],
                  [2,\"C\",\"c\"]],columns = [\"ebayno\", \"p_sku\", \"sale\"])
def f(x):
    i = x.index 
    v = x.values*2
    print(v)
    
    return pd.Series(v,i)
a.apply(f)

tfidf

corpus=[\"hi peter\", 
    \"hi tom\"] 

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf2 = TfidfVectorizer(norm=None)
re = tfidf2.fit_transform(corpus)
print(tfidf2.vocabulary_)
print(tfidf2.get_feature_names())
print(re.todense())

版权声明

本文仅代表作者观点,不代表百度立场。
本文系作者授权百度百家发表,未经许可,不得转载。

热门文章
  • 机房智能化温湿度解决方式之POE供电以太网温湿度传感器

    机房智能化温湿度解决方式之POE供电以太网温湿度传感器
    机房智能化温湿度解决方式之POE供电以太网温湿度传感器 北京盈创力和电子科技有限公司 智能型TCP网口温湿度记录仪 北京IP网络温湿度记录仪厂家,北京盈创力和 北京智能型TCP网口温湿度记录仪IP网络温湿度记录仪是一种新型的基于TCP/IP协议双绞线以太网标准温湿度采集模块,利用它可以实现现场温度值、相对湿度值的采集,同时利用其自身的RJ45通信接口可以方便地和机房监控主机或交换机集线器进行联网。 工作于-40℃~85℃工业级带...
  • Sequential Monte Carlo Methods (SMC) 序列蒙特卡洛/粒子滤波/Bootstrap Filtering

    Sequential Monte Carlo Methods (SMC) 序列蒙特卡洛/粒子滤波/Bootstrap Filtering
    Problem Statement 我们考虑一个具有马尔可夫性质、非线性、非高斯的状态空间模型(State Space Model):对于一个时间序列上的观测结果{yt,t∈N}\\{ y_t , t \\in N \\}{yt​,t∈N},我们认为每个观测结果yty_tyt​的生成依赖于一个无法直接观察的隐变量xt∈{xt,t∈N}x_t \\in \\{x_t , t \\in N \\}xt​∈{xt​,t∈N},即:p(...
  • HTTP状态保持的原理

    HTTP状态保持的原理
    a)在用户登录之后,浏览器返回响应的时候会在响应中添加上cookieb)浏览器接收到cookie之后会自动保存c)当用户再次请求同一服务器中的其他网页的时候,浏览器会自动带上之前保存的cookied)服务接收到请求之后可以请 request 对象中取到cookie 判断当前用户是否登录  Http是无状态的,就是连接时数据互通,关闭后...
  • Hive 系统函数及示例

    Hive 系统函数及示例
    查看所有系统函数 show functions; 函数分类 内置函数【系统函数】 数学函数: floor、round、ceil、cos、log2等 字符串函数: length、reverse、trim、lower、get_json_object、repeat等 收集函数: size 转换函数: cast 日期函数: year、month、datediff、date、date_add等 条件函数: coalesce、case…w...
  • CSRF的原理和防范措施

    CSRF的原理和防范措施
    a)攻击原理:i.用户C访问正常网站A时进行登录,浏览器保存A的cookieii.用户C再访问攻击网站B,网站B上有某个隐藏的链接或者图片标签会自动请求网站A的URL地址,例如表单提交,传指定的参数iii.而攻击网站B在访问网站A的时候,浏览器会自动带上网站A的cookieiv.所以网站A在接收到请求之后可判断当前用户是登录状态,所以...
标签列表