一、数据处理(特征工程)
更多pandas操作请参考添加链接描述pandas对于文件数据基本操作
导入的包sklearn
pip3 install --index-url https://pypi.douban.com/simple scikit-learn
缺失值处理
#缺失值查看
df.replace('NaN ', np.nan, inplace=True)#将数据中一些None替换为NULL
df.isnull().sum()
df.fillna(method='pad', inplace=True) # 填充前一条数据的值
类型转换
#类型转换
df[a] = df[a].apply(lambda x: float(x))
字符串编码处理,LabelEncoder
from sklearn.preprocessing import LabelEncoder
#使用LabelEncoder对数据集进行编码
le = LabelEncoder()
cat_data = ['需要编码的列名']
for i in cat_data:
df[i] = le.fit_transform(df[i])
TfidfVectorizer结合TruncatedSVD
#向量转换
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import joblib
# raw documents to tf-idf matrix:
vectorizer = TfidfVectorizer(stop_words='english',
use_idf=True,
smooth_idf=True)
# SVD to reduce dimensionality: 采用了随机SVD算法,迭代次数为10次,将维度降至5
svd_model = TruncatedSVD(n_components=5,
algorithm='randomized',
n_iter=10)
# pipeline of tf-idf + SVD, fit to and applied to documents:流水线
svd_transformer = Pipeline([('tfidf', vectorizer),
('svd', svd_model)])
# fit and transform the data:
dc_matrix = svd_transformer.fit_transform(data['分词描述'])
# save the models to disk:
joblib.dump(svd_transformer, 'svd_transformer.joblib')
# load the models from disk:
svd_transformer = joblib.load('svd_transformer.joblib')
#转换
dc_matrix = svd_transformer.transform(data['分词描述'])
dc_matrix.shape
特征值选取
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
x_data = df.iloc[:, 1:-1] # 特征值
y_data = df.iloc[:, -1] # labels
# 划分数据集
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=42)
# 使用ANOVA F-value作为评分函数选择最佳的10个特征
selector = SelectKBest(f_classif, k=10)
selector.fit(X_train, y_train)
# 获取选择的特征的索引
selected_features_indices = selector.get_support(indices=True)
# 获取选择的特征的名称
selected_features_names = x_data.columns[selected_features_indices]
# 打印选择的特征名称
print("Selected features:", selected_features_names)
# 绘制特征的评分图表
plt.bar(range(len(selector.scores_)), selector.scores_)
plt.xticks(range(len(selector.scores_)), x_data.columns, rotation=90)
plt.show()
数据划分
from sklearn.model_selection import train_test_split
x_data = df.iloc[:, 0:-1] # 特征值0--2列
y_data = df.iloc[:, -1] # labels最后一列
# 划分数据集
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=42)
排除某一列,例如
x_data = df.drop(df.columns[5], axis=1)
模型训练
逻辑回归(分类模型)
from sklearn import metrics
#划分数据集,输入最佳参数
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
#需要调优的参数
#请尝试将L1正则和L2正则分开,并配合合适的优化求解算法(solver)
#tuned_parameters={'penalth':['l1','l2'],'C':[0.001,0.01,0.1,1,10,100,
# 1000]}
#参数的搜索范围
penaltys=['l1','l2']
Cs=[0.1,1,10,100,1000]
#调优的参数集合,搜索网格为2x5,在网格上的交叉点进行搜索
tuned_parameters=dict(penalty=penaltys,C=Cs)
lr_penalty=LogisticRegression(solver='liblinear')
grid=GridSearchCV(lr_penalty,tuned_parameters,cv=3,scoring='neg_log_loss',
n_jobs=4)
grid.fit(x_train,y_train)
#预测
lr_y_predict = grid.predict(x_test)
# 打印最佳参数和准确率
print('Best parameters:', grid.best_params_)
print('Best neg_log_loss:', grid.best_score_)
#评估
score_lr = metrics.accuracy_score (y_test,lr_y_predict)
#绘制预测效果
print('accuracy_score 评估逻辑回归模型:',score_lr)
随机森林(分类模型)
如果需要用到回归则换成RandomForestRegressor
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
#再对RandomForestClassifier进行调优
k = []
v = []
for i in tqdm(range(1,10)):
#数据划分
model= RandomForestClassifier(max_depth=i, random_state=2)
#加载模型训练
model.fit(x_train, y_train)
#预测
lr_y_predict = model.predict(x_test)
#评估
score_lr = metrics.accuracy_score(y_test,lr_y_predict)
#记录结果
k.append(score_lr)
v.append(i)
#展示每次调整的结果
print('best max_depth',v[k.index(max(k))])
val1 = v[k.index(max(k))]
#再对RandomForestClassifier进行调优
k = []
v = []
for i in tqdm(range(1,10)):
#数据划分
model= RandomForestClassifier(max_depth=val1, n_estimators=i)
#加载模型训练
model.fit(x_train, y_train)
#预测
lr_y_predict = model.predict(x_test)
#评估
score_lr = metrics.accuracy_score(y_test,lr_y_predict)
#记录结果
k.append(score_lr)
v.append(i)
print('best n_estimators',v[k.index(max(k))])
print('Accuracy 评估随机森林:',max(k))
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
#再对RandomForestClassifier进行调优
k1 = []
v1 = []
kval1 =[]
for i in tqdm(range(1,10)):
#数据划分
model= RandomForestClassifier(max_depth=i, random_state=2)
#加载模型训练
model.fit(x_train, y_train)
#预测
lr_y_predict = model.predict(x_test)
#训练集合评估
score_lr = metrics.accuracy_score(y_test,lr_y_predict)
#记录结果
k1.append(score_lr)
v1.append(i)
#验证集合评估
lr_y_predict = model.predict(x_dataval)
score_lr = metrics.accuracy_score(y_dataval,lr_y_predict)
kval1.append(score_lr)
#展示每次调整的结果
print('best max_depth',v1[kval1.index(max(kval1))])
print('Accuracy 评估随机森林:',max(kval1))
val1 = v1[kval1.index(max(kval1))]
#再对RandomForestClassifier进行调优
k2 = []
v2 = []
kval2 = []
for i in tqdm(range(1,10)):
#数据划分
model= RandomForestClassifier(max_depth=val1, n_estimators=i)
#加载模型训练
model.fit(x_train, y_train)
#预测
lr_y_predict = model.predict(x_test)
#评估
score_lr = metrics.accuracy_score(y_test,lr_y_predict)
#记录结果
k2.append(score_lr)
v2.append(i)
#验证集合评估
lr_y_predict = model.predict(x_dataval)
score_lr = metrics.accuracy_score(y_dataval,lr_y_predict)
kval2.append(score_lr)
print('best n_estimators',v2[kval2.index(max(kval2))])
print('Accuracy 评估随机森林:',max(kval2))
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
# 创建并训练 XGBoost GPU 模型
xgb_gpu_model = XGBClassifier(tree_method='gpu_hist')
xgb_gpu_model.fit(x_train, y_train)
# 在测试集上进行预测并计算准确率
y_pred_gpu = xgb_gpu_model.predict(x_test)
accuracy_gpu = accuracy_score(y_test, y_pred_gpu)
print('Accuracy (GPU):', accuracy_gpu)
from sklearn.model_selection import GridSearchCV
param_grid = {
'learning_rate': [0.1, 0.01],
'max_depth': [None, 10,20,30],
'subsample': [0.8, 0.9, 1.0],
'colsample_bytree': [0.8, 0.9, 1.0],
'n_estimators': range(1,50,100)
}
xgb = XGBClassifier(tree_method='gpu_hist')
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring='accuracy', n_iter=200, cv=3, verbose=2, n_jobs=5)
grid_search.fit(x_train, y_train)
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
# 使用最佳参数的模型进行预测
y_pred_best = best_model.predict(x_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
print('Best Accuracy (GPU):', accuracy_best)
print('Best Parameters:', best_params)
svm算法(时间太长这里不调优)
from sklearn import svm
from sklearn.metrics import accuracy_score
# 创建 SVM 分类器并拟合训练数据
clf = svm.SVC(kernel='linear')
clf.fit(x_train, y_train)
# 预测测试集并计算准确率
y_pred = clf.predict(x_test)
SVMaccuracy = accuracy_score(y_test, y_pred)
print('Accuracy SVM:', SVMaccuracy)
聚类
数据在dc_matrix里面
from sklearn.model_selection import GridSearchCV
# 定义参数范围
param_grid = {
'n_clusters': [5, 10, 15, 20],
'init': ['k-means++', 'random'],
'max_iter': [300, 500, 1000],
'tol': [1e-4, 1e-5, 1e-6],
'random_state': [42]
}
# 创建KMeans对象
kmeans = KMeans()
# 创建GridSearchCV对象
grid_search = GridSearchCV(kmeans, param_grid, n_jobs=-1)
# 对数据进行聚类和搜索最佳超参数
grid_search.fit(dc_matrix)
# 输出最佳超参数
best_params = grid_search.best_params_
print(best_params)
# 使用最佳超参数对数据进行聚类
best_kmeans = KMeans(**best_params)
kmeans_results = best_kmeans.fit_predict(dc_matrix)
plt.rcParams["figure.figsize"] = (12, 10)
plt.scatter(x = document_concept_matrix[0], y = document_concept_matrix[1], c=kmeans_results)
绘制距离
# Get distances to cluster centers
distances = best_kmeans.transform(dc_matrix)
# Plot the distances using a boxplot
sns.boxplot(data=distances)
plt.show()
#加入到表里面
dfco['分类结果']= pd.DataFrame([i for i in kmeans_results])[0]
dfco['到分类中心的距离']= pd.DataFrame([i for i in distances])[0]
dfco
模型保存与使用
师范,保存文本模型,使用其转换,调用聚类,预测文章来源:https://www.toymoban.com/news/detail-413299.html
#保存聚类模型
import joblib
# save the models to disk:
joblib.dump(svd_transformer, 'svd_transformer.joblib')
# save the model to disk
filename = 'kmeans_model.sav'
joblib.dump(best_kmeans, filename)
#向量转换
import joblib
# load the models from disk:
svd_transformer = joblib.load('svd_transformer.joblib')
#转换为向量
test_dc_matrix_pf = svd_transformer.transform(dfco['分词描述2'])
# save the model to disk
filename = 'kmeans_model.sav'
# load the model from disk
loaded_model = joblib.load(filename)
# use the loaded model to make predictions
new_results_pf = loaded_model.predict(test_dc_matrix_pf)
k邻近
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import r2_score
import numpy as np
import matplotlib.pyplot as plt
# 定义参数范围
n_neighbors = list(range(1, 11))
weights = ['uniform', 'distance']
algorithms = ['ball_tree', 'kd_tree', 'brute']
# 初始化空列表存储结果
results = []
# 循环遍历所有参数组合
for n in n_neighbors:
for w in weights:
for a in algorithms:
# 实例化模型
knn = KNeighborsClassifier(n_neighbors=n, weights=w, algorithm=a)
# 拟合数据
knn.fit(x_train, y_train)
# 预测测试集
res_y = knn.predict(x_test)
# 计算r2_score
score_knn = r2_score(y_test, res_y)
# 将参数组合和得分添加到结果列表
results.append((n, w, a, score_knn))
# 将结果转换为NumPy数组并按得分进行排序
results = np.array(results)
results = results[np.argsort(results[:, 3])[::-1]]
# 打印最佳参数和得分
best_params = {'n_neighbors': results[0][0], 'weights': results[0][1], 'algorithm': results[0][2]}
best_score = results[0][3]
print('Best parameters: ', best_params)
print('Best score: ', best_score)
# 绘制折线图,显示不同参数组合的得分
fig, ax = plt.subplots(figsize=(12, 6))
for i, weight in enumerate(weights):
mask = results[:, 1] == weight
x = results[mask][:, 0]
y = results[mask][:, 3]
ax.plot(x, y, marker='o', label=weight)
ax.set_xlabel('n_neighbors')
ax.set_ylabel('r2_score')
ax.set_title('KNeighborsClassifier parameter tuning')
ax.legend()
plt.show()
Lasso回归
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
reg = Lasso()
param_grid = {'alpha': np.linspace(0, 1, 100)}
###STEP3###
#问题一:创建参数优化器GridSearchCV,将参数model,param_grid传入
grid = GridSearchCV(reg, param_grid, cv=5)
grid.fit(x_train, y_train)
print(grid.best_params_)
print(grid.score(x_test, y_test))
###STEP4###
#使用最优参数(grid.best_estimator_)的Lasso模型进行预测
ridge_model=grid.best_estimator_
y_hat = ridge_model.predict(x_test)
plt.scatter(y_test, y_hat)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Predicted Vs Actual Prices", fontsize=15)
plt.show()
线性回归
from sklearn.linear_model import LinearRegression
from sklearn import metrics
lr = LinearRegression()
lr.fit(x_train, y_train)
lr_y_predict = lr.predict(x_test)
score_lr = metrics.r2_score(y_test,lr_y_predict)
plt.scatter(y_test, lr_y_predict)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Predicted Vs Actual Prices", fontsize=15)
plt.show()
print('简单评估线性回归模型:',score_lr)
岭回归
from sklearn.linear_model import RidgeCV
from sklearn import metrics
#加载岭回归模型
rr = RidgeCV(alphas=np.array([.1, .2, .3, .4]))
rr.fit(x_train,y_train)
rr_y_predict = rr.predict(x_test)
score_rr = metrics.r2_score(y_test,rr_y_predict)
print('简单评估岭回归模型:',score_rr)
plt.scatter(y_test, rr_y_predict)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Predicted Vs Actual Prices", fontsize=15)
plt.show()
数据维度文章来源地址https://www.toymoban.com/news/detail-413299.html
import pandas as pd
from sklearn.decomposition import PCA
# 假设你的数据保存在名为df的DataFrame中,有多个列
# 创建PCA模型,将维度降低至2
pca = PCA(n_components=1)
# 对数据进行降维
df_2d = pca.fit_transform(y_data)
# 将降维后的数据转换为DataFrame
df_2d = pd.DataFrame(df_2d, columns=['PC1'])
评估
混淆
report = metrics.classification_report(y_test,lr_y_predict, labels=(0,1))
print(report)
"""模型得到的指标"""
importances = lr_penalty.coef_[0].tolist()
z_importances = pd.DataFrame([importances],columns=x_test.columns).T.sort_values(by=0)
z_importances = z_importances.rename(columns={ 0: 'importances'})
z_importances.plot(kind='bar')
ROC
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_test, lr_y_predict)
plt.plot(fpr,tpr,'*-')
plt.ylabel('TPR')
plt.xlabel('FPR')
plt.title('ROC curve')
import numpy as np
import seaborn as sns
# 获取模型系数
coefficients = model.feature_importances_
# 创建特征相关性 DataFrame
feature_correlations = pd.DataFrame({'Feature': x_train.columns, 'Correlation': np.abs(coefficients)})
# 按相关性排序
feature_correlations = feature_correlations.sort_values(by='Correlation', ascending=False)
# 创建柱状图
plt.figure(figsize=(10, 12))
sns.barplot(data=feature_correlations, x='Correlation', y='Feature')
# 设置标题和标签
plt.title('Feature Correlations')
plt.xlabel('Correlation')
plt.ylabel('Feature')
# 显示图形
plt.show()
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
# 假设您有 y_test(真实标签的列表)和 y_pred(预测标签的列表)
# 将 y_test 和 y_pred 转换为 NumPy 数组
y_test = np.array(y_test)
y_pred = np.array(lr_y_predict)
# 使用一对多(One-vs-Rest,OvR)方法计算每个类别的ROC曲线
fpr = dict()
tpr = dict()
roc_auc = dict()
n_classes = 3 # 类别数量
for i in range(1,n_classes+1):
# 计算每个类别的 ROC 曲线和 AUC
fpr[i], tpr[i], _ = metrics.roc_curve(y_test == i, y_pred == i)
roc_auc[i] = metrics.auc(fpr[i], tpr[i])
# 为每个类别绘制 ROC 曲线
plt.figure()
for i in range(1,n_classes+1):
plt.plot(fpr[i], tpr[i], label=f'class {i} (AUC = {roc_auc[i]:.2f})')
plt.xlabel('Flase FPR')
plt.ylabel('True TPR')
plt.title('RandomForestClassifier ROC')
plt.legend(loc='best')
plt.show()
到了这里,关于机械学习模型训练常用代码(随机森林、聚类、逻辑回归、svm、线性回归、lasso回归,岭回归)的文章就介绍完了。如果您还想了解更多内容,请在右上角搜索TOY模板网以前的文章或继续浏览下面的相关文章,希望大家以后多多支持TOY模板网!