nature medicine二分类结局随机森林模型构建与评估复现 - 文章 - 开发者社区

picture.image

背景

picture.image

通过机器学习构建的ECG-SMART数据模型，实现对冠状动脉闭塞型心肌梗死（OMI）的诊断，本次复现聚焦于ECG-SMART数据模型在二分类任务中的随机森林算法构建与评估，基于公开数据及算法流程，模拟原始研究的核心方法。需要特别指出的是，此次复现仅作为学习参考，旨在探索其实现细节与应用潜力

代码实现

数据读取


          
import pandas as pd
          
import numpy as np
          
import matplotlib.pyplot as plt 
          
import warnings
          
warnings.filterwarnings("ignore")
          

          
plt.rcParams['font.family'] = 'Times New Roman'
          
plt.rcParams['axes.unicode_minus'] = False
          

          
# 读取衍生数据集
          
df_derivation_cohort = pd.read_csv("Derivation_dataset.csv")
          
# 读取外部验证数据集
          
df_external_validation_cohort = pd.read_csv("External_validation_dataset.csv")
          
df_derivation_cohort

picture.image

读取两个CSV文件（一个是衍生数据集，另一个是外部验证数据集）

衍生与验证数据集的关键统计指标计算


          
def calculate_metrics(df):
          
    metrics = {
          
        # 计算 Age（年龄）的统计指标，格式为 "平均值 ± 标准差 (最小值–最大值)"
          
        "Age (years)": f"{df['Age'].mean():.0f} ± {df['Age'].std():.0f} ({df['Age'].min()}–{df['Age'].max()})",
          
        
          
        # 计算 Heart Rate (HR, 心率) 的平均值和标准差
          
        "Heart Rate (HR)": f"{df['HR'].mean():.1f} ± {df['HR'].std():.1f}",
          
        
          
        # 计算 QRS duration (QRSd, QRS波时限) 的平均值和标准差
          
        "QRS duration (QRSd)": f"{df['QRSd'].mean():.1f} ± {df['QRSd'].std():.1f}",
          
        
          
        # 计算 TpTe Interval (TpTe间期) 的平均值和标准差
          
        "TpTe Interval": f"{df['TpTe'].mean():.1f} ± {df['TpTe'].std():.1f}",
          
        
          
        # 计算 STT_PCAratio (STT PCA 比率) 的平均值和标准差
          
        "STT_PCAratio": f"{df['STT_PCAratio'].mean():.2f} ± {df['STT_PCAratio'].std():.2f}",
          
        
          
        # 计算 Outcome_Occlusion_MI（结果：闭塞性心肌梗死）的正负比例
          
        "Outcome (Occlusion MI)": {
          
            # 正比例 = Positive cases 数量占总行数的百分比
          
            "Positive": f"{(df['Outcome_Occlusion_MI'].sum() / len(df)) * 100:.1f}%",
          
            # 负比例 = 1 - 平均值，表示没有闭塞心梗的百分比
          
            "Negative": f"{(1 - df['Outcome_Occlusion_MI'].mean()) * 100:.1f}%"
          
        },
          
        
          
        # 计算 Tamp（振幅）相关指标
          
        "Tamp Features": {
          
            # 振幅的平均值
          
            "Tamp Mean": f"{df['Tamp'].mean():.2f}",
          
            # 振幅的标准差
          
            "Tamp Std Dev": f"{df['Tamp'].std():.2f}",
          
            # tamp_V4 列的最大值
          
            "Max Tamp_V4": f"{df['tamp_V4'].max():.2f}"
          
        },
          
        
          
        # 计算空间轴 (Spatial Axes) 的相关指标
          
        "Spatial Axes": {
          
            # fpTaxis 的平均值和标准差
          
            "fpTaxis": f"{df['fpTaxis'].mean():.1f} ± {df['fpTaxis'].std():.1f}",
          
            # QRSTangle 的平均值和标准差
          
            "QRSTangle": f"{df['QRSTangle'].mean():.1f} ± {df['QRSTangle'].std():.1f}"
          
        }
          
    }
          
    return metrics
          
derivation_metrics = calculate_metrics(df_derivation_cohort)
          
validation_metrics = calculate_metrics(df_external_validation_cohort)

定义一个函数，用于计算数据集中多个关键变量的统计指标，并分别应用于衍生数据集和验证数据集以生成统计结果


        
            

          print(derivation\_metrics)

picture.image


        
            

          print(validation\_metrics)

picture.image

数据集分割


          
from sklearn.model_selection import train_test_split
          
# 提取目标变量 y，即“Outcome_Occlusion_MI”列
          
y = df_derivation_cohort['Outcome_Occlusion_MI']
          
# 提取特征变量 X，去掉目标变量列
          
X = df_derivation_cohort.drop('Outcome_Occlusion_MI', axis=1)
          

          
# 划分训练集和测试集，20% 为测试集，按 y 的分层抽样
          
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=99)
          

          
# 提取外部验证集的目标变量 y_val 2024-12-11公众号Python机器学习AI
          
y_val = df_external_validation_cohort['Outcome_Occlusion_MI']
          
# 提取外部验证集的特征变量 X_val
          
X_val = df_external_validation_cohort.drop('Outcome_Occlusion_MI', axis=1)

将衍生数据集按目标变量分层抽样划分为训练集和测试集，并分别提取外部验证集的特征变量和目标变量，为模型训练和验证做好准备

数据预处理

缺失值检验


          
print("训练集总缺失值数量:", X_train.isnull().sum().sum())
          
print("测试集总缺失值数量:", X_test.isnull().sum().sum())
          
print("验证集总缺失值数量:", X_val.isnull().sum().sum())

picture.image

统计打印训练集、测试集和验证集中的总缺失值数量

缺失值填补


          
# 对训练集、测试集和验证集的缺失值进行填充
          
for el in X_train.columns:
          
    # 使用训练集每列的众数填充训练集的缺失值
          
    X_train[el].fillna(X_train[el].mode()[0], inplace=True)
          
    # 使用训练集每列的众数填充测试集的缺失值
          
    X_test[el].fillna(X_train[el].mode()[0], inplace=True)
          
    # 使用训练集每列的众数填充验证集的缺失值
          
    X_val[el].fillna(X_train[el].mode()[0], inplace=True)
          
# 2024-12-11公众号Python机器学习AI
          
# 检查填充后是否还存在缺失值
          
print("训练集是否存在缺失值:", X_train.isnull().any().any())
          
print("测试集是否存在缺失值:", X_test.isnull().any().any())
          
print("验证集是否存在缺失值:", X_val.isnull().any().any())

picture.image

使用训练集中每列的众数填充训练集、测试集和验证集的缺失值，并检查填充后是否还有缺失值

模型构建


          
from sklearn.ensemble import RandomForestClassifier
          
from sklearn.calibration import CalibratedClassifierCV
          

          
# 创建随机森林分类器
          
clf = RandomForestClassifier(
          
    class_weight='balanced_subsample',  # 每个决策树的子样本内，自动调整每个类别的权重，平衡不平衡数据集
          
    criterion='entropy',               # 使用信息增益（熵）作为分裂标准
          
    n_jobs=-1,                         # 使用所有可用的CPU核心加速训练
          
    random_state=42,                   # 固定随机种子，确保结果可复现
          
    max_features='log2',               # 每次分裂时考虑的最大特征数为 log2(总特征数)
          
    n_estimators=20,                   # 随机森林中决策树的数量
          
    min_samples_split=0.01,            # 内部节点分裂所需的最小样本比例（即至少占总样本1%）
          
    min_samples_leaf=0.005,            # 叶子节点所需的最小样本比例（即至少占总样本0.5%）
          
    min_impurity_decrease=1e-2,        # 分裂时要求的最小纯度增益，防止过拟合
          
    bootstrap=True,                    # 使用自助法采样（带放回抽样）构建每棵决策树
          
    ccp_alpha=1e-2,                    # 剪枝时的复杂度参数，增加模型泛化能力
          
    max_samples=0.75,                  # 每棵树只使用75%的样本子集（相对总样本数量）
          
    oob_score=True                     # 使用袋外样本估计模型泛化性能
          
)
          

          
# 对随机森林分类器进行校准
          
clf = CalibratedClassifierCV(
          
    clf,               # 要校准的基础分类器（随机森林）
          
    cv=5,              # 使用5折交叉验证来校准分类器
          
    method="isotonic"  # 使用保序回归法进行概率校准，适合数据较多的情况
          
)
          

          
clf.fit(X_train, y_train)

picture.image

创建一个基于随机森林的分类器，并通过保序回归法进行概率校准，然后使用训练集数据对其进行训练

ROC曲线绘制

训练集ROC曲线


          
from sklearn.metrics import roc_curve, auc, roc_auc_score
          
# 2024-12-11公众号Python机器学习AI
          
y_score = clf.predict_proba(X_train)[:, 1]
          
fpr_train, tpr_train, _ = roc_curve(y_train, y_score)
          
roc_auc_train = auc(fpr_train, tpr_train)
          
n_samples_train = X_train.shape[0]
          
plt.figure(dpi=1200)
          
plt.plot(fpr_train, tpr_train, color='red', lw=2, label='ECG-SMART = %0.3f' % roc_auc_train, alpha=0.6)
          
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
          
plt.xlim([-0.02, 1.02])  
          
plt.ylim([-0.02, 1.02]) 
          
plt.xlabel('1 - Specificity')
          
plt.ylabel('Sensitivity')
          
plt.title(f'Training set (n = {n_samples_train})')
          
plt.legend(loc="lower right")
          
plt.savefig('train_roc.pdf', format='pdf', bbox_inches='tight')
          
plt.show()

picture.image

测试 集ROC曲线


          
y_test_score = clf.predict_proba(X_test)[:, 1]
          
fpr_test, tpr_test, _ = roc_curve(y_test, y_test_score)
          
roc_auc_test = auc(fpr_test, tpr_test)
          
n_samples_test = X_test.shape[0]
          
plt.figure(dpi=1200)
          
# 2024-12-11公众号Python机器学习AI
          
plt.plot(fpr_test, tpr_test, color='red', lw=2, label='ECG-SMART = %0.2f' % roc_auc_test, alpha=0.6)
          
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
          
plt.xlim([-0.02, 1.02])  
          
plt.ylim([-0.02, 1.02]) 
          
plt.xlabel('1 - Specificity')
          
plt.ylabel('Sensitivity')
          
plt.title(f'Internal testing set (n = {n_samples_test})')
          
plt.legend(loc="lower right")
          
plt.savefig('test_roc.pdf', format='pdf', bbox_inches='tight')
          
plt.show()

picture.image

验证 集ROC曲线


          
# 预测外部验证集的概率
          
y_val_score = clf.predict_proba(X_val)[:, 1]
          
fpr_val, tpr_val, _ = roc_curve(y_val, y_val_score)
          
# 2024-12-11公众号Python机器学习AI
          
roc_auc_val = auc(fpr_val, tpr_val)
          
n_samples_val = X_val.shape[0]
          
plt.figure(dpi=1200)
          
plt.plot(fpr_val, tpr_val, color='red', lw=2, label='ECG-SMART = %0.2f' % roc_auc_val, alpha=0.6)
          
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
          
plt.xlim([-0.02, 1.02])
          
plt.ylim([-0.02, 1.02])
          
plt.xlabel('1 - Specificity')
          
plt.ylabel('Sensitivity')
          
plt.title('OMl classification performance')
          
plt.legend(loc="lower right")
          
plt.savefig('validation_roc.pdf', format='pdf', bbox_inches='tight')
          
plt.show()

picture.image

绘制训练集、测试集和验证集的ROC曲线及AUC值，直观展示模型区分正负样本的能力；ROC曲线通过灵敏度（Sensitivity）与1-特异度（1-Specificity）的关系评估模型性能，AUC值越接近1，模型区分能力越强

模型预测概率与实际标签整理


          
# 创建新的 DataFrame，分别存储 Normal 和 OMI 的概率，并添加实际标签
          
probability = clf.predict_proba(X)
          
data = pd.DataFrame({
          
    'Normal': probability[:, 0],  # 负类的概率 (Normal)
          
    'OMI': probability[:, 1],        # 正类的概率 (OMI)
          
    'Actual Label': ['Normal' if label == 0 else 'OMI' for label in y]  # 实际标签
          
})
          
data['Normal (%)'] = data['Normal'] * 100
          
data['OMI (%)'] = data['OMI'] * 100
          
# 2024-12-11公众号Python机器学习AI
          
data = data.drop(columns=['Normal', 'OMI'])
          
data

picture.image

基于完整的衍生数据集计算每个样本属于正常（Normal）和闭塞性心梗（OMI）的预测概率，并与实际标签一起整理为一个新的数据框，便于分析和可视化

OMI概率密度分布的面积图展示


          
normal_data = data[data['Actual Label'] == 'Normal']['OMI (%)']
          
omi_data = data[data['Actual Label'] == 'OMI']['OMI (%)']
          
normal_counts, normal_bin_edges = np.histogram(normal_data, bins=100, range=(0, 100), density=True)
          
omi_counts, omi_bin_edges = np.histogram(omi_data, bins=100, range=(0, 100), density=True)
          
normal_bin_centers = (normal_bin_edges[:-1] + normal_bin_edges[1:]) / 2
          
omi_bin_centers = (omi_bin_edges[:-1] + omi_bin_edges[1:]) / 2
          
plt.figure(figsize=(10, 6), dpi=1200)
          
plt.fill_between(normal_bin_centers, normal_counts, color='blue', alpha=0.6, label='Normal')
          
plt.fill_between(omi_bin_centers, omi_counts, color='red', alpha=0.6, label='OMI')
          
# 2024-12-11公众号Python机器学习AI
          
plt.title('Density Distribution of OMI (%)', fontsize=14)
          
plt.xlabel('Probability', fontsize=12)
          
plt.ylabel('Density', fontsize=12)
          
plt.xlim(0, 100)
          
plt.legend(title='OMI Classes', fontsize=10, title_fontsize=11)  # 设置图例标题和字体
          
plt.tight_layout()
          
plt.savefig('Algorithm derivation and testing_1.pdf', format='pdf', bbox_inches='tight')
          
plt.show()

picture.image

根据模型预测的OMI概率，将完整衍生数据集中属于正常（Normal）和闭塞性心梗（OMI）的样本分别绘制成概率密度的面积图。它首先计算两类样本的概率分布直方图，然后通过填充面积图展示每类样本在不同概率范围内的密度分布。图中蓝色表示正常样本，红色表示OMI样本，可直观比较两类样本在预测概率上的分布差异

基于OMI概率的风险分层与分类结果统计


          
def results(pred_probas, y_test):   
          
    if len(pred_probas.shape) == 1:
          
        OMI_score = np.round_(pred_probas * 100, decimals=2)
          
    else: 
          
        OMI_score = np.round_(pred_probas[:, 1] * 100, decimals=2)
          
    # 根据 OMI 分数划分风险等级
          
    y_pred = np.where(OMI_score < 5, 'Low risk', 'Intermediate risk')  # 风险分数小于 5 为低风险
          
    y_pred = np.where(OMI_score >= 20, 'High risk', y_pred)  # 风险分数大于等于 20 为高风险
          
    results = {
          
        'Low risk': [np.count_nonzero(y_pred == 'Low risk')],
          
        'Intermediate risk': [np.count_nonzero(y_pred == 'Intermediate risk')],
          
        'High risk': [np.count_nonzero(y_pred == 'High risk')],
          
    }
          
    true_positives = {
          
        'Low risk': [np.count_nonzero((y_pred == 'Low risk') & (y_test == 1))],
          
        'Intermediate risk': [np.count_nonzero((y_pred == 'Intermediate risk') & (y_test == 1))],
          
        'High risk': [np.count_nonzero((y_pred == 'High risk') & (y_test == 1))],
          
    }
          
    results_df = pd.DataFrame(data=results, index=['Count (N)'])
          
    true_positives_df = pd.DataFrame(data=true_positives, index=['True Positives (N)'])
          
    return pd.concat([results_df, true_positives_df])
          
# 2024-12-11公众号Python机器学习AI
          
y_test_score = clf.predict_proba(X_test)  
          
test_results = results(y_test_score, y_test)
          
test_results

picture.image

结合文献中对OMI概率的风险分层标准，基于模型预测的概率将测试集样本分为低风险（<5%）、中风险（5%-20%）和高风险（≥20%）三类，并统计每个风险等级的样本数量及真实阳性（OMI）的数量，评估分类器在不同风险水平下的区分能力

不同风险分层的OMI阳性率柱状图可视化


          
true_positive_percentages = test_results.loc['True Positives (N)'] / test_results.loc['Count (N)']
          
categories = ['+', '++', '+++']
          
colors = ['white', 'red']
          
blank_heights = [1 - p for p in true_positive_percentages]
          
filled_heights = true_positive_percentages
          
plt.figure(figsize=(6, 8), dpi=1200)
          
bars_blank = plt.bar(categories, [1, 1, 1], color=colors[0], edgecolor='black', label='Empty Bar')
          
bars_filled = plt.bar(categories, filled_heights, bottom=blank_heights, color=colors[1], alpha=0.8, label='True Positive %')
          
for bar, percentage in zip(bars_filled, filled_heights):
          
    plt.text(bar.get_x() + bar.get_width() / 2, 1,  
          
             f'{percentage * 100:.1f}%', ha='center', va='bottom', fontsize=13, fontweight='bold')
          
for bar, count in zip(bars_blank, test_results.loc['Count (N)']):
          
    plt.text(bar.get_x() + bar.get_width() / 2, -0.06,  
          
             f'{count}', ha='center', va='top', fontsize=10, fontweight='bold')
          

          
plt.text(-0.4, -0.07, 'n =', fontsize=10, ha='left', va='center')
          
plt.xlabel('ECG-SMART', fontsize=12, labelpad=30)  
          
plt.ylim(0, 1.2)  
          
plt.xticks(fontsize=12)  
          
plt.yticks([0, 0.2, 0.4, 0.6, 0.8, 1.0], [f'{int(i * 100)}%' for i in [0, 0.2, 0.4, 0.6, 0.8, 1.0]], fontsize=10)
          
plt.ylabel('% OMI', fontsize=12)
          
plt.gca().spines['top'].set_visible(False)
          
# 2024-12-11公众号Python机器学习AI
          
plt.gca().spines['right'].set_visible(False)
          
plt.gca().spines['left'].set_linewidth(0.8)
          
plt.gca().spines['bottom'].set_linewidth(0.8)
          
plt.grid(axis='y', linestyle='--', alpha=0.6)
          
plt.tight_layout()
          
plt.savefig('Algorithm derivation and testing_2.pdf', format='pdf', bbox_inches='tight')
          
plt.show()

picture.image

通过叠加柱状图展示不同风险分层（低、中、高风险）中OMI阳性率的百分比及样本总数，以直观反映各分层的诊断效果

基于SHAP的模型特征重要性分析与可视化


          
import shap
          
def calculate_mean_shap(clf, X, feature_names):
          
    # 2024-12-11公众号Python机器学习AI
          
    shap_values_list = []  
          
    for calibrated_classifier in clf.calibrated_classifiers_:
          
        base_model = calibrated_classifier.estimator
          
        explainer = shap.TreeExplainer(base_model)
          
        shap_values = explainer.shap_values(X)
          
        shap_values = shap_values[:, :, 1]
          
        shap_values_list.append(shap_values)
          
    mean_shap_values = np.mean(shap_values_list, axis=0)
          
    print(f"SHAP values shape: {mean_shap_values.shape}")
          
    print(f"Feature names: {len(feature_names)}")
          
    if mean_shap_values.shape[1] != len(feature_names):
          
        raise ValueError("SHAP values do not match the number of features.")
          

          
    return mean_shap_values
          

          
# 计算均值 SHAP 值
          
columns = X_test.columns
          
mean_shap_values = calculate_mean_shap(clf, X_test, feature_names=columns)
          
mean_shap_values

picture.image

计算校准分类模型RF中所有基分类器的测试集SHAP值均值，用于评估特征对模型预测的整体重要性


          
plt.figure()
          
shap.summary_plot(
          
    mean_shap_values, 
          
    X_test, 
          
    feature_names=X_test.columns, 
          
    plot_type="dot",  # 使用 dot 类型的图
          
    show=False,       # 不自动显示图形，便于进一步保存或修改
          
    max_display=25    # 显示前 25 个特征
          
)
          
plt.savefig("SHAP_numpy summary_plot_test.pdf", format='pdf',bbox_inches='tight', dpi=1200)