X_global = df_diff_cleaned[feature_cols].values
y_global = df_diff_cleaned[target_col].values
# try:
model_global = LinearRegression(fit_intercept=False).fit(X_global, y_global)
# 改进的基础模型
param_dist = {
'hidden_layer_sizes': [
(8, 4), (12, 8), (16, 5),
(16,), (8,) # 不同比例的两层配置
],
'activation': ['tanh'],
'learning_rate_init': loguniform(5e-4, 5e-2), # 学习率范围调整
'alpha': loguniform(1e-4, 5e-1), # 正则化强度增大
'max_iter': [300, 500, 800], # 减少最大迭代次数
'validation_fraction': [0.2], # 增加验证集比例
}
model_nn = MLPRegressor(
verbose=False,
random_state=42,
# 小数据下关闭自适应学习率(避免不稳定)
)
loo = LeaveOneOut()
# 只做一次随机搜索
random_search = RandomizedSearchCV(
estimator=model_nn,
param_distributions=param_dist,
n_iter=2, # 可以考虑适当减少
cv=loo,
scoring='neg_mean_squared_error',
n_jobs=-1,
verbose=1,
random_state=42
)
# 使用全部数据进行超参数搜索
random_search.fit(X_global, y_global)
best_model = random_search.best_estimator_
coef_df = create_coefficient_csv_with_ratios(
base_models1, feature_cols, output_path, df,
base_recipe_to_group, base_high_overlap,model_global,model_scaler # 新增的overlap数据参数
)