Coverage for backend/idaes_service/solver/ml_wizard.py: 100%
54 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-11-06 23:27 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-11-06 23:27 +0000
1from pydantic import BaseModel
2from common.models.idaes.payloads.ml_request_schema import MLTrainRequestPayload, MLTrainingCompletionPayload
3import pandas as pd
4import json
5import numpy as np
6from io import StringIO
7import contextlib
8from sklearn.model_selection import train_test_split
9from sklearn.metrics import mean_squared_error, r2_score
10from idaes.core.surrogate.pysmo_surrogate import PysmoRBFTrainer, PysmoSurrogate
13class MLResult(BaseModel):
14 surrogate_model: dict
15 charts: list[dict]
16 metrics: list[dict]
17 test_inputs: dict
18 test_outputs: dict
19 task_id: int
22def ml_generate(schema: MLTrainRequestPayload) -> MLResult:
23 df = pd.DataFrame(schema.datapoints, columns=schema.columns)
24 train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
26 input_labels = schema.input_labels
27 output_labels = schema.output_labels
29 trainer = PysmoRBFTrainer(
30 input_labels=input_labels, output_labels=output_labels, training_dataframe=train_df)
31 trainer.config.basis_function = 'gaussian'
33 # Train surrogate (calls PySMO through IDAES Python wrapper)
34 stream = StringIO()
35 with contextlib.redirect_stdout(stream):
36 rbf_train = trainer.train_surrogate()
39 # create callable surrogate model
40 rbf_surr = PysmoSurrogate(rbf_train, input_labels,
41 output_labels, input_bounds=None)
42 f = StringIO()
43 rbf_surr.save(f)
44 content = f.getvalue()
45 json_data = json.loads(content)
47 df_evaluate = rbf_surr.evaluate_surrogate(test_df)
49 metrics = []
51 charts = []
53 for output_label in output_labels:
54 charts.append(compute_chart(
55 test_df[output_label], df_evaluate[output_label], output_label))
56 metrics.append({
57 "mean_squared_error": round(mean_squared_error(df_evaluate[output_label], test_df[output_label]), 4),
58 "r2_score": round(r2_score(df_evaluate[output_label], test_df[output_label]), 4),
59 })
61 return MLResult(
62 surrogate_model=json_data,
63 charts=charts,
64 metrics=metrics,
65 test_inputs=test_df.to_dict(orient='index'),
66 test_outputs=df_evaluate.to_dict(orient='index'),
67 task_id=schema.task_id
68 )
71def compute_chart(test_data_df, eval_data_df, output_label):
72 minn = round(np.min([np.min(test_data_df), np.min(eval_data_df)]), 4)
73 maxx = round(np.max([np.max(test_data_df), np.max(eval_data_df)]), 4)
74 qq_plot_data = compute_qq_coordinates(test_data_df, eval_data_df)
76 return {
77 "min": minn,
78 "max": maxx,
79 "qq_plot_data": qq_plot_data,
80 "output_label": output_label
81 }
84def compute_qq_coordinates(test_data, eval_data):
85 """Compute QQ plot coordinates for two datasets."""
86 test_values = test_data.to_numpy().flatten()
87 eval_values = eval_data.to_numpy().flatten()
89 # Sort values
90 test_values.sort()
91 eval_values.sort()
93 # Generate QQ plot data points
94 quantiles = np.linspace(0, 1, len(test_values))
95 test_quantiles = np.quantile(test_values, quantiles)
96 eval_quantiles = np.quantile(eval_values, quantiles)
98 # Prepare JSON response for frontend
99 qq_data = [{"x": round(float(t), 4), "y": round(float(e), 4)}
100 for t, e in zip(test_quantiles, eval_quantiles)]
101 return json.dumps(qq_data)