Coverage for backend/idaes_service/solver/ml_wizard.py: 100%

54 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-11-06 23:27 +0000

1from pydantic import BaseModel 

2from common.models.idaes.payloads.ml_request_schema import MLTrainRequestPayload, MLTrainingCompletionPayload 

3import pandas as pd 

4import json 

5import numpy as np 

6from io import StringIO 

7import contextlib 

8from sklearn.model_selection import train_test_split 

9from sklearn.metrics import mean_squared_error, r2_score 

10from idaes.core.surrogate.pysmo_surrogate import PysmoRBFTrainer, PysmoSurrogate 

11 

12 

13class MLResult(BaseModel): 

14 surrogate_model: dict 

15 charts: list[dict] 

16 metrics: list[dict] 

17 test_inputs: dict 

18 test_outputs: dict 

19 task_id: int 

20 

21 

22def ml_generate(schema: MLTrainRequestPayload) -> MLResult: 

23 df = pd.DataFrame(schema.datapoints, columns=schema.columns) 

24 train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) 

25 

26 input_labels = schema.input_labels 

27 output_labels = schema.output_labels 

28 

29 trainer = PysmoRBFTrainer( 

30 input_labels=input_labels, output_labels=output_labels, training_dataframe=train_df) 

31 trainer.config.basis_function = 'gaussian' 

32 

33 # Train surrogate (calls PySMO through IDAES Python wrapper) 

34 stream = StringIO() 

35 with contextlib.redirect_stdout(stream): 

36 rbf_train = trainer.train_surrogate() 

37 

38 

39 # create callable surrogate model 

40 rbf_surr = PysmoSurrogate(rbf_train, input_labels, 

41 output_labels, input_bounds=None) 

42 f = StringIO() 

43 rbf_surr.save(f) 

44 content = f.getvalue() 

45 json_data = json.loads(content) 

46 

47 df_evaluate = rbf_surr.evaluate_surrogate(test_df) 

48 

49 metrics = [] 

50 

51 charts = [] 

52 

53 for output_label in output_labels: 

54 charts.append(compute_chart( 

55 test_df[output_label], df_evaluate[output_label], output_label)) 

56 metrics.append({ 

57 "mean_squared_error": round(mean_squared_error(df_evaluate[output_label], test_df[output_label]), 4), 

58 "r2_score": round(r2_score(df_evaluate[output_label], test_df[output_label]), 4), 

59 }) 

60 

61 return MLResult( 

62 surrogate_model=json_data, 

63 charts=charts, 

64 metrics=metrics, 

65 test_inputs=test_df.to_dict(orient='index'), 

66 test_outputs=df_evaluate.to_dict(orient='index'), 

67 task_id=schema.task_id 

68 ) 

69 

70 

71def compute_chart(test_data_df, eval_data_df, output_label): 

72 minn = round(np.min([np.min(test_data_df), np.min(eval_data_df)]), 4) 

73 maxx = round(np.max([np.max(test_data_df), np.max(eval_data_df)]), 4) 

74 qq_plot_data = compute_qq_coordinates(test_data_df, eval_data_df) 

75 

76 return { 

77 "min": minn, 

78 "max": maxx, 

79 "qq_plot_data": qq_plot_data, 

80 "output_label": output_label 

81 } 

82 

83 

84def compute_qq_coordinates(test_data, eval_data): 

85 """Compute QQ plot coordinates for two datasets.""" 

86 test_values = test_data.to_numpy().flatten() 

87 eval_values = eval_data.to_numpy().flatten() 

88 

89 # Sort values 

90 test_values.sort() 

91 eval_values.sort() 

92 

93 # Generate QQ plot data points 

94 quantiles = np.linspace(0, 1, len(test_values)) 

95 test_quantiles = np.quantile(test_values, quantiles) 

96 eval_quantiles = np.quantile(eval_values, quantiles) 

97 

98 # Prepare JSON response for frontend 

99 qq_data = [{"x": round(float(t), 4), "y": round(float(e), 4)} 

100 for t, e in zip(test_quantiles, eval_quantiles)] 

101 return json.dumps(qq_data)