Source code for domainlab.utils.generate_benchmark_plots

"""
generate the benchmark plots by calling the gen_bencmark_plots(...) function
"""
import os
from ast import literal_eval  # literal_eval can safe evaluate python expression

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from domainlab.utils.logger import Logger

matplotlib.use("Agg")

# header of the csv file:
# 0,              1,       2,      3,    4,    5,    6,    7,      8,   9,        10,     11,          12, 13,    14,         15,      17,                    18,                  19
# param_index, method, mname, commit, algo, epos, te_d, seed, params, acc, precision, recall, specificity, f1, auroc, acc_oracle, acc_val, model_selection_epoch, experiment_duration


COLNAME_METHOD = "method"
COLNAME_IDX_PARAM = "param_index"
COLNAME_PARAM = "params"
G_DF_TASK_COL = 1  # column in which the method name is saved
G_DF_PLOT_COL_METRIC_START = 9  # first 0-6 columns are not metric
G_DF_PLOT_COL_METRIC_END = 17  # first 0-6 columns are not metric


[docs] def gen_benchmark_plots( agg_results: str, output_dir: str, use_param_index: bool = True ): """ generate the benchmark plots from a csv file containing the aggregated restults. The csv file must have the columns: [param_index, task, algo, epos, te_d, seed, params, ...] all columns after seed are intrepreted as objectives of the results, they can e.g. be acc, precision, recall, specificity, f1, auroc. agg_results: path to the csv file output_dir: path to a folder which shall contain the results skip_gen: Skips the actual plotting, used to speed up testing. """ raw_df = pd.read_csv( agg_results, # path to csv file index_col=False, converters={COLNAME_PARAM: literal_eval}, # literal_eval can safe evaluate python expression skipinitialspace=True, ) raw_df[COLNAME_PARAM] = round_vals_in_dict( raw_df[[COLNAME_IDX_PARAM, COLNAME_PARAM]], use_param_index ) # generating plot gen_plots(raw_df, output_dir, use_param_index)
[docs] def round_vals_in_dict(df_column_in, use_param_index): """ replaces the dictionary by a string containing only the significant digits of the hyperparams or (if use_param_index = True) by the parameter index df_column_in: columns of the dataframe containing the param index and the dictionary of hyperparams in the form [param_index, params] use_param_index: usage of param_index instead of exact values """ df_column = df_column_in.copy() df_column_out = df_column_in[COLNAME_IDX_PARAM].copy() df_column_out = df_column_out.astype(str) for i in range(df_column.shape[0]): if not use_param_index: string = "" for num, val in enumerate(list(df_column[COLNAME_PARAM][i].values())): key = list(df_column[COLNAME_PARAM][i].keys())[num] val = np.format_float_scientific( val, precision=1, unique=False, trim="0" ) string += str(key) + ": " + str(val) + ", " df_column_out[i] = string[:-2] # remove ', ' from the end of the string else: string = "idx: " + str(df_column[COLNAME_IDX_PARAM][i]) df_column_out[i] = string return df_column_out
[docs] def gen_plots(dataframe: pd.DataFrame, output_dir: str, use_param_index: bool): """ dataframe: dataframe with columns ['param_index','task',' algo',' epos',' te_d',' seed',' params',' acc','precision',...] """ os.makedirs(output_dir, exist_ok=True) pos_numeric_end = min(G_DF_PLOT_COL_METRIC_END, dataframe.shape[1]) obj = dataframe.columns[G_DF_PLOT_COL_METRIC_START:pos_numeric_end] # boxplots for objective in obj: boxplot( dataframe, objective, file=output_dir + "/variational_plots/" + objective ) # scatterplot matrices scatterplot_matrix( dataframe, use_param_index, file=output_dir + "/sp_matrix_reg.png", kind="reg", distinguish_param_setups=False, ) scatterplot_matrix( dataframe, use_param_index, file=output_dir + "/sp_matrix.png", kind="scatter", distinguish_param_setups=False, ) scatterplot_matrix( dataframe, use_param_index, file=output_dir + "/sp_matrix_dist_reg.png", kind="reg", distinguish_param_setups=True, ) scatterplot_matrix( dataframe, use_param_index, file=output_dir + "/sp_matrix_dist.png", kind="scatter", distinguish_param_setups=True, ) # radar plots radar_plot( dataframe, file=output_dir + "/radar_dist.png", distinguish_hyperparam=True ) radar_plot(dataframe, file=output_dir + "/radar.png", distinguish_hyperparam=False) # scatter plots for parirs of objectives os.makedirs(output_dir + "/scatterpl", exist_ok=True) for i, obj_i in enumerate(obj): for j in range(i + 1, len(obj)): try: scatterplot( dataframe, [obj_i, obj[j]], file=output_dir + "/scatterpl/" + obj_i + "_" + obj[j] + ".png", ) except IndexError: logger = Logger.get_logger() logger.warning( f"disabling kde because cov matrix is singular for objectives " f"{obj_i} & {obj[j]}" ) scatterplot( dataframe, [obj_i, obj[j]], file=output_dir + "/scatterpl/" + obj_i + "_" + obj[j] + ".png", kde=False, ) # create plots for the different algortihms for algorithm in dataframe[COLNAME_METHOD].unique(): os.makedirs(output_dir + "/" + str(algorithm), exist_ok=True) dataframe_algo = dataframe[dataframe[COLNAME_METHOD] == algorithm] # boxplots for objective in obj: boxplot( dataframe_algo, objective, file=output_dir + "/" + str(algorithm) + "/variational_plots/" + objective, ) # scatterplot matrices scatterplot_matrix( dataframe_algo, use_param_index, file=output_dir + "/" + str(algorithm) + "/sp_matrix_reg.png", kind="reg", distinguish_param_setups=False, ) scatterplot_matrix( dataframe_algo, use_param_index, file=output_dir + "/" + str(algorithm) + "/sp_matrix.png", kind="scatter", distinguish_param_setups=False, ) scatterplot_matrix( dataframe_algo, use_param_index, file=output_dir + "/" + str(algorithm) + "/sp_matrix_dist_reg.png", kind="reg", distinguish_param_setups=True, ) scatterplot_matrix( dataframe_algo, use_param_index, file=output_dir + "/" + str(algorithm) + "/sp_matrix_dist.png", kind="scatter", distinguish_param_setups=True, ) # radar plots radar_plot( dataframe_algo, file=output_dir + "/" + str(algorithm) + "/radar_dist.png", distinguish_hyperparam=True, ) radar_plot( dataframe_algo, file=output_dir + "/" + str(algorithm) + "/radar.png", distinguish_hyperparam=False, ) # scatter plots for parirs of objectives os.makedirs(output_dir + "/" + str(algorithm) + "/scatterpl", exist_ok=True) for i, obj_i in enumerate(obj): for j in range(i + 1, len(obj)): try: scatterplot( dataframe_algo, [obj_i, obj[j]], file=output_dir + "/" + str(algorithm) + "/scatterpl/" + obj_i + "_" + obj[j] + ".png", distinguish_hyperparam=True, ) except IndexError: logger = Logger.get_logger() logger.warning( f"WARNING: disabling kde because cov matrix is singular " f"for objectives {obj_i} & {obj[j]}" ) scatterplot( dataframe_algo, [obj_i, obj[j]], file=output_dir + "/" + str(algorithm) + "/scatterpl/" + obj_i + "_" + obj[j] + ".png", kde=False, distinguish_hyperparam=True, )
[docs] def scatterplot_matrix( dataframe_in, use_param_index, file=None, kind="reg", distinguish_param_setups=True ): """ dataframe: dataframe containing the data with columns [algo, epos, te_d, seed, params, obj1, ..., obj2] file: filename to save the plots (if None, the plot will not be saved) reg: if True a regression line will be plotted over the data distinguish_param_setups: if True the plot will not only distinguish between models, but also between the parameter setups """ dataframe = dataframe_in.copy() pos_numeric_end = min(G_DF_PLOT_COL_METRIC_END, dataframe.shape[1]) index = list(range(G_DF_PLOT_COL_METRIC_START, pos_numeric_end)) if distinguish_param_setups: dataframe_ = dataframe.iloc[:, index] dataframe_.insert( 0, "label", dataframe[COLNAME_METHOD].astype(str) + ", " + dataframe[COLNAME_PARAM].astype(str), ) g_p = sns.pairplot(data=dataframe_, hue="label", corner=True, kind=kind) else: pos_numeric_end = min(G_DF_PLOT_COL_METRIC_END, dataframe.shape[1]) index_ = list(range(G_DF_PLOT_COL_METRIC_START, pos_numeric_end)) index_.insert(0, G_DF_TASK_COL) dataframe_ = dataframe.iloc[:, index_] g_p = sns.pairplot(data=dataframe_, hue=COLNAME_METHOD, corner=True, kind=kind) for i in range(len(index)): for j in range(len(index)): if i >= j: g_p.axes[i, j].set_xlim((-0.1, 1.1)) for k in range(j): g_p.axes[j, k].set_ylim((-0.1, 1.1)) g_p.fig.set_size_inches(12.5, 12) if use_param_index and distinguish_param_setups: sns.move_legend(g_p, loc="upper right", bbox_to_anchor=(1.0, 1.0), ncol=3) else: sns.move_legend(g_p, loc="upper right", bbox_to_anchor=(1.0, 1.0), ncol=1) plt.tight_layout() if file is not None: plt.savefig(file, dpi=300)
[docs] def scatterplot(dataframe_in, obj, file=None, kde=True, distinguish_hyperparam=False): """ dataframe: dataframe containing the data with columns [algo, epos, te_d, seed, params, obj1, ..., obj2] obj1 & obj2: name of the objectives which shall be plotted against each other file: filename to save the plots (if None, the plot will not be saved) kde: if True the distribution of the points will be estimated and plotted as kde plot distinguish_param_setups: if True the plot will not only distinguish between models, but also between the parameter setups """ obj1, obj2 = obj dataframe = dataframe_in.copy() dataframe[COLNAME_PARAM] = dataframe[COLNAME_PARAM].astype(str) if distinguish_hyperparam: if kde: g_p = sns.jointplot( data=dataframe, x=obj1, y=obj2, hue=COLNAME_PARAM, xlim=(-0.1, 1.1), ylim=(-0.1, 1.1), kind="kde", zorder=0, levels=8, alpha=0.35, warn_singular=False, ) gg_p = sns.scatterplot( data=dataframe, x=obj1, y=obj2, hue=COLNAME_PARAM, ax=g_p.ax_joint ) else: g_p = sns.jointplot( data=dataframe, x=obj1, y=obj2, hue=COLNAME_PARAM, xlim=(-0.1, 1.1), ylim=(-0.1, 1.1), ) gg_p = g_p.ax_joint else: if kde: g_p = sns.jointplot( data=dataframe, x=obj1, y=obj2, hue=COLNAME_METHOD, xlim=(-0.1, 1.1), ylim=(-0.1, 1.1), kind="kde", zorder=0, levels=8, alpha=0.35, warn_singular=False, ) gg_p = sns.scatterplot( data=dataframe, x=obj1, y=obj2, hue=COLNAME_METHOD, style=COLNAME_PARAM, ax=g_p.ax_joint, ) else: g_p = sns.jointplot( data=dataframe, x=obj1, y=obj2, hue=COLNAME_METHOD, xlim=(-0.1, 1.1), ylim=(-0.1, 1.1), ) gg_p = sns.scatterplot( data=dataframe, x=obj1, y=obj2, style=COLNAME_PARAM, ax=g_p.ax_joint ) gg_p.set_aspect("equal") gg_p.legend(fontsize=6, loc="best") if file is not None: plt.savefig(file, dpi=300)
[docs] def max_0_x(x_arg): """ max(0, x_arg) """ return max(0, x_arg)
[docs] def radar_plot(dataframe_in, file=None, distinguish_hyperparam=True): """ dataframe_in: dataframe containing the data with columns [algo, epos, te_d, seed, params, obj1, ..., obj2] file: filename to save the plots (if None, the plot will not be saved) distinguish_param_setups: if True the plot will not only distinguish between models, but also between the parameter setups """ dataframe = dataframe_in.copy() if distinguish_hyperparam: dataframe.insert( 0, "label", dataframe[COLNAME_METHOD].astype(str) + ", " + dataframe[COLNAME_PARAM].astype(str), ) else: dataframe.insert(0, "label", dataframe[COLNAME_METHOD]) # we need "G_DF_PLOT_COL_METRIC_START + 1" as we did insert the columns 'label' at index 0 pos_numeric_end = min(G_DF_PLOT_COL_METRIC_END, dataframe.shape[1]) index = list(range(G_DF_PLOT_COL_METRIC_START + 1, pos_numeric_end)) num_lines = len(dataframe["label"].unique()) _, axis = plt.subplots( figsize=(9, 9 + (0.28 * num_lines)), subplot_kw=dict(polar=True) ) num = 0 # Split the circle into even parts and save the angles # so we know where to put each axis. angles = list( np.linspace(0, 2 * np.pi, len(dataframe.columns[index]), endpoint=False) ) for algo_name in dataframe["label"].unique(): mean = ( dataframe.loc[dataframe["label"] == algo_name] .iloc[:, index] .mean() .to_list() ) std = ( dataframe.loc[dataframe["label"] == algo_name] .iloc[:, index] .std() .to_list() ) angles_ = angles # The plot is a circle, so we need to "complete the loop" # and append the start value to the end. mean = np.array(mean + mean[:1]) std = np.array(std + std[:1]) angles_ = np.array(angles_ + angles_[:1]) # Draw the outline of the data. axis.plot( angles_, mean, color=list(plt.rcParams["axes.prop_cycle"])[num]["color"], linewidth=2, label=algo_name, ) # Fill it in. axis.fill_between( angles_, list(map(max_0_x, mean - std)), y2=mean + std, color=list(plt.rcParams["axes.prop_cycle"])[num]["color"], alpha=0.1, ) num += 1 num = num % len(list(plt.rcParams["axes.prop_cycle"])) # Fix axis to go in the right order and start at 12 o'clock. axis.set_theta_offset(np.pi / 2) axis.set_theta_direction(-1) # Draw axis lines for each angle and label. axis.set_thetagrids(np.degrees(angles), dataframe.columns[index]) axis.set_ylim((0, 1)) plt.legend(loc="lower right", bbox_to_anchor=(1.0, 1.035), ncol=1, fontsize=10) if file is not None: plt.savefig(file, dpi=300)
[docs] def boxplot(dataframe_in, obj, file=None): """ generate the boxplots dataframe_in: dataframe containing the data with columns [param_idx, task , algo, epos, te_d, seed, params, obj1, ..., obj2] obj: objective to be considered in the plot (needs to be contained in dataframe_in) file: foldername to save the plots (if None, the plot will not be saved) """ boxplot_stochastic(dataframe_in, obj, file=file) boxplot_systematic(dataframe_in, obj, file=file)
[docs] def boxplot_stochastic(dataframe_in, obj, file=None): """ generate boxplot for stochastic variation dataframe_in: dataframe containing the data with columns [param_idx, task , algo, epos, te_d, seed, params, obj1, ..., obj2] obj: objective to be considered in the plot (needs to be contained in dataframe_in) file: foldername to save the plots (if None, the plot will not be saved) """ dataframe = dataframe_in.copy() os.makedirs(file, exist_ok=True) ### stochastic variation _, axes = plt.subplots( 1, len(dataframe[COLNAME_METHOD].unique()), sharey=True, figsize=(3 * len(dataframe[COLNAME_METHOD].unique()), 6), ) # iterate over all algorithms for num, algo in enumerate(list(dataframe[COLNAME_METHOD].unique())): # distinguish if the algorithm does only have one param setup or multiple if len(dataframe[COLNAME_METHOD].unique()) > 1: # generate boxplot and swarmplot sns.boxplot( data=dataframe[dataframe[COLNAME_METHOD] == algo], x=COLNAME_IDX_PARAM, y=obj, ax=axes[num], showfliers=False, boxprops={"facecolor": 'none', 'edgecolor': 'black'}, # boxprops={"facecolor": (0.4, 0.6, 0.8, 0.5)}, ) sns.swarmplot( data=dataframe[dataframe[COLNAME_METHOD] == algo], x=COLNAME_IDX_PARAM, y=obj, legend=False, ax=axes[num], ) # remove legend, set ylim, set x-label and remove y-label axes[num].legend([], [], frameon=False) axes[num].set_ylim([-0.01, 1.01]) axes[num].set_xlabel(algo, fontsize=20) if num != 0: axes[num].set_ylabel("") else: sns.boxplot( data=dataframe[dataframe[COLNAME_METHOD] == algo], x=COLNAME_IDX_PARAM, y=obj, ax=axes, showfliers=False, boxprops={"facecolor": (0.4, 0.6, 0.8, 0.5)}, ) sns.swarmplot( data=dataframe[dataframe[COLNAME_METHOD] == algo], x=COLNAME_IDX_PARAM, y=obj, hue=COLNAME_IDX_PARAM, legend=False, ax=axes, palette=sns.cubehelix_palette( n_colors=len( dataframe[dataframe[COLNAME_METHOD] == algo][ COLNAME_IDX_PARAM ].unique() ) ), ) axes.legend([], [], frameon=False) axes.set_ylim([-0.01, 1.01]) axes.set_xlabel(algo) plt.tight_layout() if file is not None: plt.savefig(file + "/stochastic_variation.png", dpi=600) plt.savefig(file + "/stochastic_variation.pdf", format="pdf") plt.savefig(file + "/stochastic_variation.svg", format="svg")
[docs] def boxplot_systematic(dataframe_in, obj, file=None): """ generate boxplot for ssystemtic variation dataframe_in: dataframe containing the data with columns [param_idx, task , algo, epos, te_d, seed, params, obj1, ..., obj2] obj: objective to be considered in the plot (needs to be contained in dataframe_in) file: foldername to save the plots (if None, the plot will not be saved) """ dataframe = dataframe_in.copy() os.makedirs(file, exist_ok=True) ### systematic variation _, axes = plt.subplots( 1, len(dataframe[COLNAME_METHOD].unique()), sharey=True, figsize=(3 * len(dataframe[COLNAME_METHOD].unique()), 6), ) for num, algo in enumerate(list(dataframe[COLNAME_METHOD].unique())): # distinguish if the algorithm does only have one param setup or multiple if len(dataframe[COLNAME_METHOD].unique()) > 1: # generate boxplot and swarmplot sns.boxplot( data=dataframe[dataframe[COLNAME_METHOD] == algo], x=COLNAME_METHOD, y=obj, ax=axes[num], showfliers=False, boxprops={"facecolor": (0.4, 0.6, 0.8, 0.5)}, ) sns.swarmplot( data=dataframe[dataframe[COLNAME_METHOD] == algo], x=COLNAME_METHOD, y=obj, hue=COLNAME_IDX_PARAM, legend=False, ax=axes[num], palette=sns.cubehelix_palette( n_colors=len( dataframe[dataframe[COLNAME_METHOD] == algo][ COLNAME_IDX_PARAM ].unique() ) ), ) # remove legend, set ylim, set x-label and remove y-label axes[num].legend([], [], frameon=False) axes[num].set_ylim([-0.1, 1.1]) axes[num].set_xlabel(" ") if num != 0: axes[num].set_ylabel("") else: sns.boxplot( data=dataframe[dataframe[COLNAME_METHOD] == algo], x=COLNAME_METHOD, y=obj, ax=axes, showfliers=False, boxprops={"facecolor": (0.4, 0.6, 0.8, 0.5)}, ) sns.swarmplot( data=dataframe[dataframe[COLNAME_METHOD] == algo], x=COLNAME_METHOD, y=obj, hue=COLNAME_IDX_PARAM, legend=False, ax=axes, palette=sns.cubehelix_palette( n_colors=len( dataframe[dataframe[COLNAME_METHOD] == algo][ COLNAME_IDX_PARAM ].unique() ) ), ) axes.legend([], [], frameon=False) axes.set_ylim([-0.1, 1.1]) axes.set_xlabel(" ") plt.tight_layout() if file is not None: plt.savefig(file + "/systematic_variation.png", dpi=300)