Source code for now_2023.utils._statistics

import pandas as pd


[docs] def characteristics_table(df: pd.DataFrame, merged_df: pd.DataFrame) -> pd.DataFrame: """Creates a DataFrame that summarizes the characteristics of the DataFrame df. Parameters ---------- df : pd.DataFrame The input dataframe on which to compute statistics. merged_df : pd.DataFrame Returns ------- population_df : pd.DataFrame The dataframe representing the characteristic table. """ import numpy as np diagnoses = np.unique(df.diagnosis.values) population_df = pd.DataFrame( index=diagnoses, columns=[ "N", "age", "%sexF", "education", "MMS", "CDR=0", "CDR=0.5", "CDR=1", "CDR=2", ], ) merged_df = merged_df.set_index(["participant_id", "session_id"], drop=True) df = df.set_index(["participant_id", "session_id"], drop=True) sub_merged_df = merged_df.loc[df.index] for diagnosis in population_df.index.values: diagnosis_df = sub_merged_df[df.diagnosis == diagnosis] population_df.loc[diagnosis, "N"] = len(diagnosis_df) # Age mean_age = np.mean(diagnosis_df.age_bl) std_age = np.std(diagnosis_df.age_bl) population_df.loc[diagnosis, "age"] = "%.1f ± %.1f" % (mean_age, std_age) # Sex population_df.loc[diagnosis, "%sexF"] = round( (len(diagnosis_df[diagnosis_df.sex == "F"]) / len(diagnosis_df)) * 100, 1 ) # Education level mean_education_level = np.nanmean(diagnosis_df.education_level) std_education_level = np.nanstd(diagnosis_df.education_level) population_df.loc[diagnosis, "education"] = "%.1f ± %.1f" % ( mean_education_level, std_education_level, ) # MMS mean_MMS = np.mean(diagnosis_df.MMS) std_MMS = np.std(diagnosis_df.MMS) population_df.loc[diagnosis, "MMS"] = "%.1f ± %.1f" % (mean_MMS, std_MMS) # CDR for value in ("0", "0.5", "1", "2"): population_df.loc[diagnosis, "CDR=%s" % value] = len( diagnosis_df[diagnosis_df.cdr_global == float(value)] ) return population_df