Source code for now_2023.data_generation._toy_dataset

def _dynamic_padding(number: int, width: int) -> str:
    return "sub-{number:0{width}d}".format(width=width, number=number)


[docs] def generate_toy_dataset(n_samples: int, output_filename: str) -> None: """Generate a toy dataset for classification. Parameters ---------- n_samples : int The number of samples that the generated dataset should have. output_filename : str The path to the file in which the dataset should be written. """ import pandas as pd from sklearn.datasets import make_classification X, y = make_classification( n_samples=n_samples, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=1, class_sep=2, flip_y=0.2, weights=[0.5, 0.5], random_state=17, ) df = pd.DataFrame(X, columns=["HC_left_volume", "HC_right_volume"]) df["group"] = y df["group"] = df["group"].apply(lambda x: "AD" if x == 1 else "CN") n_digits = len(str(n_samples)) df["subject_id"] = [_dynamic_padding(x, n_digits) for x in range(1, len(df) + 1)] df.set_index("subject_id", inplace=True) df.to_csv(output_filename, sep="\t")