Source code for now_2023.data_generation._toy_dataset

def _dynamic_padding(number: int, width: int) -> str:
    return "sub-{number:0{width}d}".format(width=width, number=number)



[docs]
def generate_toy_dataset(n_samples: int, output_filename: str) -> None:
    """Generate a toy dataset for classification.

    Parameters
    ----------
    n_samples : int
        The number of samples that the generated dataset should have.

    output_filename : str
        The path to the file in which the dataset should be written.
    """
    import pandas as pd
    from sklearn.datasets import make_classification

    X, y = make_classification(
        n_samples=n_samples,
        n_features=2,
        n_informative=2,
        n_redundant=0,
        n_repeated=0,
        n_classes=2,
        n_clusters_per_class=1,
        class_sep=2,
        flip_y=0.2,
        weights=[0.5, 0.5],
        random_state=17,
    )

    df = pd.DataFrame(X, columns=["HC_left_volume", "HC_right_volume"])
    df["group"] = y
    df["group"] = df["group"].apply(lambda x: "AD" if x == 1 else "CN")
    n_digits = len(str(n_samples))
    df["subject_id"] = [_dynamic_padding(x, n_digits) for x in range(1, len(df) + 1)]
    df.set_index("subject_id", inplace=True)
    df.to_csv(output_filename, sep="\t")