Source code for coxkan.datasets.rdatasets

import pandas as pd
from ._base import _BaseDataset
from sklearn.model_selection import train_test_split


[docs]
def download_from_rdatasets(package, name):
    datasets = (pd.read_csv("https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/datasets.csv")
                .loc[lambda x: x['Package'] == package].set_index('Item'))
    if not name in datasets.index:
        raise ValueError(f"Dataset {name} not found.")
    info = datasets.loc[name]
    url = info.CSV
    return pd.read_csv(url), info



[docs]
class FLCHAIN(_BaseDataset):
    """
    Assay of serum free light chain (FLCHAIN).
    Obtained from Rdatasets (https://github.com/vincentarelbundock/Rdatasets).

    A study of the relationship between serum free light chain (FLC) and mortality.
    The original sample contains samples on approximately 2/3 of the residents of Olmsted
    County aged 50 or greater.

    For details see http://vincentarelbundock.github.io/Rdatasets/doc/survival/flchain.html

    Variables:
        age:
            age in years.
        sex:
            F=female, M=male.
        sample.yr:
            the calendar year in which a blood sample was obtained.
        kappa:
            serum free light chain, kappa portion.
        lambda:
            serum free light chain, lambda portion.
        flc.grp:
            the FLC group for the subject, as used in the original analysis.
        creatinine:
            serum creatinine.
        mgus:
            1 if the subject had been diagnosed with monoclonal gammapothy (MGUS).
        futime: (duration)
            days from enrollment until death. Note that there are 3 subjects whose sample
            was obtained on their death date.
        death: (event)
            0=alive at last contact date, 1=dead.
        chapter:
            for those who died, a grouping of their primary cause of death by chapter headings
            of the International Code of Diseases ICD-9.

    """
    name = 'flchain'
    duration_col = 'futime'
    event_col = 'death'
    covariates = ['age', 'sex', 'sample.yr', 'kappa', 'lambda', 'flc.grp', 'creatinine', 'mgus']
    categorical_covariates = ['sex', 'sample.yr', 'flc.grp', 'mgus']


[docs]
    def load(self, split=False):
        """Get dataset.

        If 'processed' is False, return the raw data set.
        See the code for processing.
        """

        if not self.path_train.exists() and not self.path_test.exists():
            print(f"Downloading dataset '{self.name}' ...")
            df, info = download_from_rdatasets('survival', self.name)
            self.info = info
            df = self._label_cols_at_end(df)

            df = (df
                    .drop(['chapter', 'rownames'], axis=1)
                    .loc[lambda x: x['creatinine'].isna() == False]
                    .reset_index(drop=True))

            for col in self.categorical_covariates:
                df[col] = df[col].astype('category')
            for col in df.columns.drop(self.categorical_covariates):
                df[col] = df[col].astype('float32')

            df_train, df_test = train_test_split(df, test_size=0.2, random_state=2024)
            df_train.reset_index(drop=True, inplace=True)
            df_test.reset_index(drop=True, inplace=True)
            df_train.to_feather(self.path_train)
            df_test.to_feather(self.path_test)
            print('Done.')
        else:
            df_train = pd.read_feather(self.path_train)
            df_test = pd.read_feather(self.path_test)

        if split:
            return df_train, df_test

        return pd.concat([df_train, df_test], ignore_index=True)





[docs]
class NWTCO(_BaseDataset):
    """
    Data from the 3rd and 4th clinical trails National Wilm's Tumor Study Group
    Obtained from Rdatasets (https://github.com/vincentarelbundock/Rdatasets).

    Measurement error example. Tumor histology predicts survival, but prediction is stronger
    with central lab histology than with the local institution determination.

    For details see http://vincentarelbundock.github.io/Rdatasets/doc/survival/nwtco.html

    Variables:
        instit:
            histology reading from local institution:
                - 1: favorable
                - 2: unfavorable
        histol:
            histology reading from central lab:
                - 1: favorable
                - 2: unfavorable
        stage:
            disease stage: 
                - 1: localized to the kidney and completely resected
                - 2: spread beyond thekidney but completely resected
                - 3: residual tumour in the abdomen or tumour in the lymphnodes
                - 4: metastatic to the lung or liver.
        study:
            clinical trial number (3 or 4)
        age:
            age in months
        in.subcohort:
            included in the subcohort for the example in the paper
        rel: (event)
            indicator for relapse
        edrel: (duration)
            time to relapse

    References
        NE Breslow and N Chatterjee (1999), Design and analysis of two-phase studies with binary
        outcome applied to Wilms tumor prognosis. Applied Statistics 48, 457–68.
    """
    name = 'nwtco'
    duration_col = 'edrel'
    event_col = 'rel'
    covariates = ['instit', 'histol', 'stage', 'study', 'age', 'in.subcohort']
    categorical_covariates = ['instit', 'histol', 'stage', 'study', 'in.subcohort']


[docs]
    def load(self, split=False):
        """Get dataset.

        If 'processed' is False, return the raw data set.
        See the code for processing.
        """

        if not self.path_train.exists() and not self.path_test.exists():
            print(f"Downloading dataset '{self.name}' ...")
            df, info = download_from_rdatasets('survival', self.name)
            self.info = info
            df = self._label_cols_at_end(df)

            df = (df.drop(['rownames', 'seqno'], axis=1))

            for col in self.categorical_covariates:
                df[col] = df[col].astype('category')
            for col in df.columns.drop(self.categorical_covariates):
                df[col] = df[col].astype('float32')

            df_train, df_test = train_test_split(df, test_size=0.2, random_state=2024)
            df_train.reset_index(drop=True, inplace=True)
            df_test.reset_index(drop=True, inplace=True)
            df_train.to_feather(self.path_train)
            df_test.to_feather(self.path_test)
            print('Done.')

        else:
            df_train = pd.read_feather(self.path_train)
            df_test = pd.read_feather(self.path_test)

        if split:
            return df_train, df_test

        return pd.concat([df_train, df_test], ignore_index=True)