Source code for syndi.task

import json
import os
import pickle
import shutil

import pandas as pd
from pycaret import classification, regression

ORIGINAL_STEPS = 3
SAMPLING_METHODS = ["all", "original", "uniform", "baseline"]


[docs]def get_sample_method_ids(sample_method):
    if sample_method != "baseline":
        yield "baseline"
    yield from _get_sample_method_ids_no_baseline(sample_method)


def _get_sample_method_ids_no_baseline(sample_method):
    if sample_method == "all":
        yield from _get_sample_method_ids_no_baseline("uniform")
        yield from _get_sample_method_ids_no_baseline("original")
    elif sample_method == "original":
        for i in range(ORIGINAL_STEPS):
            yield "original_{}".format(str(i))
    elif sample_method == "uniform":
        yield "uniform"
    elif sample_method == "baseline":
        yield "baseline"
    else:
        raise ValueError(
            "for task sampling_method is {} which is invalid".format(
                sample_method))


[docs]class Task:
    """A class that stores the configurations to a prediction task."""

[docs]    def __init__(self, task_id=None, train_dataset=None,
                 test_dataset=None, target=None, path_to_generator=None,
                 sampling_method_id=None, pycaret_model=None, run_num=None,
                 output_dir=None, is_regression=False, regression_bins=5):
        """Create a task configuration object from a list of settings.
        Args:
            task_id (str):
                an identifier for the task.
            train_dataset (str):
                the path where train dataset csv is stored
            test_dataset (str):
                the path where test dataset csv is stored
            target (str):
                the name of the target column in train_dataset and test_dataset
            path_to_generator (str)
                the path where the generator is stored
            sampling_method_id (str):
                unique sampling method id: "uniform" , "original", or "baseline"
            pycaret_model (str):
                the pycaret classifier ID, this classifier will be trained and tested
            run_num (int):
                the number of runs for the classifier and synthetic data generator.
            is_regression (bool):
                perform regression to predict target (default is classification)
        """
        self._task_id = task_id
        self._train_dataset = train_dataset
        self._test_dataset = test_dataset
        self._target = target
        self._path_to_generator = path_to_generator
        self._sampling_method_id = sampling_method_id
        self._pycaret_model = pycaret_model
        self._run_num = run_num
        self._output_dir = output_dir
        self._is_regression = is_regression
        self._regression_bins = regression_bins

    def __str__(self):
        description_str = ""
        ordered_items = list(sorted(self.__dict__.items(), key=lambda x: x[0]))
        for k, v in ordered_items:
            description_str += "{:<20} {}\n".format(k, v)
        return description_str

    def __repr__(self):
        description_str = ""
        ordered_items = list(sorted(self.__dict__.items(), key=lambda x: x[0]))
        for k, v in ordered_items:
            description_str += "{:<20} {}\n".format(k, v)
        return description_str

[docs]    def save_as(self, file_path):
        """Save the task configurations to the given address.
        Args:
            file_path (str):
                the path to store the configurations.
        """
        _, file_type = os.path.splitext(file_path)
        if file_type == '.pkl':
            with open(file_path, 'wb') as f:
                pickle.dump(self, f)
        elif file_type == '.json':
            with open(file_path, 'w') as f:
                json.dump(self.__dict__, f)
        else:
            raise ValueError("file_type should be either \"pkl\" or \"json\"")

[docs]    @staticmethod
    def load(file_path):
        with open(file_path, 'f') as f:
            attr_dict = json.load(f)
        return Task(**attr_dict)

    @property
    def task_id(self):
        return self._task_id

    @property
    def train_dataset(self):
        return self._train_dataset

    @train_dataset.setter
    def train_dataset(self, train_dataset):
        self._train_dataset = train_dataset

    @property
    def test_dataset(self):
        return self._test_dataset

    @test_dataset.setter
    def test_dataset(self, test_dataset):
        self._test_dataset = test_dataset

    @property
    def target(self):
        return self._target

    @property
    def path_to_generator(self):
        return self._path_to_generator

    @path_to_generator.setter
    def path_to_generator(self, path_to_generator):
        self._path_to_generator = path_to_generator

    @property
    def sampling_method_id(self):
        return self._sampling_method_id

    @property
    def pycaret_model(self):
        return self._pycaret_model

    @property
    def run_num(self):
        return self._run_num

    @property
    def output_dir(self):
        return self._output_dir

    @property
    def is_regression(self):
        return self._is_regression

    @property
    def regression_bins(self):
        return self._regression_bins


[docs]def create_tasks(train_dataset="data/train.csv",
                 test_dataset="data/test.csv", target="TARGET",
                 path_to_generators="generators/", pycaret_models=None,
                 task_sampling_method="all", run_num=1, output_dir=None,
                 is_regression=False, regression_bins=5, preprocess_fn=None):
    """Create a list of benchmark task objects.

    Args:
        train_dataset (str):
            the directory of training dataset csv file
        test_dataset (str):
            the directory of test dataset csv file
        target (str)
            the name of the target column in the train and test dataset
            (must be the same for both datasets)
        path_to_generators (str)
            the directory of generators
        pycaret_models (list):
           list of strings of pycaret classification models to use, if None runs all.
        sampling_method (str):
            "uniform" , "original", "baseline" (no sampling), or "all"
            (for both uniform and original)
        run_num (int):
            the number of times to generate a sample and test a classifier on it.
        output_dir (str):
            the path to store the task configurations.

    Returns:
        list:
            a list of Task objects that store the benchmarking task
            configurations.
    """
    task_num = 0
    tasks = []

    if pycaret_models is None:
        train_data = pd.read_csv(train_dataset)
        test_data = pd.read_csv(test_dataset)
        pycaret_functions = classification
        if is_regression:
            pycaret_functions = regression
        preprocessed_train = preprocess_fn(train_data) if preprocess_fn else train_data
        preprocessed_test = preprocess_fn(test_data) if preprocess_fn else test_data
            
        pycaret_functions.setup(preprocessed_train,
                                target=target,
                                test_data=preprocessed_test,
                                silent=True,
                                verbose=False)
        pycaret_models = pycaret_functions.models().index.to_list()

    generator_paths = []
    generator_name = {}
    for f in os.listdir(path_to_generators):
        file_name, file_type = os.path.splitext(f)
        if file_type == '.pkl':
            generator_path = os.path.join(path_to_generators, f)
            generator_name[generator_path] = file_name
            generator_paths.append(generator_path)

    if output_dir is not None:
        if os.path.exists(output_dir):
            # automatically clears output directory
            shutil.rmtree(output_dir)
        os.mkdir(output_dir)

    def create_task(gen_name, task_num, classifier, generator_path,
                    sampling_method_id, run_num, output_dir):
        task_id = "{}_{}_{}_{}_{}".format(task_num, gen_name,
                                          sampling_method_id, classifier,
                                          run_num)
        task_output_dir = None
        if output_dir is not None:
            task_output_dir = os.path.join(output_dir, task_id)
            os.mkdir(task_output_dir)

        task_instance = Task(task_id=task_id, train_dataset=train_dataset,
                             test_dataset=test_dataset, target=target,
                             path_to_generator=generator_path,
                             sampling_method_id=sampling_method_id,
                             pycaret_model=classifier, run_num=run_num, output_dir=task_output_dir,
                             is_regression=is_regression, regression_bins=regression_bins)

        if output_dir is not None:
            task_instance.save_as(os.path.join(task_output_dir, 'meta.json'))
        return task_instance

    for classifier in pycaret_models:
        for sampling_method_id in get_sample_method_ids(task_sampling_method):
            for run in range(run_num):
                if sampling_method_id == "baseline":
                    gen_name = "none"
                    task_instance = create_task(gen_name, task_num, classifier,
                                                generator_path, sampling_method_id, run,
                                                output_dir)
                    tasks.append(task_instance)
                    task_num += 1
                else:
                    for generator_path in generator_paths:
                        gen_name = generator_name[generator_path]
                        task_instance = create_task(gen_name, task_num, classifier,
                                                    generator_path, sampling_method_id, run,
                                                    output_dir)
                        tasks.append(task_instance)
                        task_num += 1

    return tasks