Source code for skpipes.pipeline

from sklearn.pipeline import Pipeline
from .config import get_config


[docs]class SkPipeline(Pipeline):
    """
    Main object of scikit-pipes to select the desired Pipeline

    Sequentially apply a list of transforms.
    Intermediate steps of the pipeline must be 'transforms', that is, they
    must implement `fit` and `transform` methods.
    The transformers in the pipeline can be cached using ``memory`` argument.
    The purpose of the pipeline is to assemble several steps that can be
    cross-validated together while setting different parameters. For this, it
    enables setting parameters of the various steps using their names and the
    parameter name separated by a `'__'`, as in the example below. A step's
    estimator may be replaced entirely by setting the parameter with its name
    to another estimator, or a transformer removed by setting it to
    `'passthrough'` or `None`.

    Parameters
    ----------
    name : str
        The name of one of the pre-defined pipelines from skpipes.config
    data_type : {'numerical', 'categorical'}
        It indicates the data type of the incoming features.
    memory : str or object with the joblib.Memory interface, default=None
        Used to cache the fitted transformers of the pipeline. By default,
        no caching is performed. If a string is given, it is the path to
        the caching directory. Enabling caching triggers a clone of
        the transformers before fitting. Therefore, the transformer
        instance given to the pipeline cannot be inspected
        directly. Use the attribute ``named_steps`` or ``steps`` to
        inspect estimators within the pipeline. Caching the
        transformers is advantageous when fitting is time consuming.
    verbose : bool, default=False
        If True, the time elapsed while fitting each step will be printed as it
        is completed.

    Attributes
    ----------

    named_steps : :class:`~sklearn.utils.Bunch`
        Dictionary-like object, with the following attributes.
        Read-only attribute to access any step parameter by user given name.
        Keys are step names and values are steps parameters.
    classes_ : ndarray of shape (n_classes,)
        The classes labels. Only exist if the last step of the pipeline is a
        classifier.
    n_features_in_ : int
        Number of features seen during :term:`fit`. Only defined if the
        underlying first estimator in `steps` exposes such an attribute
        when fit.
    feature_names_in_ : ndarray of shape (`n_features_in_`,)
        Names of features seen during :term:`fit`. Only defined if the
        underlying estimator exposes such an attribute when fit.
    """

    def __init__(self, name, data_type, memory=None, verbose=False):
        self.name = name
        self.data_type = data_type
        self.memory = memory
        self.verbose = verbose

        config = get_config(self.data_type)
        pipe = config.get(self.name)
        if not pipe:
            raise ValueError(
                f"The pipeline {name} wasn't found for this data_type, "
                f"make sure it's spelled correctly"
            )
        else:
            steps = pipe["steps"]

        self.description = pipe["description"]

        super(SkPipeline, self).__init__(
            steps=steps, memory=self.memory, verbose=self.verbose
        )

    def __str__(self):
        return self.description