Source code for src.models._al_zscf

"""
This is still provisional code. Just testing an idea.
"""

import numpy as np
from sklearn.base import ClassifierMixin, BaseEstimator, clone
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from transformers import pipeline
import logging

log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=log_fmt)


def _entropy_selection(probabs, leftover_ids, increment):
    e = (-probabs * np.log2(probabs)).sum(axis=1)
    new_ids = leftover_ids[np.argsort(e)[::-1][:increment]]
    return new_ids


def _margin_sampling_selection(probabs, leftover_ids, increment):
    """
    Selecting samples as a smallest difference of probability values
    between the first and second most likely classes
    """
    probs_sorted = np.sort(probabs, axis=1)[:, ::-1]
    values = probs_sorted[:, 0] - probs_sorted[:, 1]
    new_ids = leftover_ids[np.argsort(values)[:increment]]
    return new_ids


def _random_selection(leftover_ids, increment, rng):
    """
    Random sample selection. Random State object is required.
    """
    new_ids = rng.choice(leftover_ids, increment, replace=False)
    return new_ids


def data_selection(probabs, leftover_ids, increment, rng, selection_strategy):
    if selection_strategy == 'entropy':
        return _entropy_selection(probabs, leftover_ids, increment)
    elif selection_strategy == 'margin sampling':
        return _margin_sampling_selection(probabs, leftover_ids, increment)
    elif selection_strategy == 'random':
        return _random_selection(leftover_ids, increment, rng)
    else:
        msg = f"Selection strategy {selection_strategy} is \
        not implemented. Possible values are \
        ['entropy', 'margin sampling', 'random']."
        raise ValueError(msg)


[docs]class ALZeroShotWrapper(ClassifierMixin, BaseEstimator):
    """Active Learning with Zero-Shot classification"""
[docs]    def __init__(
        self,
        classifier,
        max_iter=1000,
        selection_strategy='entropy',
        n_initial=100,
        increment=50,
        save_classifiers=False,
        auto_load=True,
        evaluation_metric=None,
        random_state=None,
        verbose=None
    ):
        """
        Performs Active Learning using Zero Shot classification for
        obtaining a pseudo-ground truth. This method attempts to fit to the
        ZSCF's predictions and emulate its behavior in a more simplistic way.
        This might be a good alternative for situations where the computational
        power is limited.

        Parameters
        ----------
        classifier : sklearn obj or similar
            Classifier to be trained.
        max_iter : int, default=1000
            Maximum number of iterations
        selection_strategy : str, default='entropy'
            Strategy used to compute uncertainty. Can be either one of
            'entropy', 'margin sampling' or 'random'.
        n_initial : int, default=100
            Number of initial training points.
        increment : int, default=50
            Number of additional instances per iteration.
        save_classifiers : bool, default=False
           If True, creates a list of classifiers (one per iteration) in the
           attribute `self.classifiers_`.
        auto_load : bool, default=True
            Wether to use the best found classifier as default classification
            method. If True, the trained classifier object is found at
            `self.classifier_`.
        evaluation_metric : function or NoneType, default=None
            Evaluation metric used to evaluate the classification outputs on
            each iteration. If `None`, Overall Accuracy is used.
        random_state : int or RandomState, default=None
            Control the random number generator used. Setting a value to this
            parameter should allow the experiment to become reproducible.
        verbose : int, bool or NoneType, default=None
            Controls the verbosity during the training process.
        """
        self.classifier = classifier
        self.max_iter = max_iter
        self.selection_strategy = selection_strategy
        self.n_initial = n_initial
        self.increment = increment
        self.random_state = random_state

        # For finding the optimal classifier purposes
        self.auto_load = auto_load
        self.save_classifiers = save_classifiers
        self.evaluation_metric = evaluation_metric

        self._logger = logging.getLogger(__name__)
        self._logger.propagate = verbose

    def fit(self, X, sequences, candidate_labels):

        self._logger.info('Started fitting process. Setting up models.')

        if self.evaluation_metric is None:
            self.evaluation_metric_ = accuracy_score
        else:
            self.evaluation_metric_ = self.evaluation_metric

        if self.save_classifiers:
            self.classifiers_ = []

        if self.auto_load:
            self.classifier_ = None

        iter_n = 0
        rng = np.random.RandomState(self.random_state)
        selection = np.zeros(shape=(X.shape[0])).astype(bool)

        self.zscf = pipeline("zero-shot-classification")
        self.y = np.ones(X.shape[0]) * np.nan
        self.label_encoder = LabelEncoder().fit(candidate_labels)

        while iter_n < self.max_iter:

            self._logger.info(f'Iteration #{iter_n}')

            classifier = clone(self.classifier)

            # add new samples to dataset
            leftover_ids = np.argwhere(~selection).squeeze()
            ids = (
                data_selection(
                    probabs,
                    leftover_ids,
                    self.increment,
                    rng,
                    self.selection_strategy
                )
                if iter_n != 0
                else
                rng.choice(leftover_ids, self.n_initial, replace=False)
            )
            selection[ids] = True

            # Assign ground truth labels based on zero-shot classification
            self.y[ids] = self.label_encoder.transform([
                    self.zscf(sequences[i], candidate_labels)['labels'][0]
                    for i in ids
            ])

            # train classifier and get probabilities
            classifier.fit(
                X[selection],
                self.y[selection]
            )

            # save classifier
            if self.save_classifiers:
                self.classifiers_.append((selection.sum(), classifier))

            # Replace top classifier
            if self.auto_load:
                self.classifier_ = classifier

            # keep track of iter_n
            if self.max_iter is not None:
                iter_n += 1

            # stop if all examples have been included
            if selection.all():
                break

            probabs = classifier.predict_proba(X[~selection])

            # some selection strategies don't deal well with 0. values
            probabs = np.where(probabs == 0., 1e-10, probabs)

        return self

    def load_best_classifier(self, X, y):
        scores = []
        for _, classifier in self.classifiers_:
            y_pred = classifier.predict(X)
            scores.append(self.evaluation_metric_(y, y_pred))

        self.classifier_ = self.classifiers_[np.argmax(scores)][-1]
        return self

    def predict(self, X):
        return self.classifier_.predict(X)