Module tiresias.core.classification

Expand source code
import numpy as np
import diffprivlib.models as dp
from tiresias.core.mechanisms import approximate_bounds

class GaussianNB(dp.GaussianNB):

    def __init__(self, epsilon=1, bounds=None, priors=None, var_smoothing=1e-9):
        super().__init__(epsilon, bounds, priors, var_smoothing)

    def fit(self, X, y, sample_weight=None):
        if not self.bounds:
            self.bounds = []
            self.epsilon /= 2.0
            for column in range(X.shape[1]):
                bounds = approximate_bounds(X[:,column], self.epsilon / X.shape[1])
                self.bounds.append(bounds)
                X[:,column] = np.minimum(np.maximum(X[:,column], bounds[0]), bounds[1])
        return super().fit(X, y, sample_weight=sample_weight)

class LogisticRegression(dp.LogisticRegression):

    def fit(self, X, y, sample_weight=None):
        if not self.data_norm:
            assert self.epsilon > 1.0
            self.epsilon -= 1.0
            row_norms = np.linalg.norm(X, axis=1)
            _, max_norm = approximate_bounds(row_norms, 1.0)
            self.data_norm = max_norm
            for i in range(X.shape[0]):
                if np.linalg.norm(X[i]) > self.data_norm:
                    X[i] = X[i] * (self.data_norm - 1e-5) / np.linalg.norm(X[i])
        return super().fit(X, y, sample_weight=sample_weight)

class TiresiasClassifier(dp.LogisticRegression):

    def __init__(self, epsilon):
        self.epsilon_model = epsilon * 0.5
        self.epsilon_selection = epsilon * 0.5

    def fit(self, X, y):
        from sklearn.metrics import f1_score
        from sklearn.model_selection import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(X, y)

        models, scores = [], []
        model = GaussianNB(epsilon=self.epsilon_model)
        models.append(model.fit(X_train, y_train))
        scores.append(f1_score(y_test, model.predict(X_test)))
        for C in [0.01, 0.1, 1.0, 10.0, 100.0]:
            model = LogisticRegression(epsilon=self.epsilon_model, C=C)
            models.append(model.fit(X_train, y_train))
            scores.append(f1_score(y_test, model.predict(X_test)))
        probabilities = np.exp(self.epsilon_selection * np.array(scores) / 2)
        probabilities = probabilities / np.sum(probabilities)

        self.model = np.random.choice(models, p=probabilities, size=1)[0]
        self.model.fit(X, y)
    
    def predict(self, X):
        return self.model.predict(X)

Classes

class GaussianNB (epsilon=1, bounds=None, priors=None, var_smoothing=1e-09)

Gaussian Naive Bayes (GaussianNB) with differential privacy

Inherits the :class:sklearn.naive_bayes.GaussianNB class from Scikit Learn and adds noise to satisfy differential privacy to the learned means and variances. Adapted from the work presented in [VSB13]_.

Parameters

epsilon : float, default: 1.0
Privacy parameter :math:\epsilon for the model.
bounds : list or None, default: None
Bounds of the data, provided as a list of tuples, with one tuple per dimension. If not provided, the bounds are computed on the data when .fit() is first called, resulting in a :class:.PrivacyLeakWarning.
priors : array-like, shape (n_classes,)
Prior probabilities of the classes. If specified the priors are not adjusted according to the data.
var_smoothing : float, optional (default=1e-9)
Portion of the largest variance of all features that is added to variances for calculation stability.

Attributes

class_prior_ : array, shape (n_classes,)
probability of each class.
class_count_ : array, shape (n_classes,)
number of training samples observed in each class.
theta_ : array, shape (n_classes, n_features)
mean of each feature per class
sigma_ : array, shape (n_classes, n_features)
variance of each feature per class
epsilon_ : float
absolute additive value to variances (unrelated to epsilon parameter for differential privacy)

References

.. [VSB13] Vaidya, Jaideep, Basit Shafiq, Anirban Basu, and Yuan Hong. "Differentially private naive bayes classification." In 2013 IEEE/WIC/ACM International Joint Conferences on Web Intelligence (WI) and Intelligent Agent Technologies (IAT), vol. 1, pp. 571-576. IEEE, 2013.

Expand source code
class GaussianNB(dp.GaussianNB):

    def __init__(self, epsilon=1, bounds=None, priors=None, var_smoothing=1e-9):
        super().__init__(epsilon, bounds, priors, var_smoothing)

    def fit(self, X, y, sample_weight=None):
        if not self.bounds:
            self.bounds = []
            self.epsilon /= 2.0
            for column in range(X.shape[1]):
                bounds = approximate_bounds(X[:,column], self.epsilon / X.shape[1])
                self.bounds.append(bounds)
                X[:,column] = np.minimum(np.maximum(X[:,column], bounds[0]), bounds[1])
        return super().fit(X, y, sample_weight=sample_weight)

Ancestors

  • diffprivlib.models.naive_bayes.GaussianNB
  • sklearn.naive_bayes.GaussianNB
  • sklearn.naive_bayes.BaseNB
  • abc.NewBase
  • sklearn.base.BaseEstimator
  • sklearn.base.ClassifierMixin

Methods

def fit(self, X, y, sample_weight=None)

Fit Gaussian Naive Bayes according to X, y

Parameters

X : array-like, shape (n_samples, n_features)
Training vectors, where n_samples is the number of samples and n_features is the number of features.
y : array-like, shape (n_samples,)
Target values.
sample_weight : array-like, shape (n_samples,), optional (default=None)

Weights applied to individual samples (1. for unweighted).

Added in version: 0.17

Gaussian Naive Bayes supports fitting with sample_weight.

Returns

self : object
 
Expand source code
def fit(self, X, y, sample_weight=None):
    if not self.bounds:
        self.bounds = []
        self.epsilon /= 2.0
        for column in range(X.shape[1]):
            bounds = approximate_bounds(X[:,column], self.epsilon / X.shape[1])
            self.bounds.append(bounds)
            X[:,column] = np.minimum(np.maximum(X[:,column], bounds[0]), bounds[1])
    return super().fit(X, y, sample_weight=sample_weight)
class LogisticRegression (epsilon=1.0, data_norm=None, tol=0.0001, C=1.0, fit_intercept=True, max_iter=100, verbose=0, warm_start=False, n_jobs=None, **unused_args)

Logistic Regression (aka logit, MaxEnt) classifier with differential privacy.

This class implements regularised logistic regression using :ref:Scipy's L-BFGS-B algorithm <scipy:optimize.minimize-lbfgsb>. :math:\epsilon-Differential privacy is achieved relative to the maximum norm of the data, as determined by data_norm, by the :class:.Vector mechanism, which adds a Laplace-distributed random vector to the objective. Adapted from the work presented in [CMS11]_.

This class is a child of :obj:sklearn.linear_model.LogisticRegression, with amendments to allow for the implementation of differential privacy. Some parameters of Scikit Learn's model have therefore had to be fixed, including:

- The only permitted solver is 'lbfgs'.  Specifying the ``solver`` option will result in a warning.
- Consequently, the only permitted penalty is 'l2'. Specifying the ``penalty`` option will result in a warning.
- In the multiclass case, only the one-vs-rest (OvR) scheme is permitted.  Specifying the ``multi_class`` option
  will result in a warning.

Parameters

epsilon : float, default: 1.0
Privacy parameter :math:\epsilon.
data_norm : float, default: None

The max l2 norm of any row of the data. This defines the spread of data that will be protected by differential privacy.

If not specified, the max norm is taken from the data when .fit() is first called, but will result in a :class:.PrivacyLeakWarning, as it reveals information about the data. To preserve differential privacy fully, data_norm should be selected independently of the data, i.e. with domain knowledge.

tol : float, default: 1e-4
Tolerance for stopping criteria.
C : float, default: 1.0
Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.
fit_intercept : bool, default: True
Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.
max_iter : int, default: 100
Maximum number of iterations taken for the solver to converge. For smaller epsilon (more noise), max_iter may need to be increased.
verbose : int, default: 0
Set to any positive number for verbosity.
warm_start : bool, default: False
When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution.
n_jobs : int or None, default: None
Number of CPU cores used when parallelising over classes. None means 1 unless in a context. -1 means using all processors.
**unused_args : kwargs
Placeholder for parameters of :obj:sklearn.linear_model.LogisticRegression that are not used in diffprivlib. Specifying any of these parameters will raise a :class:.DiffprivlibCompatibilityWarning.

Attributes

classes_ : array, shape (n_classes, )
A list of class labels known to the classifier.
coef_ : array, shape (1, n_features) or (n_classes, n_features)

Coefficient of the features in the decision function.

coef_ is of shape (1, n_features) when the given problem is binary.

intercept_ : array, shape (1,) or (n_classes,)

Intercept (a.k.a. bias) added to the decision function.

If fit_intercept is set to False, the intercept is set to zero. intercept_ is of shape (1,) when the given problem is binary.

n_iter_ : array, shape (n_classes,) or (1, )
Actual number of iterations for all classes. If binary, it returns only 1 element.

Examples

>>> from sklearn.datasets import load_iris
>>> from diffprivlib.models import LogisticRegression
>>> X, y = load_iris(return_X_y=True)
>>> clf = LogisticRegression(data_norm=12, epsilon=2).fit(X, y)
>>> clf.predict(X[:2, :])
array([0, 0])
>>> clf.predict_proba(X[:2, :])
array([[7.35362932e-01, 2.16667422e-14, 2.64637068e-01],


   [9.08384378e-01, 3.47767052e-13, 9.16156215e-02]])

>>> clf.score(X, y)
0.5266666666666666

See also

sklearn.linear_model.LogisticRegression
The implementation of logistic regression in scikit-learn, upon which this implementation is built. .Vector : The mechanism used by the model to achieve differential privacy.

References

.. [CMS11] Chaudhuri, Kamalika, Claire Monteleoni, and Anand D. Sarwate. "Differentially private empirical risk minimization." Journal of Machine Learning Research 12, no. Mar (2011): 1069-1109.

Expand source code
class LogisticRegression(dp.LogisticRegression):

    def fit(self, X, y, sample_weight=None):
        if not self.data_norm:
            assert self.epsilon > 1.0
            self.epsilon -= 1.0
            row_norms = np.linalg.norm(X, axis=1)
            _, max_norm = approximate_bounds(row_norms, 1.0)
            self.data_norm = max_norm
            for i in range(X.shape[0]):
                if np.linalg.norm(X[i]) > self.data_norm:
                    X[i] = X[i] * (self.data_norm - 1e-5) / np.linalg.norm(X[i])
        return super().fit(X, y, sample_weight=sample_weight)

Ancestors

  • diffprivlib.models.logistic_regression.LogisticRegression
  • sklearn.linear_model.logistic.LogisticRegression
  • sklearn.base.BaseEstimator
  • sklearn.linear_model.base.LinearClassifierMixin
  • sklearn.base.ClassifierMixin
  • sklearn.linear_model.base.SparseCoefMixin

Methods

def fit(self, X, y, sample_weight=None)

Fit the model according to the given training data.

Parameters

X : {array-like, sparse matrix}, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and n_features is the number of features.
y : array-like, shape (n_samples,)
Target vector relative to X.
sample_weight : ignored
Ignored by diffprivlib. Present for consistency with sklearn API.

Returns

self : class
 
Expand source code
def fit(self, X, y, sample_weight=None):
    if not self.data_norm:
        assert self.epsilon > 1.0
        self.epsilon -= 1.0
        row_norms = np.linalg.norm(X, axis=1)
        _, max_norm = approximate_bounds(row_norms, 1.0)
        self.data_norm = max_norm
        for i in range(X.shape[0]):
            if np.linalg.norm(X[i]) > self.data_norm:
                X[i] = X[i] * (self.data_norm - 1e-5) / np.linalg.norm(X[i])
    return super().fit(X, y, sample_weight=sample_weight)
class TiresiasClassifier (epsilon)

Logistic Regression (aka logit, MaxEnt) classifier with differential privacy.

This class implements regularised logistic regression using :ref:Scipy's L-BFGS-B algorithm <scipy:optimize.minimize-lbfgsb>. :math:\epsilon-Differential privacy is achieved relative to the maximum norm of the data, as determined by data_norm, by the :class:.Vector mechanism, which adds a Laplace-distributed random vector to the objective. Adapted from the work presented in [CMS11]_.

This class is a child of :obj:sklearn.linear_model.LogisticRegression, with amendments to allow for the implementation of differential privacy. Some parameters of Scikit Learn's model have therefore had to be fixed, including:

- The only permitted solver is 'lbfgs'.  Specifying the ``solver`` option will result in a warning.
- Consequently, the only permitted penalty is 'l2'. Specifying the ``penalty`` option will result in a warning.
- In the multiclass case, only the one-vs-rest (OvR) scheme is permitted.  Specifying the ``multi_class`` option
  will result in a warning.

Parameters

epsilon : float, default: 1.0
Privacy parameter :math:\epsilon.
data_norm : float, default: None

The max l2 norm of any row of the data. This defines the spread of data that will be protected by differential privacy.

If not specified, the max norm is taken from the data when .fit() is first called, but will result in a :class:.PrivacyLeakWarning, as it reveals information about the data. To preserve differential privacy fully, data_norm should be selected independently of the data, i.e. with domain knowledge.

tol : float, default: 1e-4
Tolerance for stopping criteria.
C : float, default: 1.0
Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.
fit_intercept : bool, default: True
Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.
max_iter : int, default: 100
Maximum number of iterations taken for the solver to converge. For smaller epsilon (more noise), max_iter may need to be increased.
verbose : int, default: 0
Set to any positive number for verbosity.
warm_start : bool, default: False
When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution.
n_jobs : int or None, default: None
Number of CPU cores used when parallelising over classes. None means 1 unless in a context. -1 means using all processors.
**unused_args : kwargs
Placeholder for parameters of :obj:sklearn.linear_model.LogisticRegression that are not used in diffprivlib. Specifying any of these parameters will raise a :class:.DiffprivlibCompatibilityWarning.

Attributes

classes_ : array, shape (n_classes, )
A list of class labels known to the classifier.
coef_ : array, shape (1, n_features) or (n_classes, n_features)

Coefficient of the features in the decision function.

coef_ is of shape (1, n_features) when the given problem is binary.

intercept_ : array, shape (1,) or (n_classes,)

Intercept (a.k.a. bias) added to the decision function.

If fit_intercept is set to False, the intercept is set to zero. intercept_ is of shape (1,) when the given problem is binary.

n_iter_ : array, shape (n_classes,) or (1, )
Actual number of iterations for all classes. If binary, it returns only 1 element.

Examples

>>> from sklearn.datasets import load_iris
>>> from diffprivlib.models import LogisticRegression
>>> X, y = load_iris(return_X_y=True)
>>> clf = LogisticRegression(data_norm=12, epsilon=2).fit(X, y)
>>> clf.predict(X[:2, :])
array([0, 0])
>>> clf.predict_proba(X[:2, :])
array([[7.35362932e-01, 2.16667422e-14, 2.64637068e-01],


   [9.08384378e-01, 3.47767052e-13, 9.16156215e-02]])

>>> clf.score(X, y)
0.5266666666666666

See also

sklearn.linear_model.LogisticRegression
The implementation of logistic regression in scikit-learn, upon which this implementation is built. .Vector : The mechanism used by the model to achieve differential privacy.

References

.. [CMS11] Chaudhuri, Kamalika, Claire Monteleoni, and Anand D. Sarwate. "Differentially private empirical risk minimization." Journal of Machine Learning Research 12, no. Mar (2011): 1069-1109.

Expand source code
class TiresiasClassifier(dp.LogisticRegression):

    def __init__(self, epsilon):
        self.epsilon_model = epsilon * 0.5
        self.epsilon_selection = epsilon * 0.5

    def fit(self, X, y):
        from sklearn.metrics import f1_score
        from sklearn.model_selection import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(X, y)

        models, scores = [], []
        model = GaussianNB(epsilon=self.epsilon_model)
        models.append(model.fit(X_train, y_train))
        scores.append(f1_score(y_test, model.predict(X_test)))
        for C in [0.01, 0.1, 1.0, 10.0, 100.0]:
            model = LogisticRegression(epsilon=self.epsilon_model, C=C)
            models.append(model.fit(X_train, y_train))
            scores.append(f1_score(y_test, model.predict(X_test)))
        probabilities = np.exp(self.epsilon_selection * np.array(scores) / 2)
        probabilities = probabilities / np.sum(probabilities)

        self.model = np.random.choice(models, p=probabilities, size=1)[0]
        self.model.fit(X, y)
    
    def predict(self, X):
        return self.model.predict(X)

Ancestors

  • diffprivlib.models.logistic_regression.LogisticRegression
  • sklearn.linear_model.logistic.LogisticRegression
  • sklearn.base.BaseEstimator
  • sklearn.linear_model.base.LinearClassifierMixin
  • sklearn.base.ClassifierMixin
  • sklearn.linear_model.base.SparseCoefMixin

Methods

def fit(self, X, y)

Fit the model according to the given training data.

Parameters

X : {array-like, sparse matrix}, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and n_features is the number of features.
y : array-like, shape (n_samples,)
Target vector relative to X.
sample_weight : ignored
Ignored by diffprivlib. Present for consistency with sklearn API.

Returns

self : class
 
Expand source code
def fit(self, X, y):
    from sklearn.metrics import f1_score
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    models, scores = [], []
    model = GaussianNB(epsilon=self.epsilon_model)
    models.append(model.fit(X_train, y_train))
    scores.append(f1_score(y_test, model.predict(X_test)))
    for C in [0.01, 0.1, 1.0, 10.0, 100.0]:
        model = LogisticRegression(epsilon=self.epsilon_model, C=C)
        models.append(model.fit(X_train, y_train))
        scores.append(f1_score(y_test, model.predict(X_test)))
    probabilities = np.exp(self.epsilon_selection * np.array(scores) / 2)
    probabilities = probabilities / np.sum(probabilities)

    self.model = np.random.choice(models, p=probabilities, size=1)[0]
    self.model.fit(X, y)
def predict(self, X)

Predict class labels for samples in X.

Parameters

X : array_like or sparse matrix, shape (n_samples, n_features)
Samples.

Returns

C : array, shape [n_samples]
Predicted class label per sample.
Expand source code
def predict(self, X):
    return self.model.predict(X)