Module tiresias.core.classification
Expand source code
import numpy as np
import diffprivlib.models as dp
from tiresias.core.mechanisms import approximate_bounds
class GaussianNB(dp.GaussianNB):
def __init__(self, epsilon=1, bounds=None, priors=None, var_smoothing=1e-9):
super().__init__(epsilon, bounds, priors, var_smoothing)
def fit(self, X, y, sample_weight=None):
if not self.bounds:
self.bounds = []
self.epsilon /= 2.0
for column in range(X.shape[1]):
bounds = approximate_bounds(X[:,column], self.epsilon / X.shape[1])
self.bounds.append(bounds)
X[:,column] = np.minimum(np.maximum(X[:,column], bounds[0]), bounds[1])
return super().fit(X, y, sample_weight=sample_weight)
class LogisticRegression(dp.LogisticRegression):
def fit(self, X, y, sample_weight=None):
if not self.data_norm:
assert self.epsilon > 1.0
self.epsilon -= 1.0
row_norms = np.linalg.norm(X, axis=1)
_, max_norm = approximate_bounds(row_norms, 1.0)
self.data_norm = max_norm
for i in range(X.shape[0]):
if np.linalg.norm(X[i]) > self.data_norm:
X[i] = X[i] * (self.data_norm - 1e-5) / np.linalg.norm(X[i])
return super().fit(X, y, sample_weight=sample_weight)
class TiresiasClassifier(dp.LogisticRegression):
def __init__(self, epsilon):
self.epsilon_model = epsilon * 0.5
self.epsilon_selection = epsilon * 0.5
def fit(self, X, y):
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
models, scores = [], []
model = GaussianNB(epsilon=self.epsilon_model)
models.append(model.fit(X_train, y_train))
scores.append(f1_score(y_test, model.predict(X_test)))
for C in [0.01, 0.1, 1.0, 10.0, 100.0]:
model = LogisticRegression(epsilon=self.epsilon_model, C=C)
models.append(model.fit(X_train, y_train))
scores.append(f1_score(y_test, model.predict(X_test)))
probabilities = np.exp(self.epsilon_selection * np.array(scores) / 2)
probabilities = probabilities / np.sum(probabilities)
self.model = np.random.choice(models, p=probabilities, size=1)[0]
self.model.fit(X, y)
def predict(self, X):
return self.model.predict(X)
Classes
class GaussianNB (epsilon=1, bounds=None, priors=None, var_smoothing=1e-09)
-
Gaussian Naive Bayes (GaussianNB) with differential privacy
Inherits the :class:
sklearn.naive_bayes.GaussianNB
class from Scikit Learn and adds noise to satisfy differential privacy to the learned means and variances. Adapted from the work presented in [VSB13]_.Parameters
epsilon
:float
, default:1.0
- Privacy parameter :math:
\epsilon
for the model. bounds
:list
orNone
, default:None
- Bounds of the data, provided as a list of tuples, with one tuple per dimension.
If not provided, the bounds
are computed on the data when
.fit()
is first called, resulting in a :class:.PrivacyLeakWarning
. priors
:array
-like
,shape
(n_classes
,)- Prior probabilities of the classes. If specified the priors are not adjusted according to the data.
var_smoothing
:float
, optional (default=1e
-9
)- Portion of the largest variance of all features that is added to variances for calculation stability.
Attributes
class_prior_
:array
,shape
(n_classes
,)- probability of each class.
class_count_
:array
,shape
(n_classes
,)- number of training samples observed in each class.
theta_
:array
,shape
(n_classes
,n_features
)- mean of each feature per class
sigma_
:array
,shape
(n_classes
,n_features
)- variance of each feature per class
epsilon_
:float
- absolute additive value to variances (unrelated to
epsilon
parameter for differential privacy)
References
.. [VSB13] Vaidya, Jaideep, Basit Shafiq, Anirban Basu, and Yuan Hong. "Differentially private naive bayes classification." In 2013 IEEE/WIC/ACM International Joint Conferences on Web Intelligence (WI) and Intelligent Agent Technologies (IAT), vol. 1, pp. 571-576. IEEE, 2013.
Expand source code
class GaussianNB(dp.GaussianNB): def __init__(self, epsilon=1, bounds=None, priors=None, var_smoothing=1e-9): super().__init__(epsilon, bounds, priors, var_smoothing) def fit(self, X, y, sample_weight=None): if not self.bounds: self.bounds = [] self.epsilon /= 2.0 for column in range(X.shape[1]): bounds = approximate_bounds(X[:,column], self.epsilon / X.shape[1]) self.bounds.append(bounds) X[:,column] = np.minimum(np.maximum(X[:,column], bounds[0]), bounds[1]) return super().fit(X, y, sample_weight=sample_weight)
Ancestors
- diffprivlib.models.naive_bayes.GaussianNB
- sklearn.naive_bayes.GaussianNB
- sklearn.naive_bayes.BaseNB
- abc.NewBase
- sklearn.base.BaseEstimator
- sklearn.base.ClassifierMixin
Methods
def fit(self, X, y, sample_weight=None)
-
Fit Gaussian Naive Bayes according to X, y
Parameters
X
:array
-like
,shape
(n_samples
,n_features
)- Training vectors, where n_samples is the number of samples and n_features is the number of features.
y
:array
-like
,shape
(n_samples
,)- Target values.
sample_weight
:array
-like
,shape
(n_samples
,), optional (default=None
)-
Weights applied to individual samples (1. for unweighted).
Added in version: 0.17
Gaussian Naive Bayes supports fitting with sample_weight.
Returns
self
:object
Expand source code
def fit(self, X, y, sample_weight=None): if not self.bounds: self.bounds = [] self.epsilon /= 2.0 for column in range(X.shape[1]): bounds = approximate_bounds(X[:,column], self.epsilon / X.shape[1]) self.bounds.append(bounds) X[:,column] = np.minimum(np.maximum(X[:,column], bounds[0]), bounds[1]) return super().fit(X, y, sample_weight=sample_weight)
class LogisticRegression (epsilon=1.0, data_norm=None, tol=0.0001, C=1.0, fit_intercept=True, max_iter=100, verbose=0, warm_start=False, n_jobs=None, **unused_args)
-
Logistic Regression (aka logit, MaxEnt) classifier with differential privacy.
This class implements regularised logistic regression using :ref:
Scipy's L-BFGS-B algorithm <scipy:optimize.minimize-lbfgsb>
. :math:\epsilon
-Differential privacy is achieved relative to the maximum norm of the data, as determined bydata_norm
, by the :class:.Vector
mechanism, which adds a Laplace-distributed random vector to the objective. Adapted from the work presented in [CMS11]_.This class is a child of :obj:
sklearn.linear_model.LogisticRegression
, with amendments to allow for the implementation of differential privacy. Some parameters ofScikit Learn
's model have therefore had to be fixed, including:- The only permitted solver is 'lbfgs'. Specifying the ``solver`` option will result in a warning. - Consequently, the only permitted penalty is 'l2'. Specifying the ``penalty`` option will result in a warning. - In the multiclass case, only the one-vs-rest (OvR) scheme is permitted. Specifying the ``multi_class`` option will result in a warning.
Parameters
epsilon
:float
, default:1.0
- Privacy parameter :math:
\epsilon
. data_norm
:float
, default:None
-
The max l2 norm of any row of the data. This defines the spread of data that will be protected by differential privacy.
If not specified, the max norm is taken from the data when
.fit()
is first called, but will result in a :class:.PrivacyLeakWarning
, as it reveals information about the data. To preserve differential privacy fully,data_norm
should be selected independently of the data, i.e. with domain knowledge. tol
:float
, default:1e
-4
- Tolerance for stopping criteria.
C
:float
, default:1.0
- Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.
fit_intercept
:bool
, default:True
- Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.
max_iter
:int
, default:100
- Maximum number of iterations taken for the solver to converge.
For smaller
epsilon
(more noise),max_iter
may need to be increased. verbose
:int
, default:0
- Set to any positive number for verbosity.
warm_start
:bool
, default:False
- When set to
True
, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. n_jobs
:int
orNone
, default:None
- Number of CPU cores used when parallelising over classes.
None
means 1 unless in a context.-1
means using all processors. **unused_args
:kwargs
- Placeholder for parameters of :obj:
sklearn.linear_model.LogisticRegression
that are not used indiffprivlib
. Specifying any of these parameters will raise a :class:.DiffprivlibCompatibilityWarning
.
Attributes
classes_
:array
,shape
(n_classes
, )- A list of class labels known to the classifier.
coef_
:array
,shape
(1
,n_features
) or (n_classes
,n_features
)-
Coefficient of the features in the decision function.
coef_
is of shape (1, n_features) when the given problem is binary. intercept_
:array
,shape
(1
,) or (n_classes
,)-
Intercept (a.k.a. bias) added to the decision function.
If
fit_intercept
is set to False, the intercept is set to zero.intercept_
is of shape (1,) when the given problem is binary. n_iter_
:array
,shape
(n_classes
,) or (1
, )- Actual number of iterations for all classes. If binary, it returns only 1 element.
Examples
>>> from sklearn.datasets import load_iris >>> from diffprivlib.models import LogisticRegression >>> X, y = load_iris(return_X_y=True) >>> clf = LogisticRegression(data_norm=12, epsilon=2).fit(X, y) >>> clf.predict(X[:2, :]) array([0, 0]) >>> clf.predict_proba(X[:2, :]) array([[7.35362932e-01, 2.16667422e-14, 2.64637068e-01], [9.08384378e-01, 3.47767052e-13, 9.16156215e-02]]) >>> clf.score(X, y) 0.5266666666666666
See also
sklearn.linear_model.LogisticRegression
- The implementation of logistic regression in scikit-learn, upon which this
implementation is built.
.Vector : The mechanism used by the model to achieve differential privacy.
References
.. [CMS11] Chaudhuri, Kamalika, Claire Monteleoni, and Anand D. Sarwate. "Differentially private empirical risk minimization." Journal of Machine Learning Research 12, no. Mar (2011): 1069-1109.
Expand source code
class LogisticRegression(dp.LogisticRegression): def fit(self, X, y, sample_weight=None): if not self.data_norm: assert self.epsilon > 1.0 self.epsilon -= 1.0 row_norms = np.linalg.norm(X, axis=1) _, max_norm = approximate_bounds(row_norms, 1.0) self.data_norm = max_norm for i in range(X.shape[0]): if np.linalg.norm(X[i]) > self.data_norm: X[i] = X[i] * (self.data_norm - 1e-5) / np.linalg.norm(X[i]) return super().fit(X, y, sample_weight=sample_weight)
Ancestors
- diffprivlib.models.logistic_regression.LogisticRegression
- sklearn.linear_model.logistic.LogisticRegression
- sklearn.base.BaseEstimator
- sklearn.linear_model.base.LinearClassifierMixin
- sklearn.base.ClassifierMixin
- sklearn.linear_model.base.SparseCoefMixin
Methods
def fit(self, X, y, sample_weight=None)
-
Fit the model according to the given training data.
Parameters
X
: {array
-like
,sparse
matrix
},shape
(n_samples
,n_features
)- Training vector, where n_samples is the number of samples and n_features is the number of features.
y
:array
-like
,shape
(n_samples
,)- Target vector relative to X.
sample_weight
:ignored
- Ignored by diffprivlib. Present for consistency with sklearn API.
Returns
self
:class
Expand source code
def fit(self, X, y, sample_weight=None): if not self.data_norm: assert self.epsilon > 1.0 self.epsilon -= 1.0 row_norms = np.linalg.norm(X, axis=1) _, max_norm = approximate_bounds(row_norms, 1.0) self.data_norm = max_norm for i in range(X.shape[0]): if np.linalg.norm(X[i]) > self.data_norm: X[i] = X[i] * (self.data_norm - 1e-5) / np.linalg.norm(X[i]) return super().fit(X, y, sample_weight=sample_weight)
class TiresiasClassifier (epsilon)
-
Logistic Regression (aka logit, MaxEnt) classifier with differential privacy.
This class implements regularised logistic regression using :ref:
Scipy's L-BFGS-B algorithm <scipy:optimize.minimize-lbfgsb>
. :math:\epsilon
-Differential privacy is achieved relative to the maximum norm of the data, as determined bydata_norm
, by the :class:.Vector
mechanism, which adds a Laplace-distributed random vector to the objective. Adapted from the work presented in [CMS11]_.This class is a child of :obj:
sklearn.linear_model.LogisticRegression
, with amendments to allow for the implementation of differential privacy. Some parameters ofScikit Learn
's model have therefore had to be fixed, including:- The only permitted solver is 'lbfgs'. Specifying the ``solver`` option will result in a warning. - Consequently, the only permitted penalty is 'l2'. Specifying the ``penalty`` option will result in a warning. - In the multiclass case, only the one-vs-rest (OvR) scheme is permitted. Specifying the ``multi_class`` option will result in a warning.
Parameters
epsilon
:float
, default:1.0
- Privacy parameter :math:
\epsilon
. data_norm
:float
, default:None
-
The max l2 norm of any row of the data. This defines the spread of data that will be protected by differential privacy.
If not specified, the max norm is taken from the data when
.fit()
is first called, but will result in a :class:.PrivacyLeakWarning
, as it reveals information about the data. To preserve differential privacy fully,data_norm
should be selected independently of the data, i.e. with domain knowledge. tol
:float
, default:1e
-4
- Tolerance for stopping criteria.
C
:float
, default:1.0
- Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.
fit_intercept
:bool
, default:True
- Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.
max_iter
:int
, default:100
- Maximum number of iterations taken for the solver to converge.
For smaller
epsilon
(more noise),max_iter
may need to be increased. verbose
:int
, default:0
- Set to any positive number for verbosity.
warm_start
:bool
, default:False
- When set to
True
, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. n_jobs
:int
orNone
, default:None
- Number of CPU cores used when parallelising over classes.
None
means 1 unless in a context.-1
means using all processors. **unused_args
:kwargs
- Placeholder for parameters of :obj:
sklearn.linear_model.LogisticRegression
that are not used indiffprivlib
. Specifying any of these parameters will raise a :class:.DiffprivlibCompatibilityWarning
.
Attributes
classes_
:array
,shape
(n_classes
, )- A list of class labels known to the classifier.
coef_
:array
,shape
(1
,n_features
) or (n_classes
,n_features
)-
Coefficient of the features in the decision function.
coef_
is of shape (1, n_features) when the given problem is binary. intercept_
:array
,shape
(1
,) or (n_classes
,)-
Intercept (a.k.a. bias) added to the decision function.
If
fit_intercept
is set to False, the intercept is set to zero.intercept_
is of shape (1,) when the given problem is binary. n_iter_
:array
,shape
(n_classes
,) or (1
, )- Actual number of iterations for all classes. If binary, it returns only 1 element.
Examples
>>> from sklearn.datasets import load_iris >>> from diffprivlib.models import LogisticRegression >>> X, y = load_iris(return_X_y=True) >>> clf = LogisticRegression(data_norm=12, epsilon=2).fit(X, y) >>> clf.predict(X[:2, :]) array([0, 0]) >>> clf.predict_proba(X[:2, :]) array([[7.35362932e-01, 2.16667422e-14, 2.64637068e-01], [9.08384378e-01, 3.47767052e-13, 9.16156215e-02]]) >>> clf.score(X, y) 0.5266666666666666
See also
sklearn.linear_model.LogisticRegression
- The implementation of logistic regression in scikit-learn, upon which this
implementation is built.
.Vector : The mechanism used by the model to achieve differential privacy.
References
.. [CMS11] Chaudhuri, Kamalika, Claire Monteleoni, and Anand D. Sarwate. "Differentially private empirical risk minimization." Journal of Machine Learning Research 12, no. Mar (2011): 1069-1109.
Expand source code
class TiresiasClassifier(dp.LogisticRegression): def __init__(self, epsilon): self.epsilon_model = epsilon * 0.5 self.epsilon_selection = epsilon * 0.5 def fit(self, X, y): from sklearn.metrics import f1_score from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y) models, scores = [], [] model = GaussianNB(epsilon=self.epsilon_model) models.append(model.fit(X_train, y_train)) scores.append(f1_score(y_test, model.predict(X_test))) for C in [0.01, 0.1, 1.0, 10.0, 100.0]: model = LogisticRegression(epsilon=self.epsilon_model, C=C) models.append(model.fit(X_train, y_train)) scores.append(f1_score(y_test, model.predict(X_test))) probabilities = np.exp(self.epsilon_selection * np.array(scores) / 2) probabilities = probabilities / np.sum(probabilities) self.model = np.random.choice(models, p=probabilities, size=1)[0] self.model.fit(X, y) def predict(self, X): return self.model.predict(X)
Ancestors
- diffprivlib.models.logistic_regression.LogisticRegression
- sklearn.linear_model.logistic.LogisticRegression
- sklearn.base.BaseEstimator
- sklearn.linear_model.base.LinearClassifierMixin
- sklearn.base.ClassifierMixin
- sklearn.linear_model.base.SparseCoefMixin
Methods
def fit(self, X, y)
-
Fit the model according to the given training data.
Parameters
X
: {array
-like
,sparse
matrix
},shape
(n_samples
,n_features
)- Training vector, where n_samples is the number of samples and n_features is the number of features.
y
:array
-like
,shape
(n_samples
,)- Target vector relative to X.
sample_weight
:ignored
- Ignored by diffprivlib. Present for consistency with sklearn API.
Returns
self
:class
Expand source code
def fit(self, X, y): from sklearn.metrics import f1_score from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y) models, scores = [], [] model = GaussianNB(epsilon=self.epsilon_model) models.append(model.fit(X_train, y_train)) scores.append(f1_score(y_test, model.predict(X_test))) for C in [0.01, 0.1, 1.0, 10.0, 100.0]: model = LogisticRegression(epsilon=self.epsilon_model, C=C) models.append(model.fit(X_train, y_train)) scores.append(f1_score(y_test, model.predict(X_test))) probabilities = np.exp(self.epsilon_selection * np.array(scores) / 2) probabilities = probabilities / np.sum(probabilities) self.model = np.random.choice(models, p=probabilities, size=1)[0] self.model.fit(X, y)
def predict(self, X)
-
Predict class labels for samples in X.
Parameters
X
:array_like
orsparse
matrix
,shape
(n_samples
,n_features
)- Samples.
Returns
C
:array
,shape
[n_samples
]- Predicted class label per sample.
Expand source code
def predict(self, X): return self.model.predict(X)