Module `tiresias.core.mechanisms`

Expand source code

import numpy as np

def count(x, epsilon, delta):
    """
    This function computes the differentially private estimate of the count 
    using the Laplace mechanism.
    """
    return laplace_noise(len(x), sensitivity=1, epsilon=epsilon)

def laplace_noise(x, sensitivity, epsilon):
    """
    This function returns a differentially private estimate of the value `x` 
    given the sensitivity and epsilon parameters.
    """
    return np.random.laplace(loc=x, scale=sensitivity/epsilon)

def approximate_bounds(x, epsilon=1.0, scale=0.1, base=2, bins=128, p=0.9999):
    """
    This function estimates the upper and lower bounds of the data using a 
    modification of the histogram approach from [1].

    [1] https://github.com/google/differential-privacy/
    """
    # Find `threshold` such that `P(x < threshold) = p`
    # where x ~ Laplace(1.0 / epsilon).
    threshold = -np.log(2 - 2 * p) / epsilon
    
    # To produce N bins, we need N-1 cutoffs
    #    ...|...|...|...
    # where bin[i] corresponds to (cutoff[i-1], cutoff[i])
    cutoffs = scale * np.power(base, np.arange(0, bins//2-1))
    cutoffs = (-cutoffs[::-1]).tolist() + [0] + cutoffs.tolist()
    
    # Assign each value to a bin in the log histogram
    histogram = np.random.laplace(size=bins, scale=1.0 / epsilon)
    x = sorted(x.tolist())
    for i in range(bins-1):
        while len(x) > 0 and x[0] < cutoffs[i]:
            x.pop(0)
            histogram[i] += 1
    histogram[-1] += len(x)
    
    # Get the bins that are above the threshold
    try:
        has_value = (histogram > threshold).nonzero()[0]
        first, last = has_value[0] - 1, has_value[-1]
        return cutoffs[first], cutoffs[last]
    except:
        raise RuntimeError("Bounds approximation failed.")

def mean(x, epsilon, delta, bounds=False):
    """
    This function computes the differentially private estimate of the average 
    using the noisy average approch from [1]. If the bounds are not provided 
    by the user, then they are estimated using the histogram approach from [2].

    [1] Differential Privacy: From Theory to Practice
    [2] https://github.com/google/differential-privacy/
    """
    x = np.array(x)
    if not bounds:
        epsilon = epsilon / 2.0
        bounds = approximate_bounds(x, epsilon)
    low, high = bounds
    x = np.minimum(np.maximum(x, low), high)
    noise = np.random.laplace() * (high - low) / epsilon
    mean = (np.sum(x) + noise) / len(x)
    return min(max(low, mean), high)

def sum(x, epsilon, delta, bounds=False):
    """
    This function computes the differentially private estimate of the sum. If 
    the bounds are not provided by the user, then they are estimated using the 
    histogram approach from [1].

    [1] https://github.com/google/differential-privacy/
    """
    x = np.array(x)
    if not bounds:
        epsilon = epsilon / 2.0
        bounds = approximate_bounds(x, epsilon)
    low, high = bounds
    x = np.minimum(np.maximum(x, low), high)
    noise = np.random.laplace() * (high - low) / epsilon
    return np.sum(x) + noise

def median(x, epsilon, delta):
    """
    This function computes the differentially private estimate of the median 
    using the approach proposed in [1]. It uses the Laplace mechanism on page 
    10 and the smooth sensitivity of the median derivation found on page 12.
    
    The resulting value is (epsilon-delta) differentially private. By default,
    if not specified, delta is set to `1/(100*len(x))` so that there is a 99%
    chance that differential privacy is satisfied for any individual.

    [1] http://www.cse.psu.edu/~ads22/pubs/NRS07/NRS07-full-draft-v1.pdf
    """
    alpha = epsilon / 2.0
    beta = epsilon / (2.0 * np.log(2.0 / delta))
    
    x = np.array(x)
    x.sort()
    m = (len(x) + 1) // 2
    smooth_sensitivity = []
    for k in range(0, len(x)-m):
        local_sensitivity = max(x[m+t] - x[m+t-k-1] for t in range(0, k+1))
        smooth_sensitivity.append(np.exp(-k * beta) * local_sensitivity)
    smooth_sensitivity = max(smooth_sensitivity)
    
    return np.median(x) + smooth_sensitivity/alpha * np.random.laplace()

def median_gaussian(x, epsilon, delta):
    """
    This function computes the differentially private estimate of the median 
    using the approach proposed in [1]. It uses the Gaussian mechanism on page 
    10 and the smooth sensitivity of the median derivation found on page 12.

    The resulting value is (epsilon-delta) differentially private. By default,
    if not specified, delta is set to `1/(100*len(x))` so that there is a 99%
    chance that differential privacy is satisfied for any individual.

    [1] http://www.cse.psu.edu/~ads22/pubs/NRS07/NRS07-full-draft-v1.pdf
    """
    alpha = epsilon / (5.0 * np.sqrt(2.0 * np.log(2.0/delta)))
    beta = epsilon / (4.0 * (1.0 + np.log(2.0/delta)))
    
    x = np.array(x)
    x.sort()
    m = (len(x) + 1) // 2
    smooth_sensitivity = []
    for k in range(0, len(x)-m):
        local_sensitivity = max(x[m+t] - x[m+t-k-1] for t in range(0, k+1))
        smooth_sensitivity.append(np.exp(-k * beta) * local_sensitivity)
    smooth_sensitivity = max(smooth_sensitivity)

    return np.median(x) + smooth_sensitivity/alpha * np.random.normal()

def sample_and_aggregate(x, func, epsilon, nb_partitions, delta):
    """
    This function computes the differentially private estimate of a function 
    using the sample and aggregate approach in [1]. We obtain an estimate of
    the function value by computing it on samples of the data and then use 
    the differentially private median to aggregate the results.
    """
    results = []
    x = np.array(x)
    np.random.shuffle(x)
    for partition in np.array_split(x, nb_partitions):
        results.append(func(partition))
    return median(np.array(results), epsilon, delta)

def mean_sample_and_aggregate(x, epsilon, delta):
    """
    This function computes the differentially private estimate of the average 
    using the smooth sensitivity and sample and aggregate approach.
    """
    nb_partitions = int(np.sqrt(len(x)))
    return sample_and_aggregate(x, np.mean, epsilon, nb_partitions, delta)

def sum_sample_and_aggregate(x, epsilon, delta):
    """
    This function computes the differentially private estimate of the sum 
    using the smooth sensitivity and sample and aggregate approach.
    """
    nb_partitions = int(np.sqrt(len(x)))
    return nb_partitions * sample_and_aggregate(x, np.sum, epsilon, nb_partitions, delta)

def finite_categorical(x, domain, epsilon):
    """
    This function applies randomized response to a categorical variable. The
    input can be either a np.array or a single value. There is no restriction
    on the value type but in most scenarios, the value will be either an int 
    or a string.
    """
    assert len(set(domain)) == len(domain)

    if type(x) == np.ndarray:
        for _x in x:
            assert _x in domain
        p = (np.exp(epsilon) - 1) / (len(domain) - 1 + np.exp(epsilon))
        flags = np.random.random(size=x.shape) > p
        x[flags] = np.random.choice(list(domain), size=np.sum(flags))
        return x

    assert x in domain
    if epsilon == float("inf"):
        return x
    p = (np.exp(epsilon) - 1) / (len(domain) - 1 + np.exp(epsilon))
    if np.random.random() < p:
        return x
    return np.random.choice(list(domain))

def bounded_continuous(x, low, high, epsilon):
    """
    This function applies randomized response to a bounded continuous variable. 
    The input `x` can be either a np.array or a single value.
    """
    if type(x) == np.ndarray:
        assert (x >= low).all() and (x <= high).all()
        if epsilon == float("inf"):
            return x
        return x + np.random.laplace(scale=(high-low)/epsilon, size=x.shape)

    assert x >= low and x <= high
    if epsilon == float("inf"):
        return x
    return x + np.random.laplace(scale=(high-low)/epsilon)

Functions

def approximate_bounds(x, epsilon=1.0, scale=0.1, base=2, bins=128, p=0.9999)

This function estimates the upper and lower bounds of the data using a modification of the histogram approach from [1].

[1] https://github.com/google/differential-privacy/

Expand source code

def approximate_bounds(x, epsilon=1.0, scale=0.1, base=2, bins=128, p=0.9999):
    """
    This function estimates the upper and lower bounds of the data using a 
    modification of the histogram approach from [1].

    [1] https://github.com/google/differential-privacy/
    """
    # Find `threshold` such that `P(x < threshold) = p`
    # where x ~ Laplace(1.0 / epsilon).
    threshold = -np.log(2 - 2 * p) / epsilon
    
    # To produce N bins, we need N-1 cutoffs
    #    ...|...|...|...
    # where bin[i] corresponds to (cutoff[i-1], cutoff[i])
    cutoffs = scale * np.power(base, np.arange(0, bins//2-1))
    cutoffs = (-cutoffs[::-1]).tolist() + [0] + cutoffs.tolist()
    
    # Assign each value to a bin in the log histogram
    histogram = np.random.laplace(size=bins, scale=1.0 / epsilon)
    x = sorted(x.tolist())
    for i in range(bins-1):
        while len(x) > 0 and x[0] < cutoffs[i]:
            x.pop(0)
            histogram[i] += 1
    histogram[-1] += len(x)
    
    # Get the bins that are above the threshold
    try:
        has_value = (histogram > threshold).nonzero()[0]
        first, last = has_value[0] - 1, has_value[-1]
        return cutoffs[first], cutoffs[last]
    except:
        raise RuntimeError("Bounds approximation failed.")

def bounded_continuous(x, low, high, epsilon)

This function applies randomized response to a bounded continuous variable. The input x can be either a np.array or a single value.

Expand source code

def bounded_continuous(x, low, high, epsilon):
    """
    This function applies randomized response to a bounded continuous variable. 
    The input `x` can be either a np.array or a single value.
    """
    if type(x) == np.ndarray:
        assert (x >= low).all() and (x <= high).all()
        if epsilon == float("inf"):
            return x
        return x + np.random.laplace(scale=(high-low)/epsilon, size=x.shape)

    assert x >= low and x <= high
    if epsilon == float("inf"):
        return x
    return x + np.random.laplace(scale=(high-low)/epsilon)

def count(x, epsilon, delta)

This function computes the differentially private estimate of the count using the Laplace mechanism.

Expand source code

def count(x, epsilon, delta):
    """
    This function computes the differentially private estimate of the count 
    using the Laplace mechanism.
    """
    return laplace_noise(len(x), sensitivity=1, epsilon=epsilon)

def finite_categorical(x, domain, epsilon)

This function applies randomized response to a categorical variable. The input can be either a np.array or a single value. There is no restriction on the value type but in most scenarios, the value will be either an int or a string.

Expand source code

def finite_categorical(x, domain, epsilon):
    """
    This function applies randomized response to a categorical variable. The
    input can be either a np.array or a single value. There is no restriction
    on the value type but in most scenarios, the value will be either an int 
    or a string.
    """
    assert len(set(domain)) == len(domain)

    if type(x) == np.ndarray:
        for _x in x:
            assert _x in domain
        p = (np.exp(epsilon) - 1) / (len(domain) - 1 + np.exp(epsilon))
        flags = np.random.random(size=x.shape) > p
        x[flags] = np.random.choice(list(domain), size=np.sum(flags))
        return x

    assert x in domain
    if epsilon == float("inf"):
        return x
    p = (np.exp(epsilon) - 1) / (len(domain) - 1 + np.exp(epsilon))
    if np.random.random() < p:
        return x
    return np.random.choice(list(domain))

def laplace_noise(x, sensitivity, epsilon)

This function returns a differentially private estimate of the value x given the sensitivity and epsilon parameters.

Expand source code

def laplace_noise(x, sensitivity, epsilon):
    """
    This function returns a differentially private estimate of the value `x` 
    given the sensitivity and epsilon parameters.
    """
    return np.random.laplace(loc=x, scale=sensitivity/epsilon)

def mean(x, epsilon, delta, bounds=False)

This function computes the differentially private estimate of the average using the noisy average approch from [1]. If the bounds are not provided by the user, then they are estimated using the histogram approach from [2].

[1] Differential Privacy: From Theory to Practice [2] https://github.com/google/differential-privacy/

Expand source code

def mean(x, epsilon, delta, bounds=False):
    """
    This function computes the differentially private estimate of the average 
    using the noisy average approch from [1]. If the bounds are not provided 
    by the user, then they are estimated using the histogram approach from [2].

    [1] Differential Privacy: From Theory to Practice
    [2] https://github.com/google/differential-privacy/
    """
    x = np.array(x)
    if not bounds:
        epsilon = epsilon / 2.0
        bounds = approximate_bounds(x, epsilon)
    low, high = bounds
    x = np.minimum(np.maximum(x, low), high)
    noise = np.random.laplace() * (high - low) / epsilon
    mean = (np.sum(x) + noise) / len(x)
    return min(max(low, mean), high)

def mean_sample_and_aggregate(x, epsilon, delta)

This function computes the differentially private estimate of the average using the smooth sensitivity and sample and aggregate approach.

Expand source code

def mean_sample_and_aggregate(x, epsilon, delta):
    """
    This function computes the differentially private estimate of the average 
    using the smooth sensitivity and sample and aggregate approach.
    """
    nb_partitions = int(np.sqrt(len(x)))
    return sample_and_aggregate(x, np.mean, epsilon, nb_partitions, delta)

def median(x, epsilon, delta)

This function computes the differentially private estimate of the median using the approach proposed in [1]. It uses the Laplace mechanism on page 10 and the smooth sensitivity of the median derivation found on page 12.

The resulting value is (epsilon-delta) differentially private. By default, if not specified, delta is set to 1/(100*len(x)) so that there is a 99% chance that differential privacy is satisfied for any individual.

[1] http://www.cse.psu.edu/~ads22/pubs/NRS07/NRS07-full-draft-v1.pdf

Expand source code

def median(x, epsilon, delta):
    """
    This function computes the differentially private estimate of the median 
    using the approach proposed in [1]. It uses the Laplace mechanism on page 
    10 and the smooth sensitivity of the median derivation found on page 12.
    
    The resulting value is (epsilon-delta) differentially private. By default,
    if not specified, delta is set to `1/(100*len(x))` so that there is a 99%
    chance that differential privacy is satisfied for any individual.

    [1] http://www.cse.psu.edu/~ads22/pubs/NRS07/NRS07-full-draft-v1.pdf
    """
    alpha = epsilon / 2.0
    beta = epsilon / (2.0 * np.log(2.0 / delta))
    
    x = np.array(x)
    x.sort()
    m = (len(x) + 1) // 2
    smooth_sensitivity = []
    for k in range(0, len(x)-m):
        local_sensitivity = max(x[m+t] - x[m+t-k-1] for t in range(0, k+1))
        smooth_sensitivity.append(np.exp(-k * beta) * local_sensitivity)
    smooth_sensitivity = max(smooth_sensitivity)
    
    return np.median(x) + smooth_sensitivity/alpha * np.random.laplace()

def median_gaussian(x, epsilon, delta)

This function computes the differentially private estimate of the median using the approach proposed in [1]. It uses the Gaussian mechanism on page 10 and the smooth sensitivity of the median derivation found on page 12.

[1] http://www.cse.psu.edu/~ads22/pubs/NRS07/NRS07-full-draft-v1.pdf

Expand source code

def median_gaussian(x, epsilon, delta):
    """
    This function computes the differentially private estimate of the median 
    using the approach proposed in [1]. It uses the Gaussian mechanism on page 
    10 and the smooth sensitivity of the median derivation found on page 12.

    The resulting value is (epsilon-delta) differentially private. By default,
    if not specified, delta is set to `1/(100*len(x))` so that there is a 99%
    chance that differential privacy is satisfied for any individual.

    [1] http://www.cse.psu.edu/~ads22/pubs/NRS07/NRS07-full-draft-v1.pdf
    """
    alpha = epsilon / (5.0 * np.sqrt(2.0 * np.log(2.0/delta)))
    beta = epsilon / (4.0 * (1.0 + np.log(2.0/delta)))
    
    x = np.array(x)
    x.sort()
    m = (len(x) + 1) // 2
    smooth_sensitivity = []
    for k in range(0, len(x)-m):
        local_sensitivity = max(x[m+t] - x[m+t-k-1] for t in range(0, k+1))
        smooth_sensitivity.append(np.exp(-k * beta) * local_sensitivity)
    smooth_sensitivity = max(smooth_sensitivity)

    return np.median(x) + smooth_sensitivity/alpha * np.random.normal()

def sample_and_aggregate(x, func, epsilon, nb_partitions, delta)

This function computes the differentially private estimate of a function using the sample and aggregate approach in [1]. We obtain an estimate of the function value by computing it on samples of the data and then use the differentially private median to aggregate the results.

Expand source code

def sample_and_aggregate(x, func, epsilon, nb_partitions, delta):
    """
    This function computes the differentially private estimate of a function 
    using the sample and aggregate approach in [1]. We obtain an estimate of
    the function value by computing it on samples of the data and then use 
    the differentially private median to aggregate the results.
    """
    results = []
    x = np.array(x)
    np.random.shuffle(x)
    for partition in np.array_split(x, nb_partitions):
        results.append(func(partition))
    return median(np.array(results), epsilon, delta)

def sum(x, epsilon, delta, bounds=False)

This function computes the differentially private estimate of the sum. If the bounds are not provided by the user, then they are estimated using the histogram approach from [1].

[1] https://github.com/google/differential-privacy/

Expand source code

def sum(x, epsilon, delta, bounds=False):
    """
    This function computes the differentially private estimate of the sum. If 
    the bounds are not provided by the user, then they are estimated using the 
    histogram approach from [1].

    [1] https://github.com/google/differential-privacy/
    """
    x = np.array(x)
    if not bounds:
        epsilon = epsilon / 2.0
        bounds = approximate_bounds(x, epsilon)
    low, high = bounds
    x = np.minimum(np.maximum(x, low), high)
    noise = np.random.laplace() * (high - low) / epsilon
    return np.sum(x) + noise

def sum_sample_and_aggregate(x, epsilon, delta)

This function computes the differentially private estimate of the sum using the smooth sensitivity and sample and aggregate approach.

Expand source code

def sum_sample_and_aggregate(x, epsilon, delta):
    """
    This function computes the differentially private estimate of the sum 
    using the smooth sensitivity and sample and aggregate approach.
    """
    nb_partitions = int(np.sqrt(len(x)))
    return nb_partitions * sample_and_aggregate(x, np.sum, epsilon, nb_partitions, delta)