1
|
4
|
from typing import Tuple
|
2
|
|
|
3
|
4
|
import numpy as np
|
4
|
|
|
5
|
|
|
6
|
4
|
def generate_simple_label_matrix(
|
7
|
|
n: int, m: int, cardinality: int, abstain_multiplier: float = 1.0
|
8
|
|
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
9
|
|
"""Generate a synthetic label matrix with true parameters and labels.
|
10
|
|
|
11
|
|
This function generates a set of labeling function conditional probability tables,
|
12
|
|
P(LF=l | Y=y), stored as a matrix P, and true labels Y, and then generates the
|
13
|
|
resulting label matrix L.
|
14
|
|
|
15
|
|
Parameters
|
16
|
|
----------
|
17
|
|
n
|
18
|
|
Number of data points
|
19
|
|
m
|
20
|
|
Number of labeling functions
|
21
|
|
cardinality
|
22
|
|
Cardinality of true labels (i.e. not including abstains)
|
23
|
|
abstain_multiplier
|
24
|
|
Factor to multiply the probability of abstaining by
|
25
|
|
|
26
|
|
Returns
|
27
|
|
-------
|
28
|
|
Tuple[np.ndarray, np.ndarray, np.ndarray]
|
29
|
|
A tuple containing the LF conditional probabilities P,
|
30
|
|
the true labels Y, and the output label matrix L
|
31
|
|
"""
|
32
|
|
# Generate the conditional probability tables for the LFs
|
33
|
|
# The first axis is LF, second is LF output label, third is true class label
|
34
|
|
# Note that we include abstains in the LF output space, and that we bias the
|
35
|
|
# conditional probabilities towards being non-adversarial
|
36
|
4
|
P = np.empty((m, cardinality + 1, cardinality))
|
37
|
4
|
for i in range(m):
|
38
|
4
|
p = np.random.rand(cardinality + 1, cardinality)
|
39
|
|
|
40
|
|
# Bias the LFs to being non-adversarial
|
41
|
4
|
p[1:, :] += (cardinality - 1) * np.eye(cardinality)
|
42
|
|
|
43
|
|
# Optionally increase the abstain probability by some multiplier; note this is
|
44
|
|
# to simulate the common setting where LFs label very sparsely
|
45
|
4
|
p[0, :] *= abstain_multiplier
|
46
|
|
|
47
|
|
# Normalize the conditional probabilities table
|
48
|
4
|
P[i] = p @ np.diag(1 / p.sum(axis=0))
|
49
|
|
|
50
|
|
# Generate the true datapoint labels
|
51
|
|
# Note: Assuming balanced classes to start
|
52
|
4
|
Y = np.random.choice(cardinality, n)
|
53
|
|
|
54
|
|
# Generate the label matrix L
|
55
|
4
|
L = np.empty((n, m), dtype=int)
|
56
|
4
|
for i in range(n):
|
57
|
4
|
for j in range(m):
|
58
|
4
|
L[i, j] = np.random.choice(cardinality + 1, p=P[j, :, Y[i]]) - 1
|
59
|
4
|
return P, Y, L
|