1

1

from typing import Tuple

2



3

1

import numpy as np

4



5



6

1

def generate_simple_label_matrix(

7


n: int, m: int, cardinality: int, abstain_multiplier: float = 1.0

8


) > Tuple[np.ndarray, np.ndarray, np.ndarray]:

9


"""Generate a synthetic label matrix with true parameters and labels.

10



11


This function generates a set of labeling function conditional probability tables,

12


P(LF=l  Y=y), stored as a matrix P, and true labels Y, and then generates the

13


resulting label matrix L.

14



15


Parameters

16




17


n

18


Number of data points

19


m

20


Number of labeling functions

21


cardinality

22


Cardinality of true labels (i.e. not including abstains)

23


abstain_multiplier

24


Factor to multiply the probability of abstaining by

25



26


Returns

27




28


Tuple[np.ndarray, np.ndarray, np.ndarray]

29


A tuple containing the LF conditional probabilities P,

30


the true labels Y, and the output label matrix L

31


"""

32


# Generate the conditional probability tables for the LFs

33


# The first axis is LF, second is LF output label, third is true class label

34


# Note that we include abstains in the LF output space, and that we bias the

35


# conditional probabilities towards being nonadversarial

36

1

P = np.empty((m, cardinality + 1, cardinality))

37

1

for i in range(m):

38

1

p = np.random.rand(cardinality + 1, cardinality)

39



40


# Bias the LFs to being nonadversarial

41

1

p[1:, :] += (cardinality  1) * np.eye(cardinality)

42



43


# Optionally increase the abstain probability by some multiplier; note this is

44


# to simulate the common setting where LFs label very sparsely

45

1

p[0, :] *= abstain_multiplier

46



47


# Normalize the conditional probabilities table

48

1

P[i] = p @ np.diag(1 / p.sum(axis=0))

49



50


# Generate the true datapoint labels

51


# Note: Assuming balanced classes to start

52

1

Y = np.random.choice(cardinality, n)

53



54


# Generate the label matrix L

55

1

L = np.empty((n, m), dtype=int)

56

1

for i in range(n):

57

1

for j in range(m):

58

1

L[i, j] = np.random.choice(cardinality + 1, p=P[j, :, Y[i]])  1

59

1

return P, Y, L
