Addresses #1602. Added a method to analysis/erroranalysis that wraps getlabel_buckets functionality. Given a bucket, a NumPy array x of your data, and corresponding y label(s), it will return to you x with only the instances corresponding to that bucket.
1  2 
from typing import Tuple 
2  
3  2 
import numpy as np 
4  
5  
6  2 
def generate_simple_label_matrix( 
7 
n: int, m: int, cardinality: int, abstain_multiplier: float = 1.0 

8 
) > Tuple[np.ndarray, np.ndarray, np.ndarray]: 

9 
"""Generate a synthetic label matrix with true parameters and labels.


10  
11 
This function generates a set of labeling function conditional probability tables,


12 
P(LF=l  Y=y), stored as a matrix P, and true labels Y, and then generates the


13 
resulting label matrix L.


14  
15 
Parameters


16 



17 
n


18 
Number of data points


19 
m


20 
Number of labeling functions


21 
cardinality


22 
Cardinality of true labels (i.e. not including abstains)


23 
abstain_multiplier


24 
Factor to multiply the probability of abstaining by


25  
26 
Returns


27 



28 
Tuple[np.ndarray, np.ndarray, np.ndarray]


29 
A tuple containing the LF conditional probabilities P,


30 
the true labels Y, and the output label matrix L


31 
"""


32 
# Generate the conditional probability tables for the LFs


33 
# The first axis is LF, second is LF output label, third is true class label


34 
# Note that we include abstains in the LF output space, and that we bias the


35 
# conditional probabilities towards being nonadversarial


36  2 
P = np.empty((m, cardinality + 1, cardinality)) 
37 
for i in range(m): 

38  2 
p = np.random.rand(cardinality + 1, cardinality) 
39  
40 
# Bias the LFs to being nonadversarial


41  2 
p[1:, :] += (cardinality  1) * np.eye(cardinality) 
42  
43 
# Optionally increase the abstain probability by some multiplier; note this is


44 
# to simulate the common setting where LFs label very sparsely


45  2 
p[0, :] *= abstain_multiplier 
46  
47 
# Normalize the conditional probabilities table


48  2 
P[i] = p @ np.diag(1 / p.sum(axis=0)) 
49  
50 
# Generate the true datapoint labels


51 
# Note: Assuming balanced classes to start


52  2 
Y = np.random.choice(cardinality, n) 
53  
54 
# Generate the label matrix L


55  2 
L = np.empty((n, m), dtype=int) 
56 
for i in range(n): 

57 
for j in range(m): 

58  2 
L[i, j] = np.random.choice(cardinality + 1, p=P[j, :, Y[i]])  1 
59  2 
return P, Y, L 
Read our documentation on viewing source code .