microsoft / graspologic
1
# Copyright 2020 NeuroData (http://neurodata.io)
2
#
3
# Licensed under the Apache License, Version 2.0 (the "License");
4 2
# you may not use this file except in compliance with the License.
5
# You may obtain a copy of the License at
6 2
#
7 2
#     http://www.apache.org/licenses/LICENSE-2.0
8
#
9 2
# Unless required by applicable law or agreed to in writing, software
10 2
# distributed under the License is distributed on an "AS IS" BASIS,
11 2
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 2
# See the License for the specific language governing permissions and
13 2
# limitations under the License.
14 2

15 2
import warnings
16 2

17 2
import numpy as np
18 2
from scipy import stats
19

20 2
from ..embed import select_dimension, AdjacencySpectralEmbed
21 2
from ..utils import import_graph
22 2
from .base import BaseInference
23 2
from sklearn.utils import check_array
24
from sklearn.metrics import pairwise_distances
25 2
from sklearn.metrics.pairwise import pairwise_kernels
26
from sklearn.metrics.pairwise import PAIRED_DISTANCES
27
from sklearn.metrics.pairwise import PAIRWISE_KERNEL_FUNCTIONS
28 2
from hyppo.ksample import KSample
29
from hyppo._utils import gaussian
30

31
_VALID_DISTANCES = list(PAIRED_DISTANCES.keys())
32
_VALID_KERNELS = list(PAIRWISE_KERNEL_FUNCTIONS.keys())
33
_VALID_KERNELS.append("gaussian")  # can use hyppo's medial gaussian kernel too
34
_VALID_METRICS = _VALID_DISTANCES + _VALID_KERNELS
35

36
_VALID_TESTS = ["cca", "dcorr", "hhg", "rv", "hsic", "mgc"]
37

38

39
class LatentDistributionTest(BaseInference):
40
    """Two-sample hypothesis test for the problem of determining whether two random
41
    dot product graphs have the same distributions of latent positions.
42

43
    This test can operate on two graphs where there is no known matching
44
    between the vertices of the two graphs, or even when the number of vertices
45
    is different. Currently, testing is only supported for undirected graphs.
46

47
    Read more in the :ref:`tutorials <inference_tutorials>`
48

49
    Parameters
50
    ----------
51
    test : str (default="hsic")
52
        Backend hypothesis test to use, one of ["cca", "dcorr", "hhg", "rv", "hsic", "mgc"].
53
        These tests are typically used for independence testing, but here they
54
        are used for a two-sample hypothesis test on the latent positions of
55
        two graphs. See :class:`hyppo.ksample.KSample` for more information.
56

57
    metric : str or function (default="gaussian")
58
        Distance or a kernel metric to use, either a callable or a valid string.
59
        If a callable, then it should behave similarly to either
60
        :func:`sklearn.metrics.pairwise_distances` or to
61
        :func:`sklearn.metrics.pairwise.pairwise_kernels`.
62
        If a string, then it should be either one of the keys in either
63
        `sklearn.metrics.pairwise.PAIRED_DISTANCES` or in
64
        `sklearn.metrics.pairwise.PAIRWISE_KERNEL_FUNCTIONS`, or "gaussian",
65
        which will use a gaussian kernel with an adaptively selected bandwidth.
66
        It is recommended to use kernels (e.g. "gaussian") with kernel-based
67
        hsic test and distances (e.g. "euclidean") with all other tests.
68

69
    n_components : int or None (default=None)
70
        Number of embedding dimensions. If None, the optimal embedding
71
        dimensions are found by the Zhu and Godsi algorithm.
72
        See :func:`~graspy.embed.selectSVD` for more information.
73
        This argument is ignored if input_graph=False.
74

75
    n_bootstraps : int (default=200)
76
        Number of bootstrap iterations for the backend hypothesis test.
77
        See :class:`hyppo.ksample.KSample` for more information.
78

79
    workers : int (default=1)
80
        Number of workers to use. If more than 1, parallelizes the code.
81
        Supply -1 to use all cores available to the Process.
82

83
    size_correction: bool (default=True)
84
        Ignored when the two graphs have the same number of vertices. The test
85
        degrades in validity as the number of vertices of the two graphs
86
        diverge from each other, unless a correction is performed.
87

88
        - True
89
            Whenever the two graphs have different numbers of vertices,
90
            estimates the plug-in estimator for the variance and uses it to
91
            correct the embedding of the larger graph.
92
        - False
93
            Does not perform any modifications (not recommended).
94

95
    pooled: bool (default=False)
96
        Ignored whenever the two graphs have the same number of vertices or
97
        size_correction is set to False. In order to correct the adjacency
98
        spectral embedding used in the test, it is needed to estimate the
99
        variance for each of the latent position estimates in the larger graph,
100
        which requires to compute different sample moments. These moments can
101
        be computed either over the larger graph (False), or over both graphs
102
        (True). Setting it to True should not affect the behavior of the test
103
        under the null hypothesis, but it is not clear whether it has more
104
        power or less power under which alternatives. Generally not recomended,
105
        as it is untested and included for experimental purposes.
106

107
    input_graph : bool (default=True)
108
        Flag whether to expect two full graphs, or the embeddings.
109

110
        - True
111
            .fit and .fit_predict() expect graphs, either as NetworkX graph objects
112
            or as adjacency matrices, provided as ndarrays of size (n, n) and
113
            (m, m). They will be embedded using adjacency spectral embeddings.
114
        - False
115
            .fit() and .fit_predict() expect adjacency spectral embeddings of
116
            the graphs, they must be ndarrays of size (n, d) and (m, d), where
117
            d must be same. n_components attribute is ignored in this case.
118

119
    Attributes
120
    ----------
121
    null_distribution_ : ndarray, shape (n_bootstraps, )
122
        The distribution of T statistics generated under the null.
123

124
    sample_T_statistic_ : float
125
        The observed difference between the embedded latent positions of the
126
        two input graphs.
127

128
    p_value_ : float
129
        The overall p value from the test.
130

131
    References
132
    ----------
133
    .. [1] Tang, M., Athreya, A., Sussman, D. L., Lyzinski, V., & Priebe, C. E. (2017).
134
        "A nonparametric two-sample hypothesis testing problem for random graphs."
135
        Bernoulli, 23(3), 1599-1630.
136 2

137
    .. [2] Panda, S., Palaniappan, S., Xiong, J., Bridgeford, E., Mehta, R., Shen, C., & Vogelstein, J. (2019).
138
        "hyppo: A Comprehensive Multivariate Hypothesis Testing Python Package."
139
        arXiv:1907.02088.
140

141
    .. [3] Alyakin, A., Agterberg, J., Helm, H., Priebe, C. (2020).
142
       "Correcting a Nonparametric Two-sample Graph Hypothesis Test for Graphs with Different Numbers of Vertices"
143
       arXiv:2008.09434
144

145
    """
146

147
    def __init__(
148 2
        self,
149 2
        test="dcorr",
150 2
        metric="euclidean",
151 2
        n_components=None,
152 2
        n_bootstraps=200,
153 2
        workers=1,
154
        size_correction=True,
155 2
        pooled=False,
156 0
        input_graph=True,
157 0
    ):
158 2

159 0
        if not isinstance(test, str):
160
            msg = "test must be a str, not {}".format(type(test))
161
            raise TypeError(msg)
162 0
        elif test not in _VALID_TESTS:
163
            msg = "Unknown test {}. Valid tests are {}".format(test, _VALID_TESTS)
164 2
            raise ValueError(msg)
165 2

166 0
        if not isinstance(metric, str) and not callable(metric):
167 0
            msg = "Metric must be str or callable, not {}".format(type(metric))
168
            raise TypeError(msg)
169 2
        elif metric not in _VALID_METRICS and not callable(metric):
170 2
            msg = "Unknown metric {}. Valid metrics are {}, or a callable".format(
171 2
                metric, _VALID_METRICS
172 2
            )
173 2
            raise ValueError(msg)
174 2

175
        if n_components is not None:
176 2
            if not isinstance(n_components, int):
177 2
                msg = "n_components must be an int, not {}.".format(type(n_components))
178 2
                raise TypeError(msg)
179

180 2
        if not isinstance(n_bootstraps, int):
181 2
            msg = "n_bootstraps must be an int, not {}".format(type(n_bootstraps))
182 2
            raise TypeError(msg)
183
        elif n_bootstraps < 0:
184 2
            msg = "{} is invalid number of bootstraps, must be non-negative"
185 2
            raise ValueError(msg.format(n_bootstraps))
186 2

187
        if not isinstance(workers, int):
188 2
            msg = "workers must be an int, not {}".format(type(workers))
189 2
            raise TypeError(msg)
190 2

191
        if not isinstance(size_correction, bool):
192 2
            msg = "size_correction must be a bool, not {}".format(type(size_correction))
193
            raise TypeError(msg)
194 2

195 2
        if not isinstance(pooled, bool):
196
            msg = "pooled must be a bool, not {}".format(type(pooled))
197 2
            raise TypeError(msg)
198 2

199 2
        if not isinstance(input_graph, bool):
200
            msg = "input_graph must be a bool, not {}".format(type(input_graph))
201
            raise TypeError(msg)
202

203
        super().__init__(n_components=n_components)
204

205 2
        if callable(metric):
206
            metric_func = metric
207 2
        else:
208 2
            if metric in _VALID_DISTANCES:
209
                if test == "hsic":
210 2
                    msg = (
211 2
                        f"{test} is a kernel-based test, but {metric} "
212 2
                        "is a distance. results may not be optimal. it is "
213
                        "recomended to use either a different test or one of "
214
                        f"the kernels: {_VALID_KERNELS} as a metric."
215
                    )
216
                    warnings.warn(msg, UserWarning)
217

218 2
                def metric_func(X, Y=None, metric=metric, workers=None):
219 2
                    return pairwise_distances(X, Y, metric=metric, n_jobs=workers)
220

221 2
            elif metric == "gaussian":
222 2
                if test != "hsic":
223
                    msg = (
224
                        f"{test} is a distance-based test, but {metric} "
225
                        "is a kernel. results may not be optimal. it is "
226
                        "recomended to use either a hisc as a test or one of "
227
                        f"the distances: {_VALID_DISTANCES} as a metric."
228 2
                    )
229
                    warnings.warn(msg, UserWarning)
230 2
                metric_func = gaussian
231 0
            else:
232
                if test != "hsic":
233 2
                    msg = (
234 2
                        f"{test} is a distance-based test, but {metric} "
235 2
                        "is a kernel. results may not be optimal. it is "
236 2
                        "recomended to use either a hisc as a test or one of "
237 2
                        f"the distances: {_VALID_DISTANCES} as a metric."
238 2
                    )
239
                    warnings.warn(msg, UserWarning)
240 2

241 2
                def metric_func(X, Y=None, metric=metric, workers=None):
242 2
                    return pairwise_kernels(X, Y, metric=metric, n_jobs=workers)
243 2

244 2
        self.test = KSample(test, compute_distance=metric_func)
245
        self.n_bootstraps = n_bootstraps
246 2
        self.workers = workers
247 2
        self.size_correction = size_correction
248 2
        self.pooled = pooled
249
        self.input_graph = input_graph
250 2

251 2
    def _embed(self, A1, A2):
252 2
        if self.n_components is None:
253 2
            num_dims1 = select_dimension(A1)[0][-1]
254 2
            num_dims2 = select_dimension(A2)[0][-1]
255
            self.n_components = max(num_dims1, num_dims2)
256

257
        ase = AdjacencySpectralEmbed(n_components=self.n_components)
258 2
        X1_hat = ase.fit_transform(A1)
259
        X2_hat = ase.fit_transform(A2)
260 2

261
        if isinstance(X1_hat, tuple) and isinstance(X2_hat, tuple):
262 2
            X1_hat = np.concatenate(X1_hat, axis=-1)
263 2
            X2_hat = np.concatenate(X2_hat, axis=-1)
264
        elif isinstance(X1_hat, tuple) ^ isinstance(X2_hat, tuple):
265
            msg = (
266 2
                "input graphs do not have same directedness. "
267 2
                "consider symmetrizing the directed graph."
268 2
            )
269 2
            raise ValueError(msg)
270 2

271 2
        return X1_hat, X2_hat
272

273 2
    def _sample_modified_ase(self, X, Y, pooled=False):
274
        N, M = len(X), len(Y)
275

276 2
        # return if graphs are same order, else else ensure X the larger graph.
277 2
        if N == M:
278 2
            return X, Y
279
        elif M > N:
280 2
            reverse_order = True
281 2
            X, Y = Y, X
282
            N, M = M, N
283
        else:
284 2
            reverse_order = False
285

286 2
        # estimate the central limit theorem variance
287 2
        if pooled:
288
            two_samples = np.concatenate([X, Y], axis=0)
289
            get_sigma = _fit_plug_in_variance_estimator(two_samples)
290 2
        else:
291
            get_sigma = _fit_plug_in_variance_estimator(X)
292 2
        X_sigmas = get_sigma(X) * (N - M) / (N * M)
293

294
        # increase the variance of X by sampling from the asy dist
295
        X_sampled = np.zeros(X.shape)
296
        # TODO may be parallelized, but requires keeping track of random state
297
        for i in range(N):
298
            X_sampled[i, :] = X[i, :] + stats.multivariate_normal.rvs(cov=X_sigmas[i])
299

300
        # return the embeddings in the appropriate order
301
        return (Y, X_sampled) if reverse_order else (X_sampled, Y)
302

303
    def fit(self, A1, A2):
304
        """
305
        Fits the test to the two input graphs
306

307
        Parameters
308
        ----------
309
        A1, A2 : variable (see description)
310
            The two graphs, or their embeddings to run a hypothesis test on.
311
            Expected variable type and shape depends on input_graph attribute:
312

313
            - input_graph=True
314
                expects two unembedded graphs either as NetworkX graph objects, or as
315
                two np.ndarrays, representing the adjacency matrices. In this
316
                case will be embedded using adjacency spectral embedding.
317
            - input_graph-False
318 2
                expects two already embedded graphs. In this case they must be
319 2
                arrays of shape (n, d) and (m, d), where d, the number of
320 2
                components, must be shared.
321

322 2
            Note that regardless of how the graphs are passed, they need not
323
            have the same number of vertices.
324

325
        Returns
326 2
        -------
327 2
        self
328
        """
329
        if self.input_graph:
330
            A1 = import_graph(A1)
331
            A2 = import_graph(A2)
332

333 2
            X1_hat, X2_hat = self._embed(A1, A2)
334 2
        else:
335 2
            # check for nx objects, since they are castable to arrays,
336
            # but we don't want that
337
            if not isinstance(A1, np.ndarray):
338
                msg = (
339
                    f"Embedding of the first graph is of type {type(A1)}, not "
340
                    "np.ndarray. If input_graph is False, the inputs need to be "
341 2
                    "adjacency spectral embeddings, with shapes (n, d) and "
342
                    "(m, d), passed as np.ndarrays."
343 2
                )
344 2
                raise TypeError(msg)
345
            if not isinstance(A2, np.ndarray):
346
                msg = (
347
                    f"Embedding of the second graph is of type {type(A2)}, not an "
348
                    "array. If input_graph is False, the inputs need to be "
349 2
                    "adjacency spectral embeddings, with shapes (n, d) and "
350 2
                    "(m, d), passed as np.ndarrays."
351 2
                )
352
                raise TypeError(msg)
353

354
            if A1.ndim != 2:
355
                msg = (
356 2
                    "Embedding array of the first graph does not have two dimensions. "
357 2
                    "If input_graph is False, the inputs need to be adjacency "
358 2
                    "spectral embeddings, with shapes (n, d) and (m, d)"
359
                )
360
                raise ValueError(msg)
361
            if A2.ndim != 2:
362
                msg = (
363 2
                    "Embedding array of the second graph does not have two dimensions. "
364
                    "If input_graph is False, the inputs need to be adjacency "
365
                    "spectral embeddings, with shapes (n, d) and (m, d)"
366 2
                )
367 2
                raise ValueError(msg)
368
            if A1.shape[1] != A2.shape[1]:
369 2
                msg = (
370
                    "Two input embeddings have different number of components. "
371 2
                    "If input_graph is False, the inputs need to be adjacency "
372 2
                    "spectral embeddings, with shapes (n, d) and (m, d)"
373
                )
374
                raise ValueError(msg)
375

376 2
            # checking for inf values
377
            X1_hat = check_array(A1)
378
            X2_hat = check_array(A2)
379

380 2
        X1_hat, X2_hat = _median_sign_flips(X1_hat, X2_hat)
381 2

382 2
        if self.size_correction:
383
            X1_hat, X2_hat = self._sample_modified_ase(
384 2
                X1_hat, X2_hat, pooled=self.pooled
385
            )
386 2

387
        data = self.test.test(
388
            X1_hat, X2_hat, reps=self.n_bootstraps, workers=self.workers, auto=False
389
        )
390

391
        self.null_distribution_ = self.test.indep_test.null_dist
392
        self.sample_T_statistic_ = data[0]
393
        self.p_value_ = data[1]
394

395
        return self
396

397
    def fit_predict(self, A1, A2):
398
        """
399
        Fits the test to the two input graphs and returns the p-value
400

401
        Parameters
402
        ----------
403
        A1, A2 : variable (see description)
404
            The two graphs, or their embeddings to run a hypothesis test on.
405
            Expected variable type and shape depends on input_graph attribute:
406

407
            - input_graph=True
408
                expects two unembedded graphs either as NetworkX graph objects, or as
409
                two np.ndarrays, representing the adjacency matrices. In this
410
                case will be embedded using adjacency spectral embedding.
411
            - input_graph-False
412
                expects two already embedded graphs. In this case they must be
413
                arrays of shape (n, d) and (m, d), where d, the number of
414
                components, must be shared.
415 2

416 2
            Note that regardless of how the graphs are passed, they need not to
417
            have the same number of vertices.
418

419 2

420 2
        Returns
421 2
        ------
422 2
        p_value_ : float
423 2
            The overall p value from the test
424 2
        """
425 2
        # abstract method overwritten in order to have a custom doc string
426
        self.fit(A1, A2)
427
        return self.p_value_
428 2

429

430
def _median_sign_flips(X1, X2):
431
    X1_medians = np.median(X1, axis=0)
432
    X2_medians = np.median(X2, axis=0)
433
    val = np.multiply(X1_medians, X2_medians)
434
    t = (val > 0) * 2 - 1
435
    X1 = np.multiply(t.reshape(-1, 1).T, X1)
436
    return X1, X2
437

438

439
def _fit_plug_in_variance_estimator(X):
440
    """
441
    Takes in ASE of a graph and returns a function that estimates
442
    the variance-covariance matrix at a given point using the
443
    plug-in estimator from the RDPG Central Limit Theorem.
444

445 2
    Parameters
446
    ----------
447
    X : np.ndarray, shape (n, d)
448 2
        adjacency spectral embedding of a graph
449 2

450 2
    Returns
451
    -------
452 2
    plug_in_variance_estimtor: functions
453
        a function that estimates variance (see below)
454
    """
455

456
    n = len(X)
457

458
    # precompute the Delta and the middle term matrix part
459
    delta = 1 / (n) * (X.T @ X)
460
    delta_inverse = np.linalg.inv(delta)
461
    middle_term_matrix = np.einsum("bi,bo->bio", X, X)
462

463
    def plug_in_variance_estimator(x):
464
        """
465
        Takes in a point of a matrix of points in R^d and returns an
466
        estimated covariance matrix for each of the points
467

468 2
        Parameters:
469 0
        -----------
470
        x: np.ndarray, shape (n, d)
471
            points to estimate variance at
472
            if 1-dimensional - reshaped to (1, d)
473

474
        Returns:
475
        -------
476 2
        covariances: np.ndarray, shape (n, d, d)
477 2
            n estimated variance-covariance matrices of the points provided
478 2
        """
479 2
        if x.ndim < 2:
480
            x = x.reshape(1, -1)
481 2
        # the following two lines are a properly vectorized version of
482
        # middle_term = 0
483
        # for i in range(n):
484
        #     middle_term += np.multiply.outer((x @ X[i] - (x @ X[i]) ** 2),
485
        #                                      np.outer(X[i], X[i]))
486
        # where the matrix part does not involve x and has been computed above
487
        middle_term_scalar = x @ X.T - (x @ X.T) ** 2
488
        middle_term = np.tensordot(middle_term_scalar, middle_term_matrix, axes=1)
489
        covariances = delta_inverse @ (middle_term / n) @ delta_inverse
490
        return covariances
491

492
    return plug_in_variance_estimator

Read our documentation on viewing source code .

Loading