migrate to ghactions for ci/cd
1 |
# -*- coding: utf-8 -*-
|
|
2 | 3 |
"""
|
3 |
Implementation of the modified Gromov–Hausdorff (mGH) distance
|
|
4 |
between compact metric spaces induced by unweighted graphs. This
|
|
5 |
code complements the results from "Efficient estimation of a
|
|
6 |
Gromov–Hausdorff distance between unweighted graphs" by V. Oles et
|
|
7 |
al. (https://arxiv.org/pdf/1909.09772). The mGH distance was first
|
|
8 |
defined in "Some properties of Gromov–Hausdorff distances" by F.
|
|
9 |
Mémoli (Discrete & Computational Geometry, 2012).
|
|
10 |
|
|
11 |
Author: Vladyslav Oles
|
|
12 |
|
|
13 |
===================================================================
|
|
14 |
|
|
15 |
Usage examples:
|
|
16 |
|
|
17 |
1) Estimating the mGH distance between 4-clique and single-vertex
|
|
18 |
graph from their adjacency matrices. Note that it suffices to fill
|
|
19 |
only the upper triangle of an adjacency matrix.
|
|
20 |
|
|
21 |
>>> AG = [[0, 1, 1, 1], [0, 0, 1, 1], [0, 0, 0, 1], [0, 0, 0, 0]]
|
|
22 |
>>> AH = [[0]]
|
|
23 |
>>> lb, ub = gromov_hausdorff(AG, AH)
|
|
24 |
>>> lb, ub
|
|
25 |
(0.5, 0.5)
|
|
26 |
|
|
27 |
2) Estimating the mGH distance between cycle graphs of length 2 and
|
|
28 |
5 from their adjacency matrices. Note that the adjacency matrices
|
|
29 |
can be given in both dense and sparse SciPy formats.
|
|
30 |
|
|
31 |
>>> AI = np.array([[0, 1], [0, 0]])
|
|
32 |
>>> AJ = sps.csr_matrix(([1] * 5, ([0, 0, 1, 2, 3], [1, 4, 2, 3, 4])), shape=(5, 5))
|
|
33 |
>>> lb, ub = gromov_hausdorff(AI, AJ)
|
|
34 |
>>> lb, ub
|
|
35 |
(0.5, 1.0)
|
|
36 |
|
|
37 |
3) Estimating all pairwise mGH distances between multiple graphs
|
|
38 |
from their adjacency matrices as an iterable.
|
|
39 |
|
|
40 |
>>> As = [AG, AH, AI, AJ]
|
|
41 |
>>> lbs, ubs = gromov_hausdorff(As)
|
|
42 |
>>> lbs
|
|
43 |
array([[0. , 0.5, 0.5, 0.5],
|
|
44 |
[0.5, 0. , 0.5, 1. ],
|
|
45 |
[0.5, 0.5, 0. , 0.5],
|
|
46 |
[0.5, 1. , 0.5, 0. ]])
|
|
47 |
>>> ubs
|
|
48 |
array([[0. , 0.5, 0.5, 0.5],
|
|
49 |
[0.5, 0. , 0.5, 1. ],
|
|
50 |
[0.5, 0.5, 0. , 1. ],
|
|
51 |
[0.5, 1. , 1. , 0. ]])
|
|
52 |
|
|
53 |
===================================================================
|
|
54 |
|
|
55 |
Notations:
|
|
56 |
|
|
57 |
|X| denotes the number of elements in set X.
|
|
58 |
|
|
59 |
X → Y denotes the set of all mappings of set X into set Y.
|
|
60 |
|
|
61 |
V(G) denotes vertex set of graph G.
|
|
62 |
|
|
63 |
mGH(X, Y) denotes the modified Gromov–Hausdorff distance between
|
|
64 |
compact metric spaces X and Y.
|
|
65 |
|
|
66 |
row_i(A) denotes the i-th row of matrix A.
|
|
67 |
|
|
68 |
PSPS^n(A) denotes the set of all permutation similarities of
|
|
69 |
all n×n principal submatrices of square matrix A.
|
|
70 |
|
|
71 |
PSPS^n_{i←j}(A) denotes the set of all permutation similarities of
|
|
72 |
all n×n principal submatrices of square matrix A whose i-th row is
|
|
73 |
comprised of the entries in row_j(A).
|
|
74 |
|
|
75 |
===================================================================
|
|
76 |
|
|
77 |
Glossary:
|
|
78 |
|
|
79 |
Distance matrix of metric space X is a |X|×|X| matrix whose
|
|
80 |
(i, j)-th entry holds the distance between i-th and j-th points of
|
|
81 |
X. By the properties of a metric, distance matrices are symmetric
|
|
82 |
and non-negative, their diagonal entries are 0 and off-diagonal
|
|
83 |
entries are positive.
|
|
84 |
|
|
85 |
Curvature is a generalization of distance matrix that allows
|
|
86 |
repetitions in the underlying points of a metric space. Curvature
|
|
87 |
of an n-tuple of points from metric space X is an n×n matrix whose
|
|
88 |
(i, j)-th entry holds the distance between the points from i-th and
|
|
89 |
j-th positions of the tuple. Since these points need not be
|
|
90 |
distinct, the off-diagonal entries of a curvature can equal 0.
|
|
91 |
|
|
92 |
n-th curvature set of metric space X is the set of all curvatures
|
|
93 |
of X that are of size n×n.
|
|
94 |
|
|
95 |
d-bounded curvature for some d > 0 is a curvature whose
|
|
96 |
off-diagonal entries are all ≥ d.
|
|
97 |
|
|
98 |
Positive-bounded curvature is a curvature whose off-diagonal
|
|
99 |
entries are all positive, i.e. the points in the underlying tuple
|
|
100 |
are distinct. Equivalently, positive-bounded curvatures are
|
|
101 |
distance matrices on the subsets of a metric space.
|
|
102 |
"""
|
|
103 | 3 |
import numpy as np |
104 | 3 |
import warnings |
105 | 3 |
import scipy.sparse as sps |
106 | 3 |
from scipy.sparse.csgraph import shortest_path, connected_components |
107 |
|
|
108 |
|
|
109 | 3 |
__all__ = ["gromov_hausdorff"] |
110 |
|
|
111 |
|
|
112 |
# To sample √|X| * log (|X| + 1) mappings from X → Y by default.
|
|
113 | 3 |
DEFAULT_MAPPING_SAMPLE_SIZE_ORDER = np.array([.5, 1]) |
114 |
|
|
115 |
|
|
116 | 3 |
def gromov_hausdorff( |
117 |
AG, AH=None, mapping_sample_size_order=DEFAULT_MAPPING_SAMPLE_SIZE_ORDER): |
|
118 |
"""
|
|
119 |
Estimate the mGH distance between simple unweighted graphs,
|
|
120 |
represented as compact metric spaces based on their shortest
|
|
121 |
path lengths.
|
|
122 |
|
|
123 |
Parameters
|
|
124 |
-----------
|
|
125 |
AG: np.array (|V(G)|×|V(G)|)
|
|
126 |
(Sparse) adjacency matrix of graph G, or an iterable of
|
|
127 |
adjacency matrices if AH=None.
|
|
128 |
AH: np.array (|V(H)|×|V(H)|)
|
|
129 |
(Sparse) adjacency matrix of graph H, or None.
|
|
130 |
mapping_sample_size_order: np.array (2)
|
|
131 |
Parameter that regulates the number of mappings to sample when
|
|
132 |
tightening upper bound of the mGH distance.
|
|
133 |
|
|
134 |
Returns
|
|
135 |
--------
|
|
136 |
lb: float
|
|
137 |
Lower bound of the mGH distance, or a square matrix holding
|
|
138 |
lower bounds of pairwise mGH distances if AH=None.
|
|
139 |
ub: float
|
|
140 |
Upper bound of the mGH distance, or a square matrix holding
|
|
141 |
upper bounds of pairwise mGH distances if AH=None.
|
|
142 |
"""
|
|
143 |
# Form iterable with adjacency matrices.
|
|
144 |
if AH is None: |
|
145 |
if len(AG) < 2: |
|
146 |
raise ValueError("'estimate_between_unweighted_graphs' needs at least" |
|
147 |
"2 graphs to discriminate") |
|
148 | 3 |
As = AG |
149 |
else: |
|
150 | 3 |
As = (AG, AH) |
151 |
|
|
152 | 3 |
N = len(As) |
153 |
# Find lower and upper bounds of each pairwise mGH distance between
|
|
154 |
# the graphs.
|
|
155 | 3 |
lbs = np.zeros((N, N)) |
156 | 3 |
ubs = np.zeros((N, N)) |
157 |
for i in range(N): |
|
158 |
for j in range(i + 1, N): |
|
159 |
# Transform adjacency matrices of a pair of graphs to
|
|
160 |
# distance matrices.
|
|
161 | 3 |
DX = make_distance_matrix_from_adjacency_matrix(As[i]) |
162 | 3 |
DY = make_distance_matrix_from_adjacency_matrix(As[j]) |
163 |
# Find lower and upper bounds of the mGH distance between
|
|
164 |
# the pair of graphs.
|
|
165 | 3 |
lbs[i, j], ubs[i, j] = estimate( |
166 |
DX, DY, mapping_sample_size_order=mapping_sample_size_order) |
|
167 |
|
|
168 |
if AH is None: |
|
169 |
# Symmetrize matrices with lower and upper bounds of pairwise
|
|
170 |
# mGH distances between the graphs.
|
|
171 | 3 |
lower_triangle_indices = np.tril_indices(N, -1) |
172 | 3 |
lbs[lower_triangle_indices] = lbs.T[lower_triangle_indices] |
173 | 3 |
ubs[lower_triangle_indices] = ubs.T[lower_triangle_indices] |
174 |
|
|
175 | 3 |
return lbs, ubs |
176 |
else: |
|
177 | 3 |
return lbs[0, 1], ubs[0, 1] |
178 |
|
|
179 |
|
|
180 | 3 |
def make_distance_matrix_from_adjacency_matrix(AG): |
181 |
"""
|
|
182 |
Represent simple unweighted graph as compact metric space (with
|
|
183 |
integer distances) based on its shortest path lengths.
|
|
184 |
|
|
185 |
Parameters
|
|
186 |
-----------
|
|
187 |
AG: np.array (|V(G)|×|V(G)|)
|
|
188 |
(Sparse) adjacency matrix of simple unweighted graph G.
|
|
189 |
|
|
190 |
Returns
|
|
191 |
--------
|
|
192 |
DG: np.array (|V(G)|×|V(G)|)
|
|
193 |
(Dense) distance matrix of the compact metric space
|
|
194 |
representation of G based on its shortest path lengths.
|
|
195 |
"""
|
|
196 |
# Convert adjacency matrix to SciPy format if needed.
|
|
197 |
if not sps.issparse(AG) and not isinstance(AG, np.ndarray): |
|
198 |
AG = np.asarray(AG) |
|
199 |
|
|
200 |
# Compile distance matrix of the graph based on its shortest path
|
|
201 |
# lengths.
|
|
202 | 3 |
DG = shortest_path(AG, directed=False, unweighted=True) |
203 |
# Ensure compactness of metric space, represented by distance
|
|
204 |
# matrix.
|
|
205 |
if np.any(np.isinf(DG)): |
|
206 |
warnings.warn("disconnected graph is approximated by its largest connected component") |
|
207 |
# Extract largest connected component of the graph.
|
|
208 |
_, components_by_vertex = connected_components(AG, directed=False) |
|
209 |
components, component_sizes = np.unique(components_by_vertex, return_counts=True) |
|
210 |
largest_component = components[np.argmax(component_sizes)] |
|
211 |
DG = DG[components_by_vertex == largest_component] |
|
212 |
|
|
213 |
# Cast distance matrix to optimal integer type.
|
|
214 | 3 |
DG = cast_distance_matrix_to_optimal_int_type(DG) |
215 |
|
|
216 | 3 |
return DG |
217 |
|
|
218 |
|
|
219 | 3 |
def cast_distance_matrix_to_optimal_int_type(DX): |
220 |
"""
|
|
221 |
Given a metric space X induced by simple unweighted graph,
|
|
222 |
cast its distance matrix to the smallest signed integer type,
|
|
223 |
sufficient to hold all its entries.
|
|
224 |
|
|
225 |
Parameters
|
|
226 |
-----------
|
|
227 |
DX: np.array (|X|×|X|)
|
|
228 |
Distance matrix of X.
|
|
229 |
|
|
230 |
Returns
|
|
231 |
--------
|
|
232 |
DX: np.array (|X|×|X|)
|
|
233 |
Distance matrix of X, cast to optimal type.
|
|
234 |
"""
|
|
235 | 3 |
max_distance = np.max(DX) |
236 |
# Type is signed integer to allow subtractions.
|
|
237 | 3 |
optimal_int_type = determine_optimal_int_type(max_distance) |
238 | 3 |
DX = DX.astype(optimal_int_type) |
239 |
|
|
240 | 3 |
return DX |
241 |
|
|
242 |
|
|
243 | 3 |
def determine_optimal_int_type(value): |
244 |
"""
|
|
245 |
Determine smallest signed integer type sufficient to hold a value.
|
|
246 |
|
|
247 |
Parameters
|
|
248 |
-----------
|
|
249 |
value: non-negative integer
|
|
250 |
|
|
251 |
Returns
|
|
252 |
--------
|
|
253 |
optimal_int_type: np.dtype
|
|
254 |
Optimal signed integer type to hold the value.
|
|
255 |
"""
|
|
256 |
feasible_int_types = (int_type for int_type in [np.int8, np.int16, np.int32, np.int64] |
|
257 |
if value <= np.iinfo(int_type).max) |
|
258 | 3 |
try: |
259 | 3 |
optimal_int_type = next(feasible_int_types) |
260 |
except StopIteration: |
|
261 |
raise ValueError("value {} too large to be stored as unsigned integer") |
|
262 |
|
|
263 | 3 |
return optimal_int_type |
264 |
|
|
265 |
|
|
266 | 3 |
def estimate(DX, DY, mapping_sample_size_order=DEFAULT_MAPPING_SAMPLE_SIZE_ORDER): |
267 |
"""
|
|
268 |
For X, Y metric spaces induced by simple unweighted graphs, find
|
|
269 |
lower and upper bounds of mGH(X, Y).
|
|
270 |
|
|
271 |
Parameters
|
|
272 |
----------
|
|
273 |
DX: np.array (|X|×|X|)
|
|
274 |
(Integer) distance matrix of X.
|
|
275 |
DY: np.array (|Y|×|Y|)
|
|
276 |
(Integer) distance matrix of Y.
|
|
277 |
mapping_sample_size_order: np.array (2)
|
|
278 |
Parameter that regulates the number of mappings to sample when
|
|
279 |
tightening upper bound of mGH(X, Y).
|
|
280 |
|
|
281 |
Returns
|
|
282 |
--------
|
|
283 |
lb: float
|
|
284 |
Lower bound of mGH(X, Y).
|
|
285 |
ub: float
|
|
286 |
Upper bound of mGH(X, Y).
|
|
287 |
"""
|
|
288 |
# Ensure distance matrices are of integer type.
|
|
289 |
if not np.issubdtype(DX.dtype, np.integer) or not np.issubdtype(DY.dtype, np.integer): |
|
290 |
raise ValueError("non-integer metrics are not yet supported") |
|
291 |
# Cast distance matrices to signed integer type to allow
|
|
292 |
# subtractions.
|
|
293 |
if np.issubdtype(DX.dtype, np.uint): |
|
294 |
DX = cast_distance_matrix_to_optimal_int_type(DX) |
|
295 |
if np.issubdtype(DY.dtype, np.uint): |
|
296 |
DY = cast_distance_matrix_to_optimal_int_type(DY) |
|
297 |
# Estimate mGH(X, Y).
|
|
298 | 3 |
double_lb = find_lb(DX, DY) |
299 | 3 |
double_ub = find_ub( |
300 |
DX, DY, mapping_sample_size_order=mapping_sample_size_order, double_lb=double_lb) |
|
301 |
|
|
302 | 3 |
return 0.5 * double_lb, 0.5 * double_ub |
303 |
|
|
304 |
|
|
305 | 3 |
def find_lb(DX, DY): |
306 |
"""
|
|
307 |
For X, Y metric spaces induced by simple unweighted graphs, find
|
|
308 |
lower bound of mGH(X, Y).
|
|
309 |
|
|
310 |
Parameters
|
|
311 |
----------
|
|
312 |
DX: np.array (|X|×|X|)
|
|
313 |
(Integer) distance matrix of X.
|
|
314 |
DY: np.array (|Y|×|Y|)
|
|
315 |
(Integer) distance matrix of Y.
|
|
316 |
|
|
317 |
Returns
|
|
318 |
--------
|
|
319 |
double_lb: float
|
|
320 |
Lower bound of 2*mGH(X, Y).
|
|
321 |
"""
|
|
322 | 3 |
diam_X = np.max(DX) |
323 | 3 |
diam_Y = np.max(DY) |
324 | 3 |
max_diam = max(diam_X, diam_Y) |
325 |
# Obtain trivial lower bound of 2*mGH(X, Y) from
|
|
326 |
# 1) mGH(X, Y) ≥ 0.5*|diam X - diam Y|;
|
|
327 |
# 2) if X and Y are not isometric, mGH(X, Y) ≥ 0.5.
|
|
328 | 3 |
trivial_double_lb = max(abs(diam_X - diam_Y), int(len(DX) != len(DY))) |
329 |
# Initialize lower bound of 2*mGH(X, Y).
|
|
330 | 3 |
double_lb = trivial_double_lb |
331 |
# Try tightening the lower bound.
|
|
332 | 3 |
d = max_diam |
333 |
while d > double_lb: |
|
334 |
# Try proving 2*mGH(X, Y) ≥ d using d-bounded curvatures of
|
|
335 |
# X and Y of size 3×3 or larger. 2×2 curvatures are already
|
|
336 |
# accounted for in trivial lower bound.
|
|
337 |
|
|
338 |
if d <= diam_X: |
|
339 | 3 |
K = find_largest_size_bounded_curvature(DX, diam_X, d) |
340 |
if len(K) > 2 and confirm_lb_using_bounded_curvature(d, K, DY, max_diam): |
|
341 |
double_lb = d |
|
342 |
if d > double_lb and d <= diam_Y: |
|
343 | 3 |
L = find_largest_size_bounded_curvature(DY, diam_Y, d) |
344 |
if len(L) > 2 and confirm_lb_using_bounded_curvature(d, L, DX, max_diam): |
|
345 |
double_lb = d |
|
346 |
|
|
347 | 3 |
d -= 1 |
348 |
|
|
349 | 3 |
return double_lb |
350 |
|
|
351 |
|
|
352 | 3 |
def find_largest_size_bounded_curvature(DX, diam_X, d): |
353 |
"""
|
|
354 |
Find a largest-size d-bounded curvature of metric space X induced
|
|
355 |
by simple unweighted graph.
|
|
356 |
|
|
357 |
Parameters
|
|
358 |
----------
|
|
359 |
DX: np.array (|X|×|X|)
|
|
360 |
(Integer) distance matrix of X.
|
|
361 |
diam_X: int
|
|
362 |
Largest distance in X.
|
|
363 |
|
|
364 |
Returns
|
|
365 |
--------
|
|
366 |
K: np.array (n×n)
|
|
367 |
d-bounded curvature of X of largest size; n ≤ |X|.
|
|
368 |
"""
|
|
369 |
# Initialize curvature K with the distance matrix of X.
|
|
370 | 3 |
K = DX |
371 |
while np.any(K[np.triu_indices_from(K, 1)] < d): |
|
372 |
# Pick a row (and column) with highest number of off-diagonal
|
|
373 |
# distances < d, then with smallest sum of off-diagonal
|
|
374 |
# distances ≥ d.
|
|
375 | 3 |
K_rows_sortkeys = -np.sum(K < d, axis=0) * (len(K) * diam_X) + \ |
376 |
np.sum(np.ma.masked_less(K, d), axis=0).data |
|
377 | 3 |
row_to_remove = np.argmin(K_rows_sortkeys) |
378 |
# Remove the row and column from K.
|
|
379 | 3 |
K = np.delete(K, row_to_remove, axis=0) |
380 | 3 |
K = np.delete(K, row_to_remove, axis=1) |
381 |
|
|
382 | 3 |
return K |
383 |
|
|
384 |
|
|
385 | 3 |
def confirm_lb_using_bounded_curvature(d, K, DY, max_diam): |
386 |
"""
|
|
387 |
For X, Y metric spaces induced by simple unweighted graph, try to
|
|
388 |
confirm 2*mGH(X, Y) ≥ d using K, a d-bounded curvature of X.
|
|
389 |
|
|
390 |
Parameters
|
|
391 |
----------
|
|
392 |
d: int
|
|
393 |
Lower bound candidate for 2*mGH(X, Y).
|
|
394 |
K: np.array (n×n)
|
|
395 |
d-bounded curvature of X; n ≥ 3.
|
|
396 |
DY: np.array (|Y|×|Y|)
|
|
397 |
Integer distance matrix of Y.
|
|
398 |
max_diam: int
|
|
399 |
Largest distance in X and Y.
|
|
400 |
|
|
401 |
Returns
|
|
402 |
--------
|
|
403 |
lb_is_confirmed: bool
|
|
404 |
Whether confirmed that 2*mGH(X, Y) ≥ d.
|
|
405 |
"""
|
|
406 |
# If K exceeds DY in size, the Hausdorff distance between the n-th
|
|
407 |
# curvature sets of X and Y is ≥ d, entailing 2*mGH(X, Y) ≥ d (from
|
|
408 |
# Theorem A).
|
|
409 | 3 |
lb_is_confirmed = len(K) > len(DY) or \ |
410 |
confirm_lb_using_bounded_curvature_row(d, K, DY, max_diam) |
|
411 |
|
|
412 | 3 |
return lb_is_confirmed |
413 |
|
|
414 |
|
|
415 | 3 |
def confirm_lb_using_bounded_curvature_row(d, K, DY, max_diam): |
416 |
"""
|
|
417 |
For X, Y metric spaces induced by simple unweighted graph, and K,
|
|
418 |
a d-bounded curvature of X, try to confirm 2*mGH(X, Y) ≥ d using
|
|
419 |
some row of K.
|
|
420 |
|
|
421 |
Parameters
|
|
422 |
----------
|
|
423 |
d: int
|
|
424 |
Lower bound candidate for 2*mGH(X, Y).
|
|
425 |
K: np.array (n×n)
|
|
426 |
d-bounded curvature of X; n ≥ 3.
|
|
427 |
DY: np.array (|Y|×|Y|)
|
|
428 |
Integer distance matrix of Y; n ≤ |Y|.
|
|
429 |
max_diam: int
|
|
430 |
Largest distance in X and Y.
|
|
431 |
|
|
432 |
Returns
|
|
433 |
--------
|
|
434 |
lb_is_confirmed: bool
|
|
435 |
Whether confirmed that 2*mGH(X, Y) ≥ d.
|
|
436 |
"""
|
|
437 | 3 |
lb_is_confirmed = False |
438 |
# Represent row of K as distance distributions, and retain those
|
|
439 |
# that are maximal by the entry-wise partial order.
|
|
440 | 3 |
K_max_rows_distance_distributions = find_unique_max_distributions( |
441 |
represent_distance_matrix_rows_as_distributions(K, max_diam)) |
|
442 |
# Represent rows of DY as distance distributions.
|
|
443 | 3 |
DY_rows_distance_distributions = represent_distance_matrix_rows_as_distributions(DY, max_diam) |
444 |
# For each i ∈ 1,...,n, check if ||row_i(K) - row_i(L)||_∞ ≥ d
|
|
445 |
# ∀L ∈ PSPS^n(DY), which entails that the Hausdorff distance
|
|
446 |
# between the n-th curvature sets of X and Y is ≥ d, and therefore
|
|
447 |
# 2*mGH(X, Y) ≥ d (from Theorem B).
|
|
448 | 3 |
i = 0 |
449 |
while not lb_is_confirmed and i < len(K_max_rows_distance_distributions): |
|
450 | 3 |
lb_is_confirmed = True |
451 |
# For fixed i, check if ||row_i(K) - row_i(L)||_∞ ≥ d
|
|
452 |
# ∀L ∈ PSPS^n_{i←j}(DY) ∀j ∈ 1,...,|Y|, which is equivalent to
|
|
453 |
# ||row_i(K) - row_i(L)||_∞ ≥ d ∀L ∈ PSPS^n(DY).
|
|
454 | 3 |
j = 0 |
455 |
while lb_is_confirmed and j < len(DY_rows_distance_distributions): |
|
456 |
# For fixed i and j, checking ||row_i(K) - row_i(L)||_∞ ≥ d
|
|
457 |
# ∀L ∈ PSPS^n_{i←j}(DY) is equivalent to solving a linear
|
|
458 |
# (bottleneck) assignment feasibility problem between the
|
|
459 |
# entries of row_i(K) and row_j(DY).
|
|
460 | 3 |
lb_is_confirmed = not check_assignment_feasibility( |
461 |
K_max_rows_distance_distributions[i], DY_rows_distance_distributions[j], d) |
|
462 | 3 |
j += 1 |
463 |
|
|
464 | 3 |
i += 1 |
465 |
|
|
466 | 3 |
return lb_is_confirmed |
467 |
|
|
468 | 3 |
def represent_distance_matrix_rows_as_distributions(DX, max_d): |
469 |
"""
|
|
470 |
Given a metric space X induced by simple unweighted graph,
|
|
471 |
represent each row of its distance matrix as the frequency
|
|
472 |
distribution of its entries. Entry 0 in each row is omitted.
|
|
473 |
|
|
474 |
Parameters
|
|
475 |
----------
|
|
476 |
DX: np.array (n×n)
|
|
477 |
(Integer) distance matrix of X.
|
|
478 |
max_d: int
|
|
479 |
Upper bound of the entries in DX.
|
|
480 |
|
|
481 |
Returns
|
|
482 |
--------
|
|
483 |
DX_rows_distributons: np.array (n×max_d)
|
|
484 |
Each row holds frequencies of each distance from 1 to
|
|
485 |
max_d in the corresponding row of DX. Namely, the (i, j)-th
|
|
486 |
entry holds the frequency of distance (max_d - j) in row_i(DX).
|
|
487 |
"""
|
|
488 |
# Add imaginary part to distinguish identical distances from
|
|
489 |
# different rows of D.
|
|
490 | 3 |
unique_distances, distance_frequencies = np.unique( |
491 |
DX + 1j * np.arange(len(DX))[:, None], return_counts=True) |
|
492 |
# Type is signed integer to allow subtractions.
|
|
493 | 3 |
optimal_int_type = determine_optimal_int_type(len(DX)) |
494 | 3 |
DX_rows_distributons = np.zeros((len(DX), max_d + 1), dtype=optimal_int_type) |
495 |
# Construct index pairs for distance frequencies, so that the
|
|
496 |
# frequencies of larger distances appear on the left.
|
|
497 | 3 |
distance_frequencies_index_pairs = \ |
498 |
(np.imag(unique_distances).astype(optimal_int_type), |
|
499 |
max_d - np.real(unique_distances).astype(max_d.dtype)) |
|
500 |
# Fill frequency distributions of the rows of DX.
|
|
501 | 3 |
DX_rows_distributons[distance_frequencies_index_pairs] = distance_frequencies |
502 |
# Remove (unit) frequency of distance 0 from each row.
|
|
503 | 3 |
DX_rows_distributons = DX_rows_distributons[:, :-1] |
504 |
|
|
505 | 3 |
return DX_rows_distributons |
506 |
|
|
507 |
|
|
508 | 3 |
def find_unique_max_distributions(distributions): |
509 |
"""
|
|
510 |
Given frequency distributions of entries in M positive integer
|
|
511 |
vectors of size p, find unique maximal vectors under the following
|
|
512 |
(entry- wise) partial order: for v, u vectors, v < u if and only if
|
|
513 |
there exists a bijection f: {1,...,p} → {1,...,p} such that
|
|
514 |
v_k < u_{f(k)} ∀k ∈ {1,...,p}.
|
|
515 |
|
|
516 |
Parameters
|
|
517 |
----------
|
|
518 |
distributions: np.array (M×max_d)
|
|
519 |
Frequency distributions of entries in the m vectors; the
|
|
520 |
entries are bounded from above by max_d.
|
|
521 |
|
|
522 |
Returns
|
|
523 |
--------
|
|
524 |
unique_max_distributions: np.array (m×max_d)
|
|
525 |
Unique frequency distributions of the maximal vectors; m ≤ M.
|
|
526 |
"""
|
|
527 | 3 |
pairwise_distribution_differences = \ |
528 |
np.cumsum(distributions - distributions[:, None, :], axis=2) |
|
529 | 3 |
pairwise_distribution_less_thans = np.logical_and( |
530 |
np.all(pairwise_distribution_differences >= 0, axis=2), |
|
531 |
np.any(pairwise_distribution_differences > 0, axis=2)) |
|
532 | 3 |
distributions_are_max = ~np.any(pairwise_distribution_less_thans, axis=1) |
533 | 3 |
try: |
534 | 3 |
unique_max_distributions = np.unique(distributions[distributions_are_max], axis=0) |
535 |
except AttributeError: |
|
536 |
# `np.unique` is not implemented in NumPy 1.12 (Python 3.4).
|
|
537 |
unique_max_distributions = np.vstack( |
|
538 |
{tuple(distribution) for distribution in distributions[distributions_are_max]}) |
|
539 |
|
|
540 | 3 |
return unique_max_distributions |
541 |
|
|
542 |
|
|
543 | 3 |
def check_assignment_feasibility(v_distribution, u_distribution, d): |
544 |
"""
|
|
545 |
For positie integer vectors v of size p and u of size q ≥ p, check
|
|
546 |
if there exists injective f: {1,...,p} → {1,...,q}, such that
|
|
547 |
|v_k - u_{f(k)}| < d ∀k ∈ {1,...,p}
|
|
548 |
|
|
549 |
Parameters
|
|
550 |
----------
|
|
551 |
v_distribution: np.array (max_d)
|
|
552 |
Frequency distribution of entries in v; the entries are bounded
|
|
553 |
from above by max_d.
|
|
554 |
u_distribution: np.array (max_d)
|
|
555 |
Frequency distribution of entries in u; the entries are bounded
|
|
556 |
from above by max_d.
|
|
557 |
d: int
|
|
558 |
d > 0.
|
|
559 |
|
|
560 |
Returns
|
|
561 |
--------
|
|
562 |
is_assignment_feasible: bool
|
|
563 |
Whether such injective f: {1,...,p} → {1,...,q} exists.
|
|
564 |
"""
|
|
565 | 3 |
def next_i_and_j(min_i, min_j): |
566 |
# Find reversed v distribution index of smallest v entries yet
|
|
567 |
# to be assigned. Then find index in reversed u distribution of
|
|
568 |
# smallest u entries to which the b entries can be assigned to.
|
|
569 | 3 |
try: |
570 |
i = next(i for i in range(min_i, len(reversed_v_distribution)) |
|
571 |
if reversed_v_distribution[i] > 0) |
|
572 | 3 |
except StopIteration: |
573 |
# All v entries are assigned.
|
|
574 | 3 |
i = None |
575 | 3 |
j = min_j |
576 |
else: |
|
577 | 3 |
j = next_j(i, max(i - (d - 1), min_j)) |
578 |
|
|
579 | 3 |
return i, j |
580 |
|
|
581 | 3 |
def next_j(i, min_j): |
582 |
# Find reversed u distribution index of smallest u entries to
|
|
583 |
# which v entries, corresponding to a given reversed v
|
|
584 |
# distribution index, can be assigned to.
|
|
585 | 3 |
try: |
586 |
j = next(j for j in range(min_j, min(i + (d - 1), |
|
587 |
len(reversed_u_distribution) - 1) + 1) |
|
588 |
if reversed_u_distribution[j] > 0) |
|
589 |
except StopIteration: |
|
590 |
# No u entries left to assign the particular v entries to.
|
|
591 |
j = None |
|
592 |
|
|
593 | 3 |
return j |
594 |
|
|
595 |
# Copy to allow modifications and stay pure; reverse to be
|
|
596 |
# compatible with distributions of different size.
|
|
597 | 3 |
reversed_v_distribution = list(v_distribution[::-1]) |
598 | 3 |
reversed_u_distribution = list(u_distribution[::-1]) |
599 |
# Injectively assign v entries to u entries if their difference
|
|
600 |
# is < d, going from smallest to largest entries in both v and u,
|
|
601 |
# until all v entries are assigned or such assignment proves
|
|
602 |
# infeasible.
|
|
603 | 3 |
i, j = next_i_and_j(0, 0) |
604 |
while i is not None and j is not None: |
|
605 |
if reversed_v_distribution[i] <= reversed_u_distribution[j]: |
|
606 | 3 |
reversed_u_distribution[j] -= reversed_v_distribution[i] |
607 | 3 |
reversed_v_distribution[i] = 0 |
608 | 3 |
i, j = next_i_and_j(i, j) |
609 |
else: |
|
610 |
reversed_v_distribution[i] -= reversed_u_distribution[j] |
|
611 |
reversed_u_distribution[j] = 0 |
|
612 |
j = next_j(i, j) |
|
613 |
|
|
614 |
# The assignment is feasible if and only if for some injective f,
|
|
615 |
# |v_k - u_{f(k)}| < d ∀k ∈ {1,...,p}.
|
|
616 | 3 |
is_assignment_feasible = j is not None |
617 |
|
|
618 | 3 |
return is_assignment_feasible |
619 |
|
|
620 | 3 |
def find_ub(DX, DY, mapping_sample_size_order=DEFAULT_MAPPING_SAMPLE_SIZE_ORDER, double_lb=0): |
621 |
"""
|
|
622 |
For X, Y metric spaces, find an upper bound of mGH(X, Y).
|
|
623 |
|
|
624 |
Parameters
|
|
625 |
----------
|
|
626 |
DX: np.array (|X|×|X|)
|
|
627 |
Distance matrix of X.
|
|
628 |
DY: np.array (|Y|×|Y|)
|
|
629 |
Distance matrix of Y.
|
|
630 |
mapping_sample_size_order: np.array (2)
|
|
631 |
Parameter that regulates the number of mappings to sample when
|
|
632 |
tightening the upper bound.
|
|
633 |
double_lb: float
|
|
634 |
Lower bound of 2*mGH(X, Y).
|
|
635 |
|
|
636 |
Returns
|
|
637 |
--------
|
|
638 |
double_ub: float
|
|
639 |
Upper bound of 2*mGH(X, Y).
|
|
640 |
"""
|
|
641 |
# Find upper bound of smallest distortion of a mapping in X → Y.
|
|
642 | 3 |
ub_of_X_to_Y_min_distortion = find_ub_of_min_distortion( |
643 |
DX, DY, mapping_sample_size_order=mapping_sample_size_order, goal_distortion=double_lb) |
|
644 |
# Find upper bound of smallest distortion of a mapping in Y → X.
|
|
645 | 3 |
ub_of_Y_to_X_min_distortion = find_ub_of_min_distortion( |
646 |
DY, DX, mapping_sample_size_order=mapping_sample_size_order, |
|
647 |
goal_distortion=ub_of_X_to_Y_min_distortion) |
|
648 |
|
|
649 | 3 |
return max(ub_of_X_to_Y_min_distortion, ub_of_Y_to_X_min_distortion) |
650 |
|
|
651 |
|
|
652 | 3 |
def find_ub_of_min_distortion(DX, DY, |
653 |
mapping_sample_size_order=DEFAULT_MAPPING_SAMPLE_SIZE_ORDER, |
|
654 |
goal_distortion=0): |
|
655 |
"""
|
|
656 |
For X, Y metric spaces, find an upper bound of smallest distortion
|
|
657 |
of a mapping in X → Y by heuristically constructing some mappings
|
|
658 |
and choosing the smallest distortion in the sample.
|
|
659 |
|
|
660 |
Parameters
|
|
661 |
----------
|
|
662 |
DX: np.array (|X|×|X|)
|
|
663 |
Distance matrix of X.
|
|
664 |
DY: np.array (|Y|×|Y|)
|
|
665 |
Distance matrix of Y.
|
|
666 |
mapping_sample_size_order: np.array (2)
|
|
667 |
Exponents of |X| and log (|X|+1) in their product that defines
|
|
668 |
how many mappings from X → Y to sample.
|
|
669 |
goal_distortion: float
|
|
670 |
No need to look for distortion smaller than this.
|
|
671 |
|
|
672 |
Returns
|
|
673 |
--------
|
|
674 |
ub_of_min_distortion: float
|
|
675 |
Upper bound of smallest distortion of a mapping in X → Y.
|
|
676 |
"""
|
|
677 |
# Compute the numper of mappings to sample.
|
|
678 | 3 |
n_mappings_to_sample = int(np.ceil(np.prod( |
679 |
np.array([len(DX), np.log(len(DX) + 1)]) ** mapping_sample_size_order))) |
|
680 |
# Construct each mapping in X → Y in |X| steps by choosing the image
|
|
681 |
# of π(i)-th point in X at i-th step, where π is randomly sampled
|
|
682 |
# |X|-permutation. Image of each point is chosen to minimize the
|
|
683 |
# intermediate distortion at each step.
|
|
684 |
permutations_generator = (np.random.permutation(len(DX)) for _ in range(n_mappings_to_sample)) |
|
685 | 3 |
ub_of_min_distortion = np.inf |
686 | 3 |
goal_distortion_is_matched = False |
687 | 3 |
all_sampled_permutations_are_tried = False |
688 | 3 |
pi = next(permutations_generator) |
689 |
while not goal_distortion_is_matched and not all_sampled_permutations_are_tried: |
|
690 | 3 |
mapped_xs_images, distortion = construct_mapping(DX, DY, pi) |
691 | 3 |
ub_of_min_distortion = min(distortion, ub_of_min_distortion) |
692 |
if ub_of_min_distortion <= goal_distortion: |
|
693 | 3 |
goal_distortion_is_matched = True |
694 |
|
|
695 | 3 |
try: |
696 | 3 |
pi = next(permutations_generator) |
697 | 3 |
except StopIteration: |
698 | 3 |
all_sampled_permutations_are_tried = True |
699 |
|
|
700 | 3 |
return ub_of_min_distortion |
701 |
|
|
702 |
|
|
703 | 3 |
def construct_mapping(DX, DY, pi): |
704 |
"""
|
|
705 |
# For X, Y metric spaces and |X|-permutation π, construct a mapping
|
|
706 |
from X → Y in |X| steps by choosing the image of π(i)-th point in X
|
|
707 |
at i-th step. The image of each point is chosen to minimize the
|
|
708 |
intermediate distortion at the corresponding step.
|
|
709 |
|
|
710 |
DX: np.array (|X|×|X|)
|
|
711 |
Distance matrix of X.
|
|
712 |
DY: np.array (|Y|×|Y|)
|
|
713 |
Distance matrix of Y.
|
|
714 |
pi: np.array (|X|)
|
|
715 |
|X|-permutation specifying the order in which the points in X
|
|
716 |
are mapped.
|
|
717 |
Returns
|
|
718 |
--------
|
|
719 |
mapped_xs_images: list
|
|
720 |
image of the constructed mapping.
|
|
721 |
distortion: float
|
|
722 |
distortion of the constructed mapping.
|
|
723 |
"""
|
|
724 |
# Map π(1)-th point in X to a random point in Y, due to the
|
|
725 |
# lack of better criterion.
|
|
726 | 3 |
mapped_xs = [pi[0]] |
727 | 3 |
mapped_xs_images = [np.random.choice(len(DY))] |
728 | 3 |
distortion = 0 |
729 |
# Map π(i)-th point in X for each i = 2,...,|X|.
|
|
730 |
for x in pi[1:]: |
|
731 |
# Choose point in Y that minimizes the distortion after
|
|
732 |
# mapping π(i)-th point in X to it.
|
|
733 | 3 |
bottlenecks_from_mapping_x = np.max( |
734 |
np.abs(DX[x, mapped_xs] - DY[:, mapped_xs_images]), axis=1) |
|
735 | 3 |
y = np.argmin(bottlenecks_from_mapping_x) |
736 |
# Map π(i)-th point in X to the chosen point in Y.
|
|
737 | 3 |
mapped_xs.append(x) |
738 | 3 |
mapped_xs_images.append(y) |
739 | 3 |
distortion = max(bottlenecks_from_mapping_x[y], distortion) |
740 |
|
|
741 | 3 |
return mapped_xs_images, distortion |
Read our documentation on viewing source code .