#
# Copyright (c) 2020 Jack Poulson
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#
# A python utility for generating low-rank approximate minimizers to
#
# L(X, Y) = (1 / 2) || entrywise_sqrt(W) o (A - X Y') ||_F^2 +
# (lambda / 2) (|| X ||_F^2 + || Y ||_F^2),
#
# where A is a given sparse matrix and lambda >= 0 is a regularization
# coefficient.
#
# We make use of a simple alternating weighted least squares scheme that
# solves for the optimal Y given the current estimate of X, then solves for the
# optimal X given the current estimate of Y, and repeats for a small number of
# iterations.
#
# Each iteration consists of either m or n weighted least squares problems with
# a Gramian matrix of size rank x rank.
#
import collections
import math
import numpy as np
import pickle
import scipy.linalg
import time
from scipy.sparse import csr_matrix, dok_matrix
from sklearn.cluster import KMeans
# A bijective mapping between a set of key strings and their indices.
# We do not yet assume Python 3.7, so we cannot use the 'defaults' argument.
KeyIndexBijection = collections.namedtuple('KeyIndexBijection',
['key_to_index', 'index_to_key'])
# An analogue of an enum for the different algorithms which can be used to
# generate a representative subset (skeleton) of a subset of embedding vectors.
MAX_DISTANCE_SKELETON = 0
QR_SKELETON = 1
def hilbert_schmidt_inner_product(left_matrix,
right_matrix,
row_weights=None,
column_weights=None):
'''Returns the Hilbert-Schmidt inner product of two real matrices.
The Hilbert-Schmidt inner-product of m x n matrices A and B is:
= \sum_{i = 0}^{m - 1} \sum_{j = 0}^{n - 1} A_{i, j} B_{i, j}.
The rows and columns can optionally be scaled via row and/or column weights
via:
= \sum_{i, j} r_i A_{i, j} B_{i, j} c_j.
Args:
left_matrix: The left argument to the inner product [np.ndarray].
right_matrix: The right argument to the inner product [np.ndarray].
row_weights: Row weighting of the inner product.
column_weights: Column weighting of the inner product.
Returns:
The Hilbert-Schmidt innner product of the two matrices.
'''
if not isinstance(left_matrix, np.ndarray):
raise Exception('Expected left matrix to be np.ndarray')
if not isinstance(right_matrix, np.ndarray):
raise Exception('Expected right matrix to be np.ndarray')
dtype = left_matrix.dtype.type
num_rows, num_columns = left_matrix.shape
if right_matrix.shape[0] != num_rows:
raise Exception('Inconsistent matrix heights.')
if right_matrix.shape[1] != num_columns:
raise Exception('Inconsistent matrix widths.')
inner_product = dtype(0)
for j in range(num_columns):
column_weight = 1 if column_weights is None else column_weights[j]
for i in range(num_rows):
row_weight = 1 if row_weights is None else row_weights[i]
value = left_matrix[i, j] * right_matrix[i, j]
inner_product += row_weight * value * column_weight
return inner_product
def frobenius_norm_squared(matrix, row_weights=None, column_weights=None):
'''Returns the square of the Frobenius norm of a real matrix.
The squared Frobenius norm of a matrix A, || A ||_F^2, is given by:
|| A ||_F^2 = \sum_{i = 0}^{m - 1} \sum_{j = 0}^{n - 1} A_{i, j}^2.
The rows and columns can optionally be scaled via the the given row and
column weights via:
|| sqrt(r) A sqrt(c) ||_F^2 = \sum_{i, j} r_i A_{i, j}^2 c_j.
Args:
matrix: The matrix to compute the squared norm of. It can either be
either an [np.ndarray] or a [scipy.sparse.csr_matrix].
row_weights: Row weighting of the inner product.
column_weights: Column weighting of the inner product.
Returns:
The square of the Frobenius norm of the matrix.
'''
dtype = matrix.dtype.type
num_rows, num_columns = matrix.shape
norm_squared = dtype(0)
if isinstance(matrix, scipy.sparse.csr_matrix):
num_entries = len(matrix.data)
for i in range(num_rows):
row_weight = 1 if row_weights is None else row_weights[i]
for ind in range(matrix.indptr[i], matrix.indptr[i + 1]):
j = matrix.indices[ind]
column_weight = 1 if column_weights is None else column_weights[
j]
value = matrix.data[ind]
norm_squared += row_weight * value * value * column_weight
elif isinstance(matrix, np.ndarray):
num_rows, num_columns = matrix.shape
for j in range(num_columns):
column_weight = 1 if column_weights is None else column_weights[j]
for i in range(num_rows):
row_weight = 1 if row_weights is None else row_weights[i]
value = matrix[i, j]
norm_squared += row_weight * value * value * column_weight
else:
raise Exception('Unsupported matrix type')
return norm_squared
def get_objective(matrix,
left_factor,
right_factor,
zero_weight,
frobenius_regularization,
row_weights=None,
column_weights=None):
'''Evaluates the objective function of the Bayesian, low-rank model.
Returns an evaluation of the objective function:
L(X, Y) = (1 / 2) || entrywise_sqrt(W) o (A - X Y') ||_F^2 +
(frobenius_regularization / 2) (|| X ||_F^2 + || Y ||_F^2),
where A is an m x n sparse matrix,
W = R [(1 - zero_weight) indicator(A) + zero_weight e_m e_n'] C,
where indicator(A) is the binary matrix which is one over the entry pattern
over A, R is the diagonal matrix defined by the row scaling, C is the
diagonal matrix defined by the column scaling, 'X' is an m x rank matrix
(the 'left_factor'), and 'Y' is an n x rank matrix (the 'right_factor').
The prediction mismatch penalty can be decomposed as:
=
(1 - zero_weight) +
zero_weight .
The first term can be evaluated in time O(rank * nnz(A)), whereas the second
term requires further decomposition.
=
|| sqrt(R) (A - X Y') sqrt(C) ||_F^2
|| sqrt(R) A sqrt(C) - (sqrt(R) X) (sqrt(C) Y)' ||_F^2 =
|| sqrt(R) A sqrt(C) ||_F^2 - 2 +
.
In summary, we evaluate the objective function as the sum of three
components:
sparse_loss(X, Y) = ((1 - zero_weight) / 2)
|| sqrt(R) indicator(A) sqrt(C) o (A - X Y') ||_F^2,
which we can evaluate by directly forming prediction mismatches
(A - X Y')_{i, j} for (i, j) in the sparsity pattern of A,
background_loss(X, Y) = (zero_weight / 2) [
|| sqrt(R) A sqrt(C) ||_F^2 - 2 + ],
and the regularization term,
reg_term(X, Y) = (frobenius_regularization / 2)
(|| X ||_F^2 + || Y ||_F^2).
Args:
matrix: The sparse matrix to approximate with low-rank model. It should
be a [scipy.sparse.csr_matrix].
left_factor: The matrix 'X' in the low-rank approximation A ~= X Y'.
It should be an [np.ndarray].
right_factor: The matrix 'Y' in the low-rank approximation A ~= X Y'.
It should be an [np.ndarray].
zero_weight: The loss function weighting on the zeroes of sparse matrix.
frobenius_regularization: The Frobenius-norm regularization coefficient.
row_weights: Row weighting of the inner product.
column_weights: Column weighting of the inner product.
Returns:
The objective value of the approximate model, the sparse loss, the
background loss, and the regularization term.
'''
dtype = matrix.dtype.type
if not isinstance(matrix, csr_matrix):
raise Exception('Expected matrix to be scipy.sparse.csr_matrix')
if not isinstance(left_factor, np.ndarray):
raise Exception('Expected left factor to be an np.ndarray')
if not isinstance(right_factor, np.ndarray):
raise Exception('Expected right factor to be an np.ndarray')
num_rows, num_columns = matrix.shape
if left_factor.shape[0] != num_rows:
raise Exception('Left factor was the incorrect height')
if right_factor.shape[0] != num_columns:
raise Exception('Right factor was the incorrect height')
_, rank = left_factor.shape
if right_factor.shape[1] != rank:
raise Exception('Factors had inconsistent shapes.')
# Incorporate the sparse contribution to the loss:
#
# sparse_loss(X, Y) = ((1 - zero_weight) / 2)
# || sqrt(R) indicator(A) sqrt(C) o (A - X Y') ||_F^2.
#
sparse_loss = dtype(0)
for i in range(num_rows):
row_weight = 1 if row_weights is None else row_weights[i]
for ind in range(matrix.indptr[i], matrix.indptr[i + 1]):
j = matrix.indices[ind]
value = matrix.data[ind]
column_weight = 1 if column_weights is None else column_weights[j]
prediction = np.inner(left_factor[i, :], right_factor[j, :])
error_squared = (value - prediction) * (value - prediction)
sparse_loss += row_weight * error_squared * column_weight
sparse_loss *= (1 - zero_weight) / 2
# Incorporate the background portion of the loss:
#
# background_loss(X, Y) = (zero_weight / 2) [
# || sqrt(R) A sqrt(C) ||_F^2 - 2 + ].
#
background_loss = frobenius_norm_squared(matrix,
row_weights=row_weights,
column_weights=column_weights)
scaled_left_factor = left_factor.copy()
if row_weights is not None:
scaled_left_factor = row_weights[:, None] * scaled_left_factor
background_loss -= 2 * hilbert_schmidt_inner_product(
matrix.transpose() * scaled_left_factor,
right_factor,
row_weights=column_weights)
scaled_left_gramian = left_factor.transpose() @ scaled_left_factor
background_loss += hilbert_schmidt_inner_product(
right_factor @ scaled_left_gramian,
right_factor,
row_weights=column_weights)
background_loss *= zero_weight / 2
# Incorporate the regularization term:
#
# reg_term(X, Y) = (frobenius_regularization / 2)
# (|| X ||_F^2 + || Y ||_F^2).
#
reg_term = (frobenius_regularization /
2) * (frobenius_norm_squared(left_factor) +
frobenius_norm_squared(right_factor))
objective = sparse_loss + background_loss + reg_term
return (objective, sparse_loss, background_loss, reg_term)
def update_left_factor(matrix,
left_factor,
right_factor,
zero_weight,
frobenius_regularization,
row_weights=None,
column_weights=None):
'''Optimally updates left factor given a frozen right factor.
Given the objective function
L(X, Y) = (1 / 2) || entrywise_sqrt(W) o (A - X Y') ||_F^2 +
(frobenius_regularization / 2) (|| X ||_F^2 + || Y ||_F^2),
where
W = R [(1 - zero_weight) indicator(A) + zero_weight e_m e_n'] C,
the first-order optimality condition of the objective with respect to
the direction X_{i, j} yields:
dL(X, Y) / dX_{i, j} = 0 =
d / dX_{i, j} [(1/2) sum_{k, l} W_{k, l} (A_{k, l} - (X Y')_{k, l})^2]
+ frobenius_regularization X_{i, j} =
-sum_{k, l} W_{k, l} (A_{k, l} - (X Y')_{k, l}) *
d / dX_{i, j} (X Y')_{k, l} + frobenius_regularization X_{i, j} =
-sum_l W_{i, l} (A_{i, l} - (X Y')_{i, l}) * Y_{l, j} +
frobenius_regularization X_{i, j},
which implies
X (Y' diag(W(i, :)) Y + frobenius_regularization I) = (W o A) Y.
We can decompose the i'th row of the weight matrix as
r_i [(1 - zero_weight) indicator(A(i, :)) + zero_weight e_n'] C) =
(1 - zero_weight) (r_i c') o indicator(A(i, :)) + zero_weight r_i c'.
Thus, we can precompute the row-independent background gramian
background_gramian = Y' C Y
and form the i'th scaled row gramian as
zero_weight r_i background_gramian +
(1 - zero_weight) r_i Y' diag(c o indicator(A(i, :))) Y.
The formation of the right-hand side, (W o A) Y, is equivalent to:
(R A C) Y = R (A (C Y)).
Args:
matrix: The sparse matrix to approximate with our model.
left_factor: The left factor in the low-rank approximation, which we are
now updating.
right_factor: The right factor in the low-rank approximation.
zero_weight: The loss coefficient for the mismatch of the predictions on
zero entries in the target sparse matrix.
frobenius_regularization: The coefficient for the Frobenius-norm
regularization of the left and right factors.
row_weights: Row weighting of the inner product.
column_weights: Column weighting of the inner product.
'''
dtype = right_factor.dtype
num_rows, rank = left_factor.shape
# Form the background gramian, Y' C Y, which we will update with sparse
# rank-one updates.
scaled_right_factor = right_factor.copy()
if column_weights is not None:
scaled_right_factor = column_weights[:, None] * scaled_right_factor
background_gramian = right_factor.transpose() @ scaled_right_factor
# Form the right-hand side matrix, (W o A) Y. Due to the structure of W,
# this is equivalent to R (A (C Y)), and we have preformed C Y.
rhs = matrix * scaled_right_factor
if row_weights is not None:
rhs = row_weights[:, None] * rhs
# Solve the weighted least squares systems to get the new left factor.
for i in range(num_rows):
row_weight = 1 if row_weights is None else row_weights[i]
gramian = zero_weight * row_weight * background_gramian.copy()
first_ind = matrix.indptr[i]
last_ind = matrix.indptr[i + 1]
num_updates = last_ind - first_ind
updates = np.ndarray((rank, num_updates), dtype)
for offset in range(num_updates):
ind = first_ind + offset
j = matrix.indices[ind]
weight_correction = 1 if column_weights is None else column_weights[
j]
scale = math.sqrt(weight_correction)
updates[:, offset] = scale * right_factor[j, :].transpose()
# TODO(Jack Poulson): Call BLAS ssyrk or dsyrk instead.
update_scale = (1 - zero_weight) * row_weight
gramian += update_scale * (updates @ updates.transpose())
for k in range(rank):
gramian[k, k] += frobenius_regularization
row_rhs = rhs[i, :].transpose()
new_row = scipy.linalg.solve(gramian,
row_rhs,
sym_pos=True,
lower=True)
left_factor[i, :] = new_row.transpose()
def update_right_factor(matrix_trans,
left_factor,
right_factor,
zero_weight,
frobenius_regularization,
row_weights=None,
column_weights=None):
'''Optimally updates right factor given a frozen left factor.
Given the objective function
L(X, Y) = (1 / 2) || entrywise_sqrt(W) o (A - X Y') ||_F^2 +
(frobenius_regularization / 2) (|| X ||_F^2 + || Y ||_F^2),
where
W = R [(1 - zero_weight) indicator(A) + zero_weight e_m e_n'] C,
the first-order optimality condition of the objective with respect to
the direction X_{i, j} yields:
(X' diag(W(:, j)) X + frobenius_regularization I) Y = (W o A)' X.
We can decompose the j'th column of the weight matrix as
R [(1 - zero_weight) indicator(A(:, j)) + zero_weight e_m] c_j) =
(1 - zero_weight) (r c_j) o indicator(A(:, j)) + zero_weight r c_j.
Thus, we can precompute the row-independent background gramian
background_gramian = X' R X
and form the i'th scaled row gramian as
zero_weight c_j background_gramian +
(1 - zero_weight) c_j X' diag(r o indicator(A(:, j))) Y.
The formation of the right-hand side, (W o A)' X, is equivalent to:
(R A C)' X = C (A' (R X)).
Args:
matrix_trans: The transpose of the sparse matrix to approximate with
our model.
left_factor: The left factor in the low-rank approximation.
right_factor: The right factor in the low-rank approximation, which we
are now updating.
zero_weight: The loss coefficient for the mismatch of the predictions on
zero entries in the target sparse matrix.
frobenius_regularization: The coefficient for the Frobenius-norm
regularization of the left and right factors.
row_weights: Row weighting of the inner product.
column_weights: Column weighting of the inner product.
'''
dtype = left_factor.dtype
num_columns, rank = right_factor.shape
# Form the background gramian, X' R X, which we will update with sparse
# rank-one updates.
scaled_left_factor = left_factor.copy()
if row_weights is not None:
scaled_left_factor = row_weights[:, None] * scaled_left_factor
background_gramian = left_factor.transpose() @ scaled_left_factor
# Form the right-hand side matrix, (W o A)' X. Since A is sparse, and W is
# equal to one on the sparsity pattern of A, this is equivalent to A' X.
rhs = matrix_trans * scaled_left_factor
if column_weights is not None:
rhs = column_weights[:, None] * rhs
# Solve the weighted least squares systems to get the new right factor.
for j in range(num_columns):
column_weight = 1 if column_weights is None else column_weights[j]
gramian = zero_weight * column_weight * background_gramian.copy()
first_ind = matrix_trans.indptr[j]
last_ind = matrix_trans.indptr[j + 1]
num_updates = last_ind - first_ind
updates = np.ndarray((rank, num_updates), dtype)
for offset in range(num_updates):
ind = first_ind + offset
i = matrix_trans.indices[ind]
weight_correction = 1 if row_weights is None else row_weights[i]
scale = math.sqrt(weight_correction)
updates[:, offset] = scale * left_factor[i, :].transpose()
# TODO(Jack Poulson): Call BLAS ssyrk or dsyrk instead.
update_scale = (1 - zero_weight) * column_weight
gramian += update_scale * (updates @ updates.transpose())
for k in range(rank):
gramian[k, k] += frobenius_regularization
row_rhs = rhs[j, :].transpose()
new_row = scipy.linalg.solve(gramian,
row_rhs,
sym_pos=True,
lower=True)
right_factor[j, :] = new_row.transpose()
def generate_embeddings(matrix,
rank=100,
zero_weight=0.01,
frobenius_regularization=0.01,
num_iterations=10,
use_row_weights=False,
use_column_weights=True,
monitor_objective=False):
'''Generates Bayesian, low-rank model for given sparse matrix.
Returns an approximate minimizer for the objective function:
L(X, Y) = (1 / 2) || entrywise_sqrt(W) o (A - X Y') ||_F^2 +
(frobenius_regularization / 2) (|| X ||_F^2 + || Y ||_F^2),
where A is an m x n sparse matrix,
W = R [(1 - zero_weight) indicator(A) + zero_weight e_m e_n'] C,
where indicator(A) is the binary matrix which is one over the entry pattern
over A, 'X' is an m x rank matrix (the 'left_factor'), and 'Y' is an
n x rank matrix (the 'right_factor').
Args:
matrix: The scipy.sparse.csr_matrix to approximate with low-rank model.
rank: The rank of the low-rank model.
zero_weight: The loss function weighting on the zeroes of sparse matrix.
frobenius_regularization: The Frobenius-norm regularization coefficient.
num_iterations: The number of iterations of the alternating weighted
least squares algorithm to run.
use_row_weights: If true, the loss for each row of the sparse matrix is
downweighted using its one-norm.
use_column_weights: If true, the loss for each column of the sparse matrix
is downweighted using its one-norm.
monitor_objective: If true, the objective is measured and printed at
each iteration. Keep in mind that this can substantially increase the
runtime (say, by 25% or so).
Returns:
The pairing of the left and right factors, X and Y.
'''
dtype = matrix.dtype
num_rows, num_columns = matrix.shape
start_time = time.time()
# Form the transpose of the target matrix in CSR format so that we may
# easily iterate through the nonzero pattern of its rows (the nonzero
# pattern of the columns of the original matrix).
matrix_trans = matrix.transpose().tocsr()
# For now, we assume that all entries are non-negative so that sums are
# equivalent to one norms.
column_sums = matrix.sum(axis=0)
row_sums = matrix.sum(axis=1)
# TODO(Jack Poulson): Add a configuration option for how the row and
# column weights are generated. At the moment, they are the pseudoinverse
# of the one-norms of the rows and columns of the scores matrix.
row_weights = np.ones((num_rows, ), dtype)
if use_row_weights:
for i in range(num_rows):
if row_sums[i, 0] > 0:
row_weights[i] = 1 / row_sums[i, 0]
column_weights = np.ones((num_columns, ), dtype)
if use_column_weights:
for j in range(num_columns):
if column_sums[0, j] > 0:
column_weights[j] = 1 / column_sums[0, j]
# Initialize the entries of the model as independent standard normals.
left_factor = np.zeros((num_rows, rank), dtype=dtype)
for j in range(rank):
for i in range(num_rows):
left_factor[i, j] = np.float32(np.random.normal())
right_factor = np.zeros((num_columns, rank), dtype=dtype)
for j in range(rank):
for i in range(num_columns):
right_factor[i, j] = np.float32(np.random.normal())
if monitor_objective:
objective, sparse_loss, background_loss, reg_term = get_objective(
matrix,
left_factor,
right_factor,
zero_weight,
frobenius_regularization,
row_weights=row_weights,
column_weights=column_weights)
print('Initial objective: {}, sparse: {}, background: {}, reg: {}'.
format(objective, sparse_loss, background_loss, reg_term))
for iteration in range(num_iterations):
left_start = time.time()
update_left_factor(matrix,
left_factor,
right_factor,
zero_weight,
frobenius_regularization,
row_weights=row_weights,
column_weights=column_weights)
left_end = time.time()
left_time = left_end - left_start
print(' left update {} took {} seconds.'.format(iteration, left_time))
right_start = time.time()
update_right_factor(matrix_trans,
left_factor,
right_factor,
zero_weight,
frobenius_regularization,
row_weights=row_weights,
column_weights=column_weights)
right_end = time.time()
right_time = right_end - right_start
print(' right update {} took {} seconds.'.format(
iteration, right_time))
if monitor_objective:
objective_start = time.time()
objective, sparse_loss, background_loss, reg_term = get_objective(
matrix,
left_factor,
right_factor,
zero_weight,
frobenius_regularization,
row_weights=row_weights,
column_weights=column_weights)
objective_end = time.time()
objective_time = objective_end - objective_start
print(
' objective {}: {}, sparse: {}, background: {}, reg: {} [evaluation: {} seconds]'
.format(iteration, objective, sparse_loss, background_loss,
reg_term, objective_time))
end_time = time.time()
run_time = end_time - start_time
print('Rank-{} embedding generation took {} seconds.'.format(
rank, run_time))
return (left_factor, right_factor)
def normalize_embeddings(embeddings, squash_exponent=1.):
'''Squashes the two-norms of the embeddings.
Args:
embeddings: The tall-skinny embedding matrix to normalize the rows of.
squash_exponent: Each row of the embedding is divided by its two-norm
raised to this power. If the value is set to one, then the resulting
vector will have unit norm; if the value is zero, no normalization
is performed.
'''
num_rows, rank = embeddings.shape
for i in range(num_rows):
norm = np.linalg.norm(embeddings[i, :])
scale = math.pow(norm, squash_exponent)
embeddings[i, :] /= scale
def nearest_neighbors(embeddings, key, key_index_bijection, num_neighbors=20):
'''Returns lists of keys, indices, and cosine-similarities of neighbors.
Args:
embeddings: The tall-skinny matrix of embeddings.
key: The key of the embedding to query neighbors of.
key_index_bijection: The KeyIndexBijection between keys and indices.
num_neighbors: The maximum number of neighbors to return.
Returns:
The tuple of the list of keys and indices of the nearest neighbors and
their cosine similarity to the query embedding.
'''
query_index = key_index_bijection.key_to_index[key]
cosines = embeddings @ embeddings[query_index, :].transpose()
indices = (np.argsort(cosines))[::-1]
# Leave space for the potential later removal of the key itself.
indices = indices[:num_neighbors + 1]
neighbor_indices = []
neighbor_keys = []
neighbor_cosines = []
for index in indices:
neighbor_key = key_index_bijection.index_to_key[index]
if neighbor_key == key:
continue
neighbor_indices.append(index)
neighbor_keys.append(neighbor_key)
neighbor_cosines.append(cosines[index])
neighbor_indices = neighbor_indices[:num_neighbors]
neighbor_keys = neighbor_keys[:num_neighbors]
neighbor_cosines = neighbor_cosines[:num_neighbors]
return (neighbor_keys, neighbor_indices, neighbor_cosines)
def get_kmeans(embeddings, num_clusters=5000, num_seeds=10):
'''Returns a scikit-learn KMeans fitting of the given embeddings.
Args:
embeddings: A matrix whose rows will be clustered with k-means.
num_clusters: The number of clusters to build.
num_seeds: The number of different initializations to use.
Returns:
An sklearn.cluster.KMeans estimator for the rows of the embedding matrix.
'''
estimator = KMeans(init='k-means++',
n_clusters=num_clusters,
n_init=num_seeds)
start_time = time.time()
estimator.fit(embeddings)
end_time = time.time()
run_time = end_time - start_time
print('{}-way KMeans took {} seconds.'.format(num_clusters, run_time))
return estimator
def save_estimator(estimator, filename):
'''Saves an sklearn estimator via pickling.
Args:
estimator: The estimator to pickle.
filename: The filename to pickle into.
'''
with open(filename, 'wb') as outfile:
pickle.dump(estimator, outfile)
def load_estimator(filename):
'''Loads an sklearn estimator that has been pickled.
Args:
filename: The filename containing the pickled estimator.
Returns:
The unpickled estimator.
'''
estimator = None
with open(filename, 'rb') as infile:
estimator = pickle.load(infile)
return estimator
def merge_cluster_pair(embeddings, cluster_indices, cluster_distances):
'''Uses maximum linkage cluster distance to merge closest pair.
Args:
embeddings: The tall-skinny matrix whose rows we are clustering.
cluster_indices: The list of indices of each cluster.
cluster_distances: The dictionary mapping pairs of cluster keys to their
cluster distance. It should be empty if it does not contain the full
list of cluster interaction distances, with the smaller index
occurring first in each pairing.
Returns:
The tuple of the two merged cluster indices and their distance.
'''
if len(cluster_distances) == 0:
distance_start = time.time()
num_clusters = len(cluster_indices)
print(' Initializing distances between {} clusters.'.format(
num_clusters))
for id0 in range(num_clusters):
print(' Cluster source {} / {}'.format(id0, num_clusters))
list0 = cluster_indices[id0]
embeddings0 = embeddings[list0, :]
# TODO(Jack Poulson): Incorporate matrix-matrix multiplication
# to accelerate the inner products via batching. A batch size of
# 128 or 256 might be appropriate.
for id1 in range(id0 + 1, num_clusters):
distance_key = (id0, id1)
list1 = cluster_indices[id1]
if max(len(list0), len(list1)) < 1000:
embeddings1 = embeddings[list1, :]
inner_products = embeddings0 @ embeddings1.transpose()
min_inner_product = np.amin(inner_products)
max_distance = 1.0 - min_inner_product
else:
max_distance = 0
for ind1 in list1:
ind1_inner_products = (
embeddings0 @ embeddings[ind1, :].transpose())
# Get the minimum inner product between ind1 and list0.
min_ind1_inner_product = min(ind1_inner_products)
# Get the maximum distance between ind1 and list0.
max_ind1_distance = 1.0 - min_ind1_inner_product
max_distance = max(max_distance, max_ind1_distance)
cluster_distances[distance_key] = max_distance
distance_end = time.time()
distance_time = distance_end - distance_start
print('Distance initialization took {} seconds.'.format(distance_time))
merge_pair = min(cluster_distances, key=cluster_distances.get)
merge_parent, merge_child = merge_pair
if merge_parent >= merge_child:
raise Exception('Expected distance keys to also have minimum first.')
min_distance = cluster_distances[merge_pair]
old_pairs = list(cluster_distances.keys())
for pair in old_pairs:
parent, child = pair
collision_with_merge = (parent == merge_parent or parent == merge_child
or child == merge_parent
or child == merge_child)
if not collision_with_merge:
continue
# This interaction was already merged.
if not pair in cluster_distances:
continue
# We can analytically update the cluster distances.
if pair == merge_pair:
# We will be merging these two clusters; the second of the two
# will no longer exist.
del cluster_distances[pair]
elif parent == merge_parent:
# merge_parent = parent < child, but we do not know the relation
# between child and merge_child.
other_pair = (min(merge_child, child), max(merge_child, child))
if other_pair in cluster_distances:
# The minimum index matches with the index being merged into. We
# need to update this case to:
#
# max(cluster_distances(merge_parent, child),
# cluster_distances(merge_child, child)),
#
# and then delete cluster_distances(merge_child, child).
cluster_distances[pair] = max(cluster_distances[pair],
cluster_distances[other_pair])
del cluster_distances[other_pair]
elif child == merge_parent:
# parent < child = merge_parent < merge_child.
other_pair = (parent, merge_child)
if other_pair in cluster_distances:
cluster_distances[pair] = max(cluster_distances[pair],
cluster_distances[other_pair])
del cluster_distances[other_pair]
elif parent == merge_child:
# merge_parent < merge_child = parent < child,
other_pair = (merge_parent, child)
if other_pair in cluster_distances:
cluster_distances[pair] = max(cluster_distances[pair],
cluster_distances[other_pair])
del cluster_distances[other_pair]
elif child == merge_child:
# parent < merge_child = child, but do not generally know the
# ordering of merge_parent and parent.
other_pair = (min(merge_parent, parent), max(merge_parent, parent))
if other_pair in cluster_distances:
cluster_distances[pair] = max(cluster_distances[pair],
cluster_distances[other_pair])
del cluster_distances[other_pair]
else:
raise Exception('Impossible scenario.')
return (merge_parent, merge_child, min_distance)
def get_cluster_indices(cluster_labels):
'''Converts the cluster labels into a list of each cluster's indices.
Args:
cluster_labels: The mapping from the item index to the cluster index.
Returns:
The indices of each cluster.
'''
num_items = len(cluster_labels)
num_clusters = np.amax(cluster_labels) + 1
cluster_indices = []
for i in range(num_clusters):
cluster_indices.append([])
for i in range(num_items):
cluster_label = cluster_labels[i]
cluster_indices[cluster_label].append(i)
return cluster_indices
def get_cluster_merge_tree(embeddings, cluster_labels):
'''Uses maximum linkage cluster distance to form merge tree.
Args:
embeddings: The tall-skinny matrix whose rows we are clustering.
cluster_labels: The mapping from the item index to the cluster index.
Returns:
The tree representing the cluster merging process and the indices of
each cluster.
'''
num_clusters = np.amax(cluster_labels) + 1
num_merges = num_clusters - 1
cluster_indices = get_cluster_indices(cluster_labels)
# Get the merge sequence.
merges = []
cluster_distances = {}
for i in range(num_merges):
print(' Performing merge {} of {}'.format(i, num_merges))
parent_cluster, child_cluster, distance = merge_cluster_pair(
embeddings, cluster_indices, cluster_distances)
merges.append((parent_cluster, child_cluster))
# Turn the merge sequence into a tree.
merge_tree = []
for i in range(num_clusters):
merge_tree.append({'children': [], 'parent': -1})
for i in range(num_merges):
merge = merges[i]
parent = merge[0]
child = merge[1]
merge_tree[parent]['children'].append(child)
merge_tree[child]['parent'] = parent
return merge_tree, cluster_indices
# TODO(Jack Poulson): Accumulate using a helper function rather than performing
# expensive sequences of concatenations.
def get_cluster_subtree(merge_tree, subtree_root):
'''Returns the list of clusters in a cluster subtree.
Args:
merge_tree: The tree structure representing the cluster tree.
subtree_root: The cluster id to use as the root of the subtree.
Returns:
The post-ordered list of clusters in the subtree with the given root.
'''
clusters = []
for child in merge_tree[subtree_root]['children']:
clusters += get_cluster_subtree(merge_tree, child)
clusters += [subtree_root]
return clusters
def get_cluster_subtree_indices(merge_tree, cluster_indices, subtree_root):
'''Returns the list of item indices in a cluster subtree.
Args:
merge_tree: The tree structure representing the cluster tree.
cluster_indices: The list of item indices of each cluster.
subtree_root: The cluster id to use as the root of the subtree.
Returns:
The post-ordered list of indices in the entire cluster subtree with the
given root.
'''
clusters = get_cluster_subtree(merge_tree, subtree_root)
indices = []
for cluster in clusters:
indices += cluster_indices[cluster]
return indices
def get_skeleton(embeddings,
indices,
max_samples,
threshold,
skeleton_type=MAX_DISTANCE_SKELETON,
verbose=False):
'''Returns a representative subset of a submatrix of the embeddings.
Args:
embeddings: The matrix whose rows are the item embeddings.
indices: The index set of rows to find a skeleton for.
max_samples: The maximum number of samples for forming the skeleton.
threshold: In the case of a QR skeleton, this is the minimum allowed
absolute value of a diagonal entry of 'R' from QR for its
corresponding entry to be allowed in the sample. For a maximum-
distance skeleton, this is the minimum allowable cosine-similarity
distance of two samples.
verbose: Whether information should be printed about the skeleton
generation process.
Returns:
The skeleton (global) indices representing/spanning the given subset.
'''
if skeleton_type == MAX_DISTANCE_SKELETON:
num_indices = len(indices)
embeddings_subset = embeddings[indices, :].copy()
relative_indices = list(range(num_indices))
# Randomly choose the first point.
skeleton_indices = indices.copy()
pivot = np.random.randint(low=0, high=num_indices)
num_kept = 1
# Pivot the chosen point into first position.
embeddings_subset[[0, pivot], :] = embeddings_subset[[pivot, 0], :]
skeleton_indices[0], skeleton_indices[pivot] = (
skeleton_indices[pivot], skeleton_indices[0])
# Initialize the distances of the remaining points from the sample.
# We can lazily delete entries by setting their distance to a value
# below zero (i.e., -1).
distances = -1 * np.ones((num_indices, ))
for i in range(1, num_indices):
inner_product = np.inner(embeddings_subset[0, :],
embeddings_subset[i, :])
distance = 1. - inner_product
distances[i] = distance
# Sample until we exhause our allowance or all remaining points are
# sufficiently close to the existing sample.
for i in range(1, max_samples):
pivot = np.argmax(distances)
pivot_distance = distances[pivot]
if verbose:
print('Pivot distance: {}'.format(pivot_distance))
if pivot_distance < threshold:
if verbose:
print('Early-exiting because {} < {}'.format(
pivot_distance, threshold))
break
num_kept += 1
embeddings_subset[[i, pivot], :] = embeddings_subset[[pivot, i], :]
skeleton_indices[i], skeleton_indices[pivot] = (
skeleton_indices[pivot], skeleton_indices[i])
distances[i], distances[pivot] = distances[pivot], distances[i]
# Update the remaining distances.
distances[i] = -1
for j in range(i + 1, num_indices):
inner_product = np.inner(embeddings_subset[i, :],
embeddings_subset[j, :])
distance = 1. - inner_product
distances[j] = min(distances[j], distance)
del skeleton_indices[num_kept:]
return skeleton_indices
else:
submatrix_trans = embeddings[indices, :].transpose().copy()
num_rows, num_columns = submatrix_trans.shape
min_dimension = min(num_rows, num_columns)
max_samples = min(max_samples, min_dimension)
Q, R, P = scipy.linalg.qr(submatrix_trans,
mode='economic',
pivoting=True)
submatrix_trans_perm = submatrix_trans[:, P]
num_samples = 0
for i in range(max_samples):
if np.abs(R[i, i]) < threshold:
if verbose:
print(' Early-exiting becase |{}| < {}'.format(
np.abs(R[i, i]), threshold))
break
num_samples += 1
# Unfortunately we cannot use the syntax 'indices[P[:num_samples]]'.
skeleton_indices = []
for i in range(num_samples):
skeleton_indices.append(indices[P[i]])
return skeleton_indices
def main():
num_rows = 10
num_columns = 10
rank = 3
zero_weight = 0
frobenius_regularization = 1e-3
num_iterations = 10
dtype = np.float32
A_dok_matrix = dok_matrix((num_rows, num_columns), dtype=dtype)
A_dok_matrix[0, 0] = 10
A_dok_matrix[4, 6] = 3
A_dok_matrix[8, 9] = -2
A_csr_matrix = A_dok_matrix.tocsr()
left_factor, right_factor = generate_embeddings(
A_csr_matrix,
rank=rank,
zero_weight=zero_weight,
frobenius_regularization=frobenius_regularization,
num_iterations=num_iterations)
prediction = left_factor @ right_factor.transpose()
print('Left factor: {}'.format(left_factor))
print('Right factor: {}'.format(right_factor))
print('Prediction: {}'.format(prediction))
if __name__ == '__main__':
main()