Source code for apricot.functions.base

# base.py
# Author: Jacob Schreiber <jmschreiber91@gmail.com> 

"""
This file contains code that implements the core of the submodular selection
algorithms.
"""

import numpy
from tqdm import tqdm

from ..optimizers import BaseOptimizer
from ..optimizers import NaiveGreedy
from ..optimizers import LazyGreedy
from ..optimizers import ApproximateLazyGreedy
from ..optimizers import TwoStageGreedy
from ..optimizers import StochasticGreedy
from ..optimizers import BidirectionalGreedy
from ..optimizers import GreeDi
from ..optimizers import SieveGreedy
from ..optimizers import OPTIMIZERS

from ..utils import PriorityQueue
from ..utils import check_random_state
from ..utils import _calculate_pairwise_distances

from scipy.sparse import csr_matrix


class BaseSelection(object):
	"""The base selection object.

	This object defines the structures that all submodular selection algorithms
	should follow. All algorithms will have the same public methods and the
	same attributes.

	Parameters
	----------
	n_samples : int
		The number of samples to return.

	initial_subset : list, numpy.ndarray or None, optional
		If provided, this should be a list of indices into the data matrix
		to use as the initial subset, or a group of examples that may not be
		in the provided data should beused as the initial subset. If indices, 
		the provided array should be one-dimensional. If a group of examples,
		the data should be 2 dimensional.

	optimizer : string or optimizers.BaseOptimizer, optional
		The optimization approach to use for the selection. Default is
		'two-stage', which makes selections using the naive greedy algorithm
		initially and then switches to the lazy greedy algorithm. Must be
		one of

			'naive' : the naive greedy algorithm
			'lazy' : the lazy (or accelerated) greedy algorithm
			'approximate-lazy' : the approximate lazy greedy algorithm
			'two-stage' : starts with naive and switches to lazy
			'stochastic' : the stochastic greedy algorithm
			'greedi' : the GreeDi distributed algorithm
			'bidirectional' : the bidirectional greedy algorithm

		Default is 'naive'.

	optimizer_kwds : dict or None
		A dictionary of arguments to pass into the optimizer object. The keys
		of this dictionary should be the names of the parameters in the optimizer
		and the values in the dictionary should be the values that these
		parameters take. Default is None.

	reservoir : numpy.ndarray or None
		The reservoir to use when calculating gains in the sieve greedy
		streaming optimization algorithm in the `partial_fit` method.
		Currently only used for graph-based functions. If a numpy array
		is passed in, it will be used as the reservoir. If None is passed in,
		will use reservoir sampling to collect a reservoir. Default is None.

	max_reservoir_size : int 
		The maximum size that the reservoir can take. If a reservoir is passed
		in, this value is set to the size of that array. Default is 1000.

	n_jobs : int
		The number of threads to use when performing computation in parallel.
		Currently, this parameter is exposed but does not actually do anything.
		This will be fixed soon.

	random_state : int or RandomState or None, optional
		The random seed to use for the random selection process. Only used
		for stochastic greedy.

	verbose : bool
		Whether to print output during the selection process.

	Attributes
	----------
	n_samples : int
		The number of samples to select.

	ranking : numpy.array int
		The selected samples in the order of their gain with the first number in
		the ranking corresponding to the index of the first sample that was
		selected by the greedy procedure.

	gains : numpy.array float
		The gain of each sample in the returned set when it was added to the
		growing subset. The first number corresponds to the gain of the first
		added sample, the second corresponds to the gain of the second added
		sample, and so forth.
	"""

	def __init__(self, n_samples, initial_subset=None, optimizer='lazy', 
		optimizer_kwds={}, reservoir=None, max_reservoir_size=1000, 
		n_jobs=1, random_state=None, verbose=False):
		if n_samples <= 0:
			raise ValueError("n_samples must be a positive value.")

		if not isinstance(initial_subset, (list, numpy.ndarray)) and initial_subset is not None: 
			raise ValueError("initial_subset must be a list, numpy array, or None")
		if isinstance(initial_subset, (list, numpy.ndarray)):
			initial_subset = numpy.array(initial_subset)

		if not isinstance(optimizer, BaseOptimizer):
			if optimizer not in OPTIMIZERS.keys():
				raise ValueError("Optimizer must be an optimizer object or " \
					"a str in {}.".format(str(OPTIMIZERS.keys())))

		if isinstance(optimizer, BaseOptimizer):
			optimizer.function = self

		if verbose not in (True, False):
			raise ValueError("verbosity must be True or False")

		self.n_samples = n_samples
		self.metric = 'ignore'
		self.random_state = check_random_state(random_state)
		self.optimizer = optimizer
		self.optimizer_kwds = optimizer_kwds
		self.n_jobs = n_jobs
		self.verbose = verbose
		self.initial_subset = initial_subset

		self.ranking = None
		self.idxs = None
		self.gains = None
		self.subset = None
		self.sparse = None
		self._X = None
		
		self.sieve_current_values_ = None
		self.n_seen_ = 0
		self.reservoir_size = 0 if reservoir is None else reservoir.shape[0]
		self.reservoir = reservoir
		self.max_reservoir_size = max_reservoir_size if reservoir is None else reservoir.shape[0]
		self.update_reservoir_ = reservoir is None

	def fit(self, X, y=None, sample_weight=None, sample_cost=None):
		"""Run submodular optimization to select a subset of examples.

		This method is a wrapper for the full submodular optimization process.
		It takes in some data set (and optionally labels that are ignored
		during this process) and selects `n_samples` from it in the greedy
		manner specified by the optimizer.

		This method will return the selector object itself, not the transformed
		data set. The `transform` method will then transform a data set to the
		selected points, or alternatively one can use the ranking stored in
		the `self.ranking` attribute. The `fit_transform` method will perform
		both optimization and selection and return the selected items.

		Parameters
		----------
		X : list or numpy.ndarray, shape=(n, d)
			The data set to transform. Must be numeric.

		y : list or numpy.ndarray or None, shape=(n,), optional
			The labels to transform. If passed in this function will return
			both the data and th corresponding labels for the rows that have
			been selected.

		sample_weight : list or numpy.ndarray or None, shape=(n,), optional
			The weight of each example. Currently ignored in apricot but
			included to maintain compatibility with sklearn pipelines. 

		sample_cost : list or numpy.ndarray or None, shape=(n,), optional
			The cost of each item. If set, indicates that optimization should
			be performed with respect to a knapsack constraint.

		Returns
		-------
		self : BaseGraphSelection
			The fit step returns this selector object.
		"""

		allowed_dtypes = list, numpy.ndarray, csr_matrix

		if not isinstance(X, allowed_dtypes):
			raise ValueError("X must be either a list of lists, a 2D numpy " \
				"array, or a scipy.sparse.csr_matrix.")
		if isinstance(X, numpy.ndarray) and len(X.shape) != 2:
			raise ValueError("X must have exactly two dimensions.")
		if numpy.min(X) < 0.0 and numpy.max(X) > 0.:
			raise ValueError("X cannot contain negative values or must be entirely "\
				"negative values.")
		if self.n_samples > X.shape[0]:
			raise ValueError("Cannot select more examples than the number in" \
				" the data set.")

		if not self.sparse:
			if X.dtype != 'float64':
				X = X.astype('float64')

		if isinstance(self.optimizer, str):
			optimizer = OPTIMIZERS[self.optimizer](function=self, 
				verbose=self.verbose, random_state=self.random_state,
				**self.optimizer_kwds)
		else:
			optimizer = self.optimizer

		self._X = X if self._X is None else self._X
		self._initialize(X)

		if self.verbose:
			self.pbar = tqdm(total=self.n_samples, unit_scale=True)

		optimizer.select(X, self.n_samples, sample_cost=sample_cost)

		if self.verbose == True:
			self.pbar.close()

		self.ranking = numpy.array(self.ranking)
		self.gains = numpy.array(self.gains)
		return self

	def partial_fit(self, X, y=None, sample_weight=None, sample_cost=None):
		allowed_dtypes = list, numpy.ndarray, csr_matrix

		if not isinstance(X, allowed_dtypes):
			raise ValueError("X must be either a list of lists, a 2D numpy " \
				"array, or a scipy.sparse.csr_matrix.")
		if isinstance(X, numpy.ndarray) and len(X.shape) != 2:
			raise ValueError("X must have exactly two dimensions.")

		if not self.sparse:
			if X.dtype != 'float64':
				X = X.astype('float64')

		if not isinstance(self.optimizer, SieveGreedy):
			self.optimizer = OPTIMIZERS['sieve'](function=self, 
				verbose=self.verbose, random_state=self.random_state,
				**self.optimizer_kwds)

		self._X = X if self._X is None else self._X
		self._initialize(X)

		if self.verbose:
			self.pbar = tqdm(total=self.n_samples, unit_scale=True)

		self.optimizer.select(X, self.n_samples, sample_cost=sample_cost)

		if self.verbose == True:
			self.pbar.close()

		self.ranking = numpy.array(self.ranking)
		self.gains = numpy.array(self.gains)
		self._X = None
		return self

	def transform(self, X, y=None, sample_weight=None):
		"""Transform a data set to include only the selected examples.

		This method will return a selection of X and optionally selections
		of y and sample_weight. The default setting is to select items based
		on the ranking determined in the `fit` step with examples in the same
		order as that ranking. Optionally, the whole data set can be returned,
		with the weights corresponding to samples that were not selected set
		to 0. This setting can be controlled by setting `pipeline=True`. 

		Parameters
		----------
		X : list or numpy.ndarray, shape=(n, d)
			The data set to transform. Must be numeric.

		y : list or numpy.ndarray or None, shape=(n,), optional
			The labels to transform. If passed in this function will return
			both the data and the corresponding labels for the rows that have
			been selected. Default is None. 

		sample_weight : list or numpy.ndarray or None, shape=(n,), optional
			The sample weights to transform. If passed in this function will
			return the selected labels (y) and the selected samples, even
			if no labels were passed in. Default is None.

		Returns
		-------
		X_subset : numpy.ndarray, shape=(n_samples, d)
			A subset of the data such that n_samples < n and n_samples is the
			integer provided at initialization.

		y_subset : numpy.ndarray, shape=(n_samples,), optional
			The labels that match with the indices of the samples if y is
			passed in. Only returned if passed in.

		sample_weight_subset : numpy.ndarray, shape=(n_samples,), optional
			The weight of each example.
		"""

		r = self.ranking

		if sample_weight is not None:
			if y is None:
				return X[r], None, sample_weight[r]
			else:
				return X[r], y[r], sample_weight[r]

		else:
			if y is None:
				return X[r]
			else:
				return X[r], y[r]

	def fit_transform(self, X, y=None, sample_weight=None, sample_cost=None):
		"""Run optimization and select a subset of examples.

		This method will first perform the `fit` step and then perform the
		`transform` step, returning a transformed data set. 

		Parameters
		----------
		X : list or numpy.ndarray, shape=(n, d)
			The data set to transform. Must be numeric.

		y : list or numpy.ndarray or None, shape=(n,), optional
			The labels to transform. If passed in this function will return
			both the data and the corresponding labels for the rows that have
			been selected. Default is None. 

		sample_weight : list or numpy.ndarray or None, shape=(n,), optional
			The sample weights to transform. If passed in this function will
			return the selected labels (y) and the selected samples, even
			if no labels were passed in. Default is None.

		sample_cost : list or numpy.ndarray or None, shape=(n,), optional
			The cost of each item. If set, indicates that optimization should
			be performed with respect to a knapsack constraint.

		Returns
		-------
		X_subset : numpy.ndarray, shape=(n_samples, d)
			A subset of the data such that n_samples < n and n_samples is the
			integer provided at initialization.

		y_subset : numpy.ndarray, shape=(n_samples,), optional
			The labels that match with the indices of the samples if y is
			passed in. Only returned if passed in.

		sample_weight_subset : numpy.ndarray, shape=(n_samples,), optional
			The weight of each example.
		"""

		return self.fit(X, y=y, sample_weight=sample_weight, 
			sample_cost=sample_cost).transform(X, y=y, 
			sample_weight=sample_weight)

	def _initialize(self, X, idxs=None):
		n, d = X.shape
		self._X = X if self._X is None else self._X

		self.sparse = isinstance(X, csr_matrix)
		self.ranking = []
		self.gains = []
		self.subset = numpy.zeros((0, self._X.shape[1]), dtype='float64')

		self.current_values = numpy.zeros(d, dtype='float64')
		self.current_concave_values = numpy.zeros(d, dtype='float64')
		self.mask = numpy.zeros(n, dtype='int8')

		if self.initial_subset is not None:
			if self.initial_subset.ndim == 1:
				if self.initial_subset.dtype == bool:
					self.initial_subset = numpy.where(self.initial_subset == 1)[0]
				
				if len(self.initial_subset) + self.n_samples > X.shape[0]:
					raise ValueError("When using a mask for the initial subset" \
						" must selected fewer than the size of the subset minus" \
						" the initial subset size, i.e., n_samples < X.shape[0] -"\
						" initial_subset.shape[0].")

				if self.initial_subset.max() > X.shape[0]:
					raise ValueError("When passing in an integer mask for the initial subset"\
						" the maximum value cannot exceed the size of the data set.")
				elif self.initial_subset.min() < 0:
					raise ValueError("When passing in an integer mask for the initial subset"\
						" the minimum value cannot be negative.")
				
				self.mask[self.initial_subset] = 1

		self.idxs = numpy.where(self.mask == 0)[0]

	def _calculate_gains(self, X, idxs=None):
		raise NotImplementedError

	def _calculate_sieve_gains(self, X, thresholds, idxs):
		n = X.shape[0]
		d = X.shape[1] if self.reservoir is None else self.max_reservoir_size
		l = len(thresholds)

		if self.sieve_current_values_ is None:
			self.sieve_current_values_ = numpy.zeros((l, d), 
				dtype='float64')
			self.sieve_selections_ = numpy.zeros((l, self.n_samples), 
				dtype='int64') - 1
			self.sieve_gains_ = numpy.zeros((l, self.n_samples), 
				dtype='float64') - 1
			self.sieve_n_selected_ = numpy.zeros(l, 
				dtype='int64')
			self.sieve_total_gains_ = numpy.zeros(l, 
				dtype='float64')
			self.sieve_subsets_ = numpy.zeros((l, self.n_samples, 
				self._X.shape[1]), dtype='float64')
		else:
			j = l - self.sieve_current_values_.shape[0]
			if j > 0:
				self.sieve_current_values_ = numpy.vstack([
					self.sieve_current_values_, numpy.zeros((j, d), 
						dtype='float64')])
				self.sieve_selections_ = numpy.vstack([
					self.sieve_selections_, numpy.zeros((j, self.n_samples), 
						dtype='int64') - 1])
				self.sieve_gains_ = numpy.vstack([self.sieve_gains_, 
					numpy.zeros((j, self.n_samples), dtype='float64')])
				self.sieve_n_selected_ = numpy.concatenate([
					self.sieve_n_selected_, numpy.zeros(j, dtype='int64')])
				self.sieve_total_gains_ = numpy.concatenate([
					self.sieve_total_gains_, numpy.zeros(j, dtype='float64')])
				self.sieve_subsets_ = numpy.concatenate([self.sieve_subsets_, 
					numpy.zeros((j, self.n_samples, self._X.shape[1]), 
						dtype='float64')])

	def _select_next(self, X, gain, idx):
		self.ranking.append(idx)
		self.gains.append(gain)
		self.mask[idx] = True
		self.idxs = numpy.where(self.mask == 0)[0]

		if self.sparse:
			X = self._X[idx:idx+1].toarray()
		else:
			X = self._X[idx:idx+1]

		if self.metric != 'precomputed':
			self.subset = numpy.concatenate([self.subset, X])


class BaseGraphSelection(BaseSelection):
	"""The base graph selection object.

	This object defines the structures that all submodular selection algorithms
	should follow if they operate on a graph, such as pairwise similarity 
	measurements. All algorithms will have the same public methods and the same 
	attributes.

	NOTE: All ~pairwise~ values in your data must be positive for these
	selection methods to work.

	This implementation allows users to pass in either their own symmetric
	square matrix of similarity values, or a data matrix as normal and a function
	that calculates these pairwise values.

	Parameters
	----------
	n_samples : int
		The number of samples to return.

	metric : str
		The method for converting a data matrix into a square symmetric matrix
		of pairwise similarities. If a string, can be any of the metrics
		implemented in sklearn (see https://scikit-learn.org/stable/modules/
		generated/sklearn.metrics.pairwise_distances.html), including
		"precomputed" if one has already generated a similarity matrix. Note
		that sklearn calculates distance matrices whereas apricot operates on
		similarity matrices, and so a distances.max() - distances transformation
		is performed on the resulting distances. For backcompatibility,
		'corr' will be read as 'correlation'.

	initial_subset : list, numpy.ndarray or None
		If provided, this should be a list of indices into the data matrix
		to use as the initial subset, or a group of examples that may not be
		in the provided data should beused as the initial subset. If indices, 
		the provided array should be one-dimensional. If a group of examples,
		the data should be 2 dimensional.

	optimizer : string or optimizers.BaseOptimizer, optional
		The optimization approach to use for the selection. Default is
		'two-stage', which makes selections using the naive greedy algorithm
		initially and then switches to the lazy greedy algorithm. Must be
		one of

			'naive' : the naive greedy algorithm
			'lazy' : the lazy (or accelerated) greedy algorithm
			'approximate-lazy' : the approximate lazy greedy algorithm
			'two-stage' : starts with naive and switches to lazy
			'stochastic' : the stochastic greedy algorithm
			'greedi' : the GreeDi distributed algorithm
			'bidirectional' : the bidirectional greedy algorithm

		Default is 'naive'.

	optimizer_kwds : dict or None
		A dictionary of arguments to pass into the optimizer object. The keys
		of this dictionary should be the names of the parameters in the optimizer
		and the values in the dictionary should be the values that these
		parameters take. Default is None.

	n_neighbors : int or None
		When constructing a similarity matrix, the number of nearest neighbors
		whose similarity values will be kept. The result is a sparse similarity
		matrix which can significantly speed up computation at the cost of
		accuracy. Default is None.

	reservoir : numpy.ndarray or None
		The reservoir to use when calculating gains in the sieve greedy
		streaming optimization algorithm in the `partial_fit` method.
		Currently only used for graph-based functions. If a numpy array
		is passed in, it will be used as the reservoir. If None is passed in,
		will use reservoir sampling to collect a reservoir. Default is None.

	max_reservoir_size : int 
		The maximum size that the reservoir can take. If a reservoir is passed
		in, this value is set to the size of that array. Default is 1000.

	n_jobs : int
		The number of threads to use when performing computation in parallel.
		Currently, this parameter is exposed but does not actually do anything.
		This will be fixed soon.

	random_state : int or RandomState or None, optional
		The random seed to use for the random selection process. Only used
		for stochastic greedy.

	verbose : bool
		Whether to print output during the selection process.

	Attributes
	----------
	n_samples : int
		The number of samples to select.

	metric : callable
		A function that takes in a data matrix and converts it to a square
		symmetric matrix.

	ranking : numpy.array int
		The selected samples in the order of their gain.

	gains : numpy.array float
		The gain of each sample in the returned set when it was added to the
		growing subset. The first number corresponds to the gain of the first
		added sample, the second corresponds to the gain of the second added
		sample, and so forth.
	"""

	def __init__(self, n_samples, metric='euclidean', 
		initial_subset=None, optimizer='two-stage', optimizer_kwds={},
		n_neighbors=None, reservoir=None, max_reservoir_size=1000, 
		n_jobs=1, random_state=None, verbose=False):

		super(BaseGraphSelection, self).__init__(n_samples=n_samples, 
			initial_subset=initial_subset, optimizer=optimizer, 
			optimizer_kwds=optimizer_kwds, reservoir=reservoir, 
			max_reservoir_size=max_reservoir_size, n_jobs=n_jobs, 
			random_state=random_state, verbose=verbose)

		self.metric = metric.replace("corr", "correlation")
		self.n_neighbors = n_neighbors


	def fit(self, X, y=None, sample_weight=None, sample_cost=None):
		"""Run submodular optimization to select a subset of examples.

		This method is a wrapper for the full submodular optimization process.
		It takes in some data set (and optionally labels that are ignored
		during this process) and selects `n_samples` from it in the greedy
		manner specified by the optimizer.

		This method will return the selector object itself, not the transformed
		data set. The `transform` method will then transform a data set to the
		selected points, or alternatively one can use the ranking stored in
		the `self.ranking` attribute. The `fit_transform` method will perform
		both optimization and selection and return the selected items.

		Parameters
		----------
		X : list or numpy.ndarray, shape=(n, d)
			The data set to transform. Must be numeric.

		y : list or numpy.ndarray or None, shape=(n,), optional
			The labels to transform. If passed in this function will return
			both the data and th corresponding labels for the rows that have
			been selected.

		sample_weight : list or numpy.ndarray or None, shape=(n,), optional
			The weight of each example. Currently ignored in apricot but
			included to maintain compatibility with sklearn pipelines. 

		sample_cost : list or numpy.ndarray or None, shape=(n,), optional
			The cost of each item. If set, indicates that optimization should
			be performed with respect to a knapsack constraint.

		Returns
		-------
		self : BaseGraphSelection
			The fit step returns this selector object.
		"""

		if isinstance(X, csr_matrix) and self.metric not in ("precomputed", "ignore"):
			raise ValueError("Must passed in a precomputed sparse " \
				"similarity matrix or a dense feature matrix.")
		if self.metric == 'precomputed' and X.shape[0] != X.shape[1]:
			raise ValueError("Precomputed similarity matrices " \
				"must be square and symmetric.")

		X_pairwise = _calculate_pairwise_distances(X, metric=self.metric, 
			n_neighbors=self.n_neighbors)
	
		self._X = X
		return super(BaseGraphSelection, self).fit(X_pairwise, y=y,
			sample_weight=sample_weight, sample_cost=sample_cost)

	def partial_fit(self, X, y=None, sample_weight=None, sample_cost=None):
		if self.reservoir is None:
			self.reservoir = numpy.empty((self.max_reservoir_size, X.shape[1]))

		if self.update_reservoir_:
			for i in range(X.shape[0]):
				if self.reservoir_size < self.max_reservoir_size:
					self.reservoir[self.reservoir_size] = X[i]
					self.reservoir_size += 1
				else:
					r = self.random_state.choice(self.n_seen_ + i)
					if r < self.max_reservoir_size:
						self.reservoir[r] = X[i]
						#self.current_values_[:, r] = 0.

		X_pairwise = _calculate_pairwise_distances(X, 
			Y=self.reservoir[:self.reservoir_size], metric=self.metric)

		self._X = X
		super(BaseGraphSelection, self).partial_fit(X_pairwise, y=y, 
			sample_weight=sample_weight, sample_cost=sample_cost)

		self.current_values = numpy.zeros(self.reservoir_size, 
			dtype='float64')
		self.n_seen_ += X.shape[0]

	def _initialize(self, X_pairwise, idxs=None):
		super(BaseGraphSelection, self)._initialize(X_pairwise, idxs=idxs)

	def _calculate_gains(self, X_pairwise):
		super(BaseGraphSelection, self)._calculate_gains(X_pairwise)

	def _calculate_sieve_gains(self, X, thresholds, idxs):
		super(BaseGraphSelection, self)._calculate_sieve_gains(X, thresholds,
			idxs)

	def _select_next(self, X_pairwise, gain, idx):
		super(BaseGraphSelection, self)._select_next(X_pairwise, gain, idx)