Source code for irspack.recommenders.knn

import enum
from abc import abstractmethod
from typing import Optional, Union

from .._threading import get_n_threads
from ..definitions import InteractionMatrix
from ..optimization.parameter_range import (
    CategoricalRange,
    UniformFloatRange,
    default_tune_range_knn,
    default_tune_range_knn_with_weighting,
)
from ..utils import okapi_BM_25_weight, remove_diagonal, tf_idf_weight
from ._knn import (
    AsymmetricSimilarityComputer,
    CosineSimilarityComputer,
    JaccardSimilarityComputer,
    TverskyIndexComputer,
)
from .base import BaseSimilarityRecommender, RecommenderConfig


class FeatureWeightingScheme(str, enum.Enum):
    NONE = "NONE"
    TF_IDF = "TF_IDF"
    BM_25 = "BM_25"


class BaseKNNConfig(RecommenderConfig):
    shrinkage: float = 0.0
    top_k: int = 100
    n_threads: Optional[int] = None


class BaseKNNRecommender(BaseSimilarityRecommender):
    default_tune_range = default_tune_range_knn

    def __init__(
        self,
        X_train_all: InteractionMatrix,
        shrinkage: float = 0.0,
        top_k: int = 100,
        n_threads: Optional[int] = None,
        feature_weighting: str = "NONE",
        bm25_k1: float = 1.2,
        bm25_b: float = 0.75,
    ):
        super().__init__(X_train_all)
        self.shrinkage = shrinkage
        self.top_k = top_k
        self.feature_weighting = FeatureWeightingScheme(feature_weighting)
        self.bm25_k1 = bm25_k1
        self.bm25_b = bm25_b
        self.n_threads = get_n_threads(n_threads)

    @abstractmethod
    def _create_computer(
        self, X: InteractionMatrix
    ) -> Union[
        CosineSimilarityComputer,
        AsymmetricSimilarityComputer,
        JaccardSimilarityComputer,
        TverskyIndexComputer,
    ]:
        raise NotImplementedError("")

    def _learn(self) -> None:
        if self.feature_weighting == FeatureWeightingScheme.NONE:
            X_weighted = self.X_train_all
        elif self.feature_weighting == FeatureWeightingScheme.TF_IDF:
            X_weighted = tf_idf_weight(self.X_train_all)
        elif self.feature_weighting == FeatureWeightingScheme.BM_25:
            X_weighted = okapi_BM_25_weight(self.X_train_all, self.bm25_k1, self.bm25_b)
        else:
            raise RuntimeError("Unknown weighting scheme.")

        computer = self._create_computer(X_weighted.T)
        self._W = remove_diagonal(
            computer.compute_similarity(self.X_train_all.T, self.top_k)
        ).tocsc()


class BaseCosineKNNConfig(BaseKNNConfig):
    feature_weighting: str = "NONE"
    bm25_k1: float = 1.2
    bm25_b: float = 0.75


class CosineKNNConfig(BaseCosineKNNConfig):
    normalize: bool = False


[docs]class CosineKNNRecommender(BaseKNNRecommender): r"""K-nearest neighbor recommender system based on cosine similarity. That is, the similarity matrix ``W`` is given by (column-wise top-k restricted) .. math:: \mathrm{W}_{i,j} = \begin{cases} \frac{\sum_{u} X_{ui} X_{uj}}{||X_{*i}||_2 ||X_{*j}||_2 + \mathrm{shrinkage}} & (\text{if normalize = True}) \\ \sum_{u} X_{ui} X_{uj} & (\text{if normalize = False}) \end{cases} Args: X_train_all (Union[scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]): Input interaction matrix. shrinkage (float, optional): The shrinkage parameter for regularization. Defaults to 0.0. normalize (bool, optional): Whether to normalize the similarity. Defaults to False. top_k (int, optional): Specifies the maximal number of allowed neighbors. Defaults to 100. feature_weighting (str, optional): Specifies how to weight the feature. Must be one of: - "NONE" : no feature weighting - "TF_IDF" : TF-IDF weighting - "BM_25" : `Okapi BM-25 weighting <https://en.wikipedia.org/wiki/Okapi_BM25>`_ Defaults to "NONE". bm25_k1 (float, optional): The k1 parameter for BM25. Ignored if ``feature_weighting`` is not "BM_25". Defaults to 1.2. bm25_b (float, optional): The b parameter for BM25. Ignored if ``feature_weighting`` is not "BM_25". Defaults to 0.75. n_threads (Optional[int], optional): Specifies the number of threads to use for the computation. If ``None``, the environment variable ``"IRSPACK_NUM_THREADS_DEFAULT"`` will be looked up, and if the variable is not set, it will be set to ``os.cpu_count()``. Defaults to None. """ config_class = CosineKNNConfig default_tune_range = default_tune_range_knn_with_weighting.copy() + [ CategoricalRange("normalize", [False, True]) ]
[docs] def __init__( self, X_train_all: InteractionMatrix, shrinkage: float = 0.0, normalize: bool = False, top_k: int = 100, feature_weighting: str = "NONE", bm25_k1: float = 1.2, bm25_b: float = 0.75, n_threads: Optional[int] = None, ) -> None: super().__init__( X_train_all, shrinkage, top_k, n_threads, feature_weighting=feature_weighting, bm25_k1=bm25_k1, bm25_b=bm25_b, ) self.normalize = normalize
def _create_computer(self, X: InteractionMatrix) -> CosineSimilarityComputer: return CosineSimilarityComputer( X, self.shrinkage, self.normalize, self.n_threads )
class AsymmetricCosineKNNConfig(BaseCosineKNNConfig): alpha: float = 0.5
[docs]class AsymmetricCosineKNNRecommender(BaseKNNRecommender): r"""K-nearest neighbor recommender system based on asymmetric cosine similarity. That is, the similarity matrix ``W`` is given by (column-wise top-k restricted) .. math:: \mathrm{W}_{i,j} = \frac{\sum_{u} X_{ui} X_{uj}}{||X_{*i}||^{2\alpha}_2 ||X_{*j}||^{2(1-\alpha)}_2 + \mathrm{shrinkage}} Args: X_train_all (Union[scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]): Input interaction matrix. shrinkage (float, optional): The shrinkage parameter for regularization. Defaults to 0.0. alpha (bool, optional): Specifies :math:`\alpha`. Defaults to 0.5. top_k (int, optional): Specifies the maximal number of allowed neighbors. Defaults to 100. feature_weighting (str, optional): Specifies how to weight the feature. Must be one of: - "NONE" : no feature weighting - "TF_IDF" : TF-IDF weighting - "BM_25" : `Okapi BM-25 weighting <https://en.wikipedia.org/wiki/Okapi_BM25>`_ Defaults to "NONE". bm25_k1 (float, optional): The k1 parameter for BM25. Ignored if ``feature_weighting`` is not "BM_25". Defaults to 1.2. bm25_b (float, optional): The b parameter for BM25. Ignored if ``feature_weighting`` is not "BM_25". Defaults to 0.75. n_threads (Optional[int], optional): Specifies the number of threads to use for the computation. If ``None``, the environment variable ``"IRSPACK_NUM_THREADS_DEFAULT"`` will be looked up, and if the variable is not set, it will be set to ``os.cpu_count()``. Defaults to None. """ config_class = AsymmetricCosineKNNConfig default_tune_range = default_tune_range_knn_with_weighting.copy() + [ UniformFloatRange("alpha", 0, 1) ]
[docs] def __init__( self, X_train_all: InteractionMatrix, shrinkage: float = 0.0, alpha: float = 0.5, top_k: int = 100, feature_weighting: str = "NONE", bm25_k1: float = 1.2, bm25_b: float = 0.75, n_threads: Optional[int] = None, ): super().__init__( X_train_all, shrinkage, top_k, n_threads, feature_weighting=feature_weighting, bm25_k1=bm25_k1, bm25_b=bm25_b, ) self.alpha = alpha
def _create_computer(self, X: InteractionMatrix) -> AsymmetricSimilarityComputer: return AsymmetricSimilarityComputer( X, self.shrinkage, self.alpha, self.n_threads )
class JaccardKNNConfig(BaseKNNConfig): pass
[docs]class JaccardKNNRecommender(BaseKNNRecommender): r"""K-nearest neighbor recommender system based on Jaccard similarity. That is, the similarity matrix ``W`` is given by (column-wise top-k restricted) .. math:: \mathrm{W}_{i,j} = \frac{ |U_i \cap U_j |}{ |U_i \cup U_j| + \mathrm{shrinkage}} Args: X_train_all (Union[scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]): Input interaction matrix. shrinkage (float, optional): The shrinkage parameter for regularization. Defaults to 0.0. top_k (int, optional): Specifies the maximal number of allowed neighbors. Defaults to 100. n_threads (Optional[int], optional): Specifies the number of threads to use for the computation. If ``None``, the environment variable ``"IRSPACK_NUM_THREADS_DEFAULT"`` will be looked up, and if the variable is not set, it will be set to ``os.cpu_count()``. Defaults to None. """ config_class = JaccardKNNConfig default_tune_range = default_tune_range_knn.copy()
[docs] def __init__( self, X_train_all: InteractionMatrix, shrinkage: float = 0.0, top_k: int = 100, n_threads: Optional[int] = None, ) -> None: super().__init__(X_train_all, shrinkage, top_k, n_threads)
def _create_computer(self, X: InteractionMatrix) -> JaccardSimilarityComputer: return JaccardSimilarityComputer(X, self.shrinkage, self.n_threads)
class TverskyIndexKNNConfig(BaseKNNConfig): alpha: float = 0.5 beta: float = 0.5
[docs]class TverskyIndexKNNRecommender(BaseKNNRecommender): r"""K-nearest neighbor recommender system based on Tversky Index. That is, the similarity matrix ``W`` is given by (column-wise top-k restricted) .. math:: \mathrm{W}_{i,j} = \frac{ |U_i \cap U_j |}{ |U_i \cap U_j | + \alpha |U_i \setminus U_j| + \beta |U_j \setminus U_i| + \mathrm{shrinkage}} Args: X_train_all (Union[scipy.sparse.csr_matrix, scipy.sparse.csc_matrix]): Input interaction matrix. shrinkage (float, optional): The shrinkage parameter for regularization. Defaults to 0.0. alpha (float, optional): :math:`alpha` parameter. Defaults to 0.5. beta (float, optional): :math:`beta` parameter. Defaults to 0.5. top_k (int, optional): Specifies the maximal number of allowed neighbors. Defaults to 100. n_threads (Optional[int], optional): Specifies the number of threads to use for the computation. If ``None``, the environment variable ``"IRSPACK_NUM_THREADS_DEFAULT"`` will be looked up, and if the variable is not set, it will be set to ``os.cpu_count()``. Defaults to None. """ config_class = TverskyIndexKNNConfig default_tune_range = default_tune_range_knn.copy() + [ UniformFloatRange("alpha", 0, 2), UniformFloatRange("beta", 0, 2), ]
[docs] def __init__( self, X_train_all: InteractionMatrix, shrinkage: float = 0.0, alpha: float = 0.5, beta: float = 0.5, top_k: int = 100, n_threads: Optional[int] = None, ) -> None: super().__init__(X_train_all, shrinkage, top_k, n_threads) self.alpha = alpha self.beta = beta
def _create_computer(self, X: InteractionMatrix) -> TverskyIndexComputer: return TverskyIndexComputer( X, self.shrinkage, self.alpha, self.beta, self.n_threads )