import uuid
from typing import Any, Dict, List, Tuple
import numpy as np
import numpy.typing as npt
import pandas as pd
from scipy import sparse as sps
from ..definitions import OptionalRandomState
from ..utils.random import convert_randomstate
from .userwise import UserTrainTestInteractionPair, _split_list
[docs]def holdout_specific_interactions(
df: pd.DataFrame,
user_column: str,
item_column: str,
interaction_indicator: np.ndarray,
validatable_user_ratio_val: float = 0.2,
validatable_user_ratio_test: float = 0.2,
random_state: OptionalRandomState = None,
) -> Tuple[List[Any], Dict[str, UserTrainTestInteractionPair]]:
"""Holds-out (part of) the interactions specified by the users.
All the users will be split into two category:
1. Those who have an interaction in the specified subset.
We denote them as "validatable" users.
2. Those who don't.
We split the users in 1. into three parts (train, validation, test)-users, and hold-out the specified interactions.
The interactions of non-validatable users will be part of the train dataset.
This split will be useful when want to:
- recommend only part of the items (e.g., rather unpopular ones) to the users.
In this case, the held-out interactions will be the ones with these specific items.
- split the dataframe by a certain timepoint, and ensure that no information after
that timepoint contaminates the training set.
Args:
df:
The data source.
user_column:
The column name of the users.
item_column:
The column name of the items.
interaction_indicator:
Specifies where in ``df`` the held-out interactions are.
validatable_user_ratio_val:
The ratio of "validation-set users" in the "validatable users". Defaults to 0.2.
validatable_user_ration_test:
The ratio of "test-set users" in the "validatable users". Defaults to 0.2.
random_state:
The random seed used to split validatable users into three. Defaults to `None`.
Returns:
A tuple consiting of
* The aligned list of all the items.
* A dictionary with train/val/test user pairs.
"""
v_user_ratio_train = 1 - validatable_user_ratio_val - validatable_user_ratio_test
if v_user_ratio_train < -1e-10:
raise ValueError(
"validatable_use_ratio_val + validatable_user_ratio_test exceeds 1."
)
df = df[[user_column, item_column]].copy()
unique_item_ids, item_index = np.unique(df[item_column], return_inverse=True)
item_index_colname = str(uuid.uuid1())
df[item_index_colname] = item_index
flg_colname = str(uuid.uuid1())
flg_column = np.zeros(df.shape[0])
flg_column[interaction_indicator] = 1
df[flg_colname] = flg_column
v_train_users: npt.ArrayLike
v_val_users: npt.ArrayLike
v_test_users: npt.ArrayLike
validatable_users = np.unique(df[flg_column > 0][user_column])
n_validatable_users: int = validatable_users.shape[0]
val_test_ratio = 1 - v_user_ratio_train
rns = convert_randomstate(random_state)
if val_test_ratio >= 1.0:
v_train_users = np.ndarray((0,), dtype=validatable_users.dtype)
v_val_test_users = validatable_users
else:
v_val_test_users, v_train_users = _split_list(
validatable_users,
int(v_user_ratio_train * n_validatable_users),
rns,
)
v_test_users, v_val_users = _split_list(
v_val_test_users,
int(len(v_val_test_users) * validatable_user_ratio_val / val_test_ratio),
rns,
)
df_train = pd.concat(
[
df[df[user_column].isin(v_train_users)],
df[~df[user_column].isin(validatable_users)],
]
)
df_val = df[df[user_column].isin(v_val_users)]
df_test = df[df[user_column].isin(v_test_users)]
def df_to_dataset(
df: pd.DataFrame, train: bool = False
) -> UserTrainTestInteractionPair:
uindex, uid_unique = df[user_column].factorize(sort=True)
iindex = df[item_index_colname].values
if train:
X = sps.csr_matrix(
(np.ones(df.shape[0], dtype=np.float64), (uindex, iindex)),
shape=(len(uid_unique), len(unique_item_ids)),
)
return UserTrainTestInteractionPair(uid_unique, X_train=X, X_test=None)
(learn_index,) = np.where((df[flg_colname] == 0).values)
(predict_index,) = np.where((df[flg_colname] > 0).values)
X_learn = sps.csr_matrix(
(
np.ones(learn_index.shape[0], dtype=np.float64),
(uindex[learn_index], iindex[learn_index]),
),
shape=(len(uid_unique), len(unique_item_ids)),
)
X_predict = sps.csr_matrix(
(
np.ones(predict_index.shape[0], dtype=np.float64),
(uindex[predict_index], iindex[predict_index]),
),
shape=(len(uid_unique), len(unique_item_ids)),
)
return UserTrainTestInteractionPair(
uid_unique, X_train=X_learn, X_test=X_predict
)
dataset = {
"train": df_to_dataset(df_train, train=True),
"val": df_to_dataset(df_val, train=False),
"test": df_to_dataset(df_test, train=False),
}
return unique_item_ids, dataset