import re
from pathlib import Path
from typing import List, Tuple
import pandas as pd
from .base import BaseMovieLenstDataLoader
[docs]class MovieLens100KDataManager(BaseMovieLenstDataLoader):
r"""Manages MovieLens 100K dataset.
Args:
zippath:
Where the zipped data is located. If `None`, assumes the path to be `~/.ml-1m.zip`.
If the designated path does not exist, you will be prompted for the permission to download the data.
Defaults to `None`.
force_download:
If `True`, the class will not prompt for the permission and start downloading immediately.
"""
DOWNLOAD_URL = "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
DEFAULT_PATH = Path("~/.ml-100k.zip").expanduser()
INTERACTION_PATH = "ml-100k/u.data"
USER_INFO_PATH = "ml-100k/u.user"
ITEM_INFO_PATH = "ml-100k/u.item"
GENRE_PATH = "ml-100k/u.genre"
[docs] def read_interaction(self) -> pd.DataFrame:
with self._read_as_istream(self.INTERACTION_PATH) as ifs:
data = pd.read_csv(
ifs,
sep="\t",
header=None,
names=["userId", "movieId", "rating", "timestamp"],
)
data["timestamp"] = pd.to_datetime(data["timestamp"], unit="s")
return data
def read_user_info(self) -> pd.DataFrame:
with self._read_as_istream(self.USER_INFO_PATH) as ifs:
return pd.read_csv(
ifs,
sep="|",
header=None,
names=["userId", "age", "gender", "occupation", "zipcode"],
).set_index("userId")
def _read_genre(self) -> List[str]:
with self._read_as_istream(self.GENRE_PATH) as ifs:
items = ifs.read().decode("latin-1").split()
return [re.sub(r"\|\d+$", "", i.strip()) for i in items]
def read_item_info(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
with self._read_as_istream(self.ITEM_INFO_PATH) as ifs:
df = pd.read_csv(ifs, sep="|", header=None, encoding="latin-1")
genres = self._read_genre()
df.columns = [
"movieId",
"title",
"release_date",
"video_release_date",
"URL",
] + genres
movie_ids = df.movieId.values
df["release_date"] = pd.to_datetime(df.release_date)
genre_df = pd.DataFrame(
[
dict(movieId=movie_ids[row], genre=genres[col])
for row, col in zip(*df[genres].values.nonzero())
]
)
df = df.set_index("movieId")
return df.drop(columns=genres), genre_df