Source code for irspack.dataset.movielens.ML1M

from pathlib import Path

import pandas as pd

from .base import BaseMovieLenstDataLoader


[docs]class MovieLens1MDataManager(BaseMovieLenstDataLoader): r"""Manages MovieLens 1M dataset. Args: zippath: Where the zipped data is located. If `None`, assumes the path to be `~/.ml-1m.zip`. If the designated path does not exist, you will be prompted for the permission to download the data. Defaults to `None`. force_download: If `True`, the class will not prompt for the permission and start downloading immediately. """ DOWNLOAD_URL = "http://files.grouplens.org/datasets/movielens/ml-1m.zip" DEFAULT_PATH = Path("~/.ml-1m.zip").expanduser() INTERACTION_PATH = "ml-1m/ratings.dat" ITEM_INFO_PATH = "ml-1m/movies.dat" USER_INFO_PATH = "ml-1m/users.dat"
[docs] def read_interaction(self) -> pd.DataFrame: with self._read_as_istream(self.INTERACTION_PATH) as ifs: # This is a hack. # The true separator is "::", but this will force pandas # to use python engine, which is much slower. # instead we regard the separator to be ':' and imagine there is an empty (NaN) values between "::". df = pd.read_csv( ifs, sep=":", header=None, )[[0, 2, 4, 6]].copy() df.columns = ["userId", "movieId", "rating", "timestamp"] df["timestamp"] = pd.to_datetime(df.timestamp, unit="s") return df
def read_item_info(self) -> pd.DataFrame: with self._read_as_istream(self.ITEM_INFO_PATH) as ifs: data = pd.read_csv( ifs, sep="::", header=None, encoding="latin-1", names=["movieId", "title", "genres"], engine="python", ) release_year = pd.to_numeric( data.title.str.extract(r"^.*\((?P<release_year>\d+)\)\s*$").release_year ) data["release_year"] = release_year return data.set_index("movieId") def read_user_info(self) -> pd.DataFrame: with self._read_as_istream(self.USER_INFO_PATH) as ifs: return pd.read_csv( ifs, sep="::", header=None, names=["userId", "gender", "age", "occupation", "zipcode"], engine="python", ).set_index("userId")