Source code for dataset_io.sampler

#
# samplar.py
#
# Copyright (c) 2016-2017 Junpei Kawamoto
#
# This file is part of rgmining-dataset-io.
#
# rgmining-dataset-io is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# rgmining-dataset-io is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with rgmining-dataset-io. If not, see <http://www.gnu.org/licenses/>.
#
# pylint: disable=no-member
"""Provide samplers for generating random ratings.

This module provides two samplers which generate random ratings.
One sampler is :class:`dataset_io.sampler.UniformSampler`.
This sampler generate ratings from an uniform distribution.
The other one is :class:`dataset_io.sampler.RatingBasedSampler`.
This sampler generate ratings from a rating distribution computed from a real
dataset provided by Amazon.com.

Note that ganerated ratings are normalized into [0, 1].
"""
import csv
from os import path
from numpy import random

_DATA_FILE = "rating.csv"


def _fullpath(filename):
    """Compute the full path of a given filename.

    Args:
      filename: Filename.

    Returns:
      The full path of the given filename.
    """
    return path.join(path.dirname(__file__), filename)


[docs]class UniformSampler(object): """Sampling review scores from a uniform distribution. This sampler is a callable object. To generate random ratings, .. code-block:: python sampler = UniformSampler() for rating in sampler(): # use the rating. Note that in the above example, sampler never ends and break is required to stop the generation. """ __slots__ = () def __call__(self): """Create a iterator returns sampled values. Yields: A normalized uniform random review score. """ while True: yield random.randint(0, 4) / 4.
[docs]class RatingBasedSampler(object): """Sampling review scores from a distribution based on actual reviews. This sampler generate ratings from a rating distribution computed from a real dataset provided by Amazon.com. According to the dataset, the distribution is the followings; .. csv-table:: :header: rating, the number of reviews 1, 167137 2, 122025 3, 189801 4, 422698 5, 1266919 This sampler is a callable object. To generate random ratings, .. code-block:: python sampler = UniformSampler() for rating in sampler(): # use the rating. Note that in the above example, sampler never ends and break is required to stop the generation. """ __slots__ = ("_dist", "_keys") def __init__(self): self._dist = {} with open(_fullpath(_DATA_FILE)) as fp: for k, v in csv.reader(fp): self._dist[int(k)] = float(v) total = sum(self._dist.values()) for k in self._dist: self._dist[k] /= total self._keys = sorted(self._dist.keys()) def __call__(self): """Create a iterator returns sampled values. Yields: A normalized uniform random review score. """ while True: U = random.random() for k in self._keys: U -= self._dist[k] if U < 0: yield (k - 1) / 4.