Source code for tripadvisor

#
# tripadvisor.py
#
# Copyright (c) 2017 Junpei Kawamoto
#
# This file is part of rgmining-tripadvisor-dataset.
#
# rgmining-tripadvisor-dataset is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# rgmining-tripadvisor-dataset is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Foobar.  If not, see <http://www.gnu.org/licenses/>.
#
# pylint: disable=invalid-name
"""This module provides a loading function of the Trip Advisor Dataset.

It also a helper function, :meth:`print_state`, to output a state of a graph
object.
To use both fuctions, the graph object must implement the
:ref:`graph interface <dataset-io:graph-interface>`.
"""
from __future__ import division
from contextlib import closing
import datetime
import json
from os.path import exists, join
import site
import sys
import tarfile


_DATE_FORMAT = "%B %d, %Y"
"""Data format in the dataset.
"""


def _files(tar):
    """Yields a file in the tar file.
    """
    info = tar.next()
    while info:
        if info.isfile():
            yield info
        info = tar.next()


[docs]def load(graph): """Load the Trip Advisor dataset to a given graph object. The graph object must implement the :ref:`graph interface <dataset-io:graph-interface>`. Args: graph: an instance of bipartite graph. Returns: The graph instance *graph*. """ base = "TripAdvisorJson.tar.bz2" path = join(".", base) if not exists(path): path = join(sys.prefix, "rgmining","data", base) if not exists(path): path = join(sys.prefix, "local", "rgmining","data", base) if not exists(path): path = join(site.getuserbase(), "rgmining","data", base) R = {} # Reviewers dict. with tarfile.open(path) as tar: for info in _files(tar): with closing(tar.extractfile(info)) as fp: obj = json.load(fp) target = obj["HotelInfo"]["HotelID"] product = graph.new_product(name=target) for r in obj["Reviews"]: name = r["ReviewID"] score = float(r["Ratings"]["Overall"]) / 5. try: date = datetime.datetime.strptime( r["Date"], _DATE_FORMAT).strftime("%Y%m%d") except ValueError: date = None if name not in R: R[name] = graph.new_reviewer(name=name) graph.add_review(R[name], product, score, date) return graph