Source code for deeppavlov.dataset_readers.dstc2_reader

# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, softwaredata
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import copy
import json
from pathlib import Path
from typing import Dict, List

from overrides import overrides

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader
from deeppavlov.core.data.utils import download_decompress, mark_done
from deeppavlov.core.common.log import get_logger


log = get_logger(__name__)


[docs]@register('dstc2_reader')
class DSTC2DatasetReader(DatasetReader):
    """
    Contains labelled dialogs from Dialog State Tracking Challenge 2
    (http://camdial.org/~mh521/dstc/).

    There've been made the following modifications to the original dataset:

       1. added api calls to restaurant database

          - example: ``{"text": "api_call area=\"south\" food=\"dontcare\"
            pricerange=\"cheap\"", "dialog_acts": ["api_call"]}``.

       2. new actions

          - bot dialog actions were concatenated into one action
            (example: ``{"dialog_acts": ["ask", "request"]}`` ->
            ``{"dialog_acts": ["ask_request"]}``)

          - if a slot key was associated with the dialog action, the new act
            was a concatenation of an act and a slot key (example:
            ``{"dialog_acts": ["ask"], "slot_vals": ["area"]}`` ->
            ``{"dialog_acts": ["ask_area"]}``)

       3. new train/dev/test split

          - original dstc2 consisted of three different MDP policies, the original
            train and dev datasets (consisting of two policies) were merged and
            randomly split into train/dev/test

       4. minor fixes

          - fixed several dialogs, where actions were wrongly annotated
          - uppercased first letter of bot responses
          - unified punctuation for bot responses
    """

    url = 'http://files.deeppavlov.ai/datasets/dstc2_v2.tar.gz'

    @staticmethod
    def _data_fname(datatype):
        assert datatype in ('trn', 'val', 'tst'), "wrong datatype name"
        return 'dstc2-{}.jsonlist'.format(datatype)

[docs]    @classmethod
    @overrides
    def read(self, data_path: str, dialogs: bool = False) -> Dict[str, List]:
        """
        Downloads ``'dstc2_v2.tar.gz'`` archive from ipavlov internal server,
        decompresses and saves files to ``data_path``.

        Parameters:
            data_path: path to save DSTC2 dataset
            dialogs: flag which indicates whether to output list of turns or
             list of dialogs

        Returns:
            dictionary that contains ``'train'`` field with dialogs from
            ``'dstc2-trn.jsonlist'``, ``'valid'`` field with dialogs from
            ``'dstc2-val.jsonlist'`` and ``'test'`` field with dialogs from
            ``'dstc2-tst.jsonlist'``. Each field is a list of tuples ``(x_i, y_i)``.
        """
        required_files = (self._data_fname(dt) for dt in ('trn', 'val', 'tst'))
        if not all(Path(data_path, f).exists() for f in required_files):
            log.info('[downloading data from {} to {}]'.format(self.url, data_path))
            download_decompress(self.url, data_path)
            mark_done(data_path)

        data = {
            'train': self._read_from_file(
                Path(data_path, self._data_fname('trn')), dialogs),
            'valid': self._read_from_file(
                Path(data_path, self._data_fname('val')), dialogs),
            'test': self._read_from_file(
                Path(data_path, self._data_fname('tst')), dialogs)
        }
        return data

    @classmethod
    def _read_from_file(cls, file_path, dialogs=False):
        """Returns data from single file"""
        log.info("[loading dialogs from {}]".format(file_path))

        utterances, responses, dialog_indices =\
            cls._get_turns(cls._iter_file(file_path), with_indices=True)

        data = list(map(cls._format_turn, zip(utterances, responses)))

        if dialogs:
            return [data[idx['start']:idx['end']] for idx in dialog_indices]
        return data

    @staticmethod
    def _format_turn(turn):
        x = {'text': turn[0]['text'],
             'intents': turn[0]['dialog_acts']}
        if turn[0].get('db_result') is not None:
            x['db_result'] = turn[0]['db_result']
        if turn[0].get('episode_done'):
            x['episode_done'] = True
        y = {'text': turn[1]['text'],
             'act': turn[1]['dialog_acts'][0]['act']}
        return (x, y)

    @staticmethod
    def _iter_file(file_path):
        for ln in open(file_path, 'rt', encoding='utf8'):
            if ln.strip():
                yield json.loads(ln)
            else:
                yield {}

    @staticmethod
    def _get_turns(data, with_indices=False):
        utterances = []
        responses = []
        dialog_indices = []
        n = 0
        num_dialog_utter, num_dialog_resp = 0, 0
        episode_done = True
        for turn in data:
            if not turn:
                if num_dialog_utter != num_dialog_resp:
                    raise RuntimeError("Datafile in the wrong format.")
                episode_done = True
                n += num_dialog_utter
                dialog_indices.append({
                    'start': n - num_dialog_utter,
                    'end': n,
                })
                num_dialog_utter, num_dialog_resp = 0, 0
            else:
                speaker = turn.pop('speaker')
                if speaker == 1:
                    if episode_done:
                        turn['episode_done'] = True
                    utterances.append(turn)
                    num_dialog_utter += 1
                elif speaker == 2:
                    if num_dialog_utter - 1 == num_dialog_resp:
                        responses.append(turn)
                    elif num_dialog_utter - 1 < num_dialog_resp:
                        if episode_done:
                            responses.append(turn)
                            utterances.append({
                                "text": "",
                                "dialog_acts": [],
                                "episode_done": True}
                            )
                        else:
                            new_turn = copy.deepcopy(utterances[-1])
                            if 'db_result' not in responses[-1]:
                                raise RuntimeError("Every api_call action should have"
                                                   " db_result, turn = {}"
                                                   .format(responses[-1]))
                            new_turn['db_result'] = responses[-1].pop('db_result')
                            utterances.append(new_turn)
                            responses.append(turn)
                        num_dialog_utter += 1
                    else:
                        raise RuntimeError("there cannot be two successive turns of"
                                           " speaker 1")
                    num_dialog_resp += 1
                else:
                    raise RuntimeError("Only speakers 1 and 2 are supported")
                episode_done = False

        if with_indices:
            return utterances, responses, dialog_indices
        return utterances, responses