Source code for deeppavlov.dataset_readers.basic_classification_reader

# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from logging import getLogger
from pathlib import Path

import pandas as pd

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader
from deeppavlov.core.data.utils import download

log = getLogger(__name__)


[docs]@register('basic_classification_reader') class BasicClassificationDatasetReader(DatasetReader): """ Class provides reading dataset in .csv format """
[docs] def read(self, data_path: str, url: str = None, format: str = "csv", class_sep: str = None, *args, **kwargs) -> dict: """ Read dataset from data_path directory. Reading files are all data_types + extension (i.e for data_types=["train", "valid"] files "train.csv" and "valid.csv" form data_path will be read) Args: data_path: directory with files url: download data files if data_path not exists or empty format: extension of files. Set of Values: ``"csv", "json"`` class_sep: string separator of labels in column with labels sep (str): delimeter for ``"csv"`` files. Default: None -> only one class per sample header (int): row number to use as the column names names (array): list of column names to use orient (str): indication of expected JSON string format lines (boolean): read the file as a json object per line. Default: ``False`` Returns: dictionary with types from data_types. Each field of dictionary is a list of tuples (x_i, y_i) """ data_types = ["train", "valid", "test"] train_file = kwargs.get('train', 'train.csv') if not Path(data_path, train_file).exists(): if url is None: raise Exception( "data path {} does not exist or is empty, and download url parameter not specified!".format( data_path)) log.info("Loading train data from {} to {}".format(url, data_path)) download(source_url=url, dest_file_path=Path(data_path, train_file)) data = {"train": [], "valid": [], "test": []} for data_type in data_types: file_name = kwargs.get(data_type, '{}.{}'.format(data_type, format)) if file_name is None: continue file = Path(data_path).joinpath(file_name) if file.exists(): if format == 'csv': keys = ('sep', 'header', 'names') options = {k: kwargs[k] for k in keys if k in kwargs} df = pd.read_csv(file, **options) elif format == 'json': keys = ('orient', 'lines') options = {k: kwargs[k] for k in keys if k in kwargs} df = pd.read_json(file, **options) else: raise Exception('Unsupported file format: {}'.format(format)) x = kwargs.get("x", "text") y = kwargs.get('y', 'labels') if isinstance(x, list): if class_sep is None: # each sample is a tuple ("text", "label") data[data_type] = [([row[x_] for x_ in x], str(row[y])) for _, row in df.iterrows()] else: # each sample is a tuple ("text", ["label", "label", ...]) data[data_type] = [([row[x_] for x_ in x], str(row[y]).split(class_sep)) for _, row in df.iterrows()] else: if class_sep is None: # each sample is a tuple ("text", "label") data[data_type] = [(row[x], str(row[y])) for _, row in df.iterrows()] else: # each sample is a tuple ("text", ["label", "label", ...]) data[data_type] = [(row[x], str(row[y]).split(class_sep)) for _, row in df.iterrows()] else: log.warning("Cannot find {} file".format(file)) return data