Source code for deeppavlov.dataset_readers.basic_classification_reader

# Copyright 2017 Neural Networks and Deep Learning lab, MIPT
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.

from logging import getLogger
from pathlib import Path

import pandas as pd
from overrides import overrides

from deeppavlov.core.common.registry import register
from import DatasetReader
from import download

log = getLogger(__name__)

[docs]@register('basic_classification_reader') class BasicClassificationDatasetReader(DatasetReader): """ Class provides reading dataset in .csv format """
[docs] @overrides def read(self, data_path: str, url: str = None, format: str = "csv", class_sep: str = None, *args, **kwargs) -> dict: """ Read dataset from data_path directory. Reading files are all data_types + extension (i.e for data_types=["train", "valid"] files "train.csv" and "valid.csv" form data_path will be read) Args: data_path: directory with files url: download data files if data_path not exists or empty format: extension of files. Set of Values: ``"csv", "json"`` class_sep: string separator of labels in column with labels sep (str): delimeter for ``"csv"`` files. Default: None -> only one class per sample header (int): row number to use as the column names names (array): list of column names to use orient (str): indication of expected JSON string format lines (boolean): read the file as a json object per line. Default: ``False`` Returns: dictionary with types from data_types. Each field of dictionary is a list of tuples (x_i, y_i) """ data_types = ["train", "valid", "test"] train_file = kwargs.get('train', 'train.csv') if not Path(data_path, train_file).exists(): if url is None: raise Exception("data path {} does not exist or is empty, and download url parameter not specified!".format(data_path))"Loading train data from {} to {}".format(url, data_path)) download(source_url=url, dest_file_path=Path(data_path, train_file)) data = {"train": [], "valid": [], "test": []} for data_type in data_types: file_name = kwargs.get(data_type, '{}.{}'.format(data_type, format)) file = Path(data_path).joinpath(file_name) if file.exists(): if format == 'csv': keys = ('sep', 'header', 'names') options = {k: kwargs[k] for k in keys if k in kwargs} df = pd.read_csv(file, **options) elif format == 'json': keys = ('orient', 'lines') options = {k: kwargs[k] for k in keys if k in kwargs} df = pd.read_json(file, **options) else: raise Exception('Unsupported file format: {}'.format(format)) x = kwargs.get("x", "text") y = kwargs.get('y', 'labels') if isinstance(x, list): if class_sep is None: # each sample is a tuple ("text", "label") data[data_type] = [([row[x_] for x_ in x], str(row[y])) for _, row in df.iterrows()] else: # each sample is a tuple ("text", ["label", "label", ...]) data[data_type] = [([row[x_] for x_ in x], str(row[y]).split(class_sep)) for _, row in df.iterrows()] else: if class_sep is None: # each sample is a tuple ("text", "label") data[data_type] = [(row[x], str(row[y])) for _, row in df.iterrows()] else: # each sample is a tuple ("text", ["label", "label", ...]) data[data_type] = [(row[x], str(row[y]).split(class_sep)) for _, row in df.iterrows()] else: log.warning("Cannot find {} file".format(file)) return data