#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2013 Zygmunt ZajÄ…c <zygmunt@fastml.com>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""
Corpus in CSV format.

"""


from __future__ import with_statement

import logging
import csv
import itertools

from gensim import interfaces

logger = logging.getLogger('gensim.corpora.csvcorpus')


class CsvCorpus(interfaces.CorpusABC):
    """
    Corpus in CSV format. The CSV delimiter, headers etc. are guessed automatically
    based on the file content.

    All row values are expected to be ints/floats.

    """

    def __init__(self, fname, labels):
        """
        Initialize the corpus from a file.
        `labels` = are class labels present in the input file? => skip the first column

        """
        logger.info("loading corpus from %s" % fname)
        self.fname = fname
        self.length = None
        self.labels = labels

        # load the first few lines, to guess the CSV dialect
        head = ''.join(itertools.islice(open(self.fname), 5))
        self.headers = csv.Sniffer().has_header(head)
        self.dialect = csv.Sniffer().sniff(head)
        logger.info("sniffed CSV delimiter=%r, headers=%s" % (self.dialect.delimiter, self.headers))

    def __iter__(self):
        """
        Iterate over the corpus, returning one sparse vector at a time.

        """
        reader = csv.reader(open(self.fname), self.dialect)
        if self.headers:
            next(reader)    # skip the headers

        line_no = -1
        for line_no, line in enumerate(reader):
            if self.labels:
                line.pop(0)  # ignore the first column = class label
            yield list(enumerate(map(float, line)))

        self.length = line_no + 1  # store the total number of CSV rows = documents

# endclass CsvCorpus