python source code of dataset

#!/usr/bin/env python
# File: dataset_float.py
# Author: Kiri Wagstaff, 9/24/14
#
# Superclass for unlabeled data sets with floating point values:
# readers and plotters
#
# Copyright 2013-2015, by the California Institute of Technology. ALL
# RIGHTS RESERVED.  United States Government Sponsorship
# acknowledged. Any commercial use must be negotiated with the Office
# of Technology Transfer at the California Institute of Technology.
#
# This software may be subject to U.S. export control laws and
# regulations.  By accepting this document, the user agrees to comply
# with all applicable U.S. export laws and regulations.  User has the
# responsibility to obtain export licenses, or other export authority
# as may be required before exporting such information to foreign
# countries or providing access to foreign persons.

import os, sys, re
import csv, numpy, pylab, math
from dataset import Dataset
from log import printt


class FloatDataset(Dataset):
  # Supersubclass for data sets

  def __init__(self, filename=None, name='Floatdata', initfilename=None):
    """Dataset(filename="", name="", initfilename="") -> Dataset
    
    Creates a new Dataset based on the data in filename
    and with the name given.
    Name is used in output filename prefixes.
    Assumes CSV, floating point values, and no class labels.
    Commented lines (with #) are ignored.

    This top-level class can't do anything itself,
    but needs to be instantiated with one of its subclasses.
    """

    Dataset.__init__(self, filename, name, initfilename)


  @classmethod
  def  read_csv(cls, filename, nskip):
    """read_csv(filename, nskip)

    Read in the contents of the (CSV formatted) file
    and return values to can populate fields of a FloatDataset object.
    nskip specifies how many columns to skip to get to real data.
    """

    data   = []
    labels = []
    xvals  = []
    with open(filename, 'r') as csvfile:
      lines = csvfile.readlines()

      # If there's a header (begins with #), use it to
      # populate the feature names
      if lines[0][0] == '#':
        print 'Populating xvals from data file header.'
        header = lines[0][1:].strip()
        xvals = numpy.array(map(float,header.split(',')))

      for line in lines:
        # Skip over empty or commented lines
        if line.strip() == '' or line[0] == '#':
          continue
        attributes = re.split(',', line.strip())

        data += [[float(x) for x in attributes[nskip:]]]
        if nskip > 0: # Use the first column as a label
          labels.append(attributes[0])
        else:  # fake labels
          labels.append('None')

    data = numpy.array(data)


    return (xvals, data, labels)


  def  readin(self, nskip):
    """readin()
    """
    
    (self.xvals, self.data, self.labels) = FloatDataset.read_csv(self.filename,
                                                                 nskip)

    self.data = self.data.T  # features x samples

    # If there was no header with feature names, just create an empty xvals
    if self.xvals == []:
      self.xvals = numpy.arange(self.data.shape[0]).reshape(-1,1)

    # Read in the init data file, if present
    if self.initfilename != '':
      printt('Reading initialization data set from %s' % self.initfilename)
      (_, self.initdata, _) = FloatDataset.read_csv(self.initfilename, nskip)
      self.initdata = self.initdata.T  # features x samples


  def  plot_item_triangles(self, m, ind, x, r, k, label, U,
                           rerr, feature_weights, band_ind):
    """plot_item_triangles(self, m, ind, x, r, k, label, U,
       rerr, feature_weights, band_ind)

    Plot selection m (index ind, data in x) with triangles to
    mark the largest residual values on band_ind indices
    (i.e., you need to pick them before calling this).

    To use this, define plot_item() in your data set's class
    to call this function instead.
    """

    if x == [] or r == []: 
      print "Error: No data in x and/or r."
      return
  
    pylab.clf()
    # xvals, x, and r need to be column vectors
    pylab.plot(self.xvals, r, 'r-',  linewidth=0.5, label='Expected')
    pylab.plot(self.xvals, x, 'b.-', linewidth=1,   label='Observations')
    # Boost font sizes for axis and tick labels
    pylab.xlabel(self.xlabel) #, fontsize=16)
    pylab.ylabel(self.ylabel) #, fontsize=16)
    '''
    pylab.xticks(fontsize=16)
    pylab.yticks(fontsize=16)
    '''
    pylab.title('DEMUD selection %d (%s), item %d, using K=%d' % \
                (m, label, ind, k))
    pylab.legend(fontsize=10)

    # width of triangles to plot
    width = (self.xvals.max() - self.xvals.min())/100.0
    
    for band in band_ind:
      w = float(self.xvals[band])
      reproj = r[band]
      # Draw a triangle that points up if r > x
      # or down if r < x
      pylab.fill([w-width, w+width, w],
                 [reproj,  reproj,  x[band]],
                 '0.6', zorder=1)

    outdir  = os.path.join('results', self.name)
    if not os.path.exists(outdir):
      os.mkdir(outdir)
    figfile = os.path.join(outdir, 'sel-%d-k-%d-(%s).pdf' % (m, k, label))
    pylab.savefig(figfile)
    print 'Wrote plot to %s' % figfile
    pylab.close()

    
  def  select_bands(self, x, r, frac_annotate):
    """select_bands(self, x, r, frac_annotate)

    Select which bands to highlight (largest magnitude residual).
    """
    
    res = x - r
    abs_res = numpy.absolute(res)
    mx = abs_res.max()
    mn = abs_res.min()
    print('Absolute residuals: min %2.g, max %.2g.\n' % (mn, mx))
    if mn == mx and mx == 0:
      return

    sorted_abs_res = numpy.sort(abs_res,0)
    num_annotate = int(math.floor(frac_annotate * len(abs_res)))
    thresh = sorted_abs_res[-num_annotate]
    
    print('Marking top %.3f%% of residuals (%d above %.2g).' % \
        (frac_annotate * 100, num_annotate, thresh))

    band_ind = (numpy.where(abs_res >= thresh)[0]).tolist()

    return band_ind