// // OpenForecast - open source, general-purpose forecasting package. // Copyright (C) 2002-2011 Steven R. Gould // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // package net.sourceforge.openforecast.input; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.FileNotFoundException; import java.io.IOException; import java.io.StreamTokenizer; import java.io.StringReader; import java.util.ArrayList; import net.sourceforge.openforecast.DataPoint; import net.sourceforge.openforecast.DataSet; import net.sourceforge.openforecast.Observation; /** * Defines a Builder that can be used to construct a DataSet from CSV (Comma * Separated Values) file or input stream. This class makes for a quick and * easy "import" of data from a variety of other applications such as * Microsoft Excel. * * <p>The last value on each row is assumed to represent the dependent * variable. For example, if the independent variables are represented by x1, * x2, x3 and so on, and the dependent variable is represented by y, then a * row should be of the form: * * <pre> * x1, x2, ..., xi, y * </pre> * * <p>For example, the following represents data points (1,3), (2,5), (3,6), * and (4,7): * * <pre> * 1, 3 * 2, 5 * 3, 6 * 4, 7 * </pre> * * <p>where the values 3, 5, 6 and 7 are the observed values of the dependent * variable corresponding to the associated values of the independent variables * with the values 1, 2, 3, and 4 respectively. By default, the independent * variables - just one in this example - would be named "x1", ..., * "x<em>i</em>", etc. To override this behavior, you can specify a "header * row" containing names for the individual variables (the "columns" of data). * * <p>Using the previous example, if the x<sub>i</sub> represented time, we * could define the data input as follows: * * <pre> * time, observation * 1, 3 * 2, 5 * 3, 6 * 4, 7 * </pre> * * <p>This would name the dependent variable in this case, "time", instead of * the default, "x1". * @author Steven R. Gould * @since 0.4 */ public class CSVBuilder extends AbstractBuilder { /** * Constant defining the character used to separate values. */ private final static char SEPARATOR = ','; /** * Indicates that it is unknown whether or not the file has a header row. * @see #hasHeaderRow */ private static int HAS_HEADER_ROW_UNKNOWN = -1; /** * Indicates that the file does not have a header row. * @see #hasHeaderRow */ private static int HAS_HEADER_ROW_FALSE = 0; /** * Indicates that the file does have a header row. * @see #hasHeaderRow */ private static int HAS_HEADER_ROW_TRUE = 1; /** * Set to true only if the first line/row of the current input source is * to be treated as a header row. */ private int hasHeaderRow = HAS_HEADER_ROW_UNKNOWN; /** * Stores the file reader from which data is to be read by the build * method. */ private FileReader fileReader; /** * Constructs a new CSVBuilder that reads its input from the named file. The * fields will be named "x1", "x2", "x3", etc. unless the first non-comment * row contains one or more non-numeric fields in which case the first row * will be assumed to be a header row. * @param filename the name of the CSV file to read the input from. * @throws FileNotFoundException if the file does not exist, is a * directory rather than a regular file, or for some other reason cannot * be opened for reading. * @throws SecurityException if a security manager exists and its * <code>checkRead</code> method denies read access to the file. */ public CSVBuilder( String filename ) throws FileNotFoundException { this( new FileReader(filename) ); } /** * Constructs a new CSVBuilder that reads its input from the named file, * and treats the first row of data as a header row containing field names. * @param filename the name of the CSV file to read the input from. * @param hasHeaderRow set to true if the CSV file has a header row. * @throws FileNotFoundException if the file does not exist, is a * directory rather than a regular file, or for some other reason cannot * be opened for reading. * @throws SecurityException if a security manager exists and its * <code>checkRead</code> method denies read access to the file. */ public CSVBuilder( String filename, boolean hasHeaderRow ) throws FileNotFoundException { this( new FileReader(filename), hasHeaderRow ); } /** * Constructs a new CSVBuilder that reads its input from the named file. The * fields will be named "x1", "x2", "x3", etc. unless the first non-comment * row contains one or more non-numeric fields in which case the first row * will be assumed to be a header row. * @param file the File object specifying the CSV file to read the input * from. * @throws FileNotFoundException if the file does not exist, is a * directory rather than a regular file, or for some other reason cannot * be opened for reading. * @throws SecurityException if a security manager exists and its * <code>checkRead</code> method denies read access to the file. */ public CSVBuilder( File file ) throws FileNotFoundException { this( new FileReader(file) ); } /** * Constructs a new CSVBuilder that reads its input from the named file. * The fields will be named "x1", "x2", "x3", etc. * @param file the File object specifying the CSV file to read the input * from. * @param hasHeaderRow set to true if the CSV file has a header row. * @throws FileNotFoundException if the file does not exist, is a * directory rather than a regular file, or for some other reason cannot * be opened for reading. * @throws SecurityException if a security manager exists and its * <code>checkRead</code> method denies read access to the file. */ public CSVBuilder( File file, boolean hasHeaderRow ) throws FileNotFoundException { this( new FileReader(file), hasHeaderRow ); } /** * Constructs a new CSVBuilder that reads its input from the named file * input stream. The fields will be named "x1", "x2", "x3", etc. unless the * first non-comment row contains one or more non-numeric fields in which * case the first row will be assumed to be a header row. * @param reader the FileReader object specifying the CSV file reader to * read the input from. */ public CSVBuilder( FileReader reader ) { this.fileReader = reader; hasHeaderRow = HAS_HEADER_ROW_UNKNOWN; } /** * Constructs a new CSVBuilder that reads its input from the named file * input stream. The fields will be named "x1", "x2", "x3", etc. * @param reader the FileReader object specifying the CSV file reader to * read the input from. * @param hasHeaderRow set to true if the CSV file input stream has a * header row. */ public CSVBuilder( FileReader reader, boolean hasHeaderRow ) { this.fileReader = reader; this.hasHeaderRow = hasHeaderRow ? HAS_HEADER_ROW_TRUE : HAS_HEADER_ROW_FALSE; } /** * Retrieves a DataSet - a collection of DataPoints - from the current * input source. The DataSet should contain all DataPoints defined by * the input source. * * <p>In general, build will attempt to convert all lines/rows in the CSV * input to data points. The exceptions are as follows: * <ul> * <li>Blank lines (lines containing only whitespace) will be ignored, * and can be used for spacing in the input.</li> * <li>Lines beginning with a '#' will be treated as comments, and will * be ignored.</li> * <li>If a header row is included - as specified in one of the * constructors - then it will be treated as containing field/variable * names for use by the DataSet.</li> * </ul> * @return a DataSet built from the current input source. * @throws IOException if an error occurred reading from the CSV file. */ public DataSet build() throws IOException { DataSet dataSet = new DataSet(); boolean firstLineRead = false; BufferedReader reader = new BufferedReader( fileReader ); String line; do { // Get next line (trimmed) line = reader.readLine(); if ( line == null ) continue; line = line.trim(); // Skip blank lines if ( line.length() == 0 ) continue; // Skip comment lines if ( line.startsWith( "#" ) ) continue; if ( !firstLineRead ) { firstLineRead = true; if ( hasHeaderRow != HAS_HEADER_ROW_FALSE ) { try { // Treat first line as header readHeaderRow( line ); continue; } catch ( NoHeaderException nhex ) { // No header row found, so treat it // as the first row of data } } // Calculate how many independent values per line // TODO: Fix this to handle quoted commas int n = 0; for ( int pos=0; (pos=line.indexOf(SEPARATOR,pos)) > 0; pos++ ) n++; setNumberOfVariables( n ); } DataPoint dp = build( line ); dataSet.add( dp ); } while ( line != null ); // line == null when EOF is reached return dataSet; } /** * Parses the given line to extract the variable names. * @param line a String representing the line to parse for variable names. * @throws NoHeaderException if all the "column names" appear to be * numeric and therefore look like data values. */ private void readHeaderRow( String line ) throws NoHeaderException { // Temporary store for the variable names ArrayList<String> vars = new ArrayList<String>(); boolean allData = true; int pos = 0; while ( pos < line.length() ) { // Get position of next quote int nextQuote = line.indexOf("\"", pos); // Get position of next separator int nextSeparator = line.indexOf(SEPARATOR, pos); // if no next separator, then we're done // since we ignore the name of the independent variable if ( nextSeparator < 0 ) break; if ( nextQuote < 0 || nextQuote > nextSeparator ) { // Treat chars from pos to next separator as a label String name = line.substring(pos, nextSeparator); vars.add( name ); try { // Skip over next set of characters Double.parseDouble( name ); } catch ( NumberFormatException nfex ) { allData = false; } pos = nextSeparator+1; continue; } // Handle quoted strings allData = false; int secondQuote = line.indexOf("\"",nextQuote+1); String name = line.substring(nextQuote+1,secondQuote); vars.add( name ); // We actually ignore any chars outside of quotes, yet // before the next separator pos = line.indexOf(SEPARATOR,secondQuote)+1; } // If header row appears to be all numeric values, // then parse it as first line of data if ( allData ) throw new NoHeaderException("Header line looks like first line of data values"); // Add variable names extracted to this Builder's list of // variable names int n = vars.size(); for ( int i=0; i<n; i++ ) addVariable( ((String)vars.get(i)).trim() ); } /** * Builds a DataPoint from the given CSV line. This method should only be * used to parse a line that is expected to be made up of numeric data * only. Use {@link #readHeaderRow} to read a header row if one is expected. * @param line the input line of comma separated values to parse and use * to construct a new DataPoint. * @return a DataPoint object with values as specified by the given input * String. */ private DataPoint build( String line ) throws IOException { Observation dataPoint = new Observation( 0.0 ); StreamTokenizer tokenizer = new StreamTokenizer( new StringReader( line ) ); tokenizer.commentChar( '#' ); tokenizer.eolIsSignificant( true ); tokenizer.parseNumbers(); int i = 0; int n = getNumberOfVariables(); int lastToken = SEPARATOR; do { // Read next token tokenizer.nextToken(); switch ( tokenizer.ttype ) { case '\t': case ' ': // Skip whitespace continue; case SEPARATOR: // Check for two adjacent commas if ( lastToken != SEPARATOR ) break; // Two adjacent commas. Assume 0.0 between them tokenizer.nval = 0.0; // Fall through, and handle as a number case StreamTokenizer.TT_NUMBER: // Handle numbers appropriately as data // If this is the last value on the line, treat it // as the dependent variable value if ( i == n ) dataPoint.setDependentValue(tokenizer.nval); else dataPoint.setIndependentValue(getVariableName(i), tokenizer.nval); i++; break; case StreamTokenizer.TT_WORD: throw new IOException( "Invalid input in CSV file. Number expected, found '"+tokenizer.sval+"'"); case StreamTokenizer.TT_EOL: case StreamTokenizer.TT_EOF: break; default: } lastToken = tokenizer.ttype; } while ( tokenizer.ttype != StreamTokenizer.TT_EOF ); return dataPoint; } /** * Private exception class used to handle case when we're not sure if the * first row is a data row or a header row. We try to parse it as a header * row. If a NoHeaderException is thrown, then the first row is treated as * data. */ private class NoHeaderException extends Exception { private static final long serialVersionUID = 500L; public NoHeaderException( String msg ) { super( msg ); } } } // Local Variables: // tab-width: 4 // End;