Java Code Examples for org.apache.commons.csv.CSVParser#iterator()

The following examples show how to use org.apache.commons.csv.CSVParser#iterator() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SymSpellSearchBenchMark.java    From customized-symspell with MIT License 7 votes vote down vote up
private List<String> readQueries(String queryFile) {
  List<String> queries = new ArrayList<>();
  try {
    URL queryResourceUrl = this.getClass().getClassLoader().getResource(queryFile);
    CSVParser qparser = CSVParser
        .parse(queryResourceUrl, Charset.forName("UTF-8"),
            CSVFormat.DEFAULT.withDelimiter(' '));
    java.util.Iterator<CSVRecord> csvIterator = qparser.iterator();
    while (csvIterator.hasNext()) {
      CSVRecord csvRecord = csvIterator.next();
      queries.add(csvRecord.get(0));
    }
  } catch (IOException ex) {
    System.err.println("Error occured " + ex);
  }
  return queries;
}
 
Example 2
Source File: DelimitedTextReader.java    From marklogic-contentpump with Apache License 2.0 6 votes vote down vote up
protected void initParser(InputSplit inSplit) throws IOException,
    InterruptedException {
    fileIn = openFile(inSplit, true);
    if (fileIn == null) {
        return;
    }
    instream = new InputStreamReader(fileIn, encoding);

    bytesRead = 0;
    fileLen = inSplit.getLength();
    if (uriName == null) {
        generateId = conf.getBoolean(CONF_INPUT_GENERATE_URI, false);
        if (generateId) {
            idGen = new IdGenerator(file.toUri().getPath() + "-"
                + ((FileSplit) inSplit).getStart());
        } else {
            uriId = 0;
        }
    }
    parser = new CSVParser(instream, CSVParserFormatter.
    		getFormat(delimiter, encapsulator, true,
    				true));
    parserIterator = parser.iterator();
}
 
Example 3
Source File: SymSpellSearchBenchMark.java    From customized-symspell with MIT License 5 votes vote down vote up
private void indexData(String dataResourceName, DataHolder dataHolder)
    throws IOException, SpellCheckException {
  URL resourceUrl = this.getClass().getClassLoader().getResource(dataResourceName);
  CSVParser parser = CSVParser
      .parse(resourceUrl, Charset.forName("UTF-8"), CSVFormat.DEFAULT.withDelimiter(' '));
  java.util.Iterator<CSVRecord> csvIterator = parser.iterator();
  while (csvIterator.hasNext()) {
    CSVRecord csvRecord = csvIterator.next();
    dataHolder
        .addItem(new DictionaryItem(csvRecord.get(0), Double.valueOf(csvRecord.get(1)), 0d));
  }
}
 
Example 4
Source File: SymSpellIndexBenchMark.java    From customized-symspell with MIT License 5 votes vote down vote up
private void indexData(String dataResourceName, DataHolder dataHolder)
    throws IOException, SpellCheckException {
  URL resourceUrl = this.getClass().getClassLoader().getResource(dataResourceName);
  CSVParser parser = CSVParser
      .parse(resourceUrl, Charset.forName("UTF-8"), CSVFormat.DEFAULT.withDelimiter(' '));
  java.util.Iterator<CSVRecord> csvIterator = parser.iterator();
  while (csvIterator.hasNext()) {
    CSVRecord csvRecord = csvIterator.next();
    dataHolder
        .addItem(new DictionaryItem(csvRecord.get(0), Double.valueOf(csvRecord.get(1)), 0d));
  }
}
 
Example 5
Source File: CSVIngester.java    From macrobase with Apache License 2.0 5 votes vote down vote up
@Override
public RowSet getRows(String baseQuery,
                      Map<String, String> preds,
                      int limit,
                      int offset) throws Exception{

    filename = conf.getString(MacroBaseConf.CSV_INPUT_FILE);
    Compression compression = conf.getCsvCompression();

    if (compression == Compression.GZIP) {
        InputStream fileStream = new FileInputStream(filename);
        InputStream gzipStream = new GZIPInputStream(fileStream);
        Reader decoder = new InputStreamReader(gzipStream);
        csvParser = new CSVParser(decoder, CSVFormat.DEFAULT.withHeader());
    } else {
        File csvFile = new File(conf.getString(MacroBaseConf.CSV_INPUT_FILE));
        csvParser = CSVParser.parse(csvFile, Charset.defaultCharset(), CSVFormat.DEFAULT.withHeader());
    }
    schema = csvParser.getHeaderMap();
    Iterator<CSVRecord> rawIterator = csvParser.iterator();
    int rowCount = 0;

    List<RowSet.Row> rows = Lists.newArrayList();
    while (rawIterator.hasNext() && rowCount < limit) {
        CSVRecord record = rawIterator.next();
        List<ColumnValue> columnValues = Lists.newArrayList();

        if (includeRow(record, preds)) {
            for (Map.Entry<String, Integer> se : schema.entrySet()) {
                columnValues.add(new ColumnValue(se.getKey(),record.get(se.getValue())));
            }

            rows.add(new RowSet.Row(columnValues));
            rowCount++;
        }
    }
     return new RowSet(rows);
}
 
Example 6
Source File: Step8bTaskValidationGoldAnnotator.java    From argument-reasoning-comprehension-task with Apache License 2.0 4 votes vote down vote up
public static Map<String, CorrectedInstance> loadCorrectedInstancesFromCSV()
            throws IOException
    {
        Map<String, CorrectedInstance> result = new TreeMap<>();
        // read corrections
        List<String> fileNames = Arrays.asList("mturk/annotation-task/97-post-validation.csv",
                "mturk/annotation-task/97-post-validation2.csv");
        for (String fileName : fileNames) {
            CSVParser csvParser = CSVParser
                    .parse(new File(fileName), Charset.forName("utf-8"), CSVFormat.RFC4180);

            Iterator<CSVRecord> iterator = csvParser.iterator();

            while (iterator.hasNext()) {
                CSVRecord firstLine = iterator.next();
                CSVRecord secondLine = iterator.next();
                CSVRecord thirdLine = iterator.next();

                String id = firstLine.get(0);
                boolean skipRecord = "x".equals(firstLine.get(1)) || firstLine.get(1).isEmpty();

                if (!skipRecord) {
                    int correctLabel = Integer.valueOf(firstLine.get(1));

                    //                String[] split = secondLine.get(2).split("\\W", 2);
                    //                System.out.println(Arrays.toString(split));
                    int secondLineLabel = Integer.valueOf(secondLine.get(2).split("\\W", 2)[0]);
                    String secondLineText = secondLine.get(2).split("\\W", 2)[1];

                    int thirdLineLabel = Integer.valueOf(thirdLine.get(2).split("\\W", 2)[0]);
                    String thirdLineText = thirdLine.get(2).split("\\W", 2)[1];

                    System.out.println(correctLabel);
                    System.out.println(secondLineLabel + ", " + secondLineText);
                    System.out.println(thirdLineLabel + ", " + thirdLineText);

                    String originalWarrant;
                    String alternativeWarrant;
                    if (correctLabel == secondLineLabel) {
                        originalWarrant = secondLineText;
                        alternativeWarrant = thirdLineText;
                    }
                    else {
                        originalWarrant = thirdLineText;
                        alternativeWarrant = secondLineText;
                    }

                    CorrectedInstance correctedInstance = new CorrectedInstance(originalWarrant,
                            alternativeWarrant);
//                    System.out.println(correctedInstance);

                    result.put(id, correctedInstance);
                }
            }

            System.out.println(result.size());
        }
        return result;
    }
 
Example 7
Source File: DelimitedTextInputFormat.java    From marklogic-contentpump with Apache License 2.0 4 votes vote down vote up
public List<InputSplit> getSplits(JobContext job) throws IOException {
    boolean delimSplit = isSplitInput(job.getConfiguration());
    //if delimSplit is true, size of each split is determined by 
    //Math.max(minSize, Math.min(maxSize, blockSize)) in FileInputFormat
    List<InputSplit> splits = super.getSplits(job);
    if (!delimSplit) {
        return splits;
    }

    if (splits.size()>= SPLIT_COUNT_LIMIT) {
        //if #splits > 1 million, there is enough parallelism
        //therefore no point to split
        LOG.warn("Exceeding SPLIT_COUNT_LIMIT, input_split is off:"
            + SPLIT_COUNT_LIMIT);
        DefaultStringifier.store(job.getConfiguration(), false, ConfigConstants.CONF_SPLIT_INPUT);
        return splits;
    }
    // add header info into splits
    List<InputSplit> populatedSplits = new ArrayList<InputSplit>();
    LOG.info(splits.size() + " DelimitedSplits generated");
    Configuration conf = job.getConfiguration();
    char delimiter =0;
    ArrayList<Text> hlist = new ArrayList<Text>();
    for (InputSplit file: splits) {
        FileSplit fsplit = ((FileSplit)file);
        Path path = fsplit.getPath();
        FileSystem fs = path.getFileSystem(conf);
        
        if (fsplit.getStart() == 0) {
        // parse the inSplit, get the header
            FSDataInputStream fileIn = fs.open(path);

            String delimStr = conf.get(ConfigConstants.CONF_DELIMITER,
                ConfigConstants.DEFAULT_DELIMITER);
            if (delimStr.length() == 1) {
                delimiter = delimStr.charAt(0);
            } else {
                LOG.error("Incorrect delimitor: " + delimiter
                    + ". Expects single character.");
            }
            String encoding = conf.get(
                MarkLogicConstants.OUTPUT_CONTENT_ENCODING,
                MarkLogicConstants.DEFAULT_OUTPUT_CONTENT_ENCODING);
            InputStreamReader instream = new InputStreamReader(fileIn, encoding);
            CSVParser parser = new CSVParser(instream, CSVParserFormatter.
            		getFormat(delimiter, DelimitedTextReader.encapsulator,
            				true, true));
            Iterator<CSVRecord> it = parser.iterator();
            
            String[] header = null;
            if (it.hasNext()) {
            	CSVRecord record = (CSVRecord)it.next();
            	Iterator<String> recordIterator = record.iterator();
                int recordSize = record.size();
                header = new String[recordSize];
                for (int i = 0; i < recordSize; i++) {
                	if (recordIterator.hasNext()) {
                		header[i] = (String)recordIterator.next();
                	} else {
                		throw new IOException("Record size doesn't match the real size");
                	}
                }
                
                EncodingUtil.handleBOMUTF8(header, 0);
                
                hlist.clear();
                for (String s : header) {
                    hlist.add(new Text(s));
                }
            }
            instream.close();
        }
        
        DelimitedSplit ds = new DelimitedSplit(new TextArrayWritable(
            hlist.toArray(new Text[hlist.size()])), path,
            fsplit.getStart(), fsplit.getLength(),
            fsplit.getLocations());
        populatedSplits.add(ds);
    }
    
    return populatedSplits;
}
 
Example 8
Source File: CSVIngester.java    From macrobase with Apache License 2.0 4 votes vote down vote up
@Override
public MBStream<Datum> getStream() throws Exception {
    if(!loaded) {
        long st = System.currentTimeMillis();

        filename = conf.getString(MacroBaseConf.CSV_INPUT_FILE);
        Compression compression = conf.getCsvCompression();

        if (compression == Compression.GZIP) {
            InputStream fileStream = new FileInputStream(filename);
            InputStream gzipStream = new GZIPInputStream(fileStream);
            Reader decoder = new InputStreamReader(gzipStream);
            csvParser = new CSVParser(decoder, CSVFormat.DEFAULT.withHeader());
        } else {
            File csvFile = new File(conf.getString(MacroBaseConf.CSV_INPUT_FILE));
            csvParser = CSVParser.parse(csvFile, Charset.defaultCharset(), CSVFormat.DEFAULT.withHeader());
        }
        schema = csvParser.getHeaderMap(); //equal to resultSet.getmetadata or smt

        for (Map.Entry<String, Integer> se : schema.entrySet()) {
            conf.getEncoder().recordAttributeName(se.getValue() + 1, se.getKey()); //numbering off each column for encoding
        }

        // Load all records into memory to filter out rows with missing data
        Iterator<CSVRecord> rawIterator = csvParser.iterator();

        int numRows = 0;
        while (rawIterator.hasNext()) {
            try {
                CSVRecord record = rawIterator.next();
                Datum curRow = parseRecord(record);
                dataStream.add(curRow);
                numRows++;
            } catch (NumberFormatException e) {
                badRows++;
            }
        }
        log.info("{}/{} rows successfully parsed ({} malformed rows)", numRows, numRows + badRows, badRows);
    }

    return dataStream;
}
 
Example 9
Source File: TestDataProvider.java    From preDict with GNU Lesser General Public License v3.0 3 votes vote down vote up
/**
 * expects the name of a csv resource that matches the following format:
 * 
 * <pre>
 * 0 = correct word
 * 1 = true if this is a desired match,
 *     false if this is a false-positive match
 * 2 = comma separated list of similar word
 * </pre>
 * 
 * @param resourceName
 * @throws IOException
 */
public TestDataProvider(String resourceName) throws IOException {
	URL resourceUrl = this.getClass().getClassLoader().getResource(resourceName);
	CSVParser parser = CSVParser.parse(resourceUrl, Charset.forName("UTF-8"), CSVFormat.DEFAULT.withDelimiter(':'));
	Iterator<CSVRecord> csvIterator = parser.iterator();
	while (csvIterator.hasNext()) {
		CSVRecord csvRecord = csvIterator.next();
		baseWords.add(csvRecord.get(0));
		queries.addAll(Arrays.asList(csvRecord.get(2).split(",")));
	}
}