com.univocity.parsers.common.TextParsingException Java Examples

The following examples show how to use com.univocity.parsers.common.TextParsingException. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CompliantTextRecordReader.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
/**
 * Generates the next record batch
 *
 * @return number of records in the batch
 */
@Override
public int next() {
  reader.resetForNextBatch();
  int cnt = 0;

  try {
    while (cnt < numRowsPerBatch && reader.parseNext()) {
      cnt++;
    }
    reader.finishBatch();
    return cnt;
  } catch (IOException | TextParsingException e) {
    throw UserException.dataReadError(e)
      .addContext("Failure while reading file %s. Happened at or shortly before byte position %d.",
        split.getPath(), reader.getPos())
      .build(logger);
  }
}
 
Example #2
Source File: CompliantTextBatchReader.java    From Bats with Apache License 2.0 5 votes vote down vote up
/**
 * Generates the next record batch
 * @return  number of records in the batch
 */

@Override
public boolean next() {
  reader.resetForNextBatch();

  try {
    boolean more = false;
    while (! writer.isFull()) {
      more = reader.parseNext();
      if (! more) {
        break;
      }
    }
    reader.finishBatch();

    // Return false on the batch that hits EOF. The scan operator
    // knows to process any rows in this final batch.

    return more && writer.rowCount() > 0;
  } catch (IOException | TextParsingException e) {
    if (e.getCause() != null  && e.getCause() instanceof UserException) {
      throw (UserException) e.getCause();
    }
    throw UserException.dataReadError(e)
        .addContext("Failure while reading file %s. Happened at or shortly before byte position %d.",
          split.getPath(), reader.getPos())
        .build(logger);
  }
}
 
Example #3
Source File: CsvReaderTest.java    From tablesaw with Apache License 2.0 5 votes vote down vote up
@Test
public void testMaxCharsPerColumnException() {
  final Reader reader =
      new StringReader("Text" + LINE_END + "\"short\"" + LINE_END + "1234567890" + LINE_END);

  final int maxCharsPerColumn = 8;

  assertThrows(
      TextParsingException.class,
      () -> {
        Table.read().csv(CsvReadOptions.builder(reader).maxCharsPerColumn(maxCharsPerColumn));
      });
}
 
Example #4
Source File: CsvReaderTest.java    From tablesaw with Apache License 2.0 5 votes vote down vote up
public void testReadMaxColumnsExceeded() {
  assertThrows(
      TextParsingException.class,
      () -> {
        Table.read().csv(CsvReadOptions.builder("../data/10001_columns.csv").header(false));
      });
}
 
Example #5
Source File: CsvReaderTest.java    From tablesaw with Apache License 2.0 5 votes vote down vote up
@Test
public void testMaxCharsPerColumnException() {
  final Reader reader =
      new StringReader("Text" + LINE_END + "\"short\"" + LINE_END + "1234567890" + LINE_END);

  final int maxCharsPerColumn = 8;

  assertThrows(
      TextParsingException.class,
      () -> {
        Table.read().csv(CsvReadOptions.builder(reader).maxCharsPerColumn(maxCharsPerColumn));
      });
}
 
Example #6
Source File: CsvReaderTest.java    From tablesaw with Apache License 2.0 5 votes vote down vote up
public void testReadMaxColumnsExceeded() {
  assertThrows(
      TextParsingException.class,
      () -> {
        Table.read().csv(CsvReadOptions.builder("../data/10001_columns.csv").header(false));
      });
}
 
Example #7
Source File: TextReader.java    From Bats with Apache License 2.0 4 votes vote down vote up
/**
 * Recursive function invoked when a quote is encountered. Function also
 * handles the case when there are non-white space characters in the field
 * after the quoted value.
 * @param prev  previous byte read
 * @throws IOException for input file read errors
 */
private void parseQuotedValue(byte prev) throws IOException {
  final byte newLine = this.newLine;
  final byte delimiter = this.delimiter;
  final TextOutput output = this.output;
  final TextInput input = this.input;
  final byte quote = this.quote;

  ch = input.nextCharNoNewLineCheck();

  while (!(prev == quote && (ch == delimiter || ch == newLine || isWhite(ch)))) {
    if (ch != quote) {
      if (prev == quote) { // unescaped quote detected
        if (parseUnescapedQuotes) {
          output.append(quote);
          output.append(ch);
          parseQuotedValue(ch);
          break;
        } else {
          throw new TextParsingException(
              context,
              "Unescaped quote character '"
                  + quote
                  + "' inside quoted value of CSV field. To allow unescaped quotes, "
                  + "set 'parseUnescapedQuotes' to 'true' in the CSV parser settings. "
                  + "Cannot parse CSV input.");
        }
      }
      output.append(ch);
      prev = ch;
    } else if (prev == quoteEscape) {
      output.append(quote);
      prev = NULL_BYTE;
    } else {
      prev = ch;
    }
    ch = input.nextCharNoNewLineCheck();
  }

  // Handles whitespace after quoted value:
  // Whitespace are ignored (i.e., ch <= ' ') if they are not used as delimiters (i.e., ch != ' ')
  // For example, in tab-separated files (TSV files), '\t' is used as delimiter and should not be ignored
  // Content after whitespace may be parsed if 'parseUnescapedQuotes' is enabled.
  if (ch != newLine && ch <= ' ' && ch != delimiter) {
    final DrillBuf workBuf = this.workBuf;
    workBuf.resetWriterIndex();
    do {
      // saves whitespace after value
      workBuf.writeByte(ch);
      ch = input.nextChar();
      // found a new line, go to next record.
      if (ch == newLine) {
        return;
      }
    } while (ch <= ' ' && ch != delimiter);

    // there's more stuff after the quoted value, not only empty spaces.
    if (!(ch == delimiter || ch == newLine) && parseUnescapedQuotes) {

      output.append(quote);
      for(int i =0; i < workBuf.writerIndex(); i++){
        output.append(workBuf.getByte(i));
      }
      // the next character is not the escape character, put it there
      if (ch != quoteEscape) {
        output.append(ch);
      }
      // sets this character as the previous character (may be escaping)
      // calls recursively to keep parsing potentially quoted content
      parseQuotedValue(ch);
    }
  }

  if (!(ch == delimiter || ch == newLine)) {
    throw new TextParsingException(context, "Unexpected character '" + ch
        + "' following quoted value of CSV field. Expecting '" + delimiter + "'. Cannot parse CSV input.");
  }
}
 
Example #8
Source File: TextReader.java    From Bats with Apache License 2.0 4 votes vote down vote up
/**
 * Helper method to handle exceptions caught while processing text files and generate better error messages associated with
 * the exception.
 * @param ex  Exception raised
 * @throws IOException for input file read errors
 */
private TextParsingException handleException(Exception ex) throws IOException {

  if (ex instanceof TextParsingException) {
    throw (TextParsingException) ex;
  }

  String message = null;
  String tmp = input.getStringSinceMarkForError();
  char[] chars = tmp.toCharArray();
  if (chars != null) {
    int length = chars.length;
    if (length > settings.getMaxCharsPerColumn()) {
      message = "Length of parsed input (" + length
          + ") exceeds the maximum number of characters defined in your parser settings ("
          + settings.getMaxCharsPerColumn() + "). ";
    }

    if (tmp.contains("\n") || tmp.contains("\r")) {
      tmp = displayLineSeparators(tmp, true);
      String lineSeparator = displayLineSeparators(settings.getLineSeparatorString(), false);
      message += "\nIdentified line separator characters in the parsed content. This may be the cause of the error. The line separator in your parser settings is set to '"
          + lineSeparator + "'. Parsed content:\n\t" + tmp;
    }

    int nullCharacterCount = 0;
    // ensuring the StringBuilder won't grow over Integer.MAX_VALUE to avoid OutOfMemoryError
    int maxLength = length > Integer.MAX_VALUE / 2 ? Integer.MAX_VALUE / 2 - 1 : length;
    StringBuilder s = new StringBuilder(maxLength);
    for (int i = 0; i < maxLength; i++) {
      if (chars[i] == '\0') {
        s.append('\\');
        s.append('0');
        nullCharacterCount++;
      } else {
        s.append(chars[i]);
      }
    }
    tmp = s.toString();

    if (nullCharacterCount > 0) {
      message += "\nIdentified "
          + nullCharacterCount
          + " null characters ('\0') on parsed content. This may indicate the data is corrupt or its encoding is invalid. Parsed content:\n\t"
          + tmp;
    }
  }

  UserException.Builder builder;
  if (ex instanceof UserException) {
    builder = ((UserException) ex).rebuild();
  } else {
    builder = UserException
      .dataReadError(ex)
      .message(message);
  }
  throw builder
    .addContext("Line", context.currentLine())
    .addContext("Record", context.currentRecord())
    .build(logger);
}
 
Example #9
Source File: TextReader.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
/**
 * Recursive function invoked when a quote is encountered. Function also
 * handles the case when there are non-white space characters in the field
 * after the quoted value.
 * @param prev  previous byte read
 * @throws IOException
 */
private void parseQuotedValue(byte prev) throws IOException {
  final byte newLine = this.newLine;
  final byte delimiter = this.delimiter;
  final TextOutput output = this.output;
  final TextInput input = this.input;
  final byte quote = this.quote;

  ch = input.nextCharNoNewLineCheck();

  while (!(prev == quote && (ch == delimiter || ch == newLine || isWhite(ch)))) {
    if (ch != quote) {
      if (prev == quote) { // unescaped quote detected
        if (parseUnescapedQuotes) {
          output.append(quote);
          output.append(ch);
          parseQuotedValue(ch);
          break;
        } else {
          throw new TextParsingException(
              context,
              "Unescaped quote character '"
                  + quote
                  + "' inside quoted value of CSV field. To allow unescaped quotes, set 'parseUnescapedQuotes' to 'true' in the CSV parser settings. Cannot parse CSV input.");
        }
      }
      output.append(ch);
      prev = ch;
    } else if (prev == quoteEscape) {
      output.append(quote);
      prev = NULL_BYTE;
    } else {
      prev = ch;
    }
    ch = input.nextCharNoNewLineCheck();
  }

  // Handles whitespaces after quoted value:
  // Whitespaces are ignored (i.e., ch <= ' ') if they are not used as delimiters (i.e., ch != ' ')
  // For example, in tab-separated files (TSV files), '\t' is used as delimiter and should not be ignored
  // Content after whitespaces may be parsed if 'parseUnescapedQuotes' is enabled.
  if (ch != newLine && ch <= ' ' && ch != delimiter) {
    final NettyArrowBuf workBuf = this.workBuf.asNettyBuffer();
    workBuf.resetWriterIndex();
    do {
      // saves whitespaces after value
      workBuf.writeByte(ch);
      ch = input.nextChar();
      // found a new line, go to next record.
      if (ch == newLine) {
        return;
      }
    } while (ch <= ' ' && ch != delimiter);

    // there's more stuff after the quoted value, not only empty spaces.
    if (!(ch == delimiter || ch == newLine) && parseUnescapedQuotes) {

      output.append(quote);
      for(int i =0; i < workBuf.writerIndex(); i++){
        output.append(workBuf.getByte(i));
      }
      // the next character is not the escape character, put it there
      if (ch != quoteEscape) {
        output.append(ch);
      }
      // sets this character as the previous character (may be escaping)
      // calls recursively to keep parsing potentially quoted content
      parseQuotedValue(ch);
    }
  }

  if (!(ch == delimiter || ch == newLine)) {
    throw new TextParsingException(context, "Unexpected character '" + ch
        + "' following quoted value of CSV field. Expecting '" + delimiter + "'. Cannot parse CSV input.");
  }
}