Java Code Examples for java.io.StreamTokenizer#ordinaryChar()

The following examples show how to use java.io.StreamTokenizer#ordinaryChar() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: ArffLoader.java From moa with GNU General Public License v3.0

6 votes

/**
 * Instantiates a new arff loader.
 *
 * @param reader the reader
 * @param range the range
 */
public ArffLoader(Reader reader, Range range) {
    this.range = range;
    BufferedReader br = new BufferedReader(reader);

    //Init streamTokenizer
    streamTokenizer = new StreamTokenizer(br);
    streamTokenizer.resetSyntax();
    streamTokenizer.whitespaceChars(0, ' ');
    streamTokenizer.wordChars(' ' + 1, '\u00FF');
    streamTokenizer.whitespaceChars(',', ',');
    streamTokenizer.commentChar('%');
    streamTokenizer.quoteChar('"');
    streamTokenizer.quoteChar('\'');
    streamTokenizer.ordinaryChar('{');
    streamTokenizer.ordinaryChar('}');
    streamTokenizer.eolIsSignificant(true);

    this.instanceInformation = this.getHeader();

    if (range != null) { //is MultiLabel
        this.instanceInformation.setRangeOutputIndices(range);
    }

}

Example 2

Source File: ArffLoader.java From incubator-samoa with Apache License 2.0

6 votes

/**
 * Instantiates a new arff loader.
 *
 * @param reader the reader
 * @param range
 */
public ArffLoader(Reader reader, Range range) {
  this.range = range;
  BufferedReader br = new BufferedReader(reader);

  //Init streamTokenizer
  streamTokenizer = new StreamTokenizer(br);
  streamTokenizer.resetSyntax();
  streamTokenizer.whitespaceChars(0, ' ');
  streamTokenizer.wordChars(' ' + 1, '\u00FF');
  streamTokenizer.whitespaceChars(',', ',');
  streamTokenizer.commentChar('%');
  streamTokenizer.quoteChar('"');
  streamTokenizer.quoteChar('\'');
  streamTokenizer.ordinaryChar('{');
  streamTokenizer.ordinaryChar('}');
  streamTokenizer.eolIsSignificant(true);

  this.instanceInformation = this.getHeader();

  if (range != null) { //is MultiLabel
    this.instanceInformation.setRangeOutputIndices(range);
  }

}

Example 3

Source File: Algorithm.java From KEEL with GNU General Public License v3.0

6 votes

/** Function to initialize the stream tokenizer.
 *
 * @param tokenizer The tokenizer.
 */
protected void initTokenizer( StreamTokenizer tokenizer )
{
  tokenizer.resetSyntax();
  tokenizer.whitespaceChars( 0, ' ' );
  tokenizer.wordChars( ' '+1,'\u00FF' );
  tokenizer.whitespaceChars( ',',',' );
  tokenizer.quoteChar( '"' );
  tokenizer.quoteChar( '\''  );
  tokenizer.ordinaryChar( '=' );
  tokenizer.ordinaryChar( '{' );
  tokenizer.ordinaryChar( '}' );
  tokenizer.ordinaryChar( '[' );
  tokenizer.ordinaryChar( ']' );
  tokenizer.eolIsSignificant( true );
}

Example 4

Source File: ArffLoader.java From samoa with Apache License 2.0

6 votes

private void initStreamTokenizer(Reader reader) {
    BufferedReader br = new BufferedReader(reader);

    //Init streamTokenizer
    streamTokenizer = new StreamTokenizer(br);

    streamTokenizer.resetSyntax();
    streamTokenizer.whitespaceChars(0, ' ');
    streamTokenizer.wordChars(' ' + 1, '\u00FF');
    streamTokenizer.whitespaceChars(',', ',');
    streamTokenizer.commentChar('%');
    streamTokenizer.quoteChar('"');
    streamTokenizer.quoteChar('\'');
    streamTokenizer.ordinaryChar('{');
    streamTokenizer.ordinaryChar('}');
    streamTokenizer.eolIsSignificant(true);

    this.instanceInformation = this.getHeader();
    if (classAttribute < 0) {
        this.instanceInformation.setClassIndex(this.instanceInformation.numAttributes() - 1);
        //System.out.print(this.instanceInformation.classIndex());
    } else if (classAttribute > 0) {
        this.instanceInformation.setClassIndex(classAttribute - 1);
    }
}

Example 5

Source File: Parser.java From hadoop with Apache License 2.0

5 votes

Lexer(String s) {
  tok = new StreamTokenizer(new CharArrayReader(s.toCharArray()));
  tok.quoteChar('"');
  tok.parseNumbers();
  tok.ordinaryChar(',');
  tok.ordinaryChar('(');
  tok.ordinaryChar(')');
  tok.wordChars('$','$');
  tok.wordChars('_','_');
}

Example 6

Source File: Parser.java From hadoop with Apache License 2.0

5 votes

Lexer(String s) {
  tok = new StreamTokenizer(new CharArrayReader(s.toCharArray()));
  tok.quoteChar('"');
  tok.parseNumbers();
  tok.ordinaryChar(',');
  tok.ordinaryChar('(');
  tok.ordinaryChar(')');
  tok.wordChars('$','$');
  tok.wordChars('_','_');
}

Example 7

Source File: StringUtils.java From CloverETL-Engine with GNU Lesser General Public License v2.1

5 votes

/**
 * Abbreviates input char sequence by using first letter of each word composing the sequence.
 * Skips any spaces, punctuation and other special chars.
 * 
 * @param input	character sequence to abbreviate
 * @param elementLength how many characters from each word to use (if more than 1, then each element is divided by "_" in output
 * @param capitalize capitalize letters - i.e. convert to uppercase
 * @param useNumbers also include numbers in sequence
 * @return abbreviation of input sequence
 */
public static CharSequence abbreviateString(CharSequence input, int elementLength, boolean capitalize,boolean useNumbers){
	StringBuilder out=new StringBuilder();      
	StreamTokenizer st = new StreamTokenizer(new CharSequenceReader(input));
	st.ordinaryChar('.');
	st.ordinaryChar('-');
	try{
	while(st.nextToken() !=
        StreamTokenizer.TT_EOF) {
        switch(st.ttype) {
          case StreamTokenizer.TT_WORD:
        	  if (elementLength>1 && out.length()>0) out.append('_'); // append underscore to divide abbr.pieces
        	  if (capitalize){
        		  String s = st.sval.subSequence(0, elementLength).toString();
        		  out.append(s.toUpperCase());
        	  }else{
        		  out.append(st.sval.subSequence(0, elementLength));
        	  }
            break;
          case StreamTokenizer.TT_NUMBER:
        	  if (useNumbers) out.append((int)st.nval);
        	  break;
          default: 
        	  // do nothing
        }
	}
	}catch(IOException ex){
	}
	return out;
}

Example 8

Source File: Parser.java From big-c with Apache License 2.0

5 votes

Lexer(String s) {
  tok = new StreamTokenizer(new CharArrayReader(s.toCharArray()));
  tok.quoteChar('"');
  tok.parseNumbers();
  tok.ordinaryChar(',');
  tok.ordinaryChar('(');
  tok.ordinaryChar(')');
  tok.wordChars('$','$');
  tok.wordChars('_','_');
}

Example 9

Source File: Parser.java From big-c with Apache License 2.0

5 votes

Lexer(String s) {
  tok = new StreamTokenizer(new CharArrayReader(s.toCharArray()));
  tok.quoteChar('"');
  tok.parseNumbers();
  tok.ordinaryChar(',');
  tok.ordinaryChar('(');
  tok.ordinaryChar(')');
  tok.wordChars('$','$');
  tok.wordChars('_','_');
}

Example 10

Source File: NewAnalyzerTask.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Set the params (analyzerName only),  Comma-separate list of Analyzer class names.  If the Analyzer lives in
 * org.apache.lucene.analysis, the name can be shortened by dropping the o.a.l.a part of the Fully Qualified Class Name.
 * <p>
 * Analyzer names may also refer to previously defined AnalyzerFactory's.
 * <p>
 * Example Declaration: {"NewAnalyzer" NewAnalyzer(WhitespaceAnalyzer, SimpleAnalyzer, StopAnalyzer, standard.StandardAnalyzer) &gt;
 * <p>
 * Example AnalyzerFactory usage:
 * <pre>
 * -AnalyzerFactory(name:'whitespace tokenized',WhitespaceTokenizer)
 * -NewAnalyzer('whitespace tokenized')
 * </pre>
 * @param params analyzerClassName, or empty for the StandardAnalyzer
 */
@Override
public void setParams(String params) {
  super.setParams(params);
  final StreamTokenizer stok = new StreamTokenizer(new StringReader(params));
  stok.quoteChar('"');
  stok.quoteChar('\'');
  stok.eolIsSignificant(false);
  stok.ordinaryChar(',');
  try {
    while (stok.nextToken() != StreamTokenizer.TT_EOF) {
      switch (stok.ttype) {
        case ',': {
          // Do nothing
          break;
        }
        case '\'':
        case '\"':
        case StreamTokenizer.TT_WORD: {
          analyzerNames.add(stok.sval);
          break;
        }
        default: {
          throw new RuntimeException("Unexpected token: " + stok.toString());
        }
      }
    }
  } catch (RuntimeException e) {
    if (e.getMessage().startsWith("Line #")) {
      throw e;
    } else {
      throw new RuntimeException("Line #" + (stok.lineno() + getAlgLineNum()) + ": ", e);
    }
  } catch (Throwable t) {
    throw new RuntimeException("Line #" + (stok.lineno() + getAlgLineNum()) + ": ", t);
  }


}

Example 11

Source File: Jatalog.java From Jatalog with Apache License 2.0

5 votes

private static StreamTokenizer getTokenizer(Reader reader) throws IOException {
	StreamTokenizer scan = new StreamTokenizer(reader);
	scan.ordinaryChar('.'); // '.' looks like a number to StreamTokenizer by default
	scan.commentChar('%'); // Prolog-style % comments; slashSlashComments and slashStarComments can stay as well.
	scan.quoteChar('"');
	scan.quoteChar('\'');
	// WTF? You can't disable parsing of numbers unless you reset the syntax (http://stackoverflow.com/q/8856750/115589)
	//scan.parseNumbers(); 
	return scan;
}

Example 12

Source File: RunCART.java From KEEL with GNU General Public License v3.0

5 votes

/** Function to initialize the stream tokenizer.
 *
 * @param tokenizer		The tokenizer.
 */
private void initTokenizer(StreamTokenizer tokenizer) {
	tokenizer.resetSyntax();
	tokenizer.whitespaceChars(0, ' ');
	tokenizer.wordChars(' ' + 1, '\u00FF');
	tokenizer.whitespaceChars(',', ',');
	tokenizer.quoteChar('"');
	tokenizer.quoteChar('\'');
	tokenizer.ordinaryChar('=');
	tokenizer.ordinaryChar('{');
	tokenizer.ordinaryChar('}');
	tokenizer.ordinaryChar('[');
	tokenizer.ordinaryChar(']');
	tokenizer.eolIsSignificant(true);
}

Example 13

Source File: Lexer.java From jackrabbit-filevault with Apache License 2.0

5 votes

public Lexer(Reader r, String systemId) {
    this.systemId = systemId;
    st = new StreamTokenizer(r);

    st.eolIsSignificant(false);

    st.lowerCaseMode(false);

    st.slashSlashComments(true);
    st.slashStarComments(true);

    st.wordChars('a', 'z');
    st.wordChars('A', 'Z');
    st.wordChars(':', ':');
    st.wordChars('_', '_');

    st.quoteChar(SINGLE_QUOTE);
    st.quoteChar(DOUBLE_QUOTE);

    st.ordinaryChar(BEGIN_NODE_TYPE_NAME);
    st.ordinaryChar(END_NODE_TYPE_NAME);
    st.ordinaryChar(EXTENDS);
    st.ordinaryChar(LIST_DELIMITER);
    st.ordinaryChar(PROPERTY_DEFINITION);
    st.ordinaryChar(CHILD_NODE_DEFINITION);
    st.ordinaryChar(BEGIN_TYPE);
    st.ordinaryChar(END_TYPE);
    st.ordinaryChar(DEFAULT);
    st.ordinaryChar(CONSTRAINT);
}

Example 14

Source File: Parser.java From hadoop-gpu with Apache License 2.0

5 votes

Lexer(String s) {
  tok = new StreamTokenizer(new CharArrayReader(s.toCharArray()));
  tok.quoteChar('"');
  tok.parseNumbers();
  tok.ordinaryChar(',');
  tok.ordinaryChar('(');
  tok.ordinaryChar(')');
  tok.wordChars('$','$');
  tok.wordChars('_','_');
}

Example 15

Source File: STExample.java From icafe with Eclipse Public License 1.0

4 votes

public static void main(String args[]) throws IOException {
      Hashtable<String, Double> variables = new Hashtable<String, Double>();
      @SuppressWarnings("deprecation")
StreamTokenizer st = new StreamTokenizer(System.in);
      st.eolIsSignificant(true);
      st.lowerCaseMode(true);
      st.ordinaryChar('/');
      st.ordinaryChar('-');

      while (true) {
          Expression res;
          int c = StreamTokenizer.TT_EOL;
          String varName = null;

          System.out.println("Enter an expression...");
          try {
              while (true) {
                  c = st.nextToken();
                  if (c == StreamTokenizer.TT_EOF) {
                      System.exit(1);
                  } else if (c == StreamTokenizer.TT_EOL) {
                      continue;
                  } else if (c == StreamTokenizer.TT_WORD) {
                      if (st.sval.compareTo("dump") == 0) {
                          dumpVariables(variables);
                          continue;
                      } else if (st.sval.compareTo("clear") == 0) {
                          variables = new Hashtable<String, Double>();
                          continue;
                      } else if (st.sval.compareTo("quit") == 0) {
                          System.exit(0);
                      } else if (st.sval.compareTo("exit") == 0) {
                          System.exit(0);
                      } else if (st.sval.compareTo("help") == 0) {
                          help();
                          continue;
                      }
                      varName = st.sval;
                      c = st.nextToken();
                  }
                  break;
              }
              if (c != '=') {
                  throw new SyntaxError("missing initial '=' sign.");
              }
              res = ParseExpression.expression(st);
          } catch (SyntaxError se) {
              res = null;
              varName = null;
              System.out.println("\nSyntax Error detected! - "+se.getMsg());
              while (c != StreamTokenizer.TT_EOL)
                  c = st.nextToken();
              continue;
          }

          c = st.nextToken();
          if (c != StreamTokenizer.TT_EOL) {
              if (c == ')')
                  System.out.println("\nSyntax Error detected! - To many closing parens.");
              else
                  System.out.println("\nBogus token on input - "+c);
              while (c != StreamTokenizer.TT_EOL)
                  c = st.nextToken();
          } else {
              try {
                  Double z;
                  System.out.println("Parsed expression : "+res.unparse());
                  z = new Double(res.value(variables));
                  System.out.println("Value is : "+z);
                  if (varName != null) {
                      variables.put(varName, z);
                      System.out.println("Assigned to : "+varName);
                  }
              } catch (ExecError ee) {
                  System.out.println("Execution error, "+ee.getMsg()+"!");
              }
          }
      }
  }

Example 16

Source File: ReportStructureMatcher.java From pentaho-reporting with GNU Lesser General Public License v2.1

4 votes

public static NodeMatcher parse( final String s ) throws IOException {
  final StreamTokenizer tokenizer = new StreamTokenizer( new StringReader( s ) );
  tokenizer.wordChars( '0', '9' );
  tokenizer.ordinaryChar( '.' );
  tokenizer.ordinaryChar( ',' );
  tokenizer.ordinaryChars( 0, ' ' );

  ElementMatcher elementMatcher = null;
  NodeMatcher n = null;
  Type selectorType = Type.Start;
  int token;
  while ( ( token = tokenizer.nextToken() ) != StreamTokenizer.TT_EOF ) {
    if ( token == StreamTokenizer.TT_WORD || token == '*' ) {
      NodeMatcher matcher = null;

      switch ( selectorType ) {
        case Start:
          elementMatcher = createMatcher( tokenizer );
          matcher = elementMatcher;
          break;
        case Child:
          n = new ChildMatcher( n );
          elementMatcher = createMatcher( tokenizer );
          matcher = elementMatcher;
          break;
        case Descendant:
          n = new DescendantMatcher( n );
          elementMatcher = createMatcher( tokenizer );
          matcher = elementMatcher;
          break;
        case Id:
          if ( elementMatcher == null ) {
            if ( n != null ) {
              n = new DescendantMatcher( n );
            }
            elementMatcher = createMatcher( tokenizer );
            matcher = elementMatcher;
          }
          elementMatcher.add( new AttributeMatcher( AttributeNames.Xml.NAMESPACE, AttributeNames.Xml.ID,
              tokenizer.sval ) );
          break;
        case Class:
          if ( elementMatcher == null ) {
            if ( n != null ) {
              n = new DescendantMatcher( n );
            }
            elementMatcher = createMatcher( tokenizer );
            matcher = elementMatcher;
          }
          elementMatcher.add( new AttributeMatcher( AttributeNames.Core.NAMESPACE, AttributeNames.Core.STYLE_CLASS,
              tokenizer.sval ) );
          break;
        default:
          throw new IOException();
      }

      selectorType = Type.Element;

      if ( matcher != null ) {
        if ( n != null ) {
          n = new AndMatcher( matcher, n );
        } else {
          n = matcher;
        }
      }
    } else {
      if ( token == '>' ) {
        selectorType = Type.Child;
      }
      if ( token == '.' ) {
        selectorType = Type.Class;
      }
      if ( token == '#' ) {
        selectorType = Type.Id;
      }
      if ( Character.isWhitespace( token ) ) {
        if ( selectorType == Type.Class || selectorType == Type.Id ) {
          throw new IllegalStateException();
        }

        if ( selectorType != Type.Child ) {
          selectorType = Type.Descendant;
        }
      }
    }
  }
  return n;
}

Example 17

Source File: OldStreamTokenizerTest.java From j2objc with Apache License 2.0

4 votes

public void test_basicStringTokenizerMethods() throws IOException {
    String str = "Testing 12345 \n alpha \r\n omega";
    String strb = "-3.8 'BLIND mice' \r sEe /* how */ they run";
    StringReader aa = new StringReader(str);
    StringReader ba = new StringReader(strb);
    StreamTokenizer a = new StreamTokenizer(aa);
    StreamTokenizer b = new StreamTokenizer(ba);

    Assert.assertTrue(a.lineno() == 1);
    Assert.assertTrue(a.nextToken() == StreamTokenizer.TT_WORD);
    Assert.assertTrue(a.toString().equals("Token[Testing], line 1"));
    Assert.assertTrue(a.nextToken() == StreamTokenizer.TT_NUMBER);
    Assert.assertTrue(a.toString().equals("Token[n=12345.0], line 1"));
    Assert.assertTrue(a.nextToken() == StreamTokenizer.TT_WORD);
    Assert.assertTrue(a.toString().equals("Token[alpha], line 2"));
    Assert.assertTrue(a.nextToken() == StreamTokenizer.TT_WORD);
    Assert.assertTrue(a.toString().equals("Token[omega], line 3"));
    Assert.assertTrue(a.nextToken() == StreamTokenizer.TT_EOF);
    Assert.assertTrue(a.toString().equals("Token[EOF], line 3"));

    b.commentChar('u');
    b.eolIsSignificant(true);
    b.lowerCaseMode(true);
    b.ordinaryChar('y');
    b.slashStarComments(true);

    Assert.assertTrue(b.nextToken() == StreamTokenizer.TT_NUMBER);
    Assert.assertTrue(b.nval == -3.8);
    Assert.assertTrue(b.toString().equals("Token[n=-3.8], line 1"));
    Assert.assertTrue(b.nextToken() == 39); // '
    Assert.assertTrue(b.toString().equals("Token[BLIND mice], line 1"));
    Assert.assertTrue(b.nextToken() == 10); // \n
    Assert.assertTrue(b.toString().equals("Token[EOL], line 2"));
    Assert.assertTrue(b.nextToken() == StreamTokenizer.TT_WORD);
    Assert.assertTrue(b.toString().equals("Token[see], line 2"));
    Assert.assertTrue(b.nextToken() == StreamTokenizer.TT_WORD);
    Assert.assertTrue(b.toString().equals("Token[the], line 2"));
    Assert.assertTrue(b.nextToken() == 121); // y
    Assert.assertTrue(b.toString().equals("Token['y'], line 2"));
    Assert.assertTrue(b.nextToken() == StreamTokenizer.TT_WORD);
    Assert.assertTrue(b.toString().equals("Token[r], line 2"));
    Assert.assertTrue(b.nextToken() == StreamTokenizer.TT_EOF);
    Assert.assertTrue(b.toString().equals("Token[EOF], line 2"));
}

Example 18

Source File: UnparsedTag.java From hlsparserj with Apache License 2.0

4 votes

/**
 * Parses the tag line.
 * @param line playlist line item
 */
private void parseTagLine(final String line) {
    final Matcher lineMatcher = TAGPATTERN.matcher(line);

    // Create a matcher that uses the TAGPATTERN
    if (lineMatcher.find()) {
        tagName = lineMatcher.group(1);

        final String attributeList = lineMatcher.group(2);

        final StreamTokenizer tokenizer = new StreamTokenizer(new StringReader(attributeList));
        tokenizer.resetSyntax();
        tokenizer.wordChars(' ', 255);
        tokenizer.quoteChar('"');
        tokenizer.ordinaryChar(',');
        tokenizer.ordinaryChar('=');

        String attributeName = null;
        String attributeValue = null;
        int noNameCount = 0;
        do {
            int ttype;
            try {
                ttype = tokenizer.nextToken();
            } catch (IOException e) {
                // Should never get here because reading from String
                throw new IllegalStateException(e);
            }

            if (ttype == ',' || ttype == StreamTokenizer.TT_EOF) {
                if (attributeValue == null) {
                    // Not actually an attribute - just a single value
                    attributes.put("NONAME" + noNameCount, attributeName);
                    noNameCount++;
                    attributeName = null;
                } else {
                    attributes.put(attributeName, attributeValue);
                    attributeName = null;
                    attributeValue = null;
                }
            } else if (ttype == StreamTokenizer.TT_WORD || ttype == '"') {
                if (attributeName == null) {
                    attributeName = tokenizer.sval;
                } else {
                    attributeValue = tokenizer.sval;
                }
            }
        } while (tokenizer.ttype != StreamTokenizer.TT_EOF);

        // Set the URI if a URI attribute is present
        if (attributes.containsKey(URI_ATTR)) {
            uri = attributes.get(URI_ATTR);
        }
    } else {
        // If the line startex with #EXT but does not contain a colon it is a
        // tag with no attributes
        tagName = line.substring(1);
    }
}

Example 19

Source File: OldAndroidStreamTokenizerTest.java From j2objc with Apache License 2.0

4 votes

public void testStreamTokenizer() throws Exception {
    String str = "Testing 12345 \n alpha \r\n omega";
    String strb = "-3.8 'BLIND mice' \r sEe /* how */ they run";
    StringReader aa = new StringReader(str);
    StringReader ba = new StringReader(strb);
    StreamTokenizer a = new StreamTokenizer(aa);
    StreamTokenizer b = new StreamTokenizer(ba);

    assertEquals(1, a.lineno());
    assertEquals(StreamTokenizer.TT_WORD, a.nextToken());
    assertEquals("Token[Testing], line 1", a.toString());
    assertEquals(StreamTokenizer.TT_NUMBER, a.nextToken());
    assertEquals("Token[n=12345.0], line 1", a.toString());
    assertEquals(StreamTokenizer.TT_WORD, a.nextToken());
    assertEquals("Token[alpha], line 2", a.toString());
    assertEquals(StreamTokenizer.TT_WORD, a.nextToken());
    assertEquals("Token[omega], line 3", a.toString());
    assertEquals(StreamTokenizer.TT_EOF, a.nextToken());
    assertEquals("Token[EOF], line 3", a.toString());

    b.commentChar('u');
    b.eolIsSignificant(true);
    b.lowerCaseMode(true);
    b.ordinaryChar('y');
    b.slashStarComments(true);

    assertEquals(StreamTokenizer.TT_NUMBER, b.nextToken());
    assertEquals(-3.8, b.nval);
    assertEquals("Token[n=-3.8], line 1", b.toString());
    assertEquals(39, b.nextToken()); // '
    assertEquals("Token[BLIND mice], line 1", b.toString());
    assertEquals(10, b.nextToken()); // \n
    assertEquals("Token[EOL], line 2", b.toString());
    assertEquals(StreamTokenizer.TT_WORD, b.nextToken());
    assertEquals("Token[see], line 2", b.toString());
    assertEquals(StreamTokenizer.TT_WORD, b.nextToken());
    assertEquals("Token[the], line 2", b.toString());
    assertEquals(121, b.nextToken()); // y
    assertEquals("Token['y'], line 2", b.toString());
    assertEquals(StreamTokenizer.TT_WORD, b.nextToken());
    assertEquals("Token[r], line 2", b.toString());
    assertEquals(StreamTokenizer.TT_EOF, b.nextToken());
    assertEquals("Token[EOF], line 2", b.toString());

    // A harmony regression test
    byte[] data = new byte[]{(byte) '-'};
    StreamTokenizer tokenizer = new StreamTokenizer(new ByteArrayInputStream(data));
    tokenizer.nextToken();
    String result = tokenizer.toString();
    assertEquals("Token['-'], line 1", result);

    // another harmony regression test
    byte[] data2 = new byte[]{(byte) '"',
            (byte) 'H',
            (byte) 'e',
            (byte) 'l',
            (byte) 'l',
            (byte) 'o',
            (byte) '"'};
    StreamTokenizer tokenizer2 = new StreamTokenizer(new ByteArrayInputStream(data2));
    tokenizer2.nextToken();
    result = tokenizer2.toString();
    assertEquals("Token[Hello], line 1", result);
}

Example 20

Source File: URLRespectsRobots.java From BUbiNG with Apache License 2.0

4 votes

/** Parses the argument as if it were the content of a <code>robots.txt</code> file,
 * and returns a sorted array of prefixes of URLs that the agent should not follow.
 *
 * @param content the content of the  <code>robots.txt</code> file.
 * @param userAgent the string representing the user agent of interest.
 * @return an array of character arrays, which are prefixes of the URLs not to follow, in sorted order.
 */
public static char[][] parseRobotsReader(final Reader content, final String userAgent) throws IOException {
	/* The set of disallowed paths specifically aimed at userAgent. */
	Set<String> set = new ObjectOpenHashSet<>();
	/* The set of disallowed paths specifically aimed at *. */
	Set<String> setStar = new ObjectOpenHashSet<>();
	/* True if the currently examined record is targetted to us. */
	boolean doesMatter = false;
	/* True if we have seen a section targetted to our agent. */
	boolean specific = false;
	/* True if we have seen a section targetted to *. */
	boolean generic = false;
	/* True if we are in a star section. */
	boolean starSection = false;

	StreamTokenizer st = new StreamTokenizer(new FastBufferedReader(content));
	int token;

	st.resetSyntax();
	st.eolIsSignificant(true); // We need EOLs to separate records
	st.wordChars(33, 255); // All characters may appear
	st.whitespaceChars(0, 32);
	st.ordinaryChar('#'); // We must manually simulate comments 8^(
	st.lowerCaseMode(false);

	while (true) {
		int lineFirstToken = st.nextToken();
		if (lineFirstToken == StreamTokenizer.TT_EOF) break;

			switch (lineFirstToken) {
				// Blank line: a new block is starting
			case StreamTokenizer.TT_EOL:
				doesMatter = false;
				break;

			// Comment or number: ignore until the end of line
			case StreamTokenizer.TT_NUMBER:
			case '#':
				do {
					token = st.nextToken();
				} while (token != StreamTokenizer.TT_EOL && token != StreamTokenizer.TT_EOF);
				break;

			// A string
			case StreamTokenizer.TT_WORD:
				if (st.sval.equalsIgnoreCase("user-agent:")) {
					token = st.nextToken();
					if (token == StreamTokenizer.TT_WORD)
						if (StringUtils.startsWithIgnoreCase(userAgent, st.sval)) {
							doesMatter = true;
							specific = true;
							starSection = false;
						}
						else if (st.sval.equals("*")) {
							starSection = true;
							generic = true;
						} else starSection = false;
					// Ignore the rest of the line
					while (token != StreamTokenizer.TT_EOL && token != StreamTokenizer.TT_EOF)
						token = st.nextToken();
				} else if (st.sval.equalsIgnoreCase("disallow:")) {
					token = st.nextToken();
					//System.out.println(st.sval + " " + starSection + " " + set + " " + setStar);
					if (token == StreamTokenizer.TT_EOL) {
						if (doesMatter) set.clear();
						else if (starSection) setStar.clear();
					} else if (token == StreamTokenizer.TT_WORD) {
						String disallowed = st.sval;
						if (disallowed.endsWith("*")) disallowed = disallowed.substring(0, disallowed.length()-1); // Someone (erroneously) uses * to denote any suffix
						if (doesMatter) set.add(disallowed);
						else if (starSection) setStar.add(disallowed);
					}
					// Ignore the rest of the line
					while (token != StreamTokenizer.TT_EOL && token != StreamTokenizer.TT_EOF)
						token = st.nextToken();
				} else if (LOGGER.isTraceEnabled()) LOGGER.trace("Line first token {} ununderstandable in robots.txt", st.sval);
				break;

			// Something else: a syntax error
			default:
				if (LOGGER.isTraceEnabled()) LOGGER.trace("Found unknown token type {} in robots.txt", Integer.valueOf(lineFirstToken));
		}
	}

	if (specific) return toSortedPrefixFreeCharArrays(set); // Some instructions specific to us
	if (! specific && generic) return toSortedPrefixFreeCharArrays(setStar); // No specific instruction, but some generic ones
	return toSortedPrefixFreeCharArrays(set);
}