Java Code Examples for java.io.StreamTokenizer#ordinaryChar()

The following examples show how to use java.io.StreamTokenizer#ordinaryChar() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ArffLoader.java    From moa with GNU General Public License v3.0 6 votes vote down vote up
/**
 * Instantiates a new arff loader.
 *
 * @param reader the reader
 * @param range the range
 */
public ArffLoader(Reader reader, Range range) {
    this.range = range;
    BufferedReader br = new BufferedReader(reader);

    //Init streamTokenizer
    streamTokenizer = new StreamTokenizer(br);
    streamTokenizer.resetSyntax();
    streamTokenizer.whitespaceChars(0, ' ');
    streamTokenizer.wordChars(' ' + 1, '\u00FF');
    streamTokenizer.whitespaceChars(',', ',');
    streamTokenizer.commentChar('%');
    streamTokenizer.quoteChar('"');
    streamTokenizer.quoteChar('\'');
    streamTokenizer.ordinaryChar('{');
    streamTokenizer.ordinaryChar('}');
    streamTokenizer.eolIsSignificant(true);

    this.instanceInformation = this.getHeader();

    if (range != null) { //is MultiLabel
        this.instanceInformation.setRangeOutputIndices(range);
    }

}
 
Example 2
Source File: ArffLoader.java    From incubator-samoa with Apache License 2.0 6 votes vote down vote up
/**
 * Instantiates a new arff loader.
 *
 * @param reader the reader
 * @param range
 */
public ArffLoader(Reader reader, Range range) {
  this.range = range;
  BufferedReader br = new BufferedReader(reader);

  //Init streamTokenizer
  streamTokenizer = new StreamTokenizer(br);
  streamTokenizer.resetSyntax();
  streamTokenizer.whitespaceChars(0, ' ');
  streamTokenizer.wordChars(' ' + 1, '\u00FF');
  streamTokenizer.whitespaceChars(',', ',');
  streamTokenizer.commentChar('%');
  streamTokenizer.quoteChar('"');
  streamTokenizer.quoteChar('\'');
  streamTokenizer.ordinaryChar('{');
  streamTokenizer.ordinaryChar('}');
  streamTokenizer.eolIsSignificant(true);

  this.instanceInformation = this.getHeader();

  if (range != null) { //is MultiLabel
    this.instanceInformation.setRangeOutputIndices(range);
  }

}
 
Example 3
Source File: Algorithm.java    From KEEL with GNU General Public License v3.0 6 votes vote down vote up
/** Function to initialize the stream tokenizer.
 *
 * @param tokenizer The tokenizer.
 */
protected void initTokenizer( StreamTokenizer tokenizer )
{
  tokenizer.resetSyntax();
  tokenizer.whitespaceChars( 0, ' ' );
  tokenizer.wordChars( ' '+1,'\u00FF' );
  tokenizer.whitespaceChars( ',',',' );
  tokenizer.quoteChar( '"' );
  tokenizer.quoteChar( '\''  );
  tokenizer.ordinaryChar( '=' );
  tokenizer.ordinaryChar( '{' );
  tokenizer.ordinaryChar( '}' );
  tokenizer.ordinaryChar( '[' );
  tokenizer.ordinaryChar( ']' );
  tokenizer.eolIsSignificant( true );
}
 
Example 4
Source File: ArffLoader.java    From samoa with Apache License 2.0 6 votes vote down vote up
private void initStreamTokenizer(Reader reader) {
    BufferedReader br = new BufferedReader(reader);

    //Init streamTokenizer
    streamTokenizer = new StreamTokenizer(br);

    streamTokenizer.resetSyntax();
    streamTokenizer.whitespaceChars(0, ' ');
    streamTokenizer.wordChars(' ' + 1, '\u00FF');
    streamTokenizer.whitespaceChars(',', ',');
    streamTokenizer.commentChar('%');
    streamTokenizer.quoteChar('"');
    streamTokenizer.quoteChar('\'');
    streamTokenizer.ordinaryChar('{');
    streamTokenizer.ordinaryChar('}');
    streamTokenizer.eolIsSignificant(true);

    this.instanceInformation = this.getHeader();
    if (classAttribute < 0) {
        this.instanceInformation.setClassIndex(this.instanceInformation.numAttributes() - 1);
        //System.out.print(this.instanceInformation.classIndex());
    } else if (classAttribute > 0) {
        this.instanceInformation.setClassIndex(classAttribute - 1);
    }
}
 
Example 5
Source File: Parser.java    From hadoop with Apache License 2.0 5 votes vote down vote up
Lexer(String s) {
  tok = new StreamTokenizer(new CharArrayReader(s.toCharArray()));
  tok.quoteChar('"');
  tok.parseNumbers();
  tok.ordinaryChar(',');
  tok.ordinaryChar('(');
  tok.ordinaryChar(')');
  tok.wordChars('$','$');
  tok.wordChars('_','_');
}
 
Example 6
Source File: Parser.java    From hadoop with Apache License 2.0 5 votes vote down vote up
Lexer(String s) {
  tok = new StreamTokenizer(new CharArrayReader(s.toCharArray()));
  tok.quoteChar('"');
  tok.parseNumbers();
  tok.ordinaryChar(',');
  tok.ordinaryChar('(');
  tok.ordinaryChar(')');
  tok.wordChars('$','$');
  tok.wordChars('_','_');
}
 
Example 7
Source File: StringUtils.java    From CloverETL-Engine with GNU Lesser General Public License v2.1 5 votes vote down vote up
/**
 * Abbreviates input char sequence by using first letter of each word composing the sequence.
 * Skips any spaces, punctuation and other special chars.
 * 
 * @param input	character sequence to abbreviate
 * @param elementLength how many characters from each word to use (if more than 1, then each element is divided by "_" in output
 * @param capitalize capitalize letters - i.e. convert to uppercase
 * @param useNumbers also include numbers in sequence
 * @return abbreviation of input sequence
 */
public static CharSequence abbreviateString(CharSequence input, int elementLength, boolean capitalize,boolean useNumbers){
	StringBuilder out=new StringBuilder();      
	StreamTokenizer st = new StreamTokenizer(new CharSequenceReader(input));
	st.ordinaryChar('.');
	st.ordinaryChar('-');
	try{
	while(st.nextToken() !=
        StreamTokenizer.TT_EOF) {
        switch(st.ttype) {
          case StreamTokenizer.TT_WORD:
        	  if (elementLength>1 && out.length()>0) out.append('_'); // append underscore to divide abbr.pieces
        	  if (capitalize){
        		  String s = st.sval.subSequence(0, elementLength).toString();
        		  out.append(s.toUpperCase());
        	  }else{
        		  out.append(st.sval.subSequence(0, elementLength));
        	  }
            break;
          case StreamTokenizer.TT_NUMBER:
        	  if (useNumbers) out.append((int)st.nval);
        	  break;
          default: 
        	  // do nothing
        }
	}
	}catch(IOException ex){
	}
	return out;
}
 
Example 8
Source File: Parser.java    From big-c with Apache License 2.0 5 votes vote down vote up
Lexer(String s) {
  tok = new StreamTokenizer(new CharArrayReader(s.toCharArray()));
  tok.quoteChar('"');
  tok.parseNumbers();
  tok.ordinaryChar(',');
  tok.ordinaryChar('(');
  tok.ordinaryChar(')');
  tok.wordChars('$','$');
  tok.wordChars('_','_');
}
 
Example 9
Source File: Parser.java    From big-c with Apache License 2.0 5 votes vote down vote up
Lexer(String s) {
  tok = new StreamTokenizer(new CharArrayReader(s.toCharArray()));
  tok.quoteChar('"');
  tok.parseNumbers();
  tok.ordinaryChar(',');
  tok.ordinaryChar('(');
  tok.ordinaryChar(')');
  tok.wordChars('$','$');
  tok.wordChars('_','_');
}
 
Example 10
Source File: NewAnalyzerTask.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Set the params (analyzerName only),  Comma-separate list of Analyzer class names.  If the Analyzer lives in
 * org.apache.lucene.analysis, the name can be shortened by dropping the o.a.l.a part of the Fully Qualified Class Name.
 * <p>
 * Analyzer names may also refer to previously defined AnalyzerFactory's.
 * <p>
 * Example Declaration: {"NewAnalyzer" NewAnalyzer(WhitespaceAnalyzer, SimpleAnalyzer, StopAnalyzer, standard.StandardAnalyzer) &gt;
 * <p>
 * Example AnalyzerFactory usage:
 * <pre>
 * -AnalyzerFactory(name:'whitespace tokenized',WhitespaceTokenizer)
 * -NewAnalyzer('whitespace tokenized')
 * </pre>
 * @param params analyzerClassName, or empty for the StandardAnalyzer
 */
@Override
public void setParams(String params) {
  super.setParams(params);
  final StreamTokenizer stok = new StreamTokenizer(new StringReader(params));
  stok.quoteChar('"');
  stok.quoteChar('\'');
  stok.eolIsSignificant(false);
  stok.ordinaryChar(',');
  try {
    while (stok.nextToken() != StreamTokenizer.TT_EOF) {
      switch (stok.ttype) {
        case ',': {
          // Do nothing
          break;
        }
        case '\'':
        case '\"':
        case StreamTokenizer.TT_WORD: {
          analyzerNames.add(stok.sval);
          break;
        }
        default: {
          throw new RuntimeException("Unexpected token: " + stok.toString());
        }
      }
    }
  } catch (RuntimeException e) {
    if (e.getMessage().startsWith("Line #")) {
      throw e;
    } else {
      throw new RuntimeException("Line #" + (stok.lineno() + getAlgLineNum()) + ": ", e);
    }
  } catch (Throwable t) {
    throw new RuntimeException("Line #" + (stok.lineno() + getAlgLineNum()) + ": ", t);
  }


}
 
Example 11
Source File: Jatalog.java    From Jatalog with Apache License 2.0 5 votes vote down vote up
private static StreamTokenizer getTokenizer(Reader reader) throws IOException {
	StreamTokenizer scan = new StreamTokenizer(reader);
	scan.ordinaryChar('.'); // '.' looks like a number to StreamTokenizer by default
	scan.commentChar('%'); // Prolog-style % comments; slashSlashComments and slashStarComments can stay as well.
	scan.quoteChar('"');
	scan.quoteChar('\'');
	// WTF? You can't disable parsing of numbers unless you reset the syntax (http://stackoverflow.com/q/8856750/115589)
	//scan.parseNumbers(); 
	return scan;
}
 
Example 12
Source File: RunCART.java    From KEEL with GNU General Public License v3.0 5 votes vote down vote up
/** Function to initialize the stream tokenizer.
 *
 * @param tokenizer		The tokenizer.
 */
private void initTokenizer(StreamTokenizer tokenizer) {
	tokenizer.resetSyntax();
	tokenizer.whitespaceChars(0, ' ');
	tokenizer.wordChars(' ' + 1, '\u00FF');
	tokenizer.whitespaceChars(',', ',');
	tokenizer.quoteChar('"');
	tokenizer.quoteChar('\'');
	tokenizer.ordinaryChar('=');
	tokenizer.ordinaryChar('{');
	tokenizer.ordinaryChar('}');
	tokenizer.ordinaryChar('[');
	tokenizer.ordinaryChar(']');
	tokenizer.eolIsSignificant(true);
}
 
Example 13
Source File: Lexer.java    From jackrabbit-filevault with Apache License 2.0 5 votes vote down vote up
public Lexer(Reader r, String systemId) {
    this.systemId = systemId;
    st = new StreamTokenizer(r);

    st.eolIsSignificant(false);

    st.lowerCaseMode(false);

    st.slashSlashComments(true);
    st.slashStarComments(true);

    st.wordChars('a', 'z');
    st.wordChars('A', 'Z');
    st.wordChars(':', ':');
    st.wordChars('_', '_');

    st.quoteChar(SINGLE_QUOTE);
    st.quoteChar(DOUBLE_QUOTE);

    st.ordinaryChar(BEGIN_NODE_TYPE_NAME);
    st.ordinaryChar(END_NODE_TYPE_NAME);
    st.ordinaryChar(EXTENDS);
    st.ordinaryChar(LIST_DELIMITER);
    st.ordinaryChar(PROPERTY_DEFINITION);
    st.ordinaryChar(CHILD_NODE_DEFINITION);
    st.ordinaryChar(BEGIN_TYPE);
    st.ordinaryChar(END_TYPE);
    st.ordinaryChar(DEFAULT);
    st.ordinaryChar(CONSTRAINT);
}
 
Example 14
Source File: Parser.java    From hadoop-gpu with Apache License 2.0 5 votes vote down vote up
Lexer(String s) {
  tok = new StreamTokenizer(new CharArrayReader(s.toCharArray()));
  tok.quoteChar('"');
  tok.parseNumbers();
  tok.ordinaryChar(',');
  tok.ordinaryChar('(');
  tok.ordinaryChar(')');
  tok.wordChars('$','$');
  tok.wordChars('_','_');
}
 
Example 15
Source File: STExample.java    From icafe with Eclipse Public License 1.0 4 votes vote down vote up
public static void main(String args[]) throws IOException {
      Hashtable<String, Double> variables = new Hashtable<String, Double>();
      @SuppressWarnings("deprecation")
StreamTokenizer st = new StreamTokenizer(System.in);
      st.eolIsSignificant(true);
      st.lowerCaseMode(true);
      st.ordinaryChar('/');
      st.ordinaryChar('-');

      while (true) {
          Expression res;
          int c = StreamTokenizer.TT_EOL;
          String varName = null;

          System.out.println("Enter an expression...");
          try {
              while (true) {
                  c = st.nextToken();
                  if (c == StreamTokenizer.TT_EOF) {
                      System.exit(1);
                  } else if (c == StreamTokenizer.TT_EOL) {
                      continue;
                  } else if (c == StreamTokenizer.TT_WORD) {
                      if (st.sval.compareTo("dump") == 0) {
                          dumpVariables(variables);
                          continue;
                      } else if (st.sval.compareTo("clear") == 0) {
                          variables = new Hashtable<String, Double>();
                          continue;
                      } else if (st.sval.compareTo("quit") == 0) {
                          System.exit(0);
                      } else if (st.sval.compareTo("exit") == 0) {
                          System.exit(0);
                      } else if (st.sval.compareTo("help") == 0) {
                          help();
                          continue;
                      }
                      varName = st.sval;
                      c = st.nextToken();
                  }
                  break;
              }
              if (c != '=') {
                  throw new SyntaxError("missing initial '=' sign.");
              }
              res = ParseExpression.expression(st);
          } catch (SyntaxError se) {
              res = null;
              varName = null;
              System.out.println("\nSyntax Error detected! - "+se.getMsg());
              while (c != StreamTokenizer.TT_EOL)
                  c = st.nextToken();
              continue;
          }

          c = st.nextToken();
          if (c != StreamTokenizer.TT_EOL) {
              if (c == ')')
                  System.out.println("\nSyntax Error detected! - To many closing parens.");
              else
                  System.out.println("\nBogus token on input - "+c);
              while (c != StreamTokenizer.TT_EOL)
                  c = st.nextToken();
          } else {
              try {
                  Double z;
                  System.out.println("Parsed expression : "+res.unparse());
                  z = new Double(res.value(variables));
                  System.out.println("Value is : "+z);
                  if (varName != null) {
                      variables.put(varName, z);
                      System.out.println("Assigned to : "+varName);
                  }
              } catch (ExecError ee) {
                  System.out.println("Execution error, "+ee.getMsg()+"!");
              }
          }
      }
  }
 
Example 16
Source File: ReportStructureMatcher.java    From pentaho-reporting with GNU Lesser General Public License v2.1 4 votes vote down vote up
public static NodeMatcher parse( final String s ) throws IOException {
  final StreamTokenizer tokenizer = new StreamTokenizer( new StringReader( s ) );
  tokenizer.wordChars( '0', '9' );
  tokenizer.ordinaryChar( '.' );
  tokenizer.ordinaryChar( ',' );
  tokenizer.ordinaryChars( 0, ' ' );

  ElementMatcher elementMatcher = null;
  NodeMatcher n = null;
  Type selectorType = Type.Start;
  int token;
  while ( ( token = tokenizer.nextToken() ) != StreamTokenizer.TT_EOF ) {
    if ( token == StreamTokenizer.TT_WORD || token == '*' ) {
      NodeMatcher matcher = null;

      switch ( selectorType ) {
        case Start:
          elementMatcher = createMatcher( tokenizer );
          matcher = elementMatcher;
          break;
        case Child:
          n = new ChildMatcher( n );
          elementMatcher = createMatcher( tokenizer );
          matcher = elementMatcher;
          break;
        case Descendant:
          n = new DescendantMatcher( n );
          elementMatcher = createMatcher( tokenizer );
          matcher = elementMatcher;
          break;
        case Id:
          if ( elementMatcher == null ) {
            if ( n != null ) {
              n = new DescendantMatcher( n );
            }
            elementMatcher = createMatcher( tokenizer );
            matcher = elementMatcher;
          }
          elementMatcher.add( new AttributeMatcher( AttributeNames.Xml.NAMESPACE, AttributeNames.Xml.ID,
              tokenizer.sval ) );
          break;
        case Class:
          if ( elementMatcher == null ) {
            if ( n != null ) {
              n = new DescendantMatcher( n );
            }
            elementMatcher = createMatcher( tokenizer );
            matcher = elementMatcher;
          }
          elementMatcher.add( new AttributeMatcher( AttributeNames.Core.NAMESPACE, AttributeNames.Core.STYLE_CLASS,
              tokenizer.sval ) );
          break;
        default:
          throw new IOException();
      }

      selectorType = Type.Element;

      if ( matcher != null ) {
        if ( n != null ) {
          n = new AndMatcher( matcher, n );
        } else {
          n = matcher;
        }
      }
    } else {
      if ( token == '>' ) {
        selectorType = Type.Child;
      }
      if ( token == '.' ) {
        selectorType = Type.Class;
      }
      if ( token == '#' ) {
        selectorType = Type.Id;
      }
      if ( Character.isWhitespace( token ) ) {
        if ( selectorType == Type.Class || selectorType == Type.Id ) {
          throw new IllegalStateException();
        }

        if ( selectorType != Type.Child ) {
          selectorType = Type.Descendant;
        }
      }
    }
  }
  return n;
}
 
Example 17
Source File: OldStreamTokenizerTest.java    From j2objc with Apache License 2.0 4 votes vote down vote up
public void test_basicStringTokenizerMethods() throws IOException {
    String str = "Testing 12345 \n alpha \r\n omega";
    String strb = "-3.8 'BLIND mice' \r sEe /* how */ they run";
    StringReader aa = new StringReader(str);
    StringReader ba = new StringReader(strb);
    StreamTokenizer a = new StreamTokenizer(aa);
    StreamTokenizer b = new StreamTokenizer(ba);

    Assert.assertTrue(a.lineno() == 1);
    Assert.assertTrue(a.nextToken() == StreamTokenizer.TT_WORD);
    Assert.assertTrue(a.toString().equals("Token[Testing], line 1"));
    Assert.assertTrue(a.nextToken() == StreamTokenizer.TT_NUMBER);
    Assert.assertTrue(a.toString().equals("Token[n=12345.0], line 1"));
    Assert.assertTrue(a.nextToken() == StreamTokenizer.TT_WORD);
    Assert.assertTrue(a.toString().equals("Token[alpha], line 2"));
    Assert.assertTrue(a.nextToken() == StreamTokenizer.TT_WORD);
    Assert.assertTrue(a.toString().equals("Token[omega], line 3"));
    Assert.assertTrue(a.nextToken() == StreamTokenizer.TT_EOF);
    Assert.assertTrue(a.toString().equals("Token[EOF], line 3"));

    b.commentChar('u');
    b.eolIsSignificant(true);
    b.lowerCaseMode(true);
    b.ordinaryChar('y');
    b.slashStarComments(true);

    Assert.assertTrue(b.nextToken() == StreamTokenizer.TT_NUMBER);
    Assert.assertTrue(b.nval == -3.8);
    Assert.assertTrue(b.toString().equals("Token[n=-3.8], line 1"));
    Assert.assertTrue(b.nextToken() == 39); // '
    Assert.assertTrue(b.toString().equals("Token[BLIND mice], line 1"));
    Assert.assertTrue(b.nextToken() == 10); // \n
    Assert.assertTrue(b.toString().equals("Token[EOL], line 2"));
    Assert.assertTrue(b.nextToken() == StreamTokenizer.TT_WORD);
    Assert.assertTrue(b.toString().equals("Token[see], line 2"));
    Assert.assertTrue(b.nextToken() == StreamTokenizer.TT_WORD);
    Assert.assertTrue(b.toString().equals("Token[the], line 2"));
    Assert.assertTrue(b.nextToken() == 121); // y
    Assert.assertTrue(b.toString().equals("Token['y'], line 2"));
    Assert.assertTrue(b.nextToken() == StreamTokenizer.TT_WORD);
    Assert.assertTrue(b.toString().equals("Token[r], line 2"));
    Assert.assertTrue(b.nextToken() == StreamTokenizer.TT_EOF);
    Assert.assertTrue(b.toString().equals("Token[EOF], line 2"));
}
 
Example 18
Source File: UnparsedTag.java    From hlsparserj with Apache License 2.0 4 votes vote down vote up
/**
 * Parses the tag line.
 * @param line playlist line item
 */
private void parseTagLine(final String line) {
    final Matcher lineMatcher = TAGPATTERN.matcher(line);

    // Create a matcher that uses the TAGPATTERN
    if (lineMatcher.find()) {
        tagName = lineMatcher.group(1);

        final String attributeList = lineMatcher.group(2);

        final StreamTokenizer tokenizer = new StreamTokenizer(new StringReader(attributeList));
        tokenizer.resetSyntax();
        tokenizer.wordChars(' ', 255);
        tokenizer.quoteChar('"');
        tokenizer.ordinaryChar(',');
        tokenizer.ordinaryChar('=');

        String attributeName = null;
        String attributeValue = null;
        int noNameCount = 0;
        do {
            int ttype;
            try {
                ttype = tokenizer.nextToken();
            } catch (IOException e) {
                // Should never get here because reading from String
                throw new IllegalStateException(e);
            }

            if (ttype == ',' || ttype == StreamTokenizer.TT_EOF) {
                if (attributeValue == null) {
                    // Not actually an attribute - just a single value
                    attributes.put("NONAME" + noNameCount, attributeName);
                    noNameCount++;
                    attributeName = null;
                } else {
                    attributes.put(attributeName, attributeValue);
                    attributeName = null;
                    attributeValue = null;
                }
            } else if (ttype == StreamTokenizer.TT_WORD || ttype == '"') {
                if (attributeName == null) {
                    attributeName = tokenizer.sval;
                } else {
                    attributeValue = tokenizer.sval;
                }
            }
        } while (tokenizer.ttype != StreamTokenizer.TT_EOF);

        // Set the URI if a URI attribute is present
        if (attributes.containsKey(URI_ATTR)) {
            uri = attributes.get(URI_ATTR);
        }
    } else {
        // If the line startex with #EXT but does not contain a colon it is a
        // tag with no attributes
        tagName = line.substring(1);
    }
}
 
Example 19
Source File: OldAndroidStreamTokenizerTest.java    From j2objc with Apache License 2.0 4 votes vote down vote up
public void testStreamTokenizer() throws Exception {
    String str = "Testing 12345 \n alpha \r\n omega";
    String strb = "-3.8 'BLIND mice' \r sEe /* how */ they run";
    StringReader aa = new StringReader(str);
    StringReader ba = new StringReader(strb);
    StreamTokenizer a = new StreamTokenizer(aa);
    StreamTokenizer b = new StreamTokenizer(ba);

    assertEquals(1, a.lineno());
    assertEquals(StreamTokenizer.TT_WORD, a.nextToken());
    assertEquals("Token[Testing], line 1", a.toString());
    assertEquals(StreamTokenizer.TT_NUMBER, a.nextToken());
    assertEquals("Token[n=12345.0], line 1", a.toString());
    assertEquals(StreamTokenizer.TT_WORD, a.nextToken());
    assertEquals("Token[alpha], line 2", a.toString());
    assertEquals(StreamTokenizer.TT_WORD, a.nextToken());
    assertEquals("Token[omega], line 3", a.toString());
    assertEquals(StreamTokenizer.TT_EOF, a.nextToken());
    assertEquals("Token[EOF], line 3", a.toString());

    b.commentChar('u');
    b.eolIsSignificant(true);
    b.lowerCaseMode(true);
    b.ordinaryChar('y');
    b.slashStarComments(true);

    assertEquals(StreamTokenizer.TT_NUMBER, b.nextToken());
    assertEquals(-3.8, b.nval);
    assertEquals("Token[n=-3.8], line 1", b.toString());
    assertEquals(39, b.nextToken()); // '
    assertEquals("Token[BLIND mice], line 1", b.toString());
    assertEquals(10, b.nextToken()); // \n
    assertEquals("Token[EOL], line 2", b.toString());
    assertEquals(StreamTokenizer.TT_WORD, b.nextToken());
    assertEquals("Token[see], line 2", b.toString());
    assertEquals(StreamTokenizer.TT_WORD, b.nextToken());
    assertEquals("Token[the], line 2", b.toString());
    assertEquals(121, b.nextToken()); // y
    assertEquals("Token['y'], line 2", b.toString());
    assertEquals(StreamTokenizer.TT_WORD, b.nextToken());
    assertEquals("Token[r], line 2", b.toString());
    assertEquals(StreamTokenizer.TT_EOF, b.nextToken());
    assertEquals("Token[EOF], line 2", b.toString());

    // A harmony regression test
    byte[] data = new byte[]{(byte) '-'};
    StreamTokenizer tokenizer = new StreamTokenizer(new ByteArrayInputStream(data));
    tokenizer.nextToken();
    String result = tokenizer.toString();
    assertEquals("Token['-'], line 1", result);

    // another harmony regression test
    byte[] data2 = new byte[]{(byte) '"',
            (byte) 'H',
            (byte) 'e',
            (byte) 'l',
            (byte) 'l',
            (byte) 'o',
            (byte) '"'};
    StreamTokenizer tokenizer2 = new StreamTokenizer(new ByteArrayInputStream(data2));
    tokenizer2.nextToken();
    result = tokenizer2.toString();
    assertEquals("Token[Hello], line 1", result);
}
 
Example 20
Source File: URLRespectsRobots.java    From BUbiNG with Apache License 2.0 4 votes vote down vote up
/** Parses the argument as if it were the content of a <code>robots.txt</code> file,
 * and returns a sorted array of prefixes of URLs that the agent should not follow.
 *
 * @param content the content of the  <code>robots.txt</code> file.
 * @param userAgent the string representing the user agent of interest.
 * @return an array of character arrays, which are prefixes of the URLs not to follow, in sorted order.
 */
public static char[][] parseRobotsReader(final Reader content, final String userAgent) throws IOException {
	/* The set of disallowed paths specifically aimed at userAgent. */
	Set<String> set = new ObjectOpenHashSet<>();
	/* The set of disallowed paths specifically aimed at *. */
	Set<String> setStar = new ObjectOpenHashSet<>();
	/* True if the currently examined record is targetted to us. */
	boolean doesMatter = false;
	/* True if we have seen a section targetted to our agent. */
	boolean specific = false;
	/* True if we have seen a section targetted to *. */
	boolean generic = false;
	/* True if we are in a star section. */
	boolean starSection = false;

	StreamTokenizer st = new StreamTokenizer(new FastBufferedReader(content));
	int token;

	st.resetSyntax();
	st.eolIsSignificant(true); // We need EOLs to separate records
	st.wordChars(33, 255); // All characters may appear
	st.whitespaceChars(0, 32);
	st.ordinaryChar('#'); // We must manually simulate comments 8^(
	st.lowerCaseMode(false);

	while (true) {
		int lineFirstToken = st.nextToken();
		if (lineFirstToken == StreamTokenizer.TT_EOF) break;

			switch (lineFirstToken) {
				// Blank line: a new block is starting
			case StreamTokenizer.TT_EOL:
				doesMatter = false;
				break;

			// Comment or number: ignore until the end of line
			case StreamTokenizer.TT_NUMBER:
			case '#':
				do {
					token = st.nextToken();
				} while (token != StreamTokenizer.TT_EOL && token != StreamTokenizer.TT_EOF);
				break;

			// A string
			case StreamTokenizer.TT_WORD:
				if (st.sval.equalsIgnoreCase("user-agent:")) {
					token = st.nextToken();
					if (token == StreamTokenizer.TT_WORD)
						if (StringUtils.startsWithIgnoreCase(userAgent, st.sval)) {
							doesMatter = true;
							specific = true;
							starSection = false;
						}
						else if (st.sval.equals("*")) {
							starSection = true;
							generic = true;
						} else starSection = false;
					// Ignore the rest of the line
					while (token != StreamTokenizer.TT_EOL && token != StreamTokenizer.TT_EOF)
						token = st.nextToken();
				} else if (st.sval.equalsIgnoreCase("disallow:")) {
					token = st.nextToken();
					//System.out.println(st.sval + " " + starSection + " " + set + " " + setStar);
					if (token == StreamTokenizer.TT_EOL) {
						if (doesMatter) set.clear();
						else if (starSection) setStar.clear();
					} else if (token == StreamTokenizer.TT_WORD) {
						String disallowed = st.sval;
						if (disallowed.endsWith("*")) disallowed = disallowed.substring(0, disallowed.length()-1); // Someone (erroneously) uses * to denote any suffix
						if (doesMatter) set.add(disallowed);
						else if (starSection) setStar.add(disallowed);
					}
					// Ignore the rest of the line
					while (token != StreamTokenizer.TT_EOL && token != StreamTokenizer.TT_EOF)
						token = st.nextToken();
				} else if (LOGGER.isTraceEnabled()) LOGGER.trace("Line first token {} ununderstandable in robots.txt", st.sval);
				break;

			// Something else: a syntax error
			default:
				if (LOGGER.isTraceEnabled()) LOGGER.trace("Found unknown token type {} in robots.txt", Integer.valueOf(lineFirstToken));
		}
	}

	if (specific) return toSortedPrefixFreeCharArrays(set); // Some instructions specific to us
	if (! specific && generic) return toSortedPrefixFreeCharArrays(setStar); // No specific instruction, but some generic ones
	return toSortedPrefixFreeCharArrays(set);
}