Java Code Examples for org.apache.lucene.util.IOUtils#getDecodingReader()

The following examples show how to use org.apache.lucene.util.IOUtils#getDecodingReader() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: TokenSearch.java From datawave with Apache License 2.0

6 votes

/**
 * Load stopwords from the specified file located in the classpath.
 * <p>
 * If a directory name is specified, e.g: <code>tmp/stopwords.txt</code> that path will be used when searching for the resource. Otherwise, the package
 * contianing the DefaultTokenSearch class may be used.
 * <p>
 * The current thread's context classloader will be used to load the specified filename as a resource.
 * 
 * @param filename
 *            the filename containing the stoplist to load, located using the rules described above.
 * @return a lucene {@code CharArraySet} containing the stopwords. This is configured to be case insensitive.
 * @throws IOException
 *             if there is a problem finding or loading the specified stop word file..
 */
public static CharArraySet loadStopWords(String filename) throws IOException {
    Closer closer = Closer.create();
    try {
        CharArraySet stopSet = new CharArraySet(16, true /* ignore case */);
        String pkg = Factory.class.getPackage().getName().replace('.', '/');
        String resource = filename.indexOf("/") > -1 ? filename : (pkg + "/" + filename);
        InputStream resourceStream = Thread.currentThread().getContextClassLoader().getResourceAsStream(resource);
        logger.info("Loading stopwords file " + filename + " from resource " + resource);
        if (resourceStream == null) {
            throw new FileNotFoundException("Unable to load stopword file as resource " + filename);
        }
        Reader reader = IOUtils.getDecodingReader(resourceStream, StandardCharsets.UTF_8);
        closer.register(reader);
        CharArraySet set = WordlistLoader.getWordSet(reader, "#", stopSet);
        logger.info("Loaded " + set.size() + " stopwords from " + filename + " (" + resource + ")");
        return set;
    } finally {
        closer.close();
    }
}

Example 2

Source File: ICUTokenizerFactory.java From lucene-solr with Apache License 2.0

5 votes

private BreakIterator parseRules(String filename, ResourceLoader loader) throws IOException {
  StringBuilder rules = new StringBuilder();
  InputStream rulesStream = loader.openResource(filename);
  BufferedReader reader = new BufferedReader
      (IOUtils.getDecodingReader(rulesStream, StandardCharsets.UTF_8));
  String line = null;
  while ((line = reader.readLine()) != null) {
    if ( ! line.startsWith("#"))
      rules.append(line);
    rules.append('\n');
  }
  reader.close();
  return new RuleBasedBreakIterator(rules.toString());
}

Example 3

Source File: RegexRulesPasswordProvider.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Parses rule file from stream and returns a Map of all rules found
 * @param is input stream for the file
 */
public static LinkedHashMap<Pattern,String> parseRulesFile(InputStream is) {
  LinkedHashMap<Pattern,String> rules = new LinkedHashMap<>();
  BufferedReader br = new BufferedReader(IOUtils.getDecodingReader(is, StandardCharsets.UTF_8));
  String line;
  try {
    int linenum = 0;
    while ((line = br.readLine()) != null)   {
      linenum++;
      // Remove comments
      String[] arr = line.split("#");
      if(arr.length > 0)
        line = arr[0].trim();
      if(line.length() == 0) 
        continue;
      int sep = line.indexOf("=");
      if(sep <= 0) {
        log.warn("Wrong format of password line {}", linenum);
        continue;
      }
      String pass = line.substring(sep+1).trim();
      String regex = line.substring(0, sep).trim();
      try {
        Pattern pattern = Pattern.compile(regex);
        rules.put(pattern,  pass);
      } catch(PatternSyntaxException pse) {
        log.warn("Key of line {} was not a valid regex pattern{}", linenum, pse);
        continue;
      }
    }
    is.close();
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
  return rules;
}

Example 4

Source File: FileBasedQueryMaker.java From lucene-solr with Apache License 2.0

4 votes

@Override
protected Query[] prepareQueries() throws Exception {

  Analyzer anlzr = NewAnalyzerTask.createAnalyzer(config.get("analyzer",
          "org.apache.lucene.analysis.standard.StandardAnalyzer"));
  String defaultField = config.get("file.query.maker.default.field", DocMaker.BODY_FIELD);
  QueryParser qp = new QueryParser(defaultField, anlzr);
  qp.setAllowLeadingWildcard(true);

  List<Query> qq = new ArrayList<>();
  String fileName = config.get("file.query.maker.file", null);
  if (fileName != null)
  {
    Path path = Paths.get(fileName);
    Reader reader = null;
    // note: we use a decoding reader, so if your queries are screwed up you know
    if (Files.exists(path)) {
      reader = Files.newBufferedReader(path, StandardCharsets.UTF_8);
    } else {
      //see if we can find it as a resource
      InputStream asStream = FileBasedQueryMaker.class.getClassLoader().getResourceAsStream(fileName);
      if (asStream != null) {
        reader = IOUtils.getDecodingReader(asStream, StandardCharsets.UTF_8);
      }
    }
    if (reader != null) {
      try {
        BufferedReader buffered = new BufferedReader(reader);
        String line = null;
        int lineNum = 0;
        while ((line = buffered.readLine()) != null) {
          line = line.trim();
          if (line.length() != 0 && !line.startsWith("#")) {
            try {
              qq.add(qp.parse(line));
            } catch (ParseException e) {
              System.err.println("Exception: " + e.getMessage() + " occurred while parsing line: " + lineNum + " Text: " + line);
            }
          }
          lineNum++;
        }
      } finally {
        reader.close();
      }
    } else {
      System.err.println("No Reader available for: " + fileName);
    }
    
  }
  return qq.toArray(new Query[qq.size()]) ;
}

Example 5

Source File: StopwordAnalyzerBase.java From lucene-solr with Apache License 2.0

3 votes

/**
 * Creates a CharArraySet from a file resource associated with a class. (See
 * {@link Class#getResourceAsStream(String)}).
 * 
 * @param ignoreCase
 *          <code>true</code> if the set should ignore the case of the
 *          stopwords, otherwise <code>false</code>
 * @param aClass
 *          a class that is associated with the given stopwordResource
 * @param resource
 *          name of the resource file associated with the given class
 * @param comment
 *          comment string to ignore in the stopword file
 * @return a CharArraySet containing the distinct stopwords from the given
 *         file
 * @throws IOException
 *           if loading the stopwords throws an {@link IOException}
 */
protected static CharArraySet loadStopwordSet(final boolean ignoreCase,
    final Class<? extends Analyzer> aClass, final String resource,
    final String comment) throws IOException {
  Reader reader = null;
  try {
    reader = IOUtils.getDecodingReader(aClass.getResourceAsStream(resource), StandardCharsets.UTF_8);
    return WordlistLoader.getWordSet(reader, comment, new CharArraySet(16, ignoreCase));
  } finally {
    IOUtils.close(reader);
  }
  
}

Example 6

Source File: FileDictionary.java From lucene-solr with Apache License 2.0

2 votes

/**
 * Creates a dictionary based on an inputstream.
 * Using <code>fieldDelimiter</code> to separate out the
 * fields in a line.
 * <p>
 * NOTE: content is treated as UTF-8
 */
public FileDictionary(InputStream dictFile, String fieldDelimiter) {
  in = new BufferedReader(IOUtils.getDecodingReader(dictFile, StandardCharsets.UTF_8));
  this.fieldDelimiter = fieldDelimiter;
}

Example 7

Source File: PlainTextDictionary.java From lucene-solr with Apache License 2.0

2 votes

/**
 * Creates a dictionary based on an inputstream.
 * <p>
 * NOTE: content is treated as UTF-8
 */
public PlainTextDictionary(InputStream dictFile) {
  in = new BufferedReader(IOUtils.getDecodingReader(dictFile, StandardCharsets.UTF_8));
}