Java Code Examples for org.apache.lucene.analysis.WordlistLoader

The following examples show how to use org.apache.lucene.analysis.WordlistLoader. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: datawave   Source File: TokenSearch.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Load stopwords from the specified file located in the classpath.
 * <p>
 * If a directory name is specified, e.g: <code>tmp/stopwords.txt</code> that path will be used when searching for the resource. Otherwise, the package
 * contianing the DefaultTokenSearch class may be used.
 * <p>
 * The current thread's context classloader will be used to load the specified filename as a resource.
 * 
 * @param filename
 *            the filename containing the stoplist to load, located using the rules described above.
 * @return a lucene {@code CharArraySet} containing the stopwords. This is configured to be case insensitive.
 * @throws IOException
 *             if there is a problem finding or loading the specified stop word file..
 */
public static CharArraySet loadStopWords(String filename) throws IOException {
    Closer closer = Closer.create();
    try {
        CharArraySet stopSet = new CharArraySet(16, true /* ignore case */);
        String pkg = Factory.class.getPackage().getName().replace('.', '/');
        String resource = filename.indexOf("/") > -1 ? filename : (pkg + "/" + filename);
        InputStream resourceStream = Thread.currentThread().getContextClassLoader().getResourceAsStream(resource);
        logger.info("Loading stopwords file " + filename + " from resource " + resource);
        if (resourceStream == null) {
            throw new FileNotFoundException("Unable to load stopword file as resource " + filename);
        }
        Reader reader = IOUtils.getDecodingReader(resourceStream, StandardCharsets.UTF_8);
        closer.register(reader);
        CharArraySet set = WordlistLoader.getWordSet(reader, "#", stopSet);
        logger.info("Loaded " + set.size() + " stopwords from " + filename + " (" + resource + ")");
        return set;
    } finally {
        closer.close();
    }
}
 
Example 2
Source Project: lucene-solr   Source File: TestFilesystemResourceLoader.java    License: Apache License 2.0 6 votes vote down vote up
public void testBaseDir() throws Exception {
  final Path base = createTempDir("fsResourceLoaderBase");
  Writer os = Files.newBufferedWriter(base.resolve("template.txt"), StandardCharsets.UTF_8);
  try {
    os.write("foobar\n");
  } finally {
    IOUtils.closeWhileHandlingException(os);
  }

  @SuppressWarnings("deprecation")
  ResourceLoader rl = new FilesystemResourceLoader(base);
  assertEquals("foobar", WordlistLoader.getLines(rl.openResource("template.txt"), StandardCharsets.UTF_8).get(0));
  // Same with full path name:
  String fullPath = base.resolve("template.txt").toAbsolutePath().toString();
  assertEquals("foobar",
               WordlistLoader.getLines(rl.openResource(fullPath), StandardCharsets.UTF_8).get(0));
  assertClasspathDelegation(rl);
  assertNotFound(rl);
}
 
Example 3
Source Project: lucene-solr   Source File: TestFilesystemResourceLoader.java    License: Apache License 2.0 5 votes vote down vote up
private void assertClasspathDelegation(ResourceLoader rl) throws Exception {
  // try a stopwords file from classpath
  CharArraySet set = WordlistLoader.getSnowballWordSet(
    new InputStreamReader(rl.openResource("org/apache/lucene/analysis/snowball/english_stop.txt"), StandardCharsets.UTF_8)
  );
  assertTrue(set.contains("you"));
  // try to load a class; we use string comparison because classloader may be different...
  assertEquals("org.apache.lucene.analysis.util.RollingCharBuffer",
      rl.newInstance("org.apache.lucene.analysis.util.RollingCharBuffer", Object.class).getClass().getName());
}
 
Example 4
Source Project: lucene-solr   Source File: TestSnowballVocab.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Run all languages against their snowball vocabulary tests.
 */
public void testStemmers() throws IOException {
  try (InputStream in = getClass().getResourceAsStream("test_languages.txt")) {
    for (String datafile : WordlistLoader.getLines(in, StandardCharsets.UTF_8)) {
      String language = "" + Character.toUpperCase(datafile.charAt(0)) + datafile.substring(1);
      assertCorrectOutput(language, datafile + ".zip");
    }
  }
}
 
Example 5
Source Project: lucene-solr   Source File: SolrResourceLoader.java    License: Apache License 2.0 5 votes vote down vote up
public List<String> getLines(String resource, Charset charset) throws IOException {
  try {
    return WordlistLoader.getLines(openResource(resource), charset);
  } catch (CharacterCodingException ex) {
    throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
        "Error loading resource (wrong encoding?): " + resource, ex);
  }
}
 
Example 6
Source Project: lucene-solr   Source File: SmartChineseAnalyzer.java    License: Apache License 2.0 4 votes vote down vote up
static CharArraySet loadDefaultStopWordSet() throws IOException {
  // make sure it is unmodifiable as we expose it in the outer class
  return CharArraySet.unmodifiableSet(WordlistLoader.getWordSet(IOUtils
      .getDecodingReader(SmartChineseAnalyzer.class, DEFAULT_STOPWORD_FILE,
          StandardCharsets.UTF_8), STOPWORD_FILE_COMMENT));
}
 
Example 7
Source Project: lucene-solr   Source File: AbstractAnalysisFactory.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * Returns the resource's lines (with content treated as UTF-8)
 */
protected final List<String> getLines(ResourceLoader loader, String resource) throws IOException {
  return WordlistLoader.getLines(loader.openResource(resource), StandardCharsets.UTF_8);
}
 
Example 8
Source Project: lucene-solr   Source File: TestFilesystemResourceLoader.java    License: Apache License 2.0 4 votes vote down vote up
public void testDelegation() throws Exception {
  ResourceLoader rl = new FilesystemResourceLoader(createTempDir("empty"), new StringMockResourceLoader("foobar\n"));
  assertEquals("foobar", WordlistLoader.getLines(rl.openResource("template.txt"), StandardCharsets.UTF_8).get(0));
}
 
Example 9
private List<String> getLines(ResourceLoader loader, String resource) throws IOException {
    return WordlistLoader.getLines(loader.openResource(resource), StandardCharsets.UTF_8);
}
 
Example 10
/** Builds an analyzer with the stop words from the given file.
 * @see WordlistLoader#getWordSet(File)
 * @param matchVersion Lucene version to match See {@link
 * <a href="#version">above</a>}
 * @param stopwords File to read stop words from */
public ClassicAnalyzer(final Version matchVersion, final File stopwords) throws IOException {
  this(matchVersion, WordlistLoader.getWordSet(stopwords));
}
 
Example 11
/** Builds an analyzer with the stop words from the given reader.
 * @see WordlistLoader#getWordSet(Reader)
 * @param matchVersion Lucene version to match See {@link
 * <a href="#version">above</a>}
 * @param stopwords Reader to read stop words from */
public ClassicAnalyzer(final Version matchVersion, final Reader stopwords) throws IOException {
  this(matchVersion, WordlistLoader.getWordSet(stopwords));
}
 
Example 12
/** Builds an analyzer with the stop words from the given file.
 * @see WordlistLoader#getWordSet(File)
 * @param matchVersion Lucene version to match See {@link
 * <a href="#version">above</a>}
 * @param stopwords File to read stop words from */
public StandardAnalyzer(final Version matchVersion, final File stopwords) throws IOException {
  this(matchVersion, WordlistLoader.getWordSet(stopwords));
}
 
Example 13
/** Builds an analyzer with the stop words from the given reader.
 * @see WordlistLoader#getWordSet(Reader)
 * @param matchVersion Lucene version to match See {@link
 * <a href="#version">above</a>}
 * @param stopwords Reader to read stop words from */
public StandardAnalyzer(final Version matchVersion, final Reader stopwords) throws IOException {
  this(matchVersion, WordlistLoader.getWordSet(stopwords));
}