Java Code Examples for org.apache.lucene.analysis.synonym.SynonymMap#Builder

The following examples show how to use org.apache.lucene.analysis.synonym.SynonymMap#Builder . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestConcatenateGraphFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Test
public void testSeparatorWithSynonyms() throws IOException {
  SynonymMap.Builder builder = new SynonymMap.Builder(true);
  builder.add(new CharsRef("mykeyword"), new CharsRef("mysynonym"), true);
  builder.add(new CharsRef("mykeyword"), new CharsRef("three words synonym"), true);
  Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
  String input = " mykeyword another keyword   ";
  tokenizer.setReader(new StringReader(input));
  SynonymGraphFilter filter = new SynonymGraphFilter(tokenizer, builder.build(), true);
  ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter, '-', false, 100);
  assertTokenStreamContents(stream, new String[] {
      "mykeyword-another-keyword",
      "mysynonym-another-keyword",
      "three words synonym-another-keyword"
  }, null, null, new int[] { 1, 0 ,0});
}
 
Example 2
Source File: TestRemoveDuplicatesTokenFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
  final int numIters = atLeast(3);
  for (int i = 0; i < numIters; i++) {
    SynonymMap.Builder b = new SynonymMap.Builder(random().nextBoolean());
    final int numEntries = atLeast(10);
    for (int j = 0; j < numEntries; j++) {
      add(b, randomNonEmptyString(), randomNonEmptyString(), random().nextBoolean());
    }
    final SynonymMap map = b.build();
    final boolean ignoreCase = random().nextBoolean();
    
    final Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
        TokenStream stream = new SynonymGraphFilter(tokenizer, map, ignoreCase);
        return new TokenStreamComponents(tokenizer, new RemoveDuplicatesTokenFilter(stream));
      }
    };

    checkRandomData(random(), analyzer, 200);
    analyzer.close();
  }
}
 
Example 3
Source File: TestLimitTokenPositionFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testMaxPosition3WithSynomyms() throws IOException {
  for (final boolean consumeAll : new boolean[]{true, false}) {
    MockTokenizer tokenizer = whitespaceMockTokenizer("one two three four five");
    // if we are consuming all tokens, we can use the checks, otherwise we can't
    tokenizer.setEnableChecks(consumeAll);

    SynonymMap.Builder builder = new SynonymMap.Builder(true);
    builder.add(new CharsRef("one"), new CharsRef("first"), true);
    builder.add(new CharsRef("one"), new CharsRef("alpha"), true);
    builder.add(new CharsRef("one"), new CharsRef("beguine"), true);
    CharsRefBuilder multiWordCharsRef = new CharsRefBuilder();
    SynonymMap.Builder.join(new String[]{"and", "indubitably", "single", "only"}, multiWordCharsRef);
    builder.add(new CharsRef("one"), multiWordCharsRef.get(), true);
    SynonymMap.Builder.join(new String[]{"dopple", "ganger"}, multiWordCharsRef);
    builder.add(new CharsRef("two"), multiWordCharsRef.get(), true);
    SynonymMap synonymMap = builder.build();
    @SuppressWarnings("deprecation")
    TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true);
    stream = new LimitTokenPositionFilter(stream, 3, consumeAll);

    // "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3.
    assertTokenStreamContents(stream,
        new String[]{"one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger"},
        new int[]{1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0});
  }
}
 
Example 4
Source File: TestConcatenateGraphFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Test
public void testWithSynonym() throws Exception {
  SynonymMap.Builder builder = new SynonymMap.Builder(true);
  builder.add(new CharsRef("mykeyword"), new CharsRef("mysynonym"), true);
  Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
  tokenizer.setReader(new StringReader("mykeyword"));
  @SuppressWarnings("deprecation")
  SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true);
  ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter);
  assertTokenStreamContents(stream, new String[] {"mykeyword", "mysynonym"}, null, null, new int[] { 1, 0 });
}
 
Example 5
Source File: TestConcatenateGraphFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Test
public void testWithSynonyms() throws Exception {
  SynonymMap.Builder builder = new SynonymMap.Builder(true);
  builder.add(new CharsRef("mykeyword"), new CharsRef("mysynonym"), true);
  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
  String input = "mykeyword another keyword";
  tokenStream.setReader(new StringReader(input));
  @SuppressWarnings("deprecation")
  SynonymFilter filter = new SynonymFilter(tokenStream, builder.build(), true);
  ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter, SEP_LABEL, false, 100);
  String[] expectedOutputs = new String[2];
  CharsRefBuilder expectedOutput = new CharsRefBuilder();
  expectedOutput.append("mykeyword");
  expectedOutput.append(SEP_LABEL);
  expectedOutput.append("another");
  expectedOutput.append(SEP_LABEL);
  expectedOutput.append("keyword");
  expectedOutputs[0] = expectedOutput.toCharsRef().toString();
  expectedOutput.clear();
  expectedOutput.append("mysynonym");
  expectedOutput.append(SEP_LABEL);
  expectedOutput.append("another");
  expectedOutput.append(SEP_LABEL);
  expectedOutput.append("keyword");
  expectedOutputs[1] = expectedOutput.toCharsRef().toString();
  assertTokenStreamContents(stream, expectedOutputs, null, null, new int[]{1, 0});
}
 
Example 6
Source File: TestConcatenateGraphFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Test
public void testValidNumberOfExpansions() throws IOException {
  SynonymMap.Builder builder = new SynonymMap.Builder(true);
  for (int i = 0; i < 256; i++) {
    builder.add(new CharsRef("" + (i+1)), new CharsRef("" + (1000 + (i+1))), true);
  }
  StringBuilder valueBuilder = new StringBuilder();
  for (int i = 0 ; i < 8 ; i++) {
    valueBuilder.append(i+1);
    valueBuilder.append(" ");
  }
  MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
  tokenizer.setReader(new StringReader(valueBuilder.toString()));
  @SuppressWarnings("deprecation")
  SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true);

  int count;
  try (ConcatenateGraphFilter stream = new ConcatenateGraphFilter(filter)) {
    stream.reset();
    ConcatenateGraphFilter.BytesRefBuilderTermAttribute attr = stream.addAttribute(ConcatenateGraphFilter.BytesRefBuilderTermAttribute.class);
    count = 0;
    while (stream.incrementToken()) {
      count++;
      assertNotNull(attr.getBytesRef());
      assertTrue(attr.getBytesRef().length > 0);
    }
  }
  assertEquals(count, 256);
}
 
Example 7
Source File: TestRandomChains.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override public Object apply(Random random) {
  SynonymMap.Builder b = new SynonymMap.Builder(random.nextBoolean());
  final int numEntries = atLeast(10);
  for (int j = 0; j < numEntries; j++) {
    addSyn(b, randomNonEmptyString(random), randomNonEmptyString(random), random.nextBoolean());
  }
  try {
    return b.build();
  } catch (Exception ex) {
    Rethrow.rethrow(ex);
    return null; // unreachable code
  }
}
 
Example 8
Source File: QueryAutoFilteringComponent.java    From query-autofiltering-component with Apache License 2.0 5 votes vote down vote up
private void addTerms( NamedList<NamedList<Number>> terms, SynonymMap.Builder fieldBuilder, SynonymMap.Builder termBuilder, ArrayList<String> searchFields ) throws IOException {
  TermsResponse termsResponse = new TermsResponse( terms );
  for (String fieldName : searchFields ) {
    CharsRef fieldChars = new CharsRef( fieldName );
    List<TermsResponse.Term> termList = termsResponse.getTerms( fieldName );
    if (termList != null) {
      for (TermsResponse.Term tc : termList) {
        String term = tc.getTerm();
        Log.debug( "Add distributed term: " + fieldName + " = " + term );
        addTerm( fieldChars, term, fieldBuilder, termBuilder );
      }
    }
  }
}
 
Example 9
Source File: QueryAutoFilteringComponent.java    From query-autofiltering-component with Apache License 2.0 5 votes vote down vote up
private void buildFieldMap( ResponseBuilder rb ) throws IOException {
  Log.debug( "buildFieldMap" );
  SolrIndexSearcher searcher = rb.req.getSearcher();
  // build a synonym map from the SortedDocValues -
  // for each field value: lower case, stemmed, lookup synonyms from synonyms.txt - map to fieldValue
  SynonymMap.Builder fieldBuilder = new SynonymMap.Builder( true );
  SynonymMap.Builder termBuilder = new SynonymMap.Builder( true );
    
  ArrayList<String> searchFields = getStringFields( searcher );

  for (String searchField : searchFields ) {
    Log.debug( "adding searchField " + searchField );
    CharsRef fieldChars = new CharsRef( searchField );
    SortedSetDocValues sdv = FieldCache.DEFAULT.getDocTermOrds( searcher.getAtomicReader( ), searchField );
    if (sdv == null) continue;
    Log.debug( "got SortedSetDocValues for " + searchField );
    TermsEnum te = sdv.termsEnum();
    while (te.next() != null) {
      BytesRef term = te.term();
      String fieldValue = term.utf8ToString( );
      addTerm ( fieldChars, fieldValue, fieldBuilder, termBuilder );
    }
  }
    
  addDistributedTerms( rb, fieldBuilder, termBuilder, searchFields );
    
  fieldMap = fieldBuilder.build( );
  termMap = termBuilder.build( );
}
 
Example 10
Source File: QueryAutoFilteringComponent.java    From query-autofiltering-component with Apache License 2.0 5 votes vote down vote up
private void addTerms( NamedList<NamedList<Number>> terms, SynonymMap.Builder fieldBuilder, SynonymMap.Builder termBuilder, ArrayList<String> searchFields ) throws IOException {
  TermsResponse termsResponse = new TermsResponse( terms );
  for (String fieldName : searchFields ) {
    CharsRef fieldChars = new CharsRef( fieldName );
    List<TermsResponse.Term> termList = termsResponse.getTerms( fieldName );
    if (termList != null) {
      for (TermsResponse.Term tc : termList) {
        String term = tc.getTerm();
        Log.debug( "Add distributed term: " + fieldName + " = " + term );
        addTerm( fieldChars, term, fieldBuilder, termBuilder );
      }
    }
  }
}
 
Example 11
Source File: SynonymTokenFilterFactory.java    From crate with Apache License 2.0 5 votes vote down vote up
protected SynonymMap buildSynonyms(Analyzer analyzer, Reader rules) {
    try {
        SynonymMap.Builder parser;
        if ("wordnet".equalsIgnoreCase(format)) {
            parser = new ESWordnetSynonymParser(true, expand, lenient, analyzer);
            ((ESWordnetSynonymParser) parser).parse(rules);
        } else {
            parser = new ESSolrSynonymParser(true, expand, lenient, analyzer);
            ((ESSolrSynonymParser) parser).parse(rules);
        }
        return parser.build();
    } catch (Exception e) {
        throw new IllegalArgumentException("failed to build synonyms", e);
    }
}
 
Example 12
Source File: TestRemoveDuplicatesTokenFilter.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private void add(SynonymMap.Builder b, String input, String output, boolean keepOrig) {
  b.add(new CharsRef(input.replaceAll(" +", "\u0000")),
        new CharsRef(output.replaceAll(" +", "\u0000")),
        keepOrig);
}
 
Example 13
Source File: TestRandomChains.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private void addSyn(SynonymMap.Builder b, String input, String output, boolean keepOrig) {
  b.add(new CharsRef(input.replaceAll(" +", "\u0000")),
        new CharsRef(output.replaceAll(" +", "\u0000")),
        keepOrig);
}
 
Example 14
Source File: QueryAutoFilteringComponent.java    From query-autofiltering-component with Apache License 2.0 4 votes vote down vote up
private void addDistributedTerms( ResponseBuilder rb, SynonymMap.Builder fieldBuilder, SynonymMap.Builder termBuilder, ArrayList<String> searchFields ) throws IOException {
  SolrIndexSearcher searcher = rb.req.getSearcher();
  CoreContainer container = searcher.getCore().getCoreDescriptor().getCoreContainer();
    
  ShardHandlerFactory shardHandlerFactory = container.getShardHandlerFactory( );
  ShardHandler shardHandler = shardHandlerFactory.getShardHandler();
  shardHandler.checkDistributed( rb );
    
  Log.debug( "Is Distributed = " + rb.isDistrib );
    
  if( rb.isDistrib ) {
    // create a ShardRequest that contains a Terms Request.
    // don't send to this shard???
    ShardRequest sreq = new ShardRequest();
    sreq.purpose = ShardRequest.PURPOSE_GET_TERMS;
    sreq.actualShards = rb.shards;
    ModifiableSolrParams params = new ModifiableSolrParams( );
      
    params.set( TermsParams.TERMS_LIMIT, -1);
    params.set( TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_INDEX);
    String[] fields = searchFields.toArray( new String[ searchFields.size( )] );
    params.set( TermsParams.TERMS_FIELD, fields );
      
    params.set( CommonParams.DISTRIB, "false" );
    params.set( ShardParams.IS_SHARD, true );
    params.set( ShardParams.SHARDS_PURPOSE, sreq.purpose );
    params.set( CommonParams.QT, termsHandler );
    params.set( TermsParams.TERMS, "true" );
      
    if (rb.requestInfo != null) {
      params.set("NOW", Long.toString(rb.requestInfo.getNOW().getTime()));
    }
    sreq.params = params;
      
    for (String shard : rb.shards ) {
      Log.debug( "sending request to shard " + shard );
      params.set(ShardParams.SHARD_URL, shard );
      shardHandler.submit( sreq, shard, params );
    }
      
    ShardResponse rsp = shardHandler.takeCompletedIncludingErrors( );
    if (rsp != null) {
      Log.debug( "got " + rsp.getShardRequest().responses.size( ) + " responses" );
      for ( ShardResponse srsp : rsp.getShardRequest().responses ) {
        Log.debug( "Got terms response from " + srsp.getShard( ));
      
        if (srsp.getException() != null) {
          Log.debug( "ShardResponse Exception!! " + srsp.getException( ) );
        }
      
        @SuppressWarnings("unchecked")
        NamedList<NamedList<Number>> terms = (NamedList<NamedList<Number>>) srsp.getSolrResponse().getResponse().get("terms");
        if (terms != null) {
          addTerms( terms, fieldBuilder, termBuilder, searchFields );
        }
        else {
          Log.warn( "terms was NULL! - make sure that /terms request handler is defined in solrconfig.xml" );
        }
      }
    }
  }
}
 
Example 15
Source File: QueryAutoFilteringComponent.java    From query-autofiltering-component with Apache License 2.0 4 votes vote down vote up
private void addDistributedTerms( ResponseBuilder rb, SynonymMap.Builder fieldBuilder, SynonymMap.Builder termBuilder, ArrayList<String> searchFields ) throws IOException {
  SolrIndexSearcher searcher = rb.req.getSearcher();
  CoreContainer container = searcher.getCore().getCoreDescriptor().getCoreContainer();
    
  ShardHandlerFactory shardHandlerFactory = container.getShardHandlerFactory( );
  ShardHandler shardHandler = shardHandlerFactory.getShardHandler();
  shardHandler.checkDistributed( rb );
    
  Log.debug( "Is Distributed = " + rb.isDistrib );
    
  if( rb.isDistrib ) {
    // create a ShardRequest that contains a Terms Request.
    // don't send to this shard???
    ShardRequest sreq = new ShardRequest();
    sreq.purpose = ShardRequest.PURPOSE_GET_TERMS;
    sreq.actualShards = rb.shards;
    ModifiableSolrParams params = new ModifiableSolrParams( );
      
    params.set( TermsParams.TERMS_LIMIT, -1);
    params.set( TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_INDEX);
    String[] fields = searchFields.toArray( new String[ searchFields.size( )] );
    params.set( TermsParams.TERMS_FIELD, fields );
      
    params.set( CommonParams.DISTRIB, "false" );
    params.set( ShardParams.IS_SHARD, true );
    params.set( "shards.purpose", sreq.purpose );
    params.set( CommonParams.QT, termsHandler );
    params.set( TermsParams.TERMS, "true" );
      
    if (rb.requestInfo != null) {
      params.set("NOW", Long.toString(rb.requestInfo.getNOW().getTime()));
    }
    sreq.params = params;
      
    for (String shard : rb.shards ) {
      Log.debug( "sending request to shard " + shard );
      params.set(ShardParams.SHARD_URL, shard );
      shardHandler.submit( sreq, shard, params );
    }
      
    ShardResponse rsp = shardHandler.takeCompletedIncludingErrors( );
    if (rsp != null) {
      Log.debug( "got " + rsp.getShardRequest().responses.size( ) + " responses" );
      for ( ShardResponse srsp : rsp.getShardRequest().responses ) {
        Log.debug( "Got terms response from " + srsp.getShard( ));
      
        if (srsp.getException() != null) {
          Log.warn( "ShardResponse Exception!! " + srsp.getException( ) );
        }
      
        @SuppressWarnings("unchecked")
        NamedList<NamedList<Number>> terms = (NamedList<NamedList<Number>>) srsp.getSolrResponse().getResponse().get("terms");
        if (terms != null) {
          addTerms( terms, fieldBuilder, termBuilder, searchFields );
        }
        else {
          Log.warn( "terms was NULL! - make sure that /terms request handler is defined in solrconfig.xml" );
        }
      }
    }
  }
}