org.apache.solr.analysis.TokenizerChain Java Examples

The following examples show how to use org.apache.solr.analysis.TokenizerChain. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SolrQueryParserBase.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
protected ReversedWildcardFilterFactory getReversedWildcardFilterFactory(FieldType fieldType) {
  if (leadingWildcards == null) leadingWildcards = new HashMap<>();
  ReversedWildcardFilterFactory fac = leadingWildcards.get(fieldType);
  if (fac != null || leadingWildcards.containsKey(fieldType)) {
    return fac;
  }

  Analyzer a = fieldType.getIndexAnalyzer();
  if (a instanceof TokenizerChain) {
    // examine the indexing analysis chain if it supports leading wildcards
    TokenizerChain tc = (TokenizerChain)a;
    TokenFilterFactory[] factories = tc.getTokenFilterFactories();
    for (TokenFilterFactory factory : factories) {
      if (factory instanceof ReversedWildcardFilterFactory) {
        fac = (ReversedWildcardFilterFactory)factory;
        break;
      }
    }
  }

  leadingWildcards.put(fieldType, fac);
  return fac;
}
 
Example #2
Source File: PhrasesIdentificationComponent.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** 
 * Helper method, public for testing purposes only.
 * <p>
 * Given an analyzer, inspects it to determine if:
 * <ul>
 *  <li>it is a {@link TokenizerChain}</li>
 *  <li>it contains exactly one instance of {@link ShingleFilterFactory}</li>
 * </ul>
 * <p>
 * If these these conditions are met, then this method returns the <code>maxShingleSize</code> 
 * in effect for this analyzer, otherwise returns -1.
 * </p>
 * 
 * @param analyzer An analyzer inspect
 * @return <code>maxShingleSize</code> if available
 * @lucene.internal
 */
public static int getMaxShingleSize(Analyzer analyzer) {
  if (!TokenizerChain.class.isInstance(analyzer)) {
    return -1;
  }
  
  final TokenFilterFactory[] factories = ((TokenizerChain) analyzer).getTokenFilterFactories();
  if (0 == factories.length) {
    return -1;
  }
  int result = -1;
  for (TokenFilterFactory tff : factories) {
    if (ShingleFilterFactory.class.isInstance(tff)) {
      if (0 < result) {
        // more then one shingle factory in our analyzer, which is weird, so make no assumptions...
        return -1;
      }
      // would be nice if there was an easy way to just ask a factory for the effective value
      // of an arguement...
      final Map<String,String> args = tff.getOriginalArgs();
      result = args.containsKey("maxShingleSize")
        ? Integer.parseInt(args.get("maxShingleSize")) : ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE;
    }
  }
  return result;
}
 
Example #3
Source File: PayloadUtils.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public static String getPayloadEncoder(FieldType fieldType) {
  // TODO: support custom payload encoding fields too somehow - maybe someone has a custom component that encodes payloads as floats
  String encoder = null;
  Analyzer a = fieldType.getIndexAnalyzer();
  if (a instanceof TokenizerChain) {
    // examine the indexing analysis chain for DelimitedPayloadTokenFilterFactory or NumericPayloadTokenFilterFactory
    TokenizerChain tc = (TokenizerChain)a;
    TokenFilterFactory[] factories = tc.getTokenFilterFactories();
    for (TokenFilterFactory factory : factories) {
      if (factory instanceof DelimitedPayloadTokenFilterFactory) {
        encoder = factory.getOriginalArgs().get(DelimitedPayloadTokenFilterFactory.ENCODER_ATTR);
        break;
      }

      if (factory instanceof NumericPayloadTokenFilterFactory) {
        // encodes using `PayloadHelper.encodeFloat(payload)`
        encoder = "float";
        break;
      }
    }
  }

  return encoder;
}
 
Example #4
Source File: NestPathField.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public void setArgs(IndexSchema schema, Map<String, String> args) {
  args.putIfAbsent("stored", "false");
  args.putIfAbsent("omitTermFreqAndPositions", "true");
  args.putIfAbsent("omitNorms", "true");
  args.putIfAbsent("maxCharsForDocValues", "-1");
  super.setArgs(schema, args);

  // CustomAnalyzer is easy to use
  CustomAnalyzer customAnalyzer;
  try {
    customAnalyzer = CustomAnalyzer.builder(schema.getResourceLoader())
        .withDefaultMatchVersion(schema.getDefaultLuceneMatchVersion())
        .withTokenizer(KeywordTokenizerFactory.class)
        .addTokenFilter(PatternReplaceFilterFactory.class,
            "pattern", "#\\d*",
            "replace", "all")
        .build();
  } catch (IOException e) {
    throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);//impossible?
  }
  // Solr HTTP Schema APIs don't know about CustomAnalyzer so use TokenizerChain instead
  setIndexAnalyzer(new TokenizerChain(customAnalyzer));
  // leave queryAnalyzer as literal
}
 
Example #5
Source File: CustomAnalyzerStrField.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public CustomAnalyzerStrField() {
  Random r = LuceneTestCase.random();

  // two arg constructor
  Analyzer a2 = new TokenizerChain
    (new KeywordTokenizerFactory(new HashMap<>()),
     r.nextBoolean() ? null : new TokenFilterFactory[0]);
  
  // three arg constructor
  Analyzer a3 = new TokenizerChain
    (r.nextBoolean() ? null : new CharFilterFactory[0],
     new KeywordTokenizerFactory(new HashMap<>()),
     r.nextBoolean() ? null : new TokenFilterFactory[0]);

  if (r.nextBoolean()) {
    indexAnalyzer = a2;
    queryAnalyzer = a3;
  } else {
    queryAnalyzer = a2;
    indexAnalyzer = a3;
  }
}
 
Example #6
Source File: SolrStopwordsCarrot2LexicalDataFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Obtains stop words for a field from the associated
 * {@link StopFilterFactory}, if any.
 */
private List<CharArraySet> getSolrStopWordsForField(String fieldName) {
  // No need to synchronize here, Carrot2 ensures that instances
  // of this class are not used by multiple threads at a time.
  synchronized (solrStopWords) {
    if (!solrStopWords.containsKey(fieldName)) {
      solrStopWords.put(fieldName, new ArrayList<>());

      IndexSchema schema = core.getLatestSchema();
      final Analyzer fieldAnalyzer = schema.getFieldType(fieldName).getIndexAnalyzer();
      if (fieldAnalyzer instanceof TokenizerChain) {
        final TokenFilterFactory[] filterFactories = 
            ((TokenizerChain) fieldAnalyzer).getTokenFilterFactories();
        for (TokenFilterFactory factory : filterFactories) {
          if (factory instanceof StopFilterFactory) {
            // StopFilterFactory holds the stop words in a CharArraySet
            CharArraySet stopWords = ((StopFilterFactory) factory).getStopWords();
            solrStopWords.get(fieldName).add(stopWords);
          }

          if (factory instanceof CommonGramsFilterFactory) {
            CharArraySet commonWords = ((CommonGramsFilterFactory) factory).getCommonWords();
            solrStopWords.get(fieldName).add(commonWords);
          }
        }
      }
    }
    return solrStopWords.get(fieldName);
  }
}
 
Example #7
Source File: TaggerRequestHandler.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private boolean fieldHasIndexedStopFilter(String field, SolrQueryRequest req) {
  FieldType fieldType = req.getSchema().getFieldType(field);
  Analyzer analyzer = fieldType.getIndexAnalyzer();//index analyzer
  if (analyzer instanceof TokenizerChain) {
    TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
    TokenFilterFactory[] tokenFilterFactories = tokenizerChain.getTokenFilterFactories();
    for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) {
      if (tokenFilterFactory instanceof StopFilterFactory)
        return true;
    }
  }
  return false;
}
 
Example #8
Source File: ManagedIndexSchema.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Informs analyzers used by a fieldType.
 */
protected void informResourceLoaderAwareObjectsForFieldType(FieldType fieldType) {
  // must inform any sub-components used in the
  // tokenizer chain if they are ResourceLoaderAware
  if (!fieldType.supportsAnalyzers())
    return;

  Analyzer indexAnalyzer = fieldType.getIndexAnalyzer();
  if (indexAnalyzer != null && indexAnalyzer instanceof TokenizerChain)
    informResourceLoaderAwareObjectsInChain((TokenizerChain)indexAnalyzer);

  Analyzer queryAnalyzer = fieldType.getQueryAnalyzer();
  // ref comparison is correct here (vs. equals) as they may be the same
  // object in which case, we don't need to inform twice ... however, it's
  // actually safe to call inform multiple times on an object anyway
  if (queryAnalyzer != null &&
      queryAnalyzer != indexAnalyzer &&
      queryAnalyzer instanceof TokenizerChain)
    informResourceLoaderAwareObjectsInChain((TokenizerChain)queryAnalyzer);

  // if fieldType is a TextField, it might have a multi-term analyzer
  if (fieldType instanceof TextField) {
    TextField textFieldType = (TextField)fieldType;
    Analyzer multiTermAnalyzer = textFieldType.getMultiTermAnalyzer();
    if (multiTermAnalyzer != null && multiTermAnalyzer != indexAnalyzer &&
        multiTermAnalyzer != queryAnalyzer && multiTermAnalyzer instanceof TokenizerChain)
      informResourceLoaderAwareObjectsInChain((TokenizerChain)multiTermAnalyzer);
  }
}
 
Example #9
Source File: FieldTypePluginLoader.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private Analyzer constructMultiTermAnalyzer(Analyzer queryAnalyzer) {
  if (queryAnalyzer == null) return null;

  if (!(queryAnalyzer instanceof TokenizerChain)) {
    return new KeywordAnalyzer();
  }

  return ((TokenizerChain) queryAnalyzer).getMultiTermAnalyzer();
}
 
Example #10
Source File: FieldAnalysisRequestHandlerTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Test //See SOLR-8460
public void testCustomAttribute() throws Exception {
  FieldAnalysisRequest request = new FieldAnalysisRequest();
  request.addFieldType("skutype1");
  request.setFieldValue("hi, 3456-12 a Test");
  request.setShowMatch(false);
  FieldType fieldType = new TextField();
  Analyzer analyzer = new TokenizerChain(
      new TokenizerFactory(Collections.emptyMap()) {
        @Override
        public Tokenizer create(AttributeFactory factory) {
          return new CustomTokenizer(factory);
        }
      },
      new TokenFilterFactory[] {
          new TokenFilterFactory(Collections.emptyMap()) {
            @Override
            public TokenStream create(TokenStream input) {
              return new CustomTokenFilter(input);
            }
          }
      }
  );
  fieldType.setIndexAnalyzer(analyzer);

  @SuppressWarnings({"rawtypes"})
  NamedList<NamedList> result = handler.analyzeValues(request, fieldType, "fieldNameUnused");
  // just test that we see "900" in the flags attribute here
  @SuppressWarnings({"unchecked", "rawtypes"})
  List<NamedList> tokenInfoList = (List<NamedList>) result.findRecursive("index", CustomTokenFilter.class.getName());
  // '1' from CustomTokenFilter plus 900 from CustomFlagsAttributeImpl.
  assertEquals(901, tokenInfoList.get(0).get("org.apache.lucene.analysis.tokenattributes.FlagsAttribute#flags"));
}
 
Example #11
Source File: SolrSchemaUtil.java    From jesterj with Apache License 2.0 5 votes vote down vote up
private Analyzer constructMultiTermAnalyzer(Analyzer queryAnalyzer) {
  if (queryAnalyzer == null) return null;

  if (!(queryAnalyzer instanceof TokenizerChain)) {
    return new KeywordAnalyzer();
  }
  return ((TokenizerChain) queryAnalyzer).getMultiTermAnalyzer();

}
 
Example #12
Source File: MMSegTokenizerFactoryTest.java    From mmseg4j-solr with Apache License 2.0 5 votes vote down vote up
private Dictionary getDictionaryByFieldType(String fieldTypeName) {
	FieldType ft = h.getCore().getLatestSchema().getFieldTypeByName(fieldTypeName);
	Analyzer a = ft.getIndexAnalyzer();
	Assert.assertEquals(a.getClass(), TokenizerChain.class);
	
	TokenizerChain tc = (TokenizerChain) a;
	TokenizerFactory tf = tc.getTokenizerFactory();
	Assert.assertEquals(tf.getClass(), MMSegTokenizerFactory.class);
	
	MMSegTokenizerFactory mtf = (MMSegTokenizerFactory) tf;
	
	Assert.assertNotNull(mtf.dic);
	return mtf.dic;
}
 
Example #13
Source File: TaggerRequestHandler.java    From SolrTextTagger with Apache License 2.0 5 votes vote down vote up
private boolean fieldHasIndexedStopFilter(String field, SolrQueryRequest req) {
  FieldType fieldType = req.getSchema().getFieldType(field);
  Analyzer analyzer = fieldType.getIndexAnalyzer();//index analyzer
  if (analyzer instanceof TokenizerChain) {
    TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
    TokenFilterFactory[] tokenFilterFactories = tokenizerChain.getTokenFilterFactories();
    for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) {
      if (tokenFilterFactory instanceof StopFilterFactory)
        return true;
    }
  }
  return false;
}
 
Example #14
Source File: AlfrescoFieldType.java    From SearchServices with GNU Lesser General Public License v3.0 4 votes vote down vote up
public TokenizerChain build()
{
    CharFilterFactory[] charFilterArr = charFilters == null ? null : charFilters.toArray(new CharFilterFactory[charFilters.size()]);
    TokenFilterFactory[] filterArr = filters == null ? new TokenFilterFactory[0] : filters.toArray(new TokenFilterFactory[filters.size()]);
    return new TokenizerChain(charFilterArr, tokenizer, filterArr);
}
 
Example #15
Source File: SolrSchemaUtil.java    From jesterj with Apache License 2.0 4 votes vote down vote up
/**
 * Read an analyzer from a dom node. This is adapted from {@link org.apache.solr.schema.FieldTypePluginLoader} with
 * changes to avoid requiring a SolrResourceLoader.
 *
 * @param node        The dom node representing the analyzer
 * @param luceneMatch The lucene version match (must be supplied since we don't load a SolrConfig.xml)
 * @param loader      The Resource loader that can provide accessory files such as stopwords.txt
 * @return A freshly instantiated analyzer
 * @throws XPathExpressionException if there are problems with the DOM created from the schema.xml file.
 */
private Analyzer readAnalyzer(Node node, final String luceneMatch, ResourceLoader loader) throws XPathExpressionException {


  // parent node used to be passed in as "fieldtype"
  // if (!fieldtype.hasChildNodes()) return null;
  // Node node = DOMUtil.getChild(fieldtype,"analyzer");

  if (node == null) return null;
  NamedNodeMap attrs = node.getAttributes();
  String analyzerClassName = DOMUtil.getAttr(attrs, "class");

  // check for all of these up front, so we can error if used in
  // conjunction with an explicit analyzer class.
  NodeList charFilterNodes = (NodeList) xpath.evaluate
      ("./charFilter", node, XPathConstants.NODESET);
  NodeList tokenizerNodes = (NodeList) xpath.evaluate
      ("./tokenizer", node, XPathConstants.NODESET);
  NodeList tokenFilterNodes = (NodeList) xpath.evaluate
      ("./filter", node, XPathConstants.NODESET);

  if (analyzerClassName != null) {

    // explicitly check for child analysis factories instead of
    // just any child nodes, because the user might have their
    // own custom nodes (ie: <description> or something like that)
    if (0 != charFilterNodes.getLength() ||
        0 != tokenizerNodes.getLength() ||
        0 != tokenFilterNodes.getLength()) {
      throw new SolrException
          (SolrException.ErrorCode.SERVER_ERROR,
              "Configuration Error: Analyzer class='" + analyzerClassName +
                  "' can not be combined with nested analysis factories");
    }

    try {
      // No need to be core-aware as Analyzers are not in the core-aware list
      final Class<? extends Analyzer> clazz = findClass(analyzerClassName, Analyzer.class);
      Analyzer analyzer = clazz.newInstance();

      final String matchVersionStr = DOMUtil.getAttr(attrs, LUCENE_MATCH_VERSION_PARAM);
      final Version luceneMatchVersion = (matchVersionStr == null) ?
          Version.parse(luceneMatch) :
          SolrConfig.parseLuceneVersionString(matchVersionStr);
      if (luceneMatchVersion == null) {
        throw new SolrException
            (SolrException.ErrorCode.SERVER_ERROR,
                "Configuration Error: Analyzer '" + clazz.getName() +
                    "' needs a 'luceneMatchVersion' parameter");
      }
      analyzer.setVersion(luceneMatchVersion);
      if (analyzer instanceof ResourceLoaderAware) {
        ((ResourceLoaderAware) analyzer).inform(loader);
      }
      return analyzer;
    } catch (Exception e) {
      log.error("Cannot load analyzer: " + analyzerClassName, e);
      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
          "Cannot load analyzer: " + analyzerClassName, e);
    }
  }

  // Load the CharFilters

  final ArrayList<CharFilterFactory> charFilters = new ArrayList<>();
  load(charFilterNodes, SCHEMA_XML_ANALYZER_CHAR_FILTER, charFilters, CharFilterFactory.class, luceneMatch, loader);

  // Load the Tokenizer
  // Although an analyzer only allows a single Tokenizer, we load a list to make sure
  // the configuration is ok

  final ArrayList<TokenizerFactory> tokenizers = new ArrayList<>(1);
  load(tokenizerNodes, SCHEMA_XML_ANALYZER_TOKENIZER, tokenizers, TokenizerFactory.class, luceneMatch, loader);

  // Make sure something was loaded
  if (tokenizers.isEmpty()) {
    throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "analyzer without class or tokenizer");
  }

  // Load the Filters

  final ArrayList<TokenFilterFactory> filters = new ArrayList<>();
  load(tokenFilterNodes, SCHEMA_XML_ANALYZER_FILTER, filters, TokenFilterFactory.class, luceneMatch, loader);

  return new TokenizerChain(charFilters.toArray(new CharFilterFactory[charFilters.size()]),
      tokenizers.get(0), filters.toArray(new TokenFilterFactory[filters.size()]));

}