org.apache.solr.analysis.TokenizerChain Java Exaples

Source File: SolrQueryParserBase.java From lucene-solr with Apache License 2.0

6 votes

protected ReversedWildcardFilterFactory getReversedWildcardFilterFactory(FieldType fieldType) {
  if (leadingWildcards == null) leadingWildcards = new HashMap<>();
  ReversedWildcardFilterFactory fac = leadingWildcards.get(fieldType);
  if (fac != null || leadingWildcards.containsKey(fieldType)) {
    return fac;
  }

  Analyzer a = fieldType.getIndexAnalyzer();
  if (a instanceof TokenizerChain) {
    // examine the indexing analysis chain if it supports leading wildcards
    TokenizerChain tc = (TokenizerChain)a;
    TokenFilterFactory[] factories = tc.getTokenFilterFactories();
    for (TokenFilterFactory factory : factories) {
      if (factory instanceof ReversedWildcardFilterFactory) {
        fac = (ReversedWildcardFilterFactory)factory;
        break;
      }
    }
  }

  leadingWildcards.put(fieldType, fac);
  return fac;
}

Source File: PhrasesIdentificationComponent.java From lucene-solr with Apache License 2.0

6 votes

/** 
 * Helper method, public for testing purposes only.
 * <p>
 * Given an analyzer, inspects it to determine if:
 * <ul>
 *  <li>it is a {@link TokenizerChain}</li>
 *  <li>it contains exactly one instance of {@link ShingleFilterFactory}</li>
 * </ul>
 * <p>
 * If these these conditions are met, then this method returns the <code>maxShingleSize</code> 
 * in effect for this analyzer, otherwise returns -1.
 * </p>
 * 
 * @param analyzer An analyzer inspect
 * @return <code>maxShingleSize</code> if available
 * @lucene.internal
 */
public static int getMaxShingleSize(Analyzer analyzer) {
  if (!TokenizerChain.class.isInstance(analyzer)) {
    return -1;
  }
  
  final TokenFilterFactory[] factories = ((TokenizerChain) analyzer).getTokenFilterFactories();
  if (0 == factories.length) {
    return -1;
  }
  int result = -1;
  for (TokenFilterFactory tff : factories) {
    if (ShingleFilterFactory.class.isInstance(tff)) {
      if (0 < result) {
        // more then one shingle factory in our analyzer, which is weird, so make no assumptions...
        return -1;
      }
      // would be nice if there was an easy way to just ask a factory for the effective value
      // of an arguement...
      final Map<String,String> args = tff.getOriginalArgs();
      result = args.containsKey("maxShingleSize")
        ? Integer.parseInt(args.get("maxShingleSize")) : ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE;
    }
  }
  return result;
}

Source File: PayloadUtils.java From lucene-solr with Apache License 2.0

6 votes

public static String getPayloadEncoder(FieldType fieldType) {
  // TODO: support custom payload encoding fields too somehow - maybe someone has a custom component that encodes payloads as floats
  String encoder = null;
  Analyzer a = fieldType.getIndexAnalyzer();
  if (a instanceof TokenizerChain) {
    // examine the indexing analysis chain for DelimitedPayloadTokenFilterFactory or NumericPayloadTokenFilterFactory
    TokenizerChain tc = (TokenizerChain)a;
    TokenFilterFactory[] factories = tc.getTokenFilterFactories();
    for (TokenFilterFactory factory : factories) {
      if (factory instanceof DelimitedPayloadTokenFilterFactory) {
        encoder = factory.getOriginalArgs().get(DelimitedPayloadTokenFilterFactory.ENCODER_ATTR);
        break;
      }

      if (factory instanceof NumericPayloadTokenFilterFactory) {
        // encodes using `PayloadHelper.encodeFloat(payload)`
        encoder = "float";
        break;
      }
    }
  }

  return encoder;
}

Source File: NestPathField.java From lucene-solr with Apache License 2.0

6 votes

@Override
public void setArgs(IndexSchema schema, Map<String, String> args) {
  args.putIfAbsent("stored", "false");
  args.putIfAbsent("omitTermFreqAndPositions", "true");
  args.putIfAbsent("omitNorms", "true");
  args.putIfAbsent("maxCharsForDocValues", "-1");
  super.setArgs(schema, args);

  // CustomAnalyzer is easy to use
  CustomAnalyzer customAnalyzer;
  try {
    customAnalyzer = CustomAnalyzer.builder(schema.getResourceLoader())
        .withDefaultMatchVersion(schema.getDefaultLuceneMatchVersion())
        .withTokenizer(KeywordTokenizerFactory.class)
        .addTokenFilter(PatternReplaceFilterFactory.class,
            "pattern", "#\\d*",
            "replace", "all")
        .build();
  } catch (IOException e) {
    throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);//impossible?
  }
  // Solr HTTP Schema APIs don't know about CustomAnalyzer so use TokenizerChain instead
  setIndexAnalyzer(new TokenizerChain(customAnalyzer));
  // leave queryAnalyzer as literal
}

Source File: CustomAnalyzerStrField.java From lucene-solr with Apache License 2.0

6 votes

public CustomAnalyzerStrField() {
  Random r = LuceneTestCase.random();

  // two arg constructor
  Analyzer a2 = new TokenizerChain
    (new KeywordTokenizerFactory(new HashMap<>()),
     r.nextBoolean() ? null : new TokenFilterFactory[0]);
  
  // three arg constructor
  Analyzer a3 = new TokenizerChain
    (r.nextBoolean() ? null : new CharFilterFactory[0],
     new KeywordTokenizerFactory(new HashMap<>()),
     r.nextBoolean() ? null : new TokenFilterFactory[0]);

  if (r.nextBoolean()) {
    indexAnalyzer = a2;
    queryAnalyzer = a3;
  } else {
    queryAnalyzer = a2;
    indexAnalyzer = a3;
  }
}

Source File: SolrStopwordsCarrot2LexicalDataFactory.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Obtains stop words for a field from the associated
 * {@link StopFilterFactory}, if any.
 */
private List<CharArraySet> getSolrStopWordsForField(String fieldName) {
  // No need to synchronize here, Carrot2 ensures that instances
  // of this class are not used by multiple threads at a time.
  synchronized (solrStopWords) {
    if (!solrStopWords.containsKey(fieldName)) {
      solrStopWords.put(fieldName, new ArrayList<>());

      IndexSchema schema = core.getLatestSchema();
      final Analyzer fieldAnalyzer = schema.getFieldType(fieldName).getIndexAnalyzer();
      if (fieldAnalyzer instanceof TokenizerChain) {
        final TokenFilterFactory[] filterFactories = 
            ((TokenizerChain) fieldAnalyzer).getTokenFilterFactories();
        for (TokenFilterFactory factory : filterFactories) {
          if (factory instanceof StopFilterFactory) {
            // StopFilterFactory holds the stop words in a CharArraySet
            CharArraySet stopWords = ((StopFilterFactory) factory).getStopWords();
            solrStopWords.get(fieldName).add(stopWords);
          }

          if (factory instanceof CommonGramsFilterFactory) {
            CharArraySet commonWords = ((CommonGramsFilterFactory) factory).getCommonWords();
            solrStopWords.get(fieldName).add(commonWords);
          }
        }
      }
    }
    return solrStopWords.get(fieldName);
  }
}

Source File: TaggerRequestHandler.java From lucene-solr with Apache License 2.0

5 votes

private boolean fieldHasIndexedStopFilter(String field, SolrQueryRequest req) {
  FieldType fieldType = req.getSchema().getFieldType(field);
  Analyzer analyzer = fieldType.getIndexAnalyzer();//index analyzer
  if (analyzer instanceof TokenizerChain) {
    TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
    TokenFilterFactory[] tokenFilterFactories = tokenizerChain.getTokenFilterFactories();
    for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) {
      if (tokenFilterFactory instanceof StopFilterFactory)
        return true;
    }
  }
  return false;
}

Source File: ManagedIndexSchema.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Informs analyzers used by a fieldType.
 */
protected void informResourceLoaderAwareObjectsForFieldType(FieldType fieldType) {
  // must inform any sub-components used in the
  // tokenizer chain if they are ResourceLoaderAware
  if (!fieldType.supportsAnalyzers())
    return;

  Analyzer indexAnalyzer = fieldType.getIndexAnalyzer();
  if (indexAnalyzer != null && indexAnalyzer instanceof TokenizerChain)
    informResourceLoaderAwareObjectsInChain((TokenizerChain)indexAnalyzer);

  Analyzer queryAnalyzer = fieldType.getQueryAnalyzer();
  // ref comparison is correct here (vs. equals) as they may be the same
  // object in which case, we don't need to inform twice ... however, it's
  // actually safe to call inform multiple times on an object anyway
  if (queryAnalyzer != null &&
      queryAnalyzer != indexAnalyzer &&
      queryAnalyzer instanceof TokenizerChain)
    informResourceLoaderAwareObjectsInChain((TokenizerChain)queryAnalyzer);

  // if fieldType is a TextField, it might have a multi-term analyzer
  if (fieldType instanceof TextField) {
    TextField textFieldType = (TextField)fieldType;
    Analyzer multiTermAnalyzer = textFieldType.getMultiTermAnalyzer();
    if (multiTermAnalyzer != null && multiTermAnalyzer != indexAnalyzer &&
        multiTermAnalyzer != queryAnalyzer && multiTermAnalyzer instanceof TokenizerChain)
      informResourceLoaderAwareObjectsInChain((TokenizerChain)multiTermAnalyzer);
  }
}

Source File: FieldTypePluginLoader.java From lucene-solr with Apache License 2.0

5 votes

private Analyzer constructMultiTermAnalyzer(Analyzer queryAnalyzer) {
  if (queryAnalyzer == null) return null;

  if (!(queryAnalyzer instanceof TokenizerChain)) {
    return new KeywordAnalyzer();
  }

  return ((TokenizerChain) queryAnalyzer).getMultiTermAnalyzer();
}

Source File: FieldAnalysisRequestHandlerTest.java From lucene-solr with Apache License 2.0

5 votes

@Test //See SOLR-8460
public void testCustomAttribute() throws Exception {
  FieldAnalysisRequest request = new FieldAnalysisRequest();
  request.addFieldType("skutype1");
  request.setFieldValue("hi, 3456-12 a Test");
  request.setShowMatch(false);
  FieldType fieldType = new TextField();
  Analyzer analyzer = new TokenizerChain(
      new TokenizerFactory(Collections.emptyMap()) {
        @Override
        public Tokenizer create(AttributeFactory factory) {
          return new CustomTokenizer(factory);
        }
      },
      new TokenFilterFactory[] {
          new TokenFilterFactory(Collections.emptyMap()) {
            @Override
            public TokenStream create(TokenStream input) {
              return new CustomTokenFilter(input);
            }
          }
      }
  );
  fieldType.setIndexAnalyzer(analyzer);

  @SuppressWarnings({"rawtypes"})
  NamedList<NamedList> result = handler.analyzeValues(request, fieldType, "fieldNameUnused");
  // just test that we see "900" in the flags attribute here
  @SuppressWarnings({"unchecked", "rawtypes"})
  List<NamedList> tokenInfoList = (List<NamedList>) result.findRecursive("index", CustomTokenFilter.class.getName());
  // '1' from CustomTokenFilter plus 900 from CustomFlagsAttributeImpl.
  assertEquals(901, tokenInfoList.get(0).get("org.apache.lucene.analysis.tokenattributes.FlagsAttribute#flags"));
}

Source File: SolrSchemaUtil.java From jesterj with Apache License 2.0

5 votes

private Analyzer constructMultiTermAnalyzer(Analyzer queryAnalyzer) {
  if (queryAnalyzer == null) return null;

  if (!(queryAnalyzer instanceof TokenizerChain)) {
    return new KeywordAnalyzer();
  }
  return ((TokenizerChain) queryAnalyzer).getMultiTermAnalyzer();

}

Source File: MMSegTokenizerFactoryTest.java From mmseg4j-solr with Apache License 2.0

5 votes

private Dictionary getDictionaryByFieldType(String fieldTypeName) {
	FieldType ft = h.getCore().getLatestSchema().getFieldTypeByName(fieldTypeName);
	Analyzer a = ft.getIndexAnalyzer();
	Assert.assertEquals(a.getClass(), TokenizerChain.class);
	
	TokenizerChain tc = (TokenizerChain) a;
	TokenizerFactory tf = tc.getTokenizerFactory();
	Assert.assertEquals(tf.getClass(), MMSegTokenizerFactory.class);
	
	MMSegTokenizerFactory mtf = (MMSegTokenizerFactory) tf;
	
	Assert.assertNotNull(mtf.dic);
	return mtf.dic;
}

Source File: TaggerRequestHandler.java From SolrTextTagger with Apache License 2.0

5 votes

private boolean fieldHasIndexedStopFilter(String field, SolrQueryRequest req) {
  FieldType fieldType = req.getSchema().getFieldType(field);
  Analyzer analyzer = fieldType.getIndexAnalyzer();//index analyzer
  if (analyzer instanceof TokenizerChain) {
    TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
    TokenFilterFactory[] tokenFilterFactories = tokenizerChain.getTokenFilterFactories();
    for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) {
      if (tokenFilterFactory instanceof StopFilterFactory)
        return true;
    }
  }
  return false;
}

Source File: AlfrescoFieldType.java From SearchServices with GNU Lesser General Public License v3.0

4 votes

public TokenizerChain build()
{
    CharFilterFactory[] charFilterArr = charFilters == null ? null : charFilters.toArray(new CharFilterFactory[charFilters.size()]);
    TokenFilterFactory[] filterArr = filters == null ? new TokenFilterFactory[0] : filters.toArray(new TokenFilterFactory[filters.size()]);
    return new TokenizerChain(charFilterArr, tokenizer, filterArr);
}

Source File: SolrSchemaUtil.java From jesterj with Apache License 2.0

4 votes

/**
 * Read an analyzer from a dom node. This is adapted from {@link org.apache.solr.schema.FieldTypePluginLoader} with
 * changes to avoid requiring a SolrResourceLoader.
 *
 * @param node        The dom node representing the analyzer
 * @param luceneMatch The lucene version match (must be supplied since we don't load a SolrConfig.xml)
 * @param loader      The Resource loader that can provide accessory files such as stopwords.txt
 * @return A freshly instantiated analyzer
 * @throws XPathExpressionException if there are problems with the DOM created from the schema.xml file.
 */
private Analyzer readAnalyzer(Node node, final String luceneMatch, ResourceLoader loader) throws XPathExpressionException {


  // parent node used to be passed in as "fieldtype"
  // if (!fieldtype.hasChildNodes()) return null;
  // Node node = DOMUtil.getChild(fieldtype,"analyzer");

  if (node == null) return null;
  NamedNodeMap attrs = node.getAttributes();
  String analyzerClassName = DOMUtil.getAttr(attrs, "class");

  // check for all of these up front, so we can error if used in
  // conjunction with an explicit analyzer class.
  NodeList charFilterNodes = (NodeList) xpath.evaluate
      ("./charFilter", node, XPathConstants.NODESET);
  NodeList tokenizerNodes = (NodeList) xpath.evaluate
      ("./tokenizer", node, XPathConstants.NODESET);
  NodeList tokenFilterNodes = (NodeList) xpath.evaluate
      ("./filter", node, XPathConstants.NODESET);

  if (analyzerClassName != null) {

    // explicitly check for child analysis factories instead of
    // just any child nodes, because the user might have their
    // own custom nodes (ie: <description> or something like that)
    if (0 != charFilterNodes.getLength() ||
        0 != tokenizerNodes.getLength() ||
        0 != tokenFilterNodes.getLength()) {
      throw new SolrException
          (SolrException.ErrorCode.SERVER_ERROR,
              "Configuration Error: Analyzer class='" + analyzerClassName +
                  "' can not be combined with nested analysis factories");
    }

    try {
      // No need to be core-aware as Analyzers are not in the core-aware list
      final Class<? extends Analyzer> clazz = findClass(analyzerClassName, Analyzer.class);
      Analyzer analyzer = clazz.newInstance();

      final String matchVersionStr = DOMUtil.getAttr(attrs, LUCENE_MATCH_VERSION_PARAM);
      final Version luceneMatchVersion = (matchVersionStr == null) ?
          Version.parse(luceneMatch) :
          SolrConfig.parseLuceneVersionString(matchVersionStr);
      if (luceneMatchVersion == null) {
        throw new SolrException
            (SolrException.ErrorCode.SERVER_ERROR,
                "Configuration Error: Analyzer '" + clazz.getName() +
                    "' needs a 'luceneMatchVersion' parameter");
      }
      analyzer.setVersion(luceneMatchVersion);
      if (analyzer instanceof ResourceLoaderAware) {
        ((ResourceLoaderAware) analyzer).inform(loader);
      }
      return analyzer;
    } catch (Exception e) {
      log.error("Cannot load analyzer: " + analyzerClassName, e);
      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
          "Cannot load analyzer: " + analyzerClassName, e);
    }
  }

  // Load the CharFilters

  final ArrayList<CharFilterFactory> charFilters = new ArrayList<>();
  load(charFilterNodes, SCHEMA_XML_ANALYZER_CHAR_FILTER, charFilters, CharFilterFactory.class, luceneMatch, loader);

  // Load the Tokenizer
  // Although an analyzer only allows a single Tokenizer, we load a list to make sure
  // the configuration is ok

  final ArrayList<TokenizerFactory> tokenizers = new ArrayList<>(1);
  load(tokenizerNodes, SCHEMA_XML_ANALYZER_TOKENIZER, tokenizers, TokenizerFactory.class, luceneMatch, loader);

  // Make sure something was loaded
  if (tokenizers.isEmpty()) {
    throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "analyzer without class or tokenizer");
  }

  // Load the Filters

  final ArrayList<TokenFilterFactory> filters = new ArrayList<>();
  load(tokenFilterNodes, SCHEMA_XML_ANALYZER_FILTER, filters, TokenFilterFactory.class, luceneMatch, loader);

  return new TokenizerChain(charFilters.toArray(new CharFilterFactory[charFilters.size()]),
      tokenizers.get(0), filters.toArray(new TokenFilterFactory[filters.size()]));

}

org.apache.solr.analysis.TokenizerChain Java Examples