org.apache.solr.analysis.TokenizerChain Java Examples
The following examples show how to use
org.apache.solr.analysis.TokenizerChain.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SolrQueryParserBase.java From lucene-solr with Apache License 2.0 | 6 votes |
protected ReversedWildcardFilterFactory getReversedWildcardFilterFactory(FieldType fieldType) { if (leadingWildcards == null) leadingWildcards = new HashMap<>(); ReversedWildcardFilterFactory fac = leadingWildcards.get(fieldType); if (fac != null || leadingWildcards.containsKey(fieldType)) { return fac; } Analyzer a = fieldType.getIndexAnalyzer(); if (a instanceof TokenizerChain) { // examine the indexing analysis chain if it supports leading wildcards TokenizerChain tc = (TokenizerChain)a; TokenFilterFactory[] factories = tc.getTokenFilterFactories(); for (TokenFilterFactory factory : factories) { if (factory instanceof ReversedWildcardFilterFactory) { fac = (ReversedWildcardFilterFactory)factory; break; } } } leadingWildcards.put(fieldType, fac); return fac; }
Example #2
Source File: PhrasesIdentificationComponent.java From lucene-solr with Apache License 2.0 | 6 votes |
/** * Helper method, public for testing purposes only. * <p> * Given an analyzer, inspects it to determine if: * <ul> * <li>it is a {@link TokenizerChain}</li> * <li>it contains exactly one instance of {@link ShingleFilterFactory}</li> * </ul> * <p> * If these these conditions are met, then this method returns the <code>maxShingleSize</code> * in effect for this analyzer, otherwise returns -1. * </p> * * @param analyzer An analyzer inspect * @return <code>maxShingleSize</code> if available * @lucene.internal */ public static int getMaxShingleSize(Analyzer analyzer) { if (!TokenizerChain.class.isInstance(analyzer)) { return -1; } final TokenFilterFactory[] factories = ((TokenizerChain) analyzer).getTokenFilterFactories(); if (0 == factories.length) { return -1; } int result = -1; for (TokenFilterFactory tff : factories) { if (ShingleFilterFactory.class.isInstance(tff)) { if (0 < result) { // more then one shingle factory in our analyzer, which is weird, so make no assumptions... return -1; } // would be nice if there was an easy way to just ask a factory for the effective value // of an arguement... final Map<String,String> args = tff.getOriginalArgs(); result = args.containsKey("maxShingleSize") ? Integer.parseInt(args.get("maxShingleSize")) : ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE; } } return result; }
Example #3
Source File: PayloadUtils.java From lucene-solr with Apache License 2.0 | 6 votes |
public static String getPayloadEncoder(FieldType fieldType) { // TODO: support custom payload encoding fields too somehow - maybe someone has a custom component that encodes payloads as floats String encoder = null; Analyzer a = fieldType.getIndexAnalyzer(); if (a instanceof TokenizerChain) { // examine the indexing analysis chain for DelimitedPayloadTokenFilterFactory or NumericPayloadTokenFilterFactory TokenizerChain tc = (TokenizerChain)a; TokenFilterFactory[] factories = tc.getTokenFilterFactories(); for (TokenFilterFactory factory : factories) { if (factory instanceof DelimitedPayloadTokenFilterFactory) { encoder = factory.getOriginalArgs().get(DelimitedPayloadTokenFilterFactory.ENCODER_ATTR); break; } if (factory instanceof NumericPayloadTokenFilterFactory) { // encodes using `PayloadHelper.encodeFloat(payload)` encoder = "float"; break; } } } return encoder; }
Example #4
Source File: NestPathField.java From lucene-solr with Apache License 2.0 | 6 votes |
@Override public void setArgs(IndexSchema schema, Map<String, String> args) { args.putIfAbsent("stored", "false"); args.putIfAbsent("omitTermFreqAndPositions", "true"); args.putIfAbsent("omitNorms", "true"); args.putIfAbsent("maxCharsForDocValues", "-1"); super.setArgs(schema, args); // CustomAnalyzer is easy to use CustomAnalyzer customAnalyzer; try { customAnalyzer = CustomAnalyzer.builder(schema.getResourceLoader()) .withDefaultMatchVersion(schema.getDefaultLuceneMatchVersion()) .withTokenizer(KeywordTokenizerFactory.class) .addTokenFilter(PatternReplaceFilterFactory.class, "pattern", "#\\d*", "replace", "all") .build(); } catch (IOException e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);//impossible? } // Solr HTTP Schema APIs don't know about CustomAnalyzer so use TokenizerChain instead setIndexAnalyzer(new TokenizerChain(customAnalyzer)); // leave queryAnalyzer as literal }
Example #5
Source File: CustomAnalyzerStrField.java From lucene-solr with Apache License 2.0 | 6 votes |
public CustomAnalyzerStrField() { Random r = LuceneTestCase.random(); // two arg constructor Analyzer a2 = new TokenizerChain (new KeywordTokenizerFactory(new HashMap<>()), r.nextBoolean() ? null : new TokenFilterFactory[0]); // three arg constructor Analyzer a3 = new TokenizerChain (r.nextBoolean() ? null : new CharFilterFactory[0], new KeywordTokenizerFactory(new HashMap<>()), r.nextBoolean() ? null : new TokenFilterFactory[0]); if (r.nextBoolean()) { indexAnalyzer = a2; queryAnalyzer = a3; } else { queryAnalyzer = a2; indexAnalyzer = a3; } }
Example #6
Source File: SolrStopwordsCarrot2LexicalDataFactory.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Obtains stop words for a field from the associated * {@link StopFilterFactory}, if any. */ private List<CharArraySet> getSolrStopWordsForField(String fieldName) { // No need to synchronize here, Carrot2 ensures that instances // of this class are not used by multiple threads at a time. synchronized (solrStopWords) { if (!solrStopWords.containsKey(fieldName)) { solrStopWords.put(fieldName, new ArrayList<>()); IndexSchema schema = core.getLatestSchema(); final Analyzer fieldAnalyzer = schema.getFieldType(fieldName).getIndexAnalyzer(); if (fieldAnalyzer instanceof TokenizerChain) { final TokenFilterFactory[] filterFactories = ((TokenizerChain) fieldAnalyzer).getTokenFilterFactories(); for (TokenFilterFactory factory : filterFactories) { if (factory instanceof StopFilterFactory) { // StopFilterFactory holds the stop words in a CharArraySet CharArraySet stopWords = ((StopFilterFactory) factory).getStopWords(); solrStopWords.get(fieldName).add(stopWords); } if (factory instanceof CommonGramsFilterFactory) { CharArraySet commonWords = ((CommonGramsFilterFactory) factory).getCommonWords(); solrStopWords.get(fieldName).add(commonWords); } } } } return solrStopWords.get(fieldName); } }
Example #7
Source File: TaggerRequestHandler.java From lucene-solr with Apache License 2.0 | 5 votes |
private boolean fieldHasIndexedStopFilter(String field, SolrQueryRequest req) { FieldType fieldType = req.getSchema().getFieldType(field); Analyzer analyzer = fieldType.getIndexAnalyzer();//index analyzer if (analyzer instanceof TokenizerChain) { TokenizerChain tokenizerChain = (TokenizerChain) analyzer; TokenFilterFactory[] tokenFilterFactories = tokenizerChain.getTokenFilterFactories(); for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) { if (tokenFilterFactory instanceof StopFilterFactory) return true; } } return false; }
Example #8
Source File: ManagedIndexSchema.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Informs analyzers used by a fieldType. */ protected void informResourceLoaderAwareObjectsForFieldType(FieldType fieldType) { // must inform any sub-components used in the // tokenizer chain if they are ResourceLoaderAware if (!fieldType.supportsAnalyzers()) return; Analyzer indexAnalyzer = fieldType.getIndexAnalyzer(); if (indexAnalyzer != null && indexAnalyzer instanceof TokenizerChain) informResourceLoaderAwareObjectsInChain((TokenizerChain)indexAnalyzer); Analyzer queryAnalyzer = fieldType.getQueryAnalyzer(); // ref comparison is correct here (vs. equals) as they may be the same // object in which case, we don't need to inform twice ... however, it's // actually safe to call inform multiple times on an object anyway if (queryAnalyzer != null && queryAnalyzer != indexAnalyzer && queryAnalyzer instanceof TokenizerChain) informResourceLoaderAwareObjectsInChain((TokenizerChain)queryAnalyzer); // if fieldType is a TextField, it might have a multi-term analyzer if (fieldType instanceof TextField) { TextField textFieldType = (TextField)fieldType; Analyzer multiTermAnalyzer = textFieldType.getMultiTermAnalyzer(); if (multiTermAnalyzer != null && multiTermAnalyzer != indexAnalyzer && multiTermAnalyzer != queryAnalyzer && multiTermAnalyzer instanceof TokenizerChain) informResourceLoaderAwareObjectsInChain((TokenizerChain)multiTermAnalyzer); } }
Example #9
Source File: FieldTypePluginLoader.java From lucene-solr with Apache License 2.0 | 5 votes |
private Analyzer constructMultiTermAnalyzer(Analyzer queryAnalyzer) { if (queryAnalyzer == null) return null; if (!(queryAnalyzer instanceof TokenizerChain)) { return new KeywordAnalyzer(); } return ((TokenizerChain) queryAnalyzer).getMultiTermAnalyzer(); }
Example #10
Source File: FieldAnalysisRequestHandlerTest.java From lucene-solr with Apache License 2.0 | 5 votes |
@Test //See SOLR-8460 public void testCustomAttribute() throws Exception { FieldAnalysisRequest request = new FieldAnalysisRequest(); request.addFieldType("skutype1"); request.setFieldValue("hi, 3456-12 a Test"); request.setShowMatch(false); FieldType fieldType = new TextField(); Analyzer analyzer = new TokenizerChain( new TokenizerFactory(Collections.emptyMap()) { @Override public Tokenizer create(AttributeFactory factory) { return new CustomTokenizer(factory); } }, new TokenFilterFactory[] { new TokenFilterFactory(Collections.emptyMap()) { @Override public TokenStream create(TokenStream input) { return new CustomTokenFilter(input); } } } ); fieldType.setIndexAnalyzer(analyzer); @SuppressWarnings({"rawtypes"}) NamedList<NamedList> result = handler.analyzeValues(request, fieldType, "fieldNameUnused"); // just test that we see "900" in the flags attribute here @SuppressWarnings({"unchecked", "rawtypes"}) List<NamedList> tokenInfoList = (List<NamedList>) result.findRecursive("index", CustomTokenFilter.class.getName()); // '1' from CustomTokenFilter plus 900 from CustomFlagsAttributeImpl. assertEquals(901, tokenInfoList.get(0).get("org.apache.lucene.analysis.tokenattributes.FlagsAttribute#flags")); }
Example #11
Source File: SolrSchemaUtil.java From jesterj with Apache License 2.0 | 5 votes |
private Analyzer constructMultiTermAnalyzer(Analyzer queryAnalyzer) { if (queryAnalyzer == null) return null; if (!(queryAnalyzer instanceof TokenizerChain)) { return new KeywordAnalyzer(); } return ((TokenizerChain) queryAnalyzer).getMultiTermAnalyzer(); }
Example #12
Source File: MMSegTokenizerFactoryTest.java From mmseg4j-solr with Apache License 2.0 | 5 votes |
private Dictionary getDictionaryByFieldType(String fieldTypeName) { FieldType ft = h.getCore().getLatestSchema().getFieldTypeByName(fieldTypeName); Analyzer a = ft.getIndexAnalyzer(); Assert.assertEquals(a.getClass(), TokenizerChain.class); TokenizerChain tc = (TokenizerChain) a; TokenizerFactory tf = tc.getTokenizerFactory(); Assert.assertEquals(tf.getClass(), MMSegTokenizerFactory.class); MMSegTokenizerFactory mtf = (MMSegTokenizerFactory) tf; Assert.assertNotNull(mtf.dic); return mtf.dic; }
Example #13
Source File: TaggerRequestHandler.java From SolrTextTagger with Apache License 2.0 | 5 votes |
private boolean fieldHasIndexedStopFilter(String field, SolrQueryRequest req) { FieldType fieldType = req.getSchema().getFieldType(field); Analyzer analyzer = fieldType.getIndexAnalyzer();//index analyzer if (analyzer instanceof TokenizerChain) { TokenizerChain tokenizerChain = (TokenizerChain) analyzer; TokenFilterFactory[] tokenFilterFactories = tokenizerChain.getTokenFilterFactories(); for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) { if (tokenFilterFactory instanceof StopFilterFactory) return true; } } return false; }
Example #14
Source File: AlfrescoFieldType.java From SearchServices with GNU Lesser General Public License v3.0 | 4 votes |
public TokenizerChain build() { CharFilterFactory[] charFilterArr = charFilters == null ? null : charFilters.toArray(new CharFilterFactory[charFilters.size()]); TokenFilterFactory[] filterArr = filters == null ? new TokenFilterFactory[0] : filters.toArray(new TokenFilterFactory[filters.size()]); return new TokenizerChain(charFilterArr, tokenizer, filterArr); }
Example #15
Source File: SolrSchemaUtil.java From jesterj with Apache License 2.0 | 4 votes |
/** * Read an analyzer from a dom node. This is adapted from {@link org.apache.solr.schema.FieldTypePluginLoader} with * changes to avoid requiring a SolrResourceLoader. * * @param node The dom node representing the analyzer * @param luceneMatch The lucene version match (must be supplied since we don't load a SolrConfig.xml) * @param loader The Resource loader that can provide accessory files such as stopwords.txt * @return A freshly instantiated analyzer * @throws XPathExpressionException if there are problems with the DOM created from the schema.xml file. */ private Analyzer readAnalyzer(Node node, final String luceneMatch, ResourceLoader loader) throws XPathExpressionException { // parent node used to be passed in as "fieldtype" // if (!fieldtype.hasChildNodes()) return null; // Node node = DOMUtil.getChild(fieldtype,"analyzer"); if (node == null) return null; NamedNodeMap attrs = node.getAttributes(); String analyzerClassName = DOMUtil.getAttr(attrs, "class"); // check for all of these up front, so we can error if used in // conjunction with an explicit analyzer class. NodeList charFilterNodes = (NodeList) xpath.evaluate ("./charFilter", node, XPathConstants.NODESET); NodeList tokenizerNodes = (NodeList) xpath.evaluate ("./tokenizer", node, XPathConstants.NODESET); NodeList tokenFilterNodes = (NodeList) xpath.evaluate ("./filter", node, XPathConstants.NODESET); if (analyzerClassName != null) { // explicitly check for child analysis factories instead of // just any child nodes, because the user might have their // own custom nodes (ie: <description> or something like that) if (0 != charFilterNodes.getLength() || 0 != tokenizerNodes.getLength() || 0 != tokenFilterNodes.getLength()) { throw new SolrException (SolrException.ErrorCode.SERVER_ERROR, "Configuration Error: Analyzer class='" + analyzerClassName + "' can not be combined with nested analysis factories"); } try { // No need to be core-aware as Analyzers are not in the core-aware list final Class<? extends Analyzer> clazz = findClass(analyzerClassName, Analyzer.class); Analyzer analyzer = clazz.newInstance(); final String matchVersionStr = DOMUtil.getAttr(attrs, LUCENE_MATCH_VERSION_PARAM); final Version luceneMatchVersion = (matchVersionStr == null) ? Version.parse(luceneMatch) : SolrConfig.parseLuceneVersionString(matchVersionStr); if (luceneMatchVersion == null) { throw new SolrException (SolrException.ErrorCode.SERVER_ERROR, "Configuration Error: Analyzer '" + clazz.getName() + "' needs a 'luceneMatchVersion' parameter"); } analyzer.setVersion(luceneMatchVersion); if (analyzer instanceof ResourceLoaderAware) { ((ResourceLoaderAware) analyzer).inform(loader); } return analyzer; } catch (Exception e) { log.error("Cannot load analyzer: " + analyzerClassName, e); throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Cannot load analyzer: " + analyzerClassName, e); } } // Load the CharFilters final ArrayList<CharFilterFactory> charFilters = new ArrayList<>(); load(charFilterNodes, SCHEMA_XML_ANALYZER_CHAR_FILTER, charFilters, CharFilterFactory.class, luceneMatch, loader); // Load the Tokenizer // Although an analyzer only allows a single Tokenizer, we load a list to make sure // the configuration is ok final ArrayList<TokenizerFactory> tokenizers = new ArrayList<>(1); load(tokenizerNodes, SCHEMA_XML_ANALYZER_TOKENIZER, tokenizers, TokenizerFactory.class, luceneMatch, loader); // Make sure something was loaded if (tokenizers.isEmpty()) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "analyzer without class or tokenizer"); } // Load the Filters final ArrayList<TokenFilterFactory> filters = new ArrayList<>(); load(tokenFilterNodes, SCHEMA_XML_ANALYZER_FILTER, filters, TokenFilterFactory.class, luceneMatch, loader); return new TokenizerChain(charFilters.toArray(new CharFilterFactory[charFilters.size()]), tokenizers.get(0), filters.toArray(new TokenFilterFactory[filters.size()])); }