Java Code Examples for org.apache.lucene.util.TestUtil#randomRealisticUnicodeString()

The following examples show how to use org.apache.lucene.util.TestUtil#randomRealisticUnicodeString() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestUseDocValuesAsStored.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
private String[] nextValues(int arity, String valueType) throws Exception {
  String[] values = new String[arity];
  for (int i = 0 ; i < arity ; ++i) {
    switch (valueType) {
      case "int": values[i] = String.valueOf(random().nextInt()); break;
      case "double": values[i] = String.valueOf(Double.longBitsToDouble(random().nextLong())); break;
      case "long": values[i] = String.valueOf(random().nextLong()); break;
      case "float": values[i] = String.valueOf(Float.intBitsToFloat(random().nextInt())); break;
      case "enum": values[i] = SEVERITY[TestUtil.nextInt(random(), 0, SEVERITY.length - 1)]; break;
      case "str": {
        String str = TestUtil.randomRealisticUnicodeString(random());
        values[i] = BAD_CHAR_PATTERN.matcher(str).replaceAll("\uFFFD");
        break;
      }
      case "date": {
        long epochMillis = TestUtil.nextLong(random(), START_RANDOM_EPOCH_MILLIS, END_RANDOM_EPOCH_MILLIS);
        values[i] = Instant.ofEpochMilli(epochMillis).toString();
        break;
      }
      default: throw new Exception("unknown type '" + valueType + "'");
    }
  }
  return values;
}
 
Example 2
Source File: TestBlockJoin.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
private String[][] getRandomFields(int maxUniqueValues) {

    final String[][] fields = new String[TestUtil.nextInt(random(), 2, 4)][];
    for(int fieldID=0;fieldID<fields.length;fieldID++) {
      final int valueCount;
      if (fieldID == 0) {
        valueCount = 2;
      } else {
        valueCount = TestUtil.nextInt(random(), 1, maxUniqueValues);
      }

      final String[] values = fields[fieldID] = new String[valueCount];
      for(int i=0;i<valueCount;i++) {
        values[i] = TestUtil.randomRealisticUnicodeString(random());
        //values[i] = TestUtil.randomSimpleString(random());
      }
    }

    return fields;
  }
 
Example 3
Source File: TestLongPostings.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private String getRandomTerm(String other) throws IOException {
  Analyzer a = new MockAnalyzer(random());
  while(true) {
    String s = TestUtil.randomRealisticUnicodeString(random());
    if (other != null && s.equals(other)) {
      continue;
    }
    try (TokenStream ts = a.tokenStream("foo", s)) {
      final TermToBytesRefAttribute termAtt = ts.getAttribute(TermToBytesRefAttribute.class);
      ts.reset();

      int count = 0;
      boolean changed = false;

      while(ts.incrementToken()) {
        final BytesRef termBytes = termAtt.getBytesRef();
        if (count == 0 && !termBytes.utf8ToString().equals(s)) {
          // The value was changed during analysis.  Keep iterating so the
          // tokenStream is exhausted.
          changed = true;
        }
        count++;
      }

      ts.end();
      // Did we iterate just once and the value was unchanged?
      if (!changed && count == 1) {
        return s;
      }
    }
  }
}
 
Example 4
Source File: SolrTestCaseJ4.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * We want "realistic" unicode strings beyond simple ascii, but because our
 * updates use XML we need to ensure we don't get "special" code block.
 */
public static String randomXmlUsableUnicodeString() {
  String result = TestUtil.randomRealisticUnicodeString(random());
  if (result.matches(".*\\p{InSpecials}.*")) {
    result = TestUtil.randomSimpleString(random());
  }
  return result;
}
 
Example 5
Source File: TestUTF32ToUTF8.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testSingleton() throws Exception {
  int iters = atLeast(100);
  for(int iter=0;iter<iters;iter++) {
    String s = TestUtil.randomRealisticUnicodeString(random());
    Automaton a = Automata.makeString(s);
    Automaton utf8 = new UTF32ToUTF8().convert(a);
    IntsRefBuilder ints = new IntsRefBuilder();
    Util.toIntsRef(new BytesRef(s), ints);
    Set<IntsRef> set = new HashSet<>();
    set.add(ints.get());
    assertEquals(set, TestOperations.getFiniteStrings(utf8));
  }
}
 
Example 6
Source File: TestAutomaton.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testGetSingleton() {
  int iters = atLeast(10000);
  for(int iter=0;iter<iters;iter++) {
    String s = TestUtil.randomRealisticUnicodeString(random());
    Automaton a = Automata.makeString(s);
    assertEquals(toIntsRef(s), Operations.getSingleton(a));
  }
}
 
Example 7
Source File: TestFieldUpdatesBuffer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
DocValuesUpdate.BinaryDocValuesUpdate getRandomBinaryUpdate() {
  String termField = RandomPicks.randomFrom(random(), Arrays.asList("id", "_id", "some_other_field"));
  String docId = "" + random().nextInt(10);
  DocValuesUpdate.BinaryDocValuesUpdate value = new DocValuesUpdate.BinaryDocValuesUpdate(new Term(termField, docId), "binary",
      rarely() ? null : new BytesRef(TestUtil.randomRealisticUnicodeString(random())));
  return rarely() ? value.prepareForApply(randomDocUpTo()) : value;
}
 
Example 8
Source File: CursorMarkTest.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private static Object getRandomCollation(SchemaField sf) throws IOException {
  Object val;
  Analyzer analyzer = sf.getType().getIndexAnalyzer();
  String term = TestUtil.randomRealisticUnicodeString(random());
  try (TokenStream ts = analyzer.tokenStream("fake", term)) {
    TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
    ts.reset();
    assertTrue(ts.incrementToken());
    val = BytesRef.deepCopyOf(termAtt.getBytesRef());
    assertFalse(ts.incrementToken());
    ts.end();
  }
  return val;
}
 
Example 9
Source File: FSTTester.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
static String getRandomString(Random random) {
  final String term;
  if (random.nextBoolean()) {
    term = TestUtil.randomRealisticUnicodeString(random);
  } else {
    // we want to mix in limited-alphabet symbols so
    // we get more sharing of the nodes given how few
    // terms we are testing...
    term = simpleRandomString(random);
  }
  return term;
}
 
Example 10
Source File: BaseTermVectorsFormatTestCase.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
protected RandomDocumentFactory(int distinctFieldNames, int disctinctTerms) {
  final Set<String> fieldNames = new HashSet<>();
  while (fieldNames.size() < distinctFieldNames) {
    fieldNames.add(TestUtil.randomSimpleString(random()));
    fieldNames.remove("id");
  }
  this.fieldNames = fieldNames.toArray(new String[0]);
  terms = new String[disctinctTerms];
  termBytes = new BytesRef[disctinctTerms];
  for (int i = 0; i < disctinctTerms; ++i) {
    terms[i] = TestUtil.randomRealisticUnicodeString(random());
    termBytes[i] = new BytesRef(terms[i]);
  }
}
 
Example 11
Source File: FacetTestCase.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
protected String[] getRandomTokens(int count) {
  String[] tokens = new String[count];
  for(int i=0;i<tokens.length;i++) {
    tokens[i] = TestUtil.randomRealisticUnicodeString(random(), 1, 10);
    //tokens[i] = _TestUtil.randomSimpleString(random(), 1, 10);
  }
  return tokens;
}
 
Example 12
Source File: AbstractGroupingTestCase.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
protected String generateRandomNonEmptyString() {
  String randomValue;
  do {
    // B/c of DV based impl we can't see the difference between an empty string and a null value.
    // For that reason we don't generate empty string
    // groups.
    randomValue = TestUtil.randomRealisticUnicodeString(random());
    //randomValue = _TestUtil.randomSimpleString(random());
  } while ("".equals(randomValue));
  return randomValue;
}
 
Example 13
Source File: TestTermsEnum.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private String getRandomString() {
  //return _TestUtil.randomSimpleString(random());
  return TestUtil.randomRealisticUnicodeString(random());
}
 
Example 14
Source File: TokenSourcesTest.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Repeat(iterations = 10)
//@Seed("947083AB20AB2D4F")
public void testRandomizedRoundTrip() throws Exception {
  final int distinct = TestUtil.nextInt(random(), 1, 10);

  String[] terms = new String[distinct];
  BytesRef[] termBytes = new BytesRef[distinct];
  for (int i = 0; i < distinct; ++i) {
    terms[i] = TestUtil.randomRealisticUnicodeString(random());
    termBytes[i] = new BytesRef(terms[i]);
  }

  final BaseTermVectorsFormatTestCase.RandomTokenStream rTokenStream =
      new BaseTermVectorsFormatTestCase.RandomTokenStream(TestUtil.nextInt(random(), 1, 10), terms, termBytes);
  //check to see if the token streams might have non-deterministic testable result
  final boolean storeTermVectorPositions = random().nextBoolean();
  final int[] startOffsets = rTokenStream.getStartOffsets();
  final int[] positionsIncrements = rTokenStream.getPositionsIncrements();
  for (int i = 1; i < positionsIncrements.length; i++) {
    if (storeTermVectorPositions && positionsIncrements[i] != 0) {
      continue;
    }
    //TODO should RandomTokenStream ensure endOffsets for tokens at same position and same startOffset are greater
    // than previous token's endOffset?  That would increase the testable possibilities.
    if (startOffsets[i] == startOffsets[i-1]) {
      if (VERBOSE)
        System.out.println("Skipping test because can't easily validate random token-stream is correct.");
      rTokenStream.close();
      return;
    }
  }

  //sanity check itself
  assertTokenStreamContents(rTokenStream,
      rTokenStream.getTerms(), rTokenStream.getStartOffsets(), rTokenStream.getEndOffsets(),
      rTokenStream.getPositionsIncrements());

  Directory dir = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
  FieldType myFieldType = new FieldType(TextField.TYPE_NOT_STORED);
  myFieldType.setStoreTermVectors(true);
  myFieldType.setStoreTermVectorOffsets(true);
  myFieldType.setStoreTermVectorPositions(storeTermVectorPositions);
  //payloads require positions; it will throw an error otherwise
  myFieldType.setStoreTermVectorPayloads(storeTermVectorPositions && random().nextBoolean());

  Document doc = new Document();
  doc.add(new Field("field", rTokenStream, myFieldType));
  writer.addDocument(doc);

  IndexReader reader = writer.getReader();
  writer.close();
  assertEquals(1, reader.numDocs());

  TokenStream vectorTokenStream =
      TokenSources.getTermVectorTokenStreamOrNull("field", reader.getTermVectors(0), -1);

  //sometimes check payloads
  PayloadAttribute payloadAttribute = null;
  if (myFieldType.storeTermVectorPayloads() && usually()) {
    payloadAttribute = vectorTokenStream.addAttribute(PayloadAttribute.class);
  }
  assertTokenStreamContents(vectorTokenStream,
      rTokenStream.getTerms(), rTokenStream.getStartOffsets(), rTokenStream.getEndOffsets(),
      myFieldType.storeTermVectorPositions() ? rTokenStream.getPositionsIncrements() : null);
  //test payloads
  if (payloadAttribute != null) {
    vectorTokenStream.reset();
    for (int i = 0; vectorTokenStream.incrementToken(); i++) {
      assertEquals(rTokenStream.getPayloads()[i], payloadAttribute.getPayload());
    }
  }

  reader.close();
  dir.close();
  rTokenStream.close();
}
 
Example 15
Source File: TestAutomaton.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private String getRandomString() {
  //return TestUtil.randomSimpleString(random());
  return TestUtil.randomRealisticUnicodeString(random());
}
 
Example 16
Source File: TestIDVersionPostingsFormat.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private IDSource getRandomIDs() {
  IDSource ids;
  switch (random().nextInt(6)) {
  case 0:
    // random simple
    if (VERBOSE) {
      System.out.println("TEST: use random simple ids");
    }
    ids = new IDSource() {
        @Override
        public String next() {
          return TestUtil.randomSimpleString(random());
        }
      };
    break;
  case 1:
    // random realistic unicode
    if (VERBOSE) {
      System.out.println("TEST: use random realistic unicode ids");
    }
    ids = new IDSource() {
        @Override
        public String next() {
          return TestUtil.randomRealisticUnicodeString(random());
        }
      };
    break;
  case 2:
    // sequential
    if (VERBOSE) {
      System.out.println("TEST: use seuquential ids");
    }
    ids = new IDSource() {
        int upto;
        @Override
        public String next() {
          return Integer.toString(upto++);
        }
      };
    break;
  case 3:
    // zero-pad sequential
    if (VERBOSE) {
      System.out.println("TEST: use zero-pad seuquential ids");
    }
    ids = new IDSource() {
        final int radix = TestUtil.nextInt(random(), Character.MIN_RADIX, Character.MAX_RADIX);
        final String zeroPad = String.format(Locale.ROOT, "%0" + TestUtil.nextInt(random(), 5, 20) + "d", 0);
        int upto;
        @Override
        public String next() {
          String s = Integer.toString(upto++);
          return zeroPad.substring(zeroPad.length() - s.length()) + s;
        }
      };
    break;
  case 4:
    // random long
    if (VERBOSE) {
      System.out.println("TEST: use random long ids");
    }
    ids = new IDSource() {
        final int radix = TestUtil.nextInt(random(), Character.MIN_RADIX, Character.MAX_RADIX);
        int upto;
        @Override
        public String next() {
          return Long.toString(random().nextLong() & 0x3ffffffffffffffL, radix);
        }
      };
    break;
  case 5:
    // zero-pad random long
    if (VERBOSE) {
      System.out.println("TEST: use zero-pad random long ids");
    }
    ids = new IDSource() {
        final int radix = TestUtil.nextInt(random(), Character.MIN_RADIX, Character.MAX_RADIX);
        final String zeroPad = String.format(Locale.ROOT, "%015d", 0);
        int upto;
        @Override
        public String next() {
          return Long.toString(random().nextLong() & 0x3ffffffffffffffL, radix);
        }
      };
    break;
  default:
    throw new AssertionError();
  }

  return ids;
}
 
Example 17
Source File: TestCompiledAutomaton.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private String randomString() {
  // return _TestUtil.randomSimpleString(random);
  return TestUtil.randomRealisticUnicodeString(random());
}
 
Example 18
Source File: TestSearcherTaxonomyManager.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public void run() {
  try {
    Set<String> seen = new HashSet<>();
    List<String> paths = new ArrayList<>();
    while (true) {
      Document doc = new Document();
      int numPaths = TestUtil.nextInt(random(), 1, 5);
      for(int i=0;i<numPaths;i++) {
        String path;
        if (!paths.isEmpty() && random().nextInt(5) != 4) {
          // Use previous path
          path = paths.get(random().nextInt(paths.size()));
        } else {
          // Create new path
          path = null;
          while (true) {
            path = TestUtil.randomRealisticUnicodeString(random());
            if (path.length() != 0 && !seen.contains(path)) {
              seen.add(path);
              paths.add(path);
              break;
            }
          }
        }
        doc.add(new FacetField("field", path));
      }
      try {
        w.addDocument(config.build(tw, doc));
        if (mgr != null && random().nextDouble() < 0.02) {
          w.commit();
          tw.commit();
          mgr.maybeRefresh();
        }
      } catch (IOException ioe) {
        throw new RuntimeException(ioe);
      }

      if (VERBOSE) {
        System.out.println("TW size=" + tw.getSize() + " vs " + ordLimit);
      }

      if (tw.getSize() >= ordLimit) {
        break;
      }
    }
  } finally {
    stop.set(true);
  }
}
 
Example 19
Source File: TestFSTs.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testIllegallyModifyRootArc() throws Exception {
  assumeTrue("test relies on assertions", assertsAreEnabled);

  Set<BytesRef> terms = new HashSet<>();
  for(int i=0;i<100;i++) {
    String prefix = Character.toString((char) ('a' + i));
    terms.add(new BytesRef(prefix));
    if (prefix.equals("m") == false) {
      for(int j=0;j<20;j++) {
        // Make a big enough FST that the root cache will be created:
        String suffix = TestUtil.randomRealisticUnicodeString(random(), 10, 20);
        terms.add(new BytesRef(prefix + suffix));
      }
    }
  }

  List<BytesRef> termsList = new ArrayList<>(terms);
  Collections.sort(termsList);

  ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
  FSTCompiler<BytesRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);

  IntsRefBuilder input = new IntsRefBuilder();
  for(BytesRef term : termsList) {
    Util.toIntsRef(term, input);
    fstCompiler.add(input.get(), term);
  }

  FST<BytesRef> fst = fstCompiler.compile();
  
  Arc<BytesRef> arc = new FST.Arc<>();
  fst.getFirstArc(arc);
  FST.BytesReader reader = fst.getBytesReader();
  arc = fst.findTargetArc((int) 'm', arc, arc, reader);
  assertNotNull(arc);
  assertEquals(new BytesRef("m"), arc.output());

  // NOTE: illegal:
  arc.output().length = 0;

  fst.getFirstArc(arc);
  try {
    arc = fst.findTargetArc((int) 'm', arc, arc, reader);
  } catch (AssertionError ae) {
    // expected
  }
}
 
Example 20
Source File: TestDocTermOrds.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testRandom() throws Exception {
  Directory dir = newDirectory();

  final int NUM_TERMS = atLeast(20);
  final Set<BytesRef> terms = new HashSet<>();
  while(terms.size() < NUM_TERMS) {
    final String s = TestUtil.randomRealisticUnicodeString(random());
    //final String s = _TestUtil.randomSimpleString(random);
    if (s.length() > 0) {
      terms.add(new BytesRef(s));
    }
  }
  final BytesRef[] termsArray = terms.toArray(new BytesRef[terms.size()]);
  Arrays.sort(termsArray);
  
  final int NUM_DOCS = atLeast(100);

  IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));

  // Sometimes swap in codec that impls ord():
  if (random().nextInt(10) == 7) {
    // Make sure terms index has ords:
    Codec codec = TestUtil.alwaysPostingsFormat(TestUtil.getPostingsFormatWithOrds(random()));
    conf.setCodec(codec);
  }
  
  final RandomIndexWriter w = new RandomIndexWriter(random(), dir, conf);

  final int[][] idToOrds = new int[NUM_DOCS][];
  final Set<Integer> ordsForDocSet = new HashSet<>();

  for(int id=0;id<NUM_DOCS;id++) {
    Document doc = new Document();

    doc.add(new LegacyIntField("id", id, Field.Store.YES));
    
    final int termCount = TestUtil.nextInt(random(), 0, 20 * RANDOM_MULTIPLIER);
    while(ordsForDocSet.size() < termCount) {
      ordsForDocSet.add(random().nextInt(termsArray.length));
    }
    final int[] ordsForDoc = new int[termCount];
    int upto = 0;
    if (VERBOSE) {
      System.out.println("TEST: doc id=" + id);
    }
    for(int ord : ordsForDocSet) {
      ordsForDoc[upto++] = ord;
      Field field = newStringField("field", termsArray[ord].utf8ToString(), Field.Store.NO);
      if (VERBOSE) {
        System.out.println("  f=" + termsArray[ord].utf8ToString());
      }
      doc.add(field);
    }
    ordsForDocSet.clear();
    Arrays.sort(ordsForDoc);
    idToOrds[id] = ordsForDoc;
    w.addDocument(doc);
  }
  
  final DirectoryReader r = w.getReader();
  w.close();

  if (VERBOSE) {
    System.out.println("TEST: reader=" + r);
  }

  for(LeafReaderContext ctx : r.leaves()) {
    if (VERBOSE) {
      System.out.println("\nTEST: sub=" + ctx.reader());
    }
    verify(ctx.reader(), idToOrds, termsArray, null);
  }

  // Also test top-level reader: its enum does not support
  // ord, so this forces the OrdWrapper to run:
  if (VERBOSE) {
    System.out.println("TEST: top reader");
  }
  LeafReader slowR = SlowCompositeReaderWrapper.wrap(r);
  TestUtil.checkReader(slowR);
  verify(slowR, idToOrds, termsArray, null);

  FieldCache.DEFAULT.purgeByCacheKey(slowR.getCoreCacheHelper().getKey());

  r.close();
  dir.close();
}