com.ibm.icu.text.RuleBasedCollator Java Examples

The following examples show how to use com.ibm.icu.text.RuleBasedCollator. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CollationSpecifier.java    From sql-layer with GNU Affero General Public License v3.0 6 votes vote down vote up
private static void setCollatorStrength(RuleBasedCollator collator, CollationSpecifier specifier) {
    if (specifier.caseSensitive() && specifier.accentSensitive()) {
        collator.setStrength(Collator.TERTIARY);
        collator.setCaseLevel(false);
    }
    else if (specifier.caseSensitive() && !specifier.accentSensitive()) {
        collator.setCaseLevel(true);
        collator.setStrength(Collator.PRIMARY);
    }
    else if (!specifier.caseSensitive() && specifier.accentSensitive()) {
        collator.setStrength(Collator.SECONDARY);
        collator.setCaseLevel(false);
    }
    else {
        collator.setStrength(Collator.PRIMARY);
        collator.setCaseLevel(false);
    }
}
 
Example #2
Source File: ICUCollationField.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Read custom rules from a file, and create a RuleBasedCollator
 * The file cannot support comments, as # might be in the rules!
 */
static Collator createFromRules(String fileName, ResourceLoader loader) {
  InputStream input = null;
  try {
   input = loader.openResource(fileName);
   String rules = IOUtils.toString(input, "UTF-8");
   return new RuleBasedCollator(rules);
  } catch (Exception e) {
    // io error or invalid rules
    throw new RuntimeException(e);
  } finally {
    IOUtils.closeQuietly(input);
  }
}
 
Example #3
Source File: TestICUCollationField.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Ugly: but what to do? We want to test custom sort, which reads rules in as a resource.
 * These are largish files, and jvm-specific (as our documentation says, you should always
 * look out for jvm differences with collation).
 * So it's preferable to create this file on-the-fly.
 */
public static String setupSolrHome() throws Exception {
  String tmpFile = createTempDir().toFile().getAbsolutePath();
  // make data and conf dirs
  new File(tmpFile  + "/collection1", "data").mkdirs();
  File confDir = new File(tmpFile + "/collection1", "conf");
  confDir.mkdirs();
  
  // copy over configuration files
  FileUtils.copyFile(getFile("analysis-extras/solr/collection1/conf/solrconfig-icucollate.xml"), new File(confDir, "solrconfig.xml"));
  FileUtils.copyFile(getFile("analysis-extras/solr/collection1/conf/schema-icucollate.xml"), new File(confDir, "schema.xml"));
  
  // generate custom collation rules (DIN 5007-2), saving to customrules.dat
  RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de", "DE"));

  String DIN5007_2_tailorings =
    "& ae , a\u0308 & AE , A\u0308"+
    "& oe , o\u0308 & OE , O\u0308"+
    "& ue , u\u0308 & UE , u\u0308";

  RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
  String tailoredRules = tailoredCollator.getRules();
  final String osFileName = "customrules.dat";
  final FileOutputStream os = new FileOutputStream(new File(confDir, osFileName));
  IOUtils.write(tailoredRules, os, "UTF-8");
  os.close();

  final ResourceLoader loader;
  if (random().nextBoolean()) {
    loader = new StringMockResourceLoader(tailoredRules);
  } else {
    loader = new FilesystemResourceLoader(confDir.toPath());
  }
  final Collator readCollator = ICUCollationField.createFromRules(osFileName, loader);
  assertEquals(tailoredCollator, readCollator);

  return tmpFile;
}
 
Example #4
Source File: TestICUCollationFieldDocValues.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Ugly: but what to do? We want to test custom sort, which reads rules in as a resource.
 * These are largish files, and jvm-specific (as our documentation says, you should always
 * look out for jvm differences with collation).
 * So it's preferable to create this file on-the-fly.
 */
public static String setupSolrHome() throws Exception {
  File tmpFile = createTempDir().toFile();
  
  // make data and conf dirs
  new File(tmpFile + "/collection1", "data").mkdirs();
  File confDir = new File(tmpFile + "/collection1", "conf");
  confDir.mkdirs();
  
  // copy over configuration files
  FileUtils.copyFile(getFile("analysis-extras/solr/collection1/conf/solrconfig-icucollate.xml"), new File(confDir, "solrconfig.xml"));
  FileUtils.copyFile(getFile("analysis-extras/solr/collection1/conf/schema-icucollate-dv.xml"), new File(confDir, "schema.xml"));
  
  // generate custom collation rules (DIN 5007-2), saving to customrules.dat
  RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de", "DE"));

  String DIN5007_2_tailorings =
    "& ae , a\u0308 & AE , A\u0308"+
    "& oe , o\u0308 & OE , O\u0308"+
    "& ue , u\u0308 & UE , u\u0308";

  RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
  String tailoredRules = tailoredCollator.getRules();
  FileOutputStream os = new FileOutputStream(new File(confDir, "customrules.dat"));
  IOUtils.write(tailoredRules, os, "UTF-8");
  os.close();

  return tmpFile.getAbsolutePath();
}
 
Example #5
Source File: SortingTestCase.java    From vespa with Apache License 2.0 5 votes vote down vote up
private void requireThatArabicHasCorrectRules(Collator col) {
    final int reorderCodes [] = {UScript.ARABIC};
    assertEquals("6.2.0.0", col.getUCAVersion().toString());
    assertEquals("58.0.0.6", col.getVersion().toString());
    assertEquals(Arrays.toString(reorderCodes), Arrays.toString(col.getReorderCodes()));
    assertTrue(col.compare("a", "b") < 0);
    assertTrue(col.compare("a", "aس") < 0);
    assertFalse(col.compare("س", "a") < 0);

    assertEquals(" [reorder Arab]&ت<<ة<<<ﺔ<<<ﺓ&ي<<ى<<<ﯨ<<<ﯩ<<<ﻰ<<<ﻯ<<<ﲐ<<<ﱝ", ((RuleBasedCollator) col).getRules());
    assertFalse(col.compare("س", "a") < 0);
}
 
Example #6
Source File: SortingTestCase.java    From vespa with Apache License 2.0 5 votes vote down vote up
private void requireThatChineseHasCorrectRules(Collator col) {
    final int reorderCodes [] = {UScript.HAN};
    assertEquals("8.0.0.0", col.getUCAVersion().toString());
    assertEquals("153.64.29.0", col.getVersion().toString());
    assertEquals(Arrays.toString(reorderCodes), Arrays.toString(col.getReorderCodes()));

    assertNotEquals("", ((RuleBasedCollator) col).getRules());
}
 
Example #7
Source File: CollationSpecifier.java    From sql-layer with GNU Affero General Public License v3.0 5 votes vote down vote up
public RuleBasedCollator createCollator() {
    ULocale ulocale = new ULocale(locale);
    checkLocale(ulocale, scheme);
    ulocale = setKeywords(ulocale, keywordsToValues);

    RuleBasedCollator collator = (RuleBasedCollator) RuleBasedCollator.getInstance(ulocale);
    checkKeywords(collator.getLocale(ULocale.VALID_LOCALE), keywordsToValues,
            scheme);

    if (shouldSetStrength()) {
        setCollatorStrength(collator, this);
    }
    
    return collator;
}
 
Example #8
Source File: AkCollatorFactory.java    From sql-layer with GNU Affero General Public License v3.0 5 votes vote down vote up
/**
 * Construct an actual ICU Collator given a collation specifier. The
 * result is a Collator that must be use in a thread-private manner.
 */
static synchronized Collator forScheme(final CollationSpecifier specifier) {
    RuleBasedCollator collator = (RuleBasedCollator) sourceMap.get(specifier.toString());
    if (collator == null) {
        collator = specifier.createCollator();
        sourceMap.put(specifier.toString(), collator);
    }
    collator = collator.cloneAsThawed();
    return collator;
}
 
Example #9
Source File: IcuCollationAnalyzerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testCustomRules() throws Exception {
    RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE"));
    String DIN5007_2_tailorings =
            "& ae , a\u0308 & AE , A\u0308& oe , o\u0308 & OE , O\u0308& ue , u\u0308 & UE , u\u0308";

    RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
    String tailoredRules = tailoredCollator.getRules();

    Settings settings = Settings.builder()
            .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation")
            .put("index.analysis.analyzer.myAnalyzer.rules", tailoredRules)
            .put("index.analysis.analyzer.myAnalyzer.strength", "primary")
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Analyzer analyzer = analysis.indexAnalyzers.get("myAnalyzer");

    String germanUmlaut = "Töne";
    TokenStream tsUmlaut = analyzer.tokenStream(null, germanUmlaut);
    BytesRef b1 = bytesFromTokenStream(tsUmlaut);

    String germanExpandedUmlaut = "Toene";
    TokenStream tsExpanded = analyzer.tokenStream(null, germanExpandedUmlaut);
    BytesRef b2 = bytesFromTokenStream(tsExpanded);

    assertTrue(compare(b1.bytes, b2.bytes) == 0);
}
 
Example #10
Source File: RbnfScannerProviderImpl.java    From fitnotifications with Apache License 2.0 4 votes vote down vote up
private RbnfLenientScannerImpl(RuleBasedCollator rbc) {
    this.collator = rbc;
}
 
Example #11
Source File: ICUCollationField.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/**
 * Setup the field according to the provided parameters
 */
private void setup(ResourceLoader loader, Map<String,String> args) {
  String custom = args.remove("custom");
  String localeID = args.remove("locale");
  String strength = args.remove("strength");
  String decomposition = args.remove("decomposition");
  
  String alternate = args.remove("alternate");
  String caseLevel = args.remove("caseLevel");
  String caseFirst = args.remove("caseFirst");
  String numeric = args.remove("numeric");
  String variableTop = args.remove("variableTop");

  if (custom == null && localeID == null)
    throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale is required.");
  
  if (custom != null && localeID != null)
    throw new SolrException(ErrorCode.SERVER_ERROR, "Cannot specify both locale and custom. "
        + "To tailor rules for a built-in language, see the javadocs for RuleBasedCollator. "
        + "Then save the entire customized ruleset to a file, and use with the custom parameter");
  
  final Collator collator;
  
  if (localeID != null) { 
    // create from a system collator, based on Locale.
    collator = createFromLocale(localeID);
  } else { 
    // create from a custom ruleset
    collator = createFromRules(custom, loader);
  }
  
  // set the strength flag, otherwise it will be the default.
  if (strength != null) {
    if (strength.equalsIgnoreCase("primary"))
      collator.setStrength(Collator.PRIMARY);
    else if (strength.equalsIgnoreCase("secondary"))
      collator.setStrength(Collator.SECONDARY);
    else if (strength.equalsIgnoreCase("tertiary"))
      collator.setStrength(Collator.TERTIARY);
    else if (strength.equalsIgnoreCase("quaternary"))
      collator.setStrength(Collator.QUATERNARY);
    else if (strength.equalsIgnoreCase("identical"))
      collator.setStrength(Collator.IDENTICAL);
    else
      throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid strength: " + strength);
  }
  
  // set the decomposition flag, otherwise it will be the default.
  if (decomposition != null) {
    if (decomposition.equalsIgnoreCase("no"))
      collator.setDecomposition(Collator.NO_DECOMPOSITION);
    else if (decomposition.equalsIgnoreCase("canonical"))
      collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
    else
      throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid decomposition: " + decomposition);
  }
  
  // expert options: concrete subclasses are always a RuleBasedCollator
  RuleBasedCollator rbc = (RuleBasedCollator) collator;
  if (alternate != null) {
    if (alternate.equalsIgnoreCase("shifted")) {
      rbc.setAlternateHandlingShifted(true);
    } else if (alternate.equalsIgnoreCase("non-ignorable")) {
      rbc.setAlternateHandlingShifted(false);
    } else {
      throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid alternate: " + alternate);
    }
  }
  if (caseLevel != null) {
    rbc.setCaseLevel(Boolean.parseBoolean(caseLevel));
  }
  if (caseFirst != null) {
    if (caseFirst.equalsIgnoreCase("lower")) {
      rbc.setLowerCaseFirst(true);
    } else if (caseFirst.equalsIgnoreCase("upper")) {
      rbc.setUpperCaseFirst(true);
    } else {
      throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid caseFirst: " + caseFirst);
    }
  }
  if (numeric != null) {
    rbc.setNumericCollation(Boolean.parseBoolean(numeric));
  }
  if (variableTop != null) {
    rbc.setVariableTop(variableTop);
  }

  analyzer = new ICUCollationKeyAnalyzer(collator);
}
 
Example #12
Source File: TestICUPortabilityBug.java    From database with GNU General Public License v2.0 4 votes vote down vote up
/**
 * Unit test for ICU generation of Unicode sort keys.
 * <pre>
 * Input   : "__globalRowStore"
 * 
 * Expected: [7, -124, 7, -124, 53, 63, 69, 43, 41, 63, 75, 69, 85, 77, 79, 69, 75, 49, 1, 20, 1, 126, -113, -124, -113, 8]
 * </pre>
 */
public void test_ICU_Unicode_SortKey() {
    
    final String input = "__globalRowStore";

    // Buffer reused for each String from which a sort key is derived.
    final RawCollationKey raw = new RawCollationKey(128);

    /*
     * Setup the collator by specifying the locale, strength, and
     * decomposition mode.
     */
    final Locale locale = new Locale("en", "US");
    
    final RuleBasedCollator collator = (RuleBasedCollator) Collator
            .getInstance(locale);

    collator.setStrength(Collator.TERTIARY);

    collator.setDecomposition(Collator.NO_DECOMPOSITION);

    collator.getRawCollationKey(input, raw);

    // do not include the nul byte
    final byte[] actual = new byte[raw.size - 1];

    // copy data from the buffer.
    System.arraycopy(raw.bytes/* src */, 0/* srcPos */, actual/* dest */,
            0/* destPos */, actual.length);

    if (log.isInfoEnabled()) {
        log.info("Actual  : " + Arrays.toString(actual));
    }
    
    /*
     * The expected Unicode sort key (this depends on the runtime ICU
     * version).
     */
    final byte[] expected;
    if (VersionInfo.ICU_VERSION.getMajor() == 3
            && VersionInfo.ICU_VERSION.getMinor() == 6) {
        /*
         * bigdata was initially deployed against v3.6.
         */
        expected = new byte[] { 7, -124, 7, -124, 53, 63, 69, 43, 41, 63,
                75, 69, 85, 77, 79, 69, 75, 49, 1, 20, 1, 126, -113, -124,
                -113, 8 };
    } else if (VersionInfo.ICU_VERSION.getMajor() == 4
            && VersionInfo.ICU_VERSION.getMinor() == 8) {
        /*
         * The next bundled version was 4.8.
         */
        expected = new byte[] { 6, 12, 6, 12, 51, 61, 67, 41, 39, 61, 73,
                67, 83, 75, 77, 67, 73, 47, 1, 20, 1, 126, -113, -124,
                -113, 8};
    } else {

        throw new AssertionFailedError("Not an expected ICU version: "
                + VersionInfo.ICU_VERSION);

    }

    if (log.isInfoEnabled()) {
        log.info("Expected: " + Arrays.toString(expected));
    }

    if (!Arrays.equals(expected, actual)) {
        fail("Expected: " + Arrays.toString(expected) + ", " + //
                "Actual: " + Arrays.toString(actual));
    }

}
 
Example #13
Source File: BlobDescriptorList.java    From aard2-android with GNU General Public License v3.0 4 votes vote down vote up
BlobDescriptorList(Application app, DescriptorStore<BlobDescriptor> store, int maxSize) {
    this.app = app;
    this.store = store;
    this.maxSize = maxSize;
    this.list = new ArrayList<BlobDescriptor>();
    this.filteredList = new ArrayList<BlobDescriptor>();
    this.dataSetObservable = new DataSetObservable();
    this.filter = "";
    keyComparator = Slob.Strength.QUATERNARY.comparator;

    nameComparatorAsc = new Comparator<BlobDescriptor>() {
        @Override
        public int compare(BlobDescriptor b1, BlobDescriptor b2) {
        return keyComparator.compare(b1.key, b2.key);
        }
    };

    nameComparatorDesc = Collections.reverseOrder(nameComparatorAsc);

    timeComparatorAsc = new Comparator<BlobDescriptor>() {
        @Override
        public int compare(BlobDescriptor b1, BlobDescriptor b2) {
        return Util.compare(b1.createdAt, b2.createdAt);
        }
    };

    timeComparatorDesc = Collections.reverseOrder(timeComparatorAsc);

    lastAccessComparator = new Comparator<BlobDescriptor>() {
        @Override
        public int compare(BlobDescriptor b1, BlobDescriptor b2) {
            return  Util.compare(b2.lastAccess, b1.lastAccess);
        }
    };

    order = SortOrder.TIME;
    ascending = false;
    setSort(order, ascending);

    try {
        filterCollator = (RuleBasedCollator) Collator.getInstance(Locale.ROOT).clone();
    } catch (CloneNotSupportedException e) {
        throw new RuntimeException(e);
    }
    filterCollator.setStrength(Collator.PRIMARY);
    filterCollator.setAlternateHandlingShifted(true);
    handler = new Handler(Looper.getMainLooper());
}
 
Example #14
Source File: CollatorObject.java    From es6draft with MIT License 4 votes vote down vote up
private Collator createCollator() {
    ULocale locale = ULocale.forLanguageTag(this.locale);
    if ("search".equals(usage)) {
        // "search" usage cannot be set through unicode extensions (u-co-search), handle here:
        locale = locale.setKeywordValue("collation", "search");
    }
    RuleBasedCollator collator = (RuleBasedCollator) Collator.getInstance(locale);
    collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
    collator.setNumericCollation(numeric);
    switch (caseFirst) {
    case "upper":
        collator.setUpperCaseFirst(true);
        break;
    case "lower":
        collator.setLowerCaseFirst(true);
        break;
    case "false":
        if (collator.isLowerCaseFirst()) {
            collator.setLowerCaseFirst(false);
        }
        if (collator.isUpperCaseFirst()) {
            collator.setUpperCaseFirst(false);
        }
        break;
    default:
        throw new AssertionError();
    }
    switch (sensitivity) {
    case "base":
        collator.setStrength(Collator.PRIMARY);
        break;
    case "accent":
        collator.setStrength(Collator.SECONDARY);
        break;
    case "case":
        collator.setStrength(Collator.PRIMARY);
        collator.setCaseLevel(true);
        break;
    case "variant":
        collator.setStrength(Collator.TERTIARY);
        break;
    default:
        throw new AssertionError();
    }
    collator.setAlternateHandlingShifted(ignorePunctuation);
    return collator;
}