Java Code Examples for com.ibm.icu.lang.UCharacter#hasBinaryProperty()

The following examples show how to use com.ibm.icu.lang.UCharacter#hasBinaryProperty() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: GenerateUTR30DataFiles.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private static void getNFKCDataFilesFromIcuProject() throws IOException {
  URL icuTagsURL = new URL(ICU_GIT_TAG_URL + "/");
  URL icuReleaseTagURL = new URL(icuTagsURL, ICU_RELEASE_TAG + "/");
  URL norm2url = new URL(icuReleaseTagURL, ICU_DATA_NORM2_PATH + "/");

  System.err.print("Downloading " + NFKC_TXT + " ... ");
  download(new URL(norm2url, NFKC_TXT), NFKC_TXT);
  System.err.println("done.");
  System.err.print("Downloading " + NFKC_CF_TXT + " ... ");
  download(new URL(norm2url, NFKC_CF_TXT), NFKC_CF_TXT);
  System.err.println("done.");

  System.err.print("Downloading " + NFKC_CF_TXT + " and making diacritic rules one-way ... ");
  URLConnection connection = openConnection(new URL(norm2url, NFC_TXT));
  BufferedReader reader = new BufferedReader
      (new InputStreamReader(connection.getInputStream(), StandardCharsets.UTF_8));
  Writer writer = new OutputStreamWriter(new FileOutputStream(NFC_TXT), StandardCharsets.UTF_8);
  try {
    String line;

    while (null != (line = reader.readLine())) {
      Matcher matcher = ROUND_TRIP_MAPPING_LINE_PATTERN.matcher(line);
      if (matcher.matches()) {
        final String leftHandSide = matcher.group(1);
        final String rightHandSide = matcher.group(2).trim();
        List<String> diacritics = new ArrayList<>();
        for (String outputCodePoint : rightHandSide.split("\\s+")) {
          int ch = Integer.parseInt(outputCodePoint, 16);
          if (UCharacter.hasBinaryProperty(ch, UProperty.DIACRITIC)
              // gennorm2 fails if U+0653-U+0656 are included in round-trip mappings
              || (ch >= 0x653 && ch <= 0x656)) {
            diacritics.add(outputCodePoint);
          }
        }
        if ( ! diacritics.isEmpty()) {
          StringBuilder replacementLine = new StringBuilder();
          replacementLine.append(leftHandSide).append(">").append(rightHandSide);
          replacementLine.append("  # one-way: diacritic");
          if (diacritics.size() > 1) {
            replacementLine.append("s");
          }
          for (String diacritic : diacritics) {
            replacementLine.append(" ").append(diacritic);
          }
          line = replacementLine.toString();
        }
      }
      writer.write(line);
      writer.write("\n");
    }
  } finally {
    reader.close();
    writer.close();
  }
  System.err.println("done.");
}
 
Example 2
Source File: UTR30DataFileGenerator.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 4 votes vote down vote up
@SuppressForbidden(reason = "fetching resources from ICU repository is trusted")
private static void getNFKCDataFilesFromIcuProject(String releaseTag, String dir) throws IOException {
    URL icuTagsURL = new URL(ICU_SVN_TAG_URL + "/");
    URL icuReleaseTagURL = new URL(icuTagsURL, releaseTag + "/");
    URL norm2url = new URL(icuReleaseTagURL, ICU_DATA_NORM2_PATH + "/");
    logger.info("Downloading " + NFKC_TXT + " ... ");
    download(new URL(norm2url, NFKC_TXT), dir + NFKC_TXT);
    logger.info("done.");
    logger.info("Downloading " + NFKC_CF_TXT + " ... ");
    download(new URL(norm2url, NFKC_CF_TXT), dir + NFKC_CF_TXT);
    logger.info("done.");
    logger.info("Downloading " + NFKC_CF_TXT + " and making diacritic rules one-way ... ");
    URLConnection connection = openConnection(new URL(norm2url, NFC_TXT));
    try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(),
            StandardCharsets.UTF_8));
         Writer writer = new OutputStreamWriter(new FileOutputStream(dir + NFC_TXT), StandardCharsets.UTF_8)) {
        String line;
        while (null != (line = reader.readLine())) {
            Matcher matcher = ROUND_TRIP_MAPPING_LINE_PATTERN.matcher(line);
            if (matcher.matches()) {
                final String leftHandSide = matcher.group(1);
                final String rightHandSide = matcher.group(2).trim();
                List<String> diacritics = new ArrayList<>();
                for (String outputCodePoint : rightHandSide.split("\\s+")) {
                    int ch = Integer.parseInt(outputCodePoint, 16);
                    if (UCharacter.hasBinaryProperty(ch, UProperty.DIACRITIC)
                            // gennorm2 fails if U+0653-U+0656 are included in round-trip mappings
                            || (ch >= 0x653 && ch <= 0x656)) {
                        diacritics.add(outputCodePoint);
                    }
                }
                if (!diacritics.isEmpty()) {
                    StringBuilder replacementLine = new StringBuilder();
                    replacementLine.append(leftHandSide).append(">").append(rightHandSide);
                    replacementLine.append("  # one-way: diacritic");
                    if (diacritics.size() > 1) {
                        replacementLine.append("s");
                    }
                    for (String diacritic : diacritics) {
                        replacementLine.append(" ").append(diacritic);
                    }
                    line = replacementLine.toString();
                }
            }
            writer.write(line);
            writer.write("\n");
        }
    }
    logger.info("done.");
}
 
Example 3
Source File: UnicodeData.java    From es6draft with MIT License 4 votes vote down vote up
public boolean has(int codePoint) {
    return UCharacter.hasBinaryProperty(codePoint, propertyId);
}
 
Example 4
Source File: Characters.java    From es6draft with MIT License 3 votes vote down vote up
/**
 * <strong>[11.6] Names and Keywords</strong>
 * 
 * <pre>
 * IdentifierStart ::
 *     UnicodeIDStart
 *     $
 *     _
 *     \ UnicodeEscapeSequence
 * UnicodeIDStart ::
 *     any Unicode character with the Unicode property "ID_Start".
 * </pre>
 * 
 * @param c
 *            the character
 * @return {@code true} if the character is an identifier start character
 */
public static boolean isIdentifierStart(int c) {
    if (c <= 127) {
        return ('a' <= (c | 0x20) && (c | 0x20) <= 'z') || c == '$' || c == '_';
    }
    return UCharacter.hasBinaryProperty(c, UProperty.ID_START);
}
 
Example 5
Source File: Characters.java    From es6draft with MIT License 3 votes vote down vote up
/**
 * <strong>[11.6] Names and Keywords</strong>
 * 
 * <pre>
 * IdentifierPart ::
 *     UnicodeIDContinue
 *     $
 *     _
 *     \ UnicodeEscapeSequence
 *     &lt;ZWNJ&gt;
 *     &lt;ZWJ&gt;
 * UnicodeIDContinue ::
 *     any Unicode character with the Unicode property "ID_Continue"
 * </pre>
 * 
 * @param c
 *            the character
 * @return {@code true} if the character is an identifier part character
 */
public static boolean isIdentifierPart(int c) {
    if (c <= 127) {
        return ('a' <= (c | 0x20) && (c | 0x20) <= 'z') || ('0' <= c && c <= '9') || c == '$' || c == '_';
    }
    if (c == '\u200C' || c == '\u200D') {
        return true;
    }
    return UCharacter.hasBinaryProperty(c, UProperty.ID_CONTINUE);
}
 
Example 6
Source File: Characters.java    From es6draft with MIT License 3 votes vote down vote up
/**
 * <strong>[11.6] Names and Keywords</strong>
 * 
 * <pre>
 * UnicodeIDStart ::
 *     any Unicode character with the Unicode property "ID_Start".
 * </pre>
 * 
 * @param c
 *            the character
 * @return {@code true} if the character is an identifier start character
 */
public static boolean isUnicodeIDStart(int c) {
    if (c <= 127) {
        return ('a' <= (c | 0x20) && (c | 0x20) <= 'z');
    }
    return UCharacter.hasBinaryProperty(c, UProperty.ID_START);
}
 
Example 7
Source File: Characters.java    From es6draft with MIT License 3 votes vote down vote up
/**
 * <strong>[11.6] Names and Keywords</strong>
 * 
 * <pre>
 * UnicodeIDContinue ::
 *     any Unicode character with the Unicode property "ID_Continue"
 * </pre>
 * 
 * @param c
 *            the character
 * @return {@code true} if the character is an identifier part character
 */
public static boolean isUnicodeIDContinue(int c) {
    if (c <= 127) {
        return ('a' <= (c | 0x20) && (c | 0x20) <= 'z') || ('0' <= c && c <= '9') || c == '_';
    }
    return UCharacter.hasBinaryProperty(c, UProperty.ID_CONTINUE);
}