package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.tools; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UProperty; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.UnicodeSetIterator; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.lucene.util.SuppressForbidden; import java.io.BufferedReader; import java.io.File; import java.io.FileFilter; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Writer; import java.net.URL; import java.net.URLConnection; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Generates UTR30 data files from icu-project.org. * <ol> * <li> * Downloads nfc.txt, nfkc.txt and nfkc_cf.txt from icu-project.org, * </li> * <li> * Converts round-trip mappings in nfc.txt (containing '=') * that map to at least one [:Diacritic:] character * into one-way mappings ('>' instead of '='). * </li> * </ol> */ public class UTR30DataFileGenerator { private static final Logger logger = LogManager.getLogger(UTR30DataFileGenerator.class.getName()); private static final String ICU_SVN_TAG_URL = "http://source.icu-project.org/repos/icu/icu/tags"; private static final String ICU_DATA_NORM2_PATH = "icu4c/source/data/unidata/norm2"; private static final String NFC_TXT = "nfc.txt"; private static final String NFKC_TXT = "nfkc.txt"; private static final String NFKC_CF_TXT = "nfkc_cf.txt"; private static final Pattern ROUND_TRIP_MAPPING_LINE_PATTERN = Pattern.compile("^\\s*([^=]+?)\\s*=\\s*(.*)$"); private static final Pattern VERBATIM_RULE_LINE_PATTERN = Pattern.compile("^#\\s*Rule:\\s*verbatim\\s*$", Pattern.CASE_INSENSITIVE); private static final Pattern RULE_LINE_PATTERN = Pattern.compile("^#\\s*Rule:\\s*(.*)>(.*)", Pattern.CASE_INSENSITIVE); private static final Pattern BLANK_OR_COMMENT_LINE_PATTERN = Pattern.compile("^\\s*(?:#.*)?$"); private static final Pattern NUMERIC_VALUE_PATTERN = Pattern.compile("Numeric[-\\s_]*Value", Pattern.CASE_INSENSITIVE); private static byte[] bytes = new byte[8192]; public void execute(String releaseTag, String dir) throws IOException { getNFKCDataFilesFromIcuProject(releaseTag, dir); expandRulesInUTR30DataFiles(dir); } @SuppressForbidden(reason = "fetching resources from ICU repository is trusted") private static void getNFKCDataFilesFromIcuProject(String releaseTag, String dir) throws IOException { URL icuTagsURL = new URL(ICU_SVN_TAG_URL + "/"); URL icuReleaseTagURL = new URL(icuTagsURL, releaseTag + "/"); URL norm2url = new URL(icuReleaseTagURL, ICU_DATA_NORM2_PATH + "/"); logger.info("Downloading " + NFKC_TXT + " ... "); download(new URL(norm2url, NFKC_TXT), dir + NFKC_TXT); logger.info("done."); logger.info("Downloading " + NFKC_CF_TXT + " ... "); download(new URL(norm2url, NFKC_CF_TXT), dir + NFKC_CF_TXT); logger.info("done."); logger.info("Downloading " + NFKC_CF_TXT + " and making diacritic rules one-way ... "); URLConnection connection = openConnection(new URL(norm2url, NFC_TXT)); try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), StandardCharsets.UTF_8)); Writer writer = new OutputStreamWriter(new FileOutputStream(dir + NFC_TXT), StandardCharsets.UTF_8)) { String line; while (null != (line = reader.readLine())) { Matcher matcher = ROUND_TRIP_MAPPING_LINE_PATTERN.matcher(line); if (matcher.matches()) { final String leftHandSide = matcher.group(1); final String rightHandSide = matcher.group(2).trim(); List<String> diacritics = new ArrayList<>(); for (String outputCodePoint : rightHandSide.split("\\s+")) { int ch = Integer.parseInt(outputCodePoint, 16); if (UCharacter.hasBinaryProperty(ch, UProperty.DIACRITIC) // gennorm2 fails if U+0653-U+0656 are included in round-trip mappings || (ch >= 0x653 && ch <= 0x656)) { diacritics.add(outputCodePoint); } } if (!diacritics.isEmpty()) { StringBuilder replacementLine = new StringBuilder(); replacementLine.append(leftHandSide).append(">").append(rightHandSide); replacementLine.append(" # one-way: diacritic"); if (diacritics.size() > 1) { replacementLine.append("s"); } for (String diacritic : diacritics) { replacementLine.append(" ").append(diacritic); } line = replacementLine.toString(); } } writer.write(line); writer.write("\n"); } } logger.info("done."); } @SuppressForbidden(reason = "fetching resources from ICU repository is trusted") private static void download(URL url, String outputFile) throws IOException { final URLConnection connection = openConnection(url); int numBytes; try (InputStream inputStream = connection.getInputStream(); OutputStream outputStream = new FileOutputStream(outputFile)) { while (-1 != (numBytes = inputStream.read(bytes))) { outputStream.write(bytes, 0, numBytes); } } } @SuppressForbidden(reason = "fetching resources from ICU repository is trusted") private static URLConnection openConnection(URL url) throws IOException { final URLConnection connection = url.openConnection(); connection.setUseCaches(false); connection.addRequestProperty("Cache-Control", "no-cache"); connection.connect(); return connection; } @SuppressForbidden(reason = "nio file porting will be done later") private static void expandRulesInUTR30DataFiles(String dir) throws IOException { FileFilter filter = pathname -> { String name = pathname.getName(); return pathname.isFile() && name.matches(".*\\.(?s:txt)") && !name.equals(NFC_TXT) && !name.equals(NFKC_TXT) && !name.equals(NFKC_CF_TXT); }; if (dir != null) { File dirFile = new File(dir); File[] files = dirFile.listFiles(filter); if (files != null) { for (File file : files) { expandDataFileRules(file); } } } } @SuppressForbidden(reason = "nio file porting will be done later") private static void expandDataFileRules(File file) throws IOException { StringBuilder builder = new StringBuilder(); String line; boolean verbatim = false; boolean modified = false; int lineNum = 0; try (InputStream inputStream = new FileInputStream(file); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8))) { while (null != (line = bufferedReader.readLine())) { ++lineNum; if (VERBATIM_RULE_LINE_PATTERN.matcher(line).matches()) { verbatim = true; builder.append(line).append("\n"); } else { Matcher ruleMatcher = RULE_LINE_PATTERN.matcher(line); if (ruleMatcher.matches()) { verbatim = false; builder.append(line).append("\n"); try { String leftHandSide = ruleMatcher.group(1).trim(); String rightHandSide = ruleMatcher.group(2).trim(); expandSingleRule(builder, leftHandSide, rightHandSide); } catch (IllegalArgumentException e) { logger.error("ERROR in " + file.getName() + " line #" + lineNum + ":", e); throw e; } modified = true; } else { if (BLANK_OR_COMMENT_LINE_PATTERN.matcher(line).matches()) { builder.append(line).append("\n"); } else { if (verbatim) { builder.append(line).append("\n"); } else { modified = true; } } } } } } if (modified) { logger.info("Expanding rules in and overwriting " + file.getName()); try (Writer writer = new OutputStreamWriter(new FileOutputStream(file, false), StandardCharsets.UTF_8)) { writer.write(builder.toString()); } } } private static void expandSingleRule(StringBuilder builder, String leftHandSide, String rightHandSide) { UnicodeSet set = new UnicodeSet(leftHandSide, UnicodeSet.IGNORE_SPACE); boolean numericValue = NUMERIC_VALUE_PATTERN.matcher(rightHandSide).matches(); for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.nextRange(); ) { if (it.codepoint != UnicodeSetIterator.IS_STRING) { if (numericValue) { for (int cp = it.codepoint; cp <= it.codepointEnd; ++cp) { builder.append(String.format(Locale.ROOT, "%04X", cp)).append('>'); builder.append(String.format(Locale.ROOT, "%04X", 0x30 + UCharacter.getNumericValue(cp))); builder.append(" # ").append(UCharacter.getName(cp)); builder.append("\n"); } } else { builder.append(String.format(Locale.ROOT, "%04X", it.codepoint)); if (it.codepointEnd > it.codepoint) { builder.append("..").append(String.format(Locale.ROOT, "%04X", it.codepointEnd)); } builder.append('>').append(rightHandSide).append("\n"); } } else { logger.error("ERROR: String '" + it.getString() + "' found in UnicodeSet"); } } } }